====Basics====
* Basics borrowed from Pablo Cordero
# R is best used as an interactive environment for statistical analysis.
# Don't think of it primarily as a compiler/interpreter for scripts!
# You should be spending most of your time in the REPL (Read-Eval-Print Loop).
# R as a calculator
log2(32)
sqrt(2)
# R as a graphics tool
# Define a vector named cars with five values
my.vector <- c(1,3,6,4,9)
# Some basic variables that come with R to play with
head(cars)
dim(cars)
plot(x=cars$speed, y=cars$dist,
main="cars",
xlab="speed",
ylab="dist")
hist(cars$speed)
# Primitive data types
# Numeric - floating point
# Integers
# Boolean values - TRUE, FALSE
# Special values - NA, Inf, -Inf
x <- 3.14159
x <- 1 / 0
y <- TRUE
!y
z <- NA
u <- 2.71828
v <- "The quick brown fox jumped over the lazy dog"
# You can compare values using the usual binary infix operators, which return TRUE, FALSE or NA
x > u
x == u
x == z
# There are also some handy tests you can use to detect special values.
is.na(x)
is.na(y)
is.na(z)
is.infinite(x)
is.infinite(y)
# Compound data - vectors, matrices, lists, data frames,
# The most basic type of compound data in R is a vector.
# Vectors of numeric values
x <- c(1,2,3,4,5,6)
# Can also have vectors of boolean or string values.
x >= 3
y <- c("the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog")
# You can specify a range using the ':' operator
x <- 1:6
# c(...) is a handy function for building vectors from other vectors
y <- c(1:6)
z <- c(1:3,c(4:6))
s <- c(x,y)
c(x, y, s)
# Referencing elements of a vector using [...]
x[1]
x[2]
x[3]
# You can use variables as indices.
i <- 4
x[i]
# You can reference elements of a vector using
# a vector of indices:
x[1:3]
selection <- c(4:6)
x[selection]
# The elements of a vector can have symbolic names
names(x) <- c('a', 'b', 'c', 'd', 'e', 'f')
# Now you can reference elements of the vector using
# the symbolic names. This can be very handy when you
# have a big vector and you don't want to remember, e.g.
# the index that corresponds to your gene of interest.
x['c']
x[c('a', 'c')]
# Some special ways to build vectors
numeric(10)
character(10)
rep(NA, 10)
rep(1, 10)
rep(1:2, 10)
seq(from=1,to=10, by=2)
seq(from=0, to=10, by=0.1)
# MODIFYING VECTORS
# Modifying elements of a vector
x[3] <- NA
x[5] <- 1/0
# You can modify multiple elements of a vector using
# a vector of the the indices of elements you want to modify.
x[selection] <- 10
x[selection] <- c(10:12)
# DOING STUFF TO VECTORS
# Sometimes, you want to know the indices of the elements of a vector
# which are NA, or Infinite, or whatever. You can get a vector of the
# elements which are na like this:
x['e'] <- NA
is.na(x)
which(is.na(x))
x[is.na(x)] <- 0
# Sorting
sort(x)
# Order
order(x)
x[order(x)]
# Matching the elements of two vectors to each other
a <- c('a', 'b', 'c', 'd', 'e', 'f')
b <- c('d', 'e', 'a')
match(b, a)
a[match(b,a)]
# Some convenient functions for operating on strings and vectors of strings
z <- paste(y, collapse="_")
strsplit(z, split="_")
# Vectorized operations - many operations operate on vectors
# in an element-wise fashion, returning vectors.
x <- c(1:6)
z <- rev(x)
x[c(3,6)] <- NA
x + 1
x + z
x - z
x * z
x > 3
!(x > 3)
sum(x)
sum(x[!is.na(x)])
sum(x, na.rm=TRUE)
mean(x)
mean(x, na.rm=TRUE)
var(x, na.rm=TRUE)
# MATRICES - n x m tables
m <- matrix(0, nrow=2, ncol=2)
m <- matrix(x, nrow=2)
m <- matrix(x, nrow=2, byrow=TRUE)
t(m)
dim(m)
m <- rbind(m, c(10:12))
m <- cbind(m, c(13:15))
# Referencing elements of a matrix
m[1,1]
m[1:3,1:3]
m <- m[,1:3]
m[1,]
m[,1]
m[ is.na(m) ] <- 0
my.data <- as.matrix(read.table("http://www.stanford.edu/~kjung/my.data.txt", sep="\t"))
# The columns of a matrix can have symbolic names
rownames(my.data)
colnames(my.data)
my.data['Shaggy', 'B_4']
my.data['Shaggy',]
# LISTS - Generally used like a hash table / associative map,
# though it is also an ordered list]
# Making a new lists
x <- list(a=c(1,2,3), b=c('d', 'e', 'f'), c="foo")
names(x)
# Referencing elements of a list
x[[1]]
x[['a']]
x$a
# Modifying elements of a list
x$a <- 3.14159
# unlist()
# Some R functions return lists, and we want to get vectors - we can
# convert a list into a vector using unlist(...)
strsplit("a_b_c", split="_")
unlist(strsplit("a_b_c", split="_"))
# FACTORS - R's representation of categorical values.
f <- factor(rep(c('a', 'b'), 10))
levels(f)
# DATA FRAMES
# Tables with heterogeneous columns.
pheno.data <- read.table("http://www.stanford.edu/~kjung/pheno.data.txt", sep="\t")
# Reference elements of a data frame like it was a matrix, or using
# column names
pheno.data[1:3,1:3]
pheno.data$plays.tuba
# ITERATING OVER DATA
# for loops
means <- numeric(nrow(my.data))
names(means) <- rownames(my.data)
for (i in 1:nrow(my.data)) {
means[i] <- mean(my.data[i,], na.rm=TRUE)
}
# apply
means <- numeric(nrow(my.data))
means <- apply(my.data, MARGIN=1, FUN=mean, na.rm=TRUE)
# rowMeans, colMeans
means <- rowMeans(my.data, na.rm=TRUE)
# lapply
# Printing stuff out
# cat - print something to STDOUT
cat('a', 'b', '\n')
cat('a', 'b', '\n', sep="_")
cat(c(1:4), "\n")
# sprintf
u <- 3
v <- 3.14
s <- "Hello"
sprintf("%g %f %s", u, v, s)
sprintf("%g %.3f %s", u, v, s)
cat(sprintf("%g %.3f %s\n", u, v, s))
# Getting stuff into and out of R
# read.table, write.table
# We saw these above - read through the help page for these functions to get
# a feel for the available options.
# read.table
# write.table
# save, load - use these with large data structures as
save(my.data, file="my.data.RData")
rm(my.data)
my.data
load(file="my.data.RData")
my.data
# If you want to save everything in your session, use save.image
save.image(file="my.image.RData")
ls() # Lists all bindings in your session
rm(list=ls()) # Remove all bindings
load(file="my.image.RData")
ls()
====Problem Set 1 Hints and Other Tricks====
setwd('/Users/liuyipei/BMI215/Module 1 - Nick/DrugSafety-Homework')
dz<-read.csv('single_drug_event_frequencies.csv')
head(dz)
cd1<-read.csv('cholesterol_drugs.txt', col.names=F)
colnames(cd1)<-c('singlet')
cd1$chole<-'choles'
head(cd0)
cd0<-data.frame(singlet=setdiff(as.character(unique(dz$singlet)), cd1$singlet),chole=0)
cd0$chole<-'noncholes'
cd.table<-rbind(cd1, cd0)
head(cd1)
head(cd0)
head(cd.table)
q1.table <- merge(cd.table, dz)
q1.table$hifreq <- ifelse(q1.table$freq > 0.1, 'hifr', 'lofr')
head(q1.table)
nrow(q1.table)
ncol(q1.table)
sum(q1.table$chole == 'choles')
sum(q1.table$chole == 'noncholes')
table(q1.table$chole, q1.table$hifreq)
library(plyr)
hard.work<-function(x){
c(nrow(x), ncol(x), x$chole[1]=='choles')
}
ddply(q1.table, .(singlet), .fun=hard.work)->lets.talk.about.what.happened
head(lets.talk.about.what.happened)
dim(lets.talk.about.what.happened)
summary(lets.talk.about.what.happened)
colnames(lets.talk.about.what.happened)<-c('singlet', 'row.count', 'col.count', 'ch.drug')
library(caTools)
my.x<-c(0:10)*0.1
my.y<-c(0,1,3,5,6,7,7,9,9,10,10)*0.1
plot(my.x, my.y)
trapz(my.x, my.y)
data.frame(a=c(8,12),b=c(31,41))->t
t
fisher.test(t)->f.t
ls(f.t)
f.t$p.value
f.t$conf.int