====Basics==== * Basics borrowed from Pablo Cordero


# R is best used as an interactive environment for statistical analysis.
# Don't think of it primarily as a compiler/interpreter for scripts!
# You should be spending most of your time in the REPL (Read-Eval-Print Loop).

# R as a calculator
log2(32)
sqrt(2)

# R as a graphics tool
# Define a vector named cars with five values
my.vector <- c(1,3,6,4,9)

# Some basic variables that come with R to play with
head(cars)
dim(cars)

plot(x=cars$speed, y=cars$dist,
     main="cars",
     xlab="speed",
     ylab="dist")
hist(cars$speed)

# Primitive data types
# Numeric - floating point
# Integers
# Boolean values - TRUE, FALSE
# Special values - NA, Inf, -Inf
x <- 3.14159
x <- 1 / 0
y <- TRUE
!y
z <- NA
u <- 2.71828
v <- "The quick brown fox jumped over the lazy dog"

# You can compare values using the usual binary infix operators, which return TRUE, FALSE or NA
x > u
x == u
x == z

# There are also some handy tests you can use to detect special values.
is.na(x)
is.na(y)
is.na(z)
is.infinite(x)
is.infinite(y)

# Compound data - vectors, matrices, lists, data frames, 

# The most basic type of compound data in R is a vector.
# Vectors of numeric values
x <- c(1,2,3,4,5,6)

# Can also have vectors of boolean or string values.
x >= 3
y <- c("the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog")

# You can specify a range using the ':' operator
x <- 1:6

# c(...) is a handy function for building vectors from other vectors
y <- c(1:6)
z <- c(1:3,c(4:6))
s <- c(x,y)
c(x, y, s)

# Referencing elements of a vector using [...]
x[1]
x[2]
x[3]

# You can use variables as indices.
i <- 4
x[i]

# You can reference elements of a vector using
# a vector of indices:
x[1:3]
selection <- c(4:6)
x[selection]

# The elements of a vector can have symbolic names
names(x) <- c('a', 'b', 'c', 'd', 'e', 'f')

# Now you can reference elements of the vector using
# the symbolic names.  This can be very handy when you
# have a big vector and you don't want to remember, e.g.
# the index that corresponds to your gene of interest.
x['c']
x[c('a', 'c')]

# Some special ways to build vectors
numeric(10)
character(10)
rep(NA, 10)
rep(1, 10)
rep(1:2, 10)
seq(from=1,to=10, by=2)
seq(from=0, to=10, by=0.1)

# MODIFYING VECTORS
# Modifying elements of a vector
x[3] <- NA
x[5] <- 1/0

# You can modify multiple elements of a vector using
# a vector of the the indices of elements you want to modify.
x[selection] <- 10
x[selection] <- c(10:12)


# DOING STUFF TO VECTORS

# Sometimes, you want to know the indices of the elements of a vector
# which are NA, or Infinite, or whatever.  You can get a vector of the
# elements which are na like this:
x['e'] <- NA
is.na(x)
which(is.na(x))
x[is.na(x)] <- 0

# Sorting
sort(x)

# Order
order(x)
x[order(x)]

# Matching the elements of two vectors to each other
a <- c('a', 'b', 'c', 'd', 'e', 'f')
b <- c('d', 'e', 'a')
match(b, a)
a[match(b,a)]

# Some convenient functions for operating on strings and vectors of strings
z <- paste(y, collapse="_")
strsplit(z, split="_")


# Vectorized operations - many operations operate on vectors
# in an element-wise fashion, returning vectors.
x <- c(1:6)
z <- rev(x)
x[c(3,6)] <- NA
x + 1
x + z
x - z
x * z
x > 3
!(x > 3)
sum(x)
sum(x[!is.na(x)])
sum(x, na.rm=TRUE)
mean(x)
mean(x, na.rm=TRUE)
var(x, na.rm=TRUE)


# MATRICES - n x m tables
m <- matrix(0, nrow=2, ncol=2)
m <- matrix(x, nrow=2)
m <- matrix(x, nrow=2, byrow=TRUE)
t(m)
dim(m)
m <- rbind(m, c(10:12))
m <- cbind(m, c(13:15))

# Referencing elements of a matrix
m[1,1]
m[1:3,1:3]
m <- m[,1:3]
m[1,]
m[,1]
m[ is.na(m) ] <- 0

my.data <- as.matrix(read.table("http://www.stanford.edu/~kjung/my.data.txt", sep="\t"))

# The columns of a matrix can have symbolic names
rownames(my.data)
colnames(my.data)
my.data['Shaggy', 'B_4']
my.data['Shaggy',]


# LISTS - Generally used like a hash table / associative map,
# though it is also an ordered list]  
# Making a new lists
x <- list(a=c(1,2,3), b=c('d', 'e', 'f'), c="foo")
names(x)

# Referencing elements of a list
x[[1]]
x[['a']]
x$a

# Modifying elements of a list
x$a <- 3.14159

# unlist()
# Some R functions return lists, and we want to get vectors - we can
# convert a list into a vector using unlist(...)
strsplit("a_b_c", split="_")
unlist(strsplit("a_b_c", split="_"))


# FACTORS - R's representation of categorical values.
f <- factor(rep(c('a', 'b'), 10))
levels(f)

# DATA FRAMES
# Tables with heterogeneous columns.
pheno.data <- read.table("http://www.stanford.edu/~kjung/pheno.data.txt", sep="\t")

# Reference elements of a data frame like it was a matrix, or using
# column names
pheno.data[1:3,1:3]
pheno.data$plays.tuba

# ITERATING OVER DATA

# for loops
means <- numeric(nrow(my.data))
names(means) <- rownames(my.data)
for (i in 1:nrow(my.data)) {
  means[i] <- mean(my.data[i,], na.rm=TRUE)
}

# apply
means <- numeric(nrow(my.data))
means <- apply(my.data, MARGIN=1, FUN=mean, na.rm=TRUE)

# rowMeans, colMeans
means <- rowMeans(my.data, na.rm=TRUE)

# lapply

# Printing stuff out

# cat - print something to STDOUT
cat('a', 'b', '\n')
cat('a', 'b', '\n', sep="_")
cat(c(1:4), "\n")


# sprintf
u <- 3
v <- 3.14
s <- "Hello"
sprintf("%g %f %s", u, v, s)
sprintf("%g %.3f %s", u, v, s)
cat(sprintf("%g %.3f %s\n", u, v, s))

# Getting stuff into and out of R

# read.table, write.table
# We saw these above - read through the help page for these functions to get
# a feel for the available options.
# read.table
# write.table

# save, load - use these with large data structures as
save(my.data, file="my.data.RData")
rm(my.data)
my.data
load(file="my.data.RData")
my.data
    

# If you want to save everything in your session, use save.image
save.image(file="my.image.RData")
ls()  # Lists all bindings in your session
rm(list=ls()) # Remove all bindings
load(file="my.image.RData") 
ls()

====Problem Set 1 Hints and Other Tricks====


setwd('/Users/liuyipei/BMI215/Module 1 - Nick/DrugSafety-Homework')
dz<-read.csv('single_drug_event_frequencies.csv')
head(dz)

cd1<-read.csv('cholesterol_drugs.txt', col.names=F)
colnames(cd1)<-c('singlet')
cd1$chole<-'choles'
head(cd0)

cd0<-data.frame(singlet=setdiff(as.character(unique(dz$singlet)), cd1$singlet),chole=0)
cd0$chole<-'noncholes'
cd.table<-rbind(cd1, cd0)
head(cd1)
head(cd0)
head(cd.table)

q1.table <- merge(cd.table, dz)
q1.table$hifreq <- ifelse(q1.table$freq > 0.1, 'hifr', 'lofr')
head(q1.table)
nrow(q1.table)
ncol(q1.table)

sum(q1.table$chole == 'choles')
sum(q1.table$chole == 'noncholes')
table(q1.table$chole, q1.table$hifreq)

library(plyr)
hard.work<-function(x){
  c(nrow(x), ncol(x), x$chole[1]=='choles')
}
ddply(q1.table, .(singlet), .fun=hard.work)->lets.talk.about.what.happened
head(lets.talk.about.what.happened)
dim(lets.talk.about.what.happened)
summary(lets.talk.about.what.happened)
colnames(lets.talk.about.what.happened)<-c('singlet', 'row.count', 'col.count', 'ch.drug')

library(caTools)
my.x<-c(0:10)*0.1
my.y<-c(0,1,3,5,6,7,7,9,9,10,10)*0.1
plot(my.x, my.y)
trapz(my.x, my.y)

data.frame(a=c(8,12),b=c(31,41))->t
t
fisher.test(t)->f.t
ls(f.t)
f.t$p.value
f.t$conf.int