====Basics==== * Basics borrowed from Pablo Cordero # R is best used as an interactive environment for statistical analysis. # Don't think of it primarily as a compiler/interpreter for scripts! # You should be spending most of your time in the REPL (Read-Eval-Print Loop). # R as a calculator log2(32) sqrt(2) # R as a graphics tool # Define a vector named cars with five values my.vector <- c(1,3,6,4,9) # Some basic variables that come with R to play with head(cars) dim(cars) plot(x=cars$speed, y=cars$dist, main="cars", xlab="speed", ylab="dist") hist(cars$speed) # Primitive data types # Numeric - floating point # Integers # Boolean values - TRUE, FALSE # Special values - NA, Inf, -Inf x <- 3.14159 x <- 1 / 0 y <- TRUE !y z <- NA u <- 2.71828 v <- "The quick brown fox jumped over the lazy dog" # You can compare values using the usual binary infix operators, which return TRUE, FALSE or NA x > u x == u x == z # There are also some handy tests you can use to detect special values. is.na(x) is.na(y) is.na(z) is.infinite(x) is.infinite(y) # Compound data - vectors, matrices, lists, data frames, # The most basic type of compound data in R is a vector. # Vectors of numeric values x <- c(1,2,3,4,5,6) # Can also have vectors of boolean or string values. x >= 3 y <- c("the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "dog") # You can specify a range using the ':' operator x <- 1:6 # c(...) is a handy function for building vectors from other vectors y <- c(1:6) z <- c(1:3,c(4:6)) s <- c(x,y) c(x, y, s) # Referencing elements of a vector using [...] x[1] x[2] x[3] # You can use variables as indices. i <- 4 x[i] # You can reference elements of a vector using # a vector of indices: x[1:3] selection <- c(4:6) x[selection] # The elements of a vector can have symbolic names names(x) <- c('a', 'b', 'c', 'd', 'e', 'f') # Now you can reference elements of the vector using # the symbolic names. This can be very handy when you # have a big vector and you don't want to remember, e.g. # the index that corresponds to your gene of interest. x['c'] x[c('a', 'c')] # Some special ways to build vectors numeric(10) character(10) rep(NA, 10) rep(1, 10) rep(1:2, 10) seq(from=1,to=10, by=2) seq(from=0, to=10, by=0.1) # MODIFYING VECTORS # Modifying elements of a vector x[3] <- NA x[5] <- 1/0 # You can modify multiple elements of a vector using # a vector of the the indices of elements you want to modify. x[selection] <- 10 x[selection] <- c(10:12) # DOING STUFF TO VECTORS # Sometimes, you want to know the indices of the elements of a vector # which are NA, or Infinite, or whatever. You can get a vector of the # elements which are na like this: x['e'] <- NA is.na(x) which(is.na(x)) x[is.na(x)] <- 0 # Sorting sort(x) # Order order(x) x[order(x)] # Matching the elements of two vectors to each other a <- c('a', 'b', 'c', 'd', 'e', 'f') b <- c('d', 'e', 'a') match(b, a) a[match(b,a)] # Some convenient functions for operating on strings and vectors of strings z <- paste(y, collapse="_") strsplit(z, split="_") # Vectorized operations - many operations operate on vectors # in an element-wise fashion, returning vectors. x <- c(1:6) z <- rev(x) x[c(3,6)] <- NA x + 1 x + z x - z x * z x > 3 !(x > 3) sum(x) sum(x[!is.na(x)]) sum(x, na.rm=TRUE) mean(x) mean(x, na.rm=TRUE) var(x, na.rm=TRUE) # MATRICES - n x m tables m <- matrix(0, nrow=2, ncol=2) m <- matrix(x, nrow=2) m <- matrix(x, nrow=2, byrow=TRUE) t(m) dim(m) m <- rbind(m, c(10:12)) m <- cbind(m, c(13:15)) # Referencing elements of a matrix m[1,1] m[1:3,1:3] m <- m[,1:3] m[1,] m[,1] m[ is.na(m) ] <- 0 my.data <- as.matrix(read.table("http://www.stanford.edu/~kjung/my.data.txt", sep="\t")) # The columns of a matrix can have symbolic names rownames(my.data) colnames(my.data) my.data['Shaggy', 'B_4'] my.data['Shaggy',] # LISTS - Generally used like a hash table / associative map, # though it is also an ordered list] # Making a new lists x <- list(a=c(1,2,3), b=c('d', 'e', 'f'), c="foo") names(x) # Referencing elements of a list x[[1]] x[['a']] x$a # Modifying elements of a list x$a <- 3.14159 # unlist() # Some R functions return lists, and we want to get vectors - we can # convert a list into a vector using unlist(...) strsplit("a_b_c", split="_") unlist(strsplit("a_b_c", split="_")) # FACTORS - R's representation of categorical values. f <- factor(rep(c('a', 'b'), 10)) levels(f) # DATA FRAMES # Tables with heterogeneous columns. pheno.data <- read.table("http://www.stanford.edu/~kjung/pheno.data.txt", sep="\t") # Reference elements of a data frame like it was a matrix, or using # column names pheno.data[1:3,1:3] pheno.data$plays.tuba # ITERATING OVER DATA # for loops means <- numeric(nrow(my.data)) names(means) <- rownames(my.data) for (i in 1:nrow(my.data)) { means[i] <- mean(my.data[i,], na.rm=TRUE) } # apply means <- numeric(nrow(my.data)) means <- apply(my.data, MARGIN=1, FUN=mean, na.rm=TRUE) # rowMeans, colMeans means <- rowMeans(my.data, na.rm=TRUE) # lapply # Printing stuff out # cat - print something to STDOUT cat('a', 'b', '\n') cat('a', 'b', '\n', sep="_") cat(c(1:4), "\n") # sprintf u <- 3 v <- 3.14 s <- "Hello" sprintf("%g %f %s", u, v, s) sprintf("%g %.3f %s", u, v, s) cat(sprintf("%g %.3f %s\n", u, v, s)) # Getting stuff into and out of R # read.table, write.table # We saw these above - read through the help page for these functions to get # a feel for the available options. # read.table # write.table # save, load - use these with large data structures as save(my.data, file="my.data.RData") rm(my.data) my.data load(file="my.data.RData") my.data # If you want to save everything in your session, use save.image save.image(file="my.image.RData") ls() # Lists all bindings in your session rm(list=ls()) # Remove all bindings load(file="my.image.RData") ls() ====Problem Set 1 Hints and Other Tricks==== setwd('/Users/liuyipei/BMI215/Module 1 - Nick/DrugSafety-Homework') dz<-read.csv('single_drug_event_frequencies.csv') head(dz) cd1<-read.csv('cholesterol_drugs.txt', col.names=F) colnames(cd1)<-c('singlet') cd1$chole<-'choles' head(cd0) cd0<-data.frame(singlet=setdiff(as.character(unique(dz$singlet)), cd1$singlet),chole=0) cd0$chole<-'noncholes' cd.table<-rbind(cd1, cd0) head(cd1) head(cd0) head(cd.table) q1.table <- merge(cd.table, dz) q1.table$hifreq <- ifelse(q1.table$freq > 0.1, 'hifr', 'lofr') head(q1.table) nrow(q1.table) ncol(q1.table) sum(q1.table$chole == 'choles') sum(q1.table$chole == 'noncholes') table(q1.table$chole, q1.table$hifreq) library(plyr) hard.work<-function(x){ c(nrow(x), ncol(x), x$chole[1]=='choles') } ddply(q1.table, .(singlet), .fun=hard.work)->lets.talk.about.what.happened head(lets.talk.about.what.happened) dim(lets.talk.about.what.happened) summary(lets.talk.about.what.happened) colnames(lets.talk.about.what.happened)<-c('singlet', 'row.count', 'col.count', 'ch.drug') library(caTools) my.x<-c(0:10)*0.1 my.y<-c(0,1,3,5,6,7,7,9,9,10,10)*0.1 plot(my.x, my.y) trapz(my.x, my.y) data.frame(a=c(8,12),b=c(31,41))->t t fisher.test(t)->f.t ls(f.t) f.t$p.value f.t$conf.int