#some doodles intro R programming #assigning variable > b <- 21 > b [1] 21 #vector > l <- c(21, 10,5) > l [1] 21 10 5 ##1 based for access > l [1] 21 10 5 > l[0] numeric(0) > l[1] [1] 21 > > l[2] [1] 10 # > l [1] 21 10 5 > l[c(1,2)] [1] 21 10 #sequence > seq(from=0, to=20, by=1.0) [1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 #repeat > rep(c(4,2), each=3) [1] 4 4 4 2 2 2 > > rep(c(4,2), times=3) [1] 4 2 4 2 4 2 > #random numbers from a list > sample(0:5) [1] 1 3 0 2 5 4 > sample(0:5) [1] 2 3 4 0 1 5 #vector access #getting values > l [1] 21 10 5 > l[l==5] [1] 5 > l[l <= 10] [1] 10 5 #getting indexes > which(l <= 10) [1] 2 3 > which.min(l) [1] 3 > which.max(l) [1] 1 > l [1] 21 10 5 > length(l) [1] 3 #matrix > dat <- c(21, 10, 5, 25, 30, 45) > dat [1] 21 10 5 25 30 45 > m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) ) > > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 #accessing matrix entries > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 > m1[1,1] [1] 21 > m1[2,1:2] colA colB 25 30 > m1[,"colB"] rowA rowB 10 30 > > m1["rowB",] colA colB colC 25 30 45 > m1["rowB", c(1,2)] colA colB 25 30 > > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 > > dim(m1) [1] 2 3 > nrow(m1) [1] 2 > ncol(m1) [1] 3 > colnames(m1) [1] "colA" "colB" "colC" > rownames(m1) [1] "rowA" "rowB" #matrix multiplication > m1 <- matrix( c(1,0,1,1), 2,2, byrow=TRUE) > m1 [,1] [,2] [1,] 1 0 [2,] 1 1 > m2 <- matrix( c(1,1,0,1), 2,2, byrow=TRUE) > m2 [,1] [,2] [1,] 1 1 [2,] 0 1 > > m1 %*% m2 [,1] [,2] [1,] 1 1 [2,] 1 2 #transpose #rows become columns > m2 [,1] [,2] [1,] 1 1 [2,] 0 1 > t(m2) #transpose [,1] [,2] [1,] 1 0 [2,] 1 1 #summing parts of matrix > apply(m2, 1, sum) #sum of each row [1] 2 1 > > apply(m2, 2, sum) #sum of each column [1] 1 2 > m2 [,1] [,2] [1,] 1 1 [2,] 0 1 #applying operation to matrix > m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) ) > > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 > m1 cols rows colA colB colC rowA 21 10 5 rowB 25 30 45 > apply(m1, 1, function(x) return(sum(x)+5) ) rowA rowB 41 105 ## > m2 <- matrix( c(2,3,1,0), 2, 2, byrow=TRUE) > > m2 [,1] [,2] [1,] 2 3 [2,] 1 0 > apply(m2, 1, function(n) return(sum(n^2)) ) #first square element then add columns [1] 13 1 > apply(m2, 1, function(n) return(sum(n)^2) ) #first add columns then square result [1] 25 1 #joining matrices > m2 [,1] [,2] [1,] 2 3 [2,] 1 0 > > m3 <- matrix(c(5, 2, 4, 10), 2, 2, byrow=TRUE) > m3 [,1] [,2] [1,] 5 2 [2,] 4 10 > rbind(m2, m3) [,1] [,2] [1,] 2 3 [2,] 1 0 [3,] 5 2 [4,] 4 10 # > m2 [,1] [,2] [1,] 2 3 [2,] 1 0 > m3 [,1] [,2] [1,] 5 2 [2,] 4 10 > > cbind(m2, m3) [,1] [,2] [,3] [,4] [1,] 2 3 5 2 [2,] 1 0 4 10 #convert matrix to vector > m2 [,1] [,2] [1,] 2 3 [2,] 1 0 > as.vector(m2) [1] 2 1 3 0 #list of matrices > lmulti <- list( matrix(3,3,3), matrix(2,3,3)) > lmulti [[1]] [,1] [,2] [,3] [1,] 3 3 3 [2,] 3 3 3 [3,] 3 3 3 [[2]] [,1] [,2] [,3] [1,] 2 2 2 [2,] 2 2 2 [3,] 2 2 2 > lmulti[[1]] [,1] [,2] [,3] [1,] 3 3 3 [2,] 3 3 3 [3,] 3 3 3 > lmulti[[2]] [,1] [,2] [,3] [1,] 2 2 2 [2,] 2 2 2 [3,] 2 2 2 #dataframe like a matrix with labeled rows/columns > varA <- c(3,5) > varB <- c(1,0) > varC <- c(21,10) > d1 <- data.frame(varA, varB, varC) > > d1 varA varB varC 1 3 1 21 2 5 0 10 > rownames(d1) <- c("rowA", "rowB") > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 > #getting a row or column by variable name > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 > d1["rowB", "varC"] [1] 10 > d1[,"varC"] [1] 21 10 > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 > d1[order(d1[,"varC"]), ] #order by 3rd column smallest to biggest varA varB varC rowB 5 0 10 rowA 3 1 21 > order(d1[, "varC"]) #gives indexes after did ordering [1] 2 1 > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 > d1[,"varB"] #get varB column [1] 1 0 > d1[, c("varB","varC")] #get two columns varB varC rowA 1 21 rowB 0 10 > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 #accessing element like matrix > d1 varA varB varC rowA 3 1 21 rowB 5 0 10 > > d1["rowB", "varA"] [1] 5 #getting elements of data frame > id <- c(0,1) > varA <- c(25,10) > varB <- c(3, 2) > varC <- c(21, 3) > d2 <- data.frame(id, varA, varB, varC) > d2 id varA varB varC 1 0 25 3 21 2 1 10 2 3 > d2[d2$id==1, ] id varA varB varC 2 1 10 2 3 > > d2[d2$id<=1, ] id varA varB varC 1 0 25 3 21 2 1 10 2 3 #ifelse > d2 id varA varB varC 1 0 25 3 21 2 1 10 2 3 > > ifelse( d2 > 10, 1, 0) id varA varB varC [1,] 0 1 0 1 [2,] 0 0 0 0 #loop > for(n in 0:5){print(n)} [1] 0 [1] 1 [1] 2 [1] 3 [1] 4 [1] 5 > a <- 10 > if( a == 10 ){ print("a is 10")}else{ print("a not 10") } [1] "a is 10" #function > fun <- function(arg){print(arg)} > fun("yaaay! a function") [1] "yaaay! a function" #writing dataframe to a textfile > d2 id varA varB varC 1 0 25 3 21 2 1 10 2 3 > write.table(d2, file="/Users/Nathaniel/Documents/src_r/data/temp.txt", append=FALSE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE) > ## nathanismacbook:data Nathaniel$ cat temp.txt id varA varB varC 0 25 3 21 1 10 2 3 ## #reading into data frame > d3 = read.table(file="/Users/Nathaniel/Documents/src_r/data/temp.txt", header=TRUE, sep="\t", skip=0) > d3 id varA varB varC 1 0 25 3 21 2 1 10 2 3 #plotting (scatter plot and histogram) > par(mfrow=c(2,1)) #2 by 1 graphs > a <- 0:10 > b <- 5:15 > length(a) [1] 11 > length(b) [1] 11 > plot(a,b, type="p", col="black", main="Test Plot", xlab="x label", ylab="y label") > > lines(a,b) > c <- rnorm(50, mean=10, sd=2) > hist(c, main="Histogram", xlab="c label", ylab="counts") > #standard deviation > sqrt( ((10-5)^2 + (3-5)^2 + (2-5)^2)/(3-1) ) [1] 4.358899 > a [1] 10 3 2 > sd(a) [1] 4.358899 #sum summary items > a <- c(10,3,2) > prod(a) [1] 60 > sum(a) [1] 15 > mean(a) [1] 5 > (10+3+2)/3 [1] 5 #max min > a [1] 10 3 2 > range(a) [1] 2 10 #boolean > a[c(TRUE,FALSE,TRUE)] [1] 10 2 > a [1] 10 3 2 #doodling > d2 id varA varB varC 1 0 25 3 21 2 1 10 2 3 > > summary(d2) id varA varB varC Min. :0.00 Min. :10.00 Min. :2.00 Min. : 3.0 1st Qu.:0.25 1st Qu.:13.75 1st Qu.:2.25 1st Qu.: 7.5 Median :0.50 Median :17.50 Median :2.50 Median :12.0 Mean :0.50 Mean :17.50 Mean :2.50 Mean :12.0 3rd Qu.:0.75 3rd Qu.:21.25 3rd Qu.:2.75 3rd Qu.:16.5 Max. :1.00 Max. :25.00 Max. :3.00 Max. :21.0 > > summary(d2[,-1]) varA varB varC Min. :10.00 Min. :2.00 Min. : 3.0 1st Qu.:13.75 1st Qu.:2.25 1st Qu.: 7.5 Median :17.50 Median :2.50 Median :12.0 Mean :17.50 Mean :2.50 Mean :12.0 3rd Qu.:21.25 3rd Qu.:2.75 3rd Qu.:16.5 Max. :25.00 Max. :3.00 Max. :21.0 > #correlation > cor( sample(0:5), sample(0:5) ) [1] 0.4857143 #logistic regression > y=round(runif(10, 0,1)) > x = y+5+rnorm(10, mean=0, sd=0.5) > d = data.frame( x,y ) > d x y 1 5.531844 0 2 6.575968 1 3 6.110443 1 4 4.038395 0 5 4.961647 0 6 6.390256 1 7 5.399595 0 8 4.613426 0 9 5.217149 0 10 5.754785 1 > logistic = glm( y ~ x, data=d, family=binomial) Warning messages: 1: glm.fit: algorithm did not converge 2: glm.fit: fitted probabilities numerically 0 or 1 occurred > summary(logistic) Call: glm(formula = y ~ x, family = binomial, data = d) Deviance Residuals: Min 1Q Median 3Q -2.412e-05 -2.110e-08 -2.110e-08 2.110e-08 Max 2.224e-05 Coefficients: Estimate Std. Error z value Pr(>|z|) (Intercept) -1115.7 1326088.2 -0.001 0.999 x 197.7 235313.4 0.001 0.999 (Dispersion parameter for binomial family taken to be 1) Null deviance: 1.3460e+01 on 9 degrees of freedom Residual deviance: 1.0764e-09 on 8 degrees of freedom AIC: 4 Number of Fisher Scoring iterations: 25 > (prob y=1 | x) > predict(logistic, type="response") 1 2 3 4 2.908054e-10 1.000000e+00 1.000000e+00 2.220446e-16 5 6 7 8 2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 9 10 2.220446e-16 1.000000e+00 #inspired by #Cosma #http://www.science.smith.edu/~jcrouser/SDS293/labs/lab4-r.html #linear regression > x <- sample(0:5) > y <- x+rnorm(6, mean=0, sd=1) > > x [1] 2 1 0 5 4 3 > y [1] -1.078103 1.260372 2.112437 4.518225 4.596164 [6] 3.582079 > > d1 <- data.frame(x,y) > d1 x y 1 2 -1.078103 2 1 1.260372 3 0 2.112437 4 5 4.518225 5 4 4.596164 6 3 3.582079 > plot(d1[,"x"],d1[,"y"], type="p") > reg <- lm(y ~ x, data=d1) > reg Call: lm(formula = y ~ x, data = d1) Coefficients: (Intercept) x 0.5916 0.7628 > summary(reg) Call: lm(formula = y ~ x, data = d1) Residuals: 1 2 3 4 5 6 -3.19525 -0.09402 1.52080 0.11280 0.95350 0.70217 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 0.5916 1.3514 0.438 0.684 x 0.7628 0.4464 1.709 0.163 Residual standard error: 1.867 on 4 degrees of freedom Multiple R-squared: 0.422, Adjusted R-squared: 0.2775 F-statistic: 2.92 on 1 and 4 DF, p-value: 0.1627 > > attributes(reg) $names [1] "coefficients" "residuals" "effects" [4] "rank" "fitted.values" "assign" [7] "qr" "df.residual" "xlevels" [10] "call" "terms" "model" $class [1] "lm" > reg$residuals 1 2 3 4 -3.19525357 -0.09402159 1.52080078 0.11280307 5 6 0.95349945 0.70217186 > hist(reg$residuals) > par(mfrow=c(2,1)) > plot(reg) #plotting variables in data set > pairs(d1[,c("x","y")]) > plot(d1[,"x"], d1[,"y"], pch=16) #closed circles > abline(0,1, lwd=2, col=2) #adding a line to the plot #inspired by #https://www.stat.cmu.edu/~larry/=stat401/Stat401Rlab2016.pdf Happy Sketching! #inspired by #https://www.cs.cmu.edu/~02710/Lectures/R_recitation.pdf #https://www.stat.cmu.edu/~hseltman/Rclass/R1.R #https://www.andrew.cmu.edu/user/achoulde/94842/lectures/lecture09/lecture09-94842.html #inspired by #Larry Wasserman #Brian Junker #Cosma Shalizi #Joel Greenhouse #Howard Seltman