#some doodles intro R programming
#assigning variable
> b <- 21
> b
[1] 21
#vector
> l <- c(21, 10,5)
> l
[1] 21 10 5
##1 based for access
> l
[1] 21 10 5
> l[0]
numeric(0)
> l[1]
[1] 21
>
> l[2]
[1] 10
#
> l
[1] 21 10 5
> l[c(1,2)]
[1] 21 10
#sequence
> seq(from=0, to=20, by=1.0)
[1] 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
#repeat
> rep(c(4,2), each=3)
[1] 4 4 4 2 2 2
>
> rep(c(4,2), times=3)
[1] 4 2 4 2 4 2
>
#random numbers from a list
> sample(0:5)
[1] 1 3 0 2 5 4
> sample(0:5)
[1] 2 3 4 0 1 5
#vector access
#getting values
> l
[1] 21 10 5
> l[l==5]
[1] 5
> l[l <= 10]
[1] 10 5
#getting indexes
> which(l <= 10)
[1] 2 3
> which.min(l)
[1] 3
> which.max(l)
[1] 1
> l
[1] 21 10 5
> length(l)
[1] 3
#matrix
> dat <- c(21, 10, 5, 25, 30, 45)
> dat
[1] 21 10 5 25 30 45
> m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) )
>
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
#accessing matrix entries
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
> m1[1,1]
[1] 21
> m1[2,1:2]
colA colB
25 30
> m1[,"colB"]
rowA rowB
10 30
>
> m1["rowB",]
colA colB colC
25 30 45
> m1["rowB", c(1,2)]
colA colB
25 30
>
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
>
> dim(m1)
[1] 2 3
> nrow(m1)
[1] 2
> ncol(m1)
[1] 3
> colnames(m1)
[1] "colA" "colB" "colC"
> rownames(m1)
[1] "rowA" "rowB"
#matrix multiplication
> m1 <- matrix( c(1,0,1,1), 2,2, byrow=TRUE)
> m1
[,1] [,2]
[1,] 1 0
[2,] 1 1
> m2 <- matrix( c(1,1,0,1), 2,2, byrow=TRUE)
> m2
[,1] [,2]
[1,] 1 1
[2,] 0 1
>
> m1 %*% m2
[,1] [,2]
[1,] 1 1
[2,] 1 2
#transpose #rows become columns
> m2
[,1] [,2]
[1,] 1 1
[2,] 0 1
> t(m2) #transpose
[,1] [,2]
[1,] 1 0
[2,] 1 1
#summing parts of matrix
> apply(m2, 1, sum) #sum of each row
[1] 2 1
>
> apply(m2, 2, sum) #sum of each column
[1] 1 2
> m2
[,1] [,2]
[1,] 1 1
[2,] 0 1
#applying operation to matrix
> m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) )
>
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
> m1
cols
rows colA colB colC
rowA 21 10 5
rowB 25 30 45
> apply(m1, 1, function(x) return(sum(x)+5) )
rowA rowB
41 105
##
> m2 <- matrix( c(2,3,1,0), 2, 2, byrow=TRUE)
>
> m2
[,1] [,2]
[1,] 2 3
[2,] 1 0
> apply(m2, 1, function(n) return(sum(n^2)) ) #first square element then add columns
[1] 13 1
> apply(m2, 1, function(n) return(sum(n)^2) ) #first add columns then square result
[1] 25 1
#joining matrices
> m2
[,1] [,2]
[1,] 2 3
[2,] 1 0
>
> m3 <- matrix(c(5, 2, 4, 10), 2, 2, byrow=TRUE)
> m3
[,1] [,2]
[1,] 5 2
[2,] 4 10
> rbind(m2, m3)
[,1] [,2]
[1,] 2 3
[2,] 1 0
[3,] 5 2
[4,] 4 10
#
> m2
[,1] [,2]
[1,] 2 3
[2,] 1 0
> m3
[,1] [,2]
[1,] 5 2
[2,] 4 10
>
> cbind(m2, m3)
[,1] [,2] [,3] [,4]
[1,] 2 3 5 2
[2,] 1 0 4 10
#convert matrix to vector
> m2
[,1] [,2]
[1,] 2 3
[2,] 1 0
> as.vector(m2)
[1] 2 1 3 0
#list of matrices
> lmulti <- list( matrix(3,3,3), matrix(2,3,3))
> lmulti
[[1]]
[,1] [,2] [,3]
[1,] 3 3 3
[2,] 3 3 3
[3,] 3 3 3
[[2]]
[,1] [,2] [,3]
[1,] 2 2 2
[2,] 2 2 2
[3,] 2 2 2
> lmulti[[1]]
[,1] [,2] [,3]
[1,] 3 3 3
[2,] 3 3 3
[3,] 3 3 3
> lmulti[[2]]
[,1] [,2] [,3]
[1,] 2 2 2
[2,] 2 2 2
[3,] 2 2 2
#dataframe like a matrix with labeled rows/columns
> varA <- c(3,5)
> varB <- c(1,0)
> varC <- c(21,10)
> d1 <- data.frame(varA, varB, varC)
>
> d1
varA varB varC
1 3 1 21
2 5 0 10
> rownames(d1) <- c("rowA", "rowB")
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
>
#getting a row or column by variable name
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
> d1["rowB", "varC"]
[1] 10
> d1[,"varC"]
[1] 21 10
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
> d1[order(d1[,"varC"]), ] #order by 3rd column smallest to biggest
varA varB varC
rowB 5 0 10
rowA 3 1 21
> order(d1[, "varC"]) #gives indexes after did ordering
[1] 2 1
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
> d1[,"varB"] #get varB column
[1] 1 0
> d1[, c("varB","varC")] #get two columns
varB varC
rowA 1 21
rowB 0 10
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
#accessing element like matrix
> d1
varA varB varC
rowA 3 1 21
rowB 5 0 10
>
> d1["rowB", "varA"]
[1] 5
#getting elements of data frame
> id <- c(0,1)
> varA <- c(25,10)
> varB <- c(3, 2)
> varC <- c(21, 3)
> d2 <- data.frame(id, varA, varB, varC)
> d2
id varA varB varC
1 0 25 3 21
2 1 10 2 3
> d2[d2$id==1, ]
id varA varB varC
2 1 10 2 3
>
> d2[d2$id<=1, ]
id varA varB varC
1 0 25 3 21
2 1 10 2 3
#ifelse
> d2
id varA varB varC
1 0 25 3 21
2 1 10 2 3
>
> ifelse( d2 > 10, 1, 0)
id varA varB varC
[1,] 0 1 0 1
[2,] 0 0 0 0
#loop
> for(n in 0:5){print(n)}
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
> a <- 10
> if( a == 10 ){ print("a is 10")}else{ print("a not 10") }
[1] "a is 10"
#function
> fun <- function(arg){print(arg)}
> fun("yaaay! a function")
[1] "yaaay! a function"
#writing dataframe to a textfile
> d2
id varA varB varC
1 0 25 3 21
2 1 10 2 3
> write.table(d2, file="/Users/Nathaniel/Documents/src_r/data/temp.txt", append=FALSE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
>
##
nathanismacbook:data Nathaniel$ cat temp.txt
id varA varB varC
0 25 3 21
1 10 2 3
##
#reading into data frame
> d3 = read.table(file="/Users/Nathaniel/Documents/src_r/data/temp.txt", header=TRUE, sep="\t", skip=0)
> d3
id varA varB varC
1 0 25 3 21
2 1 10 2 3
#plotting (scatter plot and histogram)
> par(mfrow=c(2,1)) #2 by 1 graphs
> a <- 0:10
> b <- 5:15
> length(a)
[1] 11
> length(b)
[1] 11
> plot(a,b, type="p", col="black", main="Test Plot", xlab="x label", ylab="y label")
>
> lines(a,b)
> c <- rnorm(50, mean=10, sd=2)
> hist(c, main="Histogram", xlab="c label", ylab="counts")
>
#standard deviation
> sqrt( ((10-5)^2 + (3-5)^2 + (2-5)^2)/(3-1) )
[1] 4.358899
> a
[1] 10 3 2
> sd(a)
[1] 4.358899
#sum summary items
> a <- c(10,3,2)
> prod(a)
[1] 60
> sum(a)
[1] 15
> mean(a)
[1] 5
> (10+3+2)/3
[1] 5
#max min
> a
[1] 10 3 2
> range(a)
[1] 2 10
#boolean
> a[c(TRUE,FALSE,TRUE)]
[1] 10 2
> a
[1] 10 3 2
#doodling
> d2
id varA varB varC
1 0 25 3 21
2 1 10 2 3
>
> summary(d2)
id varA varB varC
Min. :0.00 Min. :10.00 Min. :2.00 Min. : 3.0
1st Qu.:0.25 1st Qu.:13.75 1st Qu.:2.25 1st Qu.: 7.5
Median :0.50 Median :17.50 Median :2.50 Median :12.0
Mean :0.50 Mean :17.50 Mean :2.50 Mean :12.0
3rd Qu.:0.75 3rd Qu.:21.25 3rd Qu.:2.75 3rd Qu.:16.5
Max. :1.00 Max. :25.00 Max. :3.00 Max. :21.0
>
> summary(d2[,-1])
varA varB varC
Min. :10.00 Min. :2.00 Min. : 3.0
1st Qu.:13.75 1st Qu.:2.25 1st Qu.: 7.5
Median :17.50 Median :2.50 Median :12.0
Mean :17.50 Mean :2.50 Mean :12.0
3rd Qu.:21.25 3rd Qu.:2.75 3rd Qu.:16.5
Max. :25.00 Max. :3.00 Max. :21.0
>
#correlation
> cor( sample(0:5), sample(0:5) )
[1] 0.4857143
#logistic regression
> y=round(runif(10, 0,1))
> x = y+5+rnorm(10, mean=0, sd=0.5)
> d = data.frame( x,y )
> d
x y
1 5.531844 0
2 6.575968 1
3 6.110443 1
4 4.038395 0
5 4.961647 0
6 6.390256 1
7 5.399595 0
8 4.613426 0
9 5.217149 0
10 5.754785 1
> logistic = glm( y ~ x, data=d, family=binomial)
Warning messages:
1: glm.fit: algorithm did not converge
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
> summary(logistic)
Call:
glm(formula = y ~ x, family = binomial, data = d)
Deviance Residuals:
Min 1Q Median 3Q
-2.412e-05 -2.110e-08 -2.110e-08 2.110e-08
Max
2.224e-05
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1115.7 1326088.2 -0.001 0.999
x 197.7 235313.4 0.001 0.999
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1.3460e+01 on 9 degrees of freedom
Residual deviance: 1.0764e-09 on 8 degrees of freedom
AIC: 4
Number of Fisher Scoring iterations: 25
>
(prob y=1 | x)
> predict(logistic, type="response")
1 2 3 4
2.908054e-10 1.000000e+00 1.000000e+00 2.220446e-16
5 6 7 8
2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16
9 10
2.220446e-16 1.000000e+00
#inspired by
#Cosma
#http://www.science.smith.edu/~jcrouser/SDS293/labs/lab4-r.html
#linear regression
> x <- sample(0:5)
> y <- x+rnorm(6, mean=0, sd=1)
>
> x
[1] 2 1 0 5 4 3
> y
[1] -1.078103 1.260372 2.112437 4.518225 4.596164
[6] 3.582079
>
> d1 <- data.frame(x,y)
> d1
x y
1 2 -1.078103
2 1 1.260372
3 0 2.112437
4 5 4.518225
5 4 4.596164
6 3 3.582079
> plot(d1[,"x"],d1[,"y"], type="p")
> reg <- lm(y ~ x, data=d1)
> reg
Call:
lm(formula = y ~ x, data = d1)
Coefficients:
(Intercept) x
0.5916 0.7628
> summary(reg)
Call:
lm(formula = y ~ x, data = d1)
Residuals:
1 2 3 4 5 6
-3.19525 -0.09402 1.52080 0.11280 0.95350 0.70217
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.5916 1.3514 0.438 0.684
x 0.7628 0.4464 1.709 0.163
Residual standard error: 1.867 on 4 degrees of freedom
Multiple R-squared: 0.422, Adjusted R-squared: 0.2775
F-statistic: 2.92 on 1 and 4 DF, p-value: 0.1627
>
> attributes(reg)
$names
[1] "coefficients" "residuals" "effects"
[4] "rank" "fitted.values" "assign"
[7] "qr" "df.residual" "xlevels"
[10] "call" "terms" "model"
$class
[1] "lm"
> reg$residuals
1 2 3 4
-3.19525357 -0.09402159 1.52080078 0.11280307
5 6
0.95349945 0.70217186
> hist(reg$residuals)
> par(mfrow=c(2,1))
> plot(reg)
#plotting variables in data set
> pairs(d1[,c("x","y")])
> plot(d1[,"x"], d1[,"y"], pch=16) #closed circles
> abline(0,1, lwd=2, col=2) #adding a line to the plot
#inspired by
#https://www.stat.cmu.edu/~larry/=stat401/Stat401Rlab2016.pdf
Happy Sketching!
#inspired by
#https://www.cs.cmu.edu/~02710/Lectures/R_recitation.pdf
#https://www.stat.cmu.edu/~hseltman/Rclass/R1.R
#https://www.andrew.cmu.edu/user/achoulde/94842/lectures/lecture09/lecture09-94842.html
#inspired by
#Larry Wasserman
#Brian Junker
#Cosma Shalizi
#Joel Greenhouse
#Howard Seltman