Wednesday, February 14, 2024

doodles intro to R programming

#some doodles intro R programming

#assigning variable
> b <- 21
> b
[1] 21

#vector
> l <- c(21, 10,5)
> l
[1] 21 10  5

##1 based for access
> l
[1] 21 10  5
> l[0]
numeric(0)
> l[1]
[1] 21
> 
> l[2]
[1] 10

#
> l
[1] 21 10  5
> l[c(1,2)]
[1] 21 10


#sequence
> seq(from=0, to=20, by=1.0)
 [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
 
#repeat
> rep(c(4,2), each=3)
[1] 4 4 4 2 2 2
> 
> rep(c(4,2), times=3)
[1] 4 2 4 2 4 2
> 

#random numbers from a list
> sample(0:5)
[1] 1 3 0 2 5 4
> sample(0:5)
[1] 2 3 4 0 1 5


#vector access
#getting values
> l
[1] 21 10  5
> l[l==5]
[1] 5
> l[l <= 10]
[1] 10  5

#getting indexes
> which(l <= 10)
[1] 2 3

> which.min(l)
[1] 3
> which.max(l)
[1] 1
> l
[1] 21 10  5

> length(l)
[1] 3


#matrix
> dat <- c(21, 10, 5, 25, 30, 45)
> dat
[1] 21 10  5 25 30 45

> m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) )
> 
> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45

#accessing matrix entries
> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45
> m1[1,1]
[1] 21
> m1[2,1:2]
colA colB 
  25   30 
  
> m1[,"colB"]
rowA rowB 
  10   30 
> 
> m1["rowB",]
colA colB colC 
  25   30   45 
> m1["rowB", c(1,2)]
colA colB 
  25   30 
> 
> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45


> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45
> 
> dim(m1)
[1] 2 3
> nrow(m1)
[1] 2

> ncol(m1)
[1] 3

> colnames(m1)
[1] "colA" "colB" "colC"
> rownames(m1)
[1] "rowA" "rowB"


#matrix multiplication

> m1 <- matrix( c(1,0,1,1), 2,2, byrow=TRUE)
> m1
     [,1] [,2]
[1,]    1    0
[2,]    1    1
> m2 <- matrix( c(1,1,0,1), 2,2, byrow=TRUE)
> m2
     [,1] [,2]
[1,]    1    1
[2,]    0    1
> 
> m1 %*% m2
     [,1] [,2]
[1,]    1    1
[2,]    1    2

#transpose #rows become columns
> m2
     [,1] [,2]
[1,]    1    1
[2,]    0    1
> t(m2) #transpose
     [,1] [,2]
[1,]    1    0
[2,]    1    1

#summing parts of matrix
> apply(m2, 1, sum) #sum of each row
[1] 2 1
> 
> apply(m2, 2, sum) #sum of each column
[1] 1 2
> m2
     [,1] [,2]
[1,]    1    1
[2,]    0    1

#applying operation to matrix
> m1 <- matrix(data=dat, nrow=2, ncol=3, byrow=TRUE, dimnames=list(rows=c("rowA","rowB"), cols=c("colA", "colB", "colC")) )
> 
> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45
  
> m1
      cols
rows   colA colB colC
  rowA   21   10    5
  rowB   25   30   45
> apply(m1, 1, function(x) return(sum(x)+5) )
rowA rowB 
  41  105 
  
##
> m2 <- matrix( c(2,3,1,0), 2, 2, byrow=TRUE)
> 
> m2
     [,1] [,2]
[1,]    2    3
[2,]    1    0
> apply(m2, 1, function(n) return(sum(n^2)) ) #first square element then add columns
[1] 13  1

> apply(m2, 1, function(n) return(sum(n)^2) ) #first add columns then square result
[1] 25  1


#joining matrices
> m2
     [,1] [,2]
[1,]    2    3
[2,]    1    0
> 
> m3 <- matrix(c(5, 2, 4, 10), 2, 2, byrow=TRUE)
> m3
     [,1] [,2]
[1,]    5    2
[2,]    4   10
> rbind(m2, m3)
     [,1] [,2]
[1,]    2    3
[2,]    1    0
[3,]    5    2
[4,]    4   10

#
> m2
     [,1] [,2]
[1,]    2    3
[2,]    1    0
> m3
     [,1] [,2]
[1,]    5    2
[2,]    4   10
> 
> cbind(m2, m3)
     [,1] [,2] [,3] [,4]
[1,]    2    3    5    2
[2,]    1    0    4   10

#convert matrix to vector
> m2
     [,1] [,2]
[1,]    2    3
[2,]    1    0
> as.vector(m2)
[1] 2 1 3 0


#list of matrices
> lmulti <- list( matrix(3,3,3), matrix(2,3,3))
> lmulti
[[1]]
     [,1] [,2] [,3]
[1,]    3    3    3
[2,]    3    3    3
[3,]    3    3    3

[[2]]
     [,1] [,2] [,3]
[1,]    2    2    2
[2,]    2    2    2
[3,]    2    2    2

> lmulti[[1]]
     [,1] [,2] [,3]
[1,]    3    3    3
[2,]    3    3    3
[3,]    3    3    3
> lmulti[[2]]
     [,1] [,2] [,3]
[1,]    2    2    2
[2,]    2    2    2
[3,]    2    2    2

#dataframe like a matrix with labeled rows/columns
> varA <- c(3,5)
> varB <- c(1,0)
> varC <- c(21,10)
> d1 <- data.frame(varA, varB, varC)
> 
> d1
  varA varB varC
1    3    1   21
2    5    0   10
> rownames(d1) <- c("rowA", "rowB")
> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10
> 

#getting a row or column by variable name
> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10
> d1["rowB", "varC"]
[1] 10
> d1[,"varC"]
[1] 21 10


> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10
> d1[order(d1[,"varC"]), ] #order by 3rd column smallest to biggest
     varA varB varC
rowB    5    0   10
rowA    3    1   21

> order(d1[, "varC"]) #gives indexes after did ordering
[1] 2 1
> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10

> d1[,"varB"] #get varB column
[1] 1 0

> d1[, c("varB","varC")] #get two columns
     varB varC
rowA    1   21
rowB    0   10
> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10

#accessing element like matrix
> d1
     varA varB varC
rowA    3    1   21
rowB    5    0   10
> 
> d1["rowB", "varA"]
[1] 5

#getting elements of data frame
> id <- c(0,1)
> varA <- c(25,10)
> varB <- c(3, 2)
> varC <- c(21, 3)
> d2 <- data.frame(id, varA, varB, varC)
> d2
  id varA varB varC
1  0   25    3   21
2  1   10    2    3
> d2[d2$id==1, ]
  id varA varB varC
2  1   10    2    3
> 
> d2[d2$id<=1, ]
  id varA varB varC
1  0   25    3   21
2  1   10    2    3


#ifelse
> d2
  id varA varB varC
1  0   25    3   21
2  1   10    2    3
> 
> ifelse( d2 > 10, 1, 0)
     id varA varB varC
[1,]  0    1    0    1
[2,]  0    0    0    0

#loop
> for(n in 0:5){print(n)}
[1] 0
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5

> a <- 10
> if( a == 10 ){ print("a is 10")}else{ print("a not 10") }
[1] "a is 10"

#function
> fun <- function(arg){print(arg)}
> fun("yaaay! a function")
[1] "yaaay! a function"

#writing dataframe to a textfile
> d2
  id varA varB varC
1  0   25    3   21
2  1   10    2    3
> write.table(d2, file="/Users/Nathaniel/Documents/src_r/data/temp.txt", append=FALSE, quote=FALSE, sep="\t", row.names=FALSE, col.names=TRUE)
> 
##
nathanismacbook:data Nathaniel$ cat temp.txt 
id	varA	varB	varC
0	25	3	21
1	10	2	3
##

#reading into data frame
> d3 = read.table(file="/Users/Nathaniel/Documents/src_r/data/temp.txt", header=TRUE, sep="\t", skip=0)
> d3
  id varA varB varC
1  0   25    3   21
2  1   10    2    3

#plotting (scatter plot and histogram)
> par(mfrow=c(2,1)) #2 by 1 graphs
> a <- 0:10
> b <- 5:15
> length(a)
[1] 11
> length(b)
[1] 11
> plot(a,b, type="p", col="black", main="Test Plot", xlab="x label", ylab="y label")
> 
> lines(a,b)
> c <- rnorm(50, mean=10, sd=2)
> hist(c, main="Histogram", xlab="c label", ylab="counts")
> 



#standard deviation
> sqrt( ((10-5)^2 + (3-5)^2 + (2-5)^2)/(3-1) )
[1] 4.358899
> a
[1] 10  3  2
> sd(a)
[1] 4.358899

#sum summary items
> a <- c(10,3,2)
> prod(a)
[1] 60
> sum(a)
[1] 15
> mean(a)
[1] 5
> (10+3+2)/3
[1] 5


#max min
> a
[1] 10  3  2
> range(a)
[1]  2 10

#boolean
> a[c(TRUE,FALSE,TRUE)]
[1] 10  2
> a
[1] 10  3  2


#doodling
> d2
  id varA varB varC
1  0   25    3   21
2  1   10    2    3
> 
> summary(d2)
       id            varA            varB           varC     
 Min.   :0.00   Min.   :10.00   Min.   :2.00   Min.   : 3.0  
 1st Qu.:0.25   1st Qu.:13.75   1st Qu.:2.25   1st Qu.: 7.5  
 Median :0.50   Median :17.50   Median :2.50   Median :12.0  
 Mean   :0.50   Mean   :17.50   Mean   :2.50   Mean   :12.0  
 3rd Qu.:0.75   3rd Qu.:21.25   3rd Qu.:2.75   3rd Qu.:16.5  
 Max.   :1.00   Max.   :25.00   Max.   :3.00   Max.   :21.0  
> 
> summary(d2[,-1])
      varA            varB           varC     
 Min.   :10.00   Min.   :2.00   Min.   : 3.0  
 1st Qu.:13.75   1st Qu.:2.25   1st Qu.: 7.5  
 Median :17.50   Median :2.50   Median :12.0  
 Mean   :17.50   Mean   :2.50   Mean   :12.0  
 3rd Qu.:21.25   3rd Qu.:2.75   3rd Qu.:16.5  
 Max.   :25.00   Max.   :3.00   Max.   :21.0  
> 

#correlation
> cor( sample(0:5), sample(0:5) )
[1] 0.4857143


#logistic regression
> y=round(runif(10, 0,1))
> x = y+5+rnorm(10, mean=0, sd=0.5)
> d = data.frame( x,y )
> d
          x y
1  5.531844 0
2  6.575968 1
3  6.110443 1
4  4.038395 0
5  4.961647 0
6  6.390256 1
7  5.399595 0
8  4.613426 0
9  5.217149 0
10 5.754785 1
> logistic = glm( y ~ x, data=d, family=binomial)
Warning messages:
1: glm.fit: algorithm did not converge 
2: glm.fit: fitted probabilities numerically 0 or 1 occurred 
> summary(logistic)

Call:
glm(formula = y ~ x, family = binomial, data = d)

Deviance Residuals: 
       Min          1Q      Median          3Q  
-2.412e-05  -2.110e-08  -2.110e-08   2.110e-08  
       Max  
 2.224e-05  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)
(Intercept)   -1115.7  1326088.2  -0.001    0.999
x               197.7   235313.4   0.001    0.999

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 1.3460e+01  on 9  degrees of freedom
Residual deviance: 1.0764e-09  on 8  degrees of freedom
AIC: 4

Number of Fisher Scoring iterations: 25

> 

(prob y=1 | x)
> predict(logistic, type="response")
           1            2            3            4 
2.908054e-10 1.000000e+00 1.000000e+00 2.220446e-16 
           5            6            7            8 
2.220446e-16 1.000000e+00 2.220446e-16 2.220446e-16 
           9           10 
2.220446e-16 1.000000e+00 

#inspired by
#Cosma
#http://www.science.smith.edu/~jcrouser/SDS293/labs/lab4-r.html



#linear regression
> x <- sample(0:5)
> y <- x+rnorm(6, mean=0, sd=1)
> 
> x
[1] 2 1 0 5 4 3
> y
[1] -1.078103  1.260372  2.112437  4.518225  4.596164
[6]  3.582079
> 
> d1 <- data.frame(x,y)
> d1
  x         y
1 2 -1.078103
2 1  1.260372
3 0  2.112437
4 5  4.518225
5 4  4.596164
6 3  3.582079

> plot(d1[,"x"],d1[,"y"], type="p")

> reg <- lm(y ~ x, data=d1)
> reg

Call:
lm(formula = y ~ x, data = d1)

Coefficients:
(Intercept)            x  
     0.5916       0.7628  

> summary(reg)

Call:
lm(formula = y ~ x, data = d1)

Residuals:
       1        2        3        4        5        6 
-3.19525 -0.09402  1.52080  0.11280  0.95350  0.70217 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)   0.5916     1.3514   0.438    0.684
x             0.7628     0.4464   1.709    0.163

Residual standard error: 1.867 on 4 degrees of freedom
Multiple R-squared:  0.422,	Adjusted R-squared:  0.2775 
F-statistic:  2.92 on 1 and 4 DF,  p-value: 0.1627

> 

> attributes(reg)
$names
 [1] "coefficients"  "residuals"     "effects"      
 [4] "rank"          "fitted.values" "assign"       
 [7] "qr"            "df.residual"   "xlevels"      
[10] "call"          "terms"         "model"        

$class
[1] "lm"

> reg$residuals
          1           2           3           4 
-3.19525357 -0.09402159  1.52080078  0.11280307 
          5           6 
 0.95349945  0.70217186 
> hist(reg$residuals)

> par(mfrow=c(2,1))
> plot(reg)

#plotting variables in data set
> pairs(d1[,c("x","y")])


> plot(d1[,"x"], d1[,"y"], pch=16) #closed circles
> abline(0,1, lwd=2, col=2) #adding a line to the plot

#inspired by
#https://www.stat.cmu.edu/~larry/=stat401/Stat401Rlab2016.pdf


Happy Sketching!

#inspired by
#https://www.cs.cmu.edu/~02710/Lectures/R_recitation.pdf
#https://www.stat.cmu.edu/~hseltman/Rclass/R1.R
#https://www.andrew.cmu.edu/user/achoulde/94842/lectures/lecture09/lecture09-94842.html

#inspired by
#Larry Wasserman
#Brian Junker
#Cosma Shalizi
#Joel Greenhouse
#Howard Seltman