DPI R Bootcamp
Jared Knowles
In this lesson we hope to learn: - Coding Style - Write a `for’ loop - Write a basic function - Optimize R with parallelization - Mixed effect models - Data mining with R - Animations - Git, GitHub, and add-on packages
-
# Loop to calculate number of students per grade
nstudents <- rep(NA, 6)
for (i in unique(df$grade)) {
nstudents[[i - 2]] <- length(df$stuid[df$grade == i])
}
nstudents
## [1] 500 400 500 400 500 400
summary(factor(df$grade))
## 3 4 5 6 7 8
## 500 400 500 400 500 400
A = matrix(as.numeric(1:1e+05))
system.time({
Sum = 0
for (i in seq_along(A)) {
Sum = Sum + A[[i]]
}
Sum
})
## user system elapsed
## 0.11 0.00 0.11
system.time({
sum(A)
})
## user system elapsed
## 0 0 0
rm(A)
print(myfunction)
print(mean) #bytecode, we can't see it
## function (x, ...)
## UseMethod("mean")
## <bytecode: 0x0ea57bd8>
## <environment: namespace:base>
print(order)
## function (..., na.last = TRUE, decreasing = FALSE)
## {
## z <- list(...)
## if (any(unlist(lapply(z, is.object)))) {
## z <- lapply(z, function(x) if (is.object(x))
## xtfrm(x)
## else x)
## if (!is.na(na.last))
## return(do.call("order", c(z, na.last = na.last, decreasing = decreasing)))
## }
## else if (!is.na(na.last))
## return(.Internal(order(na.last, decreasing, ...)))
## if (any(diff(l.z <- vapply(z, length, 1L)) != 0L))
## stop("argument lengths differ")
## ans <- vapply(z, is.na, rep.int(NA, l.z[1L]))
## ok <- if (is.matrix(ans))
## !apply(ans, 1, any)
## else !any(ans)
## if (all(!ok))
## return(integer())
## z[[1L]][!ok] <- NA
## ans <- do.call("order", c(z, decreasing = decreasing))
## keep <- seq_along(ok)[ok]
## ans[ans %in% keep]
## }
## <bytecode: 0x0deed500>
## <environment: namespace:base>
x<-as.character(x)
is really obnoxiousdefac <- function(x) {
# assign function a name, and list its arguments
x <- as.character(x) # what does function do?
x # last line is output of function
}
a <- factor(letters)
summary(a)
summary(defac(a))
summary(as.character(a))
foo
which has both a character and a missing value?extractN <- function(x) {
x <- suppressWarnings(as.numeric(x))
# ignore warnings because we don't care
x <- x[!is.na(x)]
x
}
extractN(foo)
A <- extractN(foo)
lme4
package to do this work, or in a Bayesian framework (fancy) we can use winBUGS or JAGSlme4
by Doug Bates (a retired UW-Madison Professor of Statistics)mymod_me<-lmer(readSS~factor(grade)+factor(race)+female+disab+ell+(1|dist)+(1|stuid),data=df)
(1|dist)
denote a random effect. In this case we are measuring a random effect for the variable dist
in our data.lm
formula (by designlibrary(lme4)
mymod_me <- lmer(readSS ~ factor(grade) + factor(race) + female + disab + ell +
(1 | dist) + (1 | stuid), data = df)
print(mymod_me, correlation = FALSE)
## Linear mixed model fit by REML
## Formula: readSS ~ factor(grade) + factor(race) + female + disab + ell + (1 | dist) + (1 | stuid)
## Data: df
## AIC BIC logLik deviance REMLdev
## 30731 30826 -15350 30774 30699
## Random effects:
## Groups Name Variance Std.Dev.
## stuid (Intercept) 10626 103.1
## dist (Intercept) 487 22.1
## Residual 1564 39.5
## Number of obs: 2700, groups: stuid, 1200; dist, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 378.24 31.03 12.2
## factor(grade)4 62.72 3.05 20.6
## factor(grade)5 124.67 3.28 38.0
## factor(grade)6 182.67 3.99 45.8
## factor(grade)7 236.64 4.20 56.4
## factor(grade)8 301.41 4.74 63.5
## factor(race)B -61.87 28.05 -2.2
## factor(race)H -34.29 27.54 -1.2
## factor(race)I -6.09 43.93 -0.1
## factor(race)W 10.37 28.00 0.4
## female 8.76 6.21 1.4
## disab -5.26 8.50 -0.6
## ell -18.81 13.10 -1.4
mymod <- lm(readSS ~ factor(grade) + factor(race) + female + disab + ell, data = df)
qplot(readSS, predict(mymod), data = df, alpha = I(0.3), color = I("blue")) +
geom_point(aes(x = df$readSS, y = fitted(mymod_me)), alpha = I(0.4), color = "dark green") +
theme_dpi() + xlab("Observed") + ylab("Predicted") + geom_text(aes(x = 370,
y = 700), label = "Green is Results of the \n Mixed Model")
caret
provides best in class data mining tools to uscaret
package looks likelibrary(caret)
# Set aside test set
testset <- sample(unique(df$stuid), 500)
df$case <- 0
df$case[df$stuid %in% testset] <- 1
# Draw a training set of data (random subset of students)
training <- subset(df, case == 0)
testing <- subset(df, case == 1)
training <- training[, c(3, 6:16, 21, 22, 28, 29, 30)] # subset vars
trainX <- training[, 1:15]
refac <- function(x) as.factor(as.character(x))
trainX$stuid <- refac(trainX$stuid)
trainX$dist <- refac(trainX$dist)
trainX$year <- refac(trainX$year)
# Parameters
ctrl <- trainControl(method = "repeatedcv", number = 7, repeats = 3, summaryFunction = defaultSummary)
# Search grid
grid <- expand.grid(.interaction.depth = seq(2, 6, by = 1), .n.trees = seq(200,
700, by = 100), .shrinkage = c(0.05, 0.1))
# Boosted tree search
gbmTune <- train(x = trainX, y = training$mathSS, method = "gbm", metric = "RMSE",
trControl = ctrl, tuneGrid = grid, verbose = FALSE)
# gbmPred<-predict(gbmTune,testing[,names(trainX)])
plot(gbmTune)
n <- 10000
rep <- 5
# tLoop <- replicate(rep, system.time( integLoop(func, xint, yint, n) ))
# summary(tLoop[3,])
tVec <- replicate(rep, system.time(integVec(func, xint, yint, n)))
summary(tVec[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.004 0.000 0.020
tApply <- replicate(rep, system.time(integApply(func, xint, yint, n)))
summary(tApply[3, ])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.96 5.97 5.98 6.01 6.06 6.08
# 2 Core Cluster
library(snow)
c1 <- makeCluster(c("localhost", "localhost"), type = "SOCK")
tSnow1 <- replicate(rep, system.time(integSnow(c1, func, xint, yint, n)))
summary(tSnow1[3])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.75 8.75 8.75 8.75 8.75 8.75
stopCluster(c1)
git
and GitHub can helpIt is good to include the session info, e.g. this document is produced with knitr version 0.8
. Here is my session info:
print(sessionInfo(), locale = FALSE)
## R version 2.15.2 (2012-10-26)
## Platform: i386-w64-mingw32/i386 (32-bit)
##
## attached base packages:
## [1] splines grid stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] snow_0.3-10 gbm_1.6-3.2 survival_2.36-14 caret_5.15-044
## [5] foreach_1.4.0 cluster_1.14.3 reshape_0.8.4 lme4_0.999999-0
## [9] Matrix_1.0-10 lattice_0.20-10 xtable_1.7-0 gridExtra_0.9.1
## [13] sandwich_2.2-9 quantreg_4.91 SparseM_0.96 mgcv_1.7-22
## [17] eeptools_0.1 mapproj_1.1-8.3 maps_2.2-6 proto_0.3-9.2
## [21] stringr_0.6.1 plyr_1.7.1 ggplot2_0.9.2.1 lmtest_0.9-30
## [25] zoo_1.7-9 knitr_0.8
##
## loaded via a namespace (and not attached):
## [1] codetools_0.2-8 colorspace_1.2-0 compiler_2.15.2
## [4] dichromat_1.2-4 digest_0.5.2 evaluate_0.4.2
## [7] formatR_0.6 gtable_0.1.1 iterators_1.0.6
## [10] labeling_0.1 MASS_7.3-22 memoise_0.1
## [13] munsell_0.4 nlme_3.1-105 RColorBrewer_1.0-5
## [16] reshape2_1.2.1 scales_0.2.2 stats4_2.15.1
## [19] tools_2.15.1
This work (R Tutorial for Education, by Jared E. Knowles), in service of the Wisconsin Department of Public Instruction, is free of known copyright restrictions.