##-------------------------------------## ## Script for Basic R Programming ## ## John Fox ## ## Programming in R ## ## ICPSR Summer Program 2009 ## ##-------------------------------------## # Preliminary example library(car) # for data mod.duncan <- lm(prestige ~ income + education, data=Duncan) cook <- sqrt(cookd(mod.duncan)) plot(hatvalues(mod.duncan), rstudent(mod.duncan), cex=(10/max(cook))*cook) abline(h=c(-2, 0, 2), lty=2) abline(v=c(2, 3)*length(coef(mod.duncan))/length(rstudent(mod.duncan)), lty=1) identify(hatvalues(mod.duncan), rstudent(mod.duncan), rownames(Duncan)) inflPlot <- function (model, scale=10, col=c(1, 2), identify=TRUE, labels=names(rstud), ... ) { # Plot hatvalues, studentized residuals, and Cook's distances # for a linear or generalized linear model # Arguments: # model: an lm or glm model object # scale: a scaling factor for the circles representing Cook's D # col: colors for non-noteworthy and noteworthy points # identify points: label noteworthy points (TRUE or FALSE) # labels: for identified points hatval <- hatvalues(model) rstud <- rstudent(model) cook <- sqrt(cookd(model)) scale <- scale/max(cook, na.rm = TRUE) p <- length(coef(model)) n <- length(rstud) cutoff <- sqrt(4/(n - p)) plot(hatval, rstud, xlab = "Hat-Values", ylab = "Studentized Residuals", cex=scale*cook, col=ifelse(cook > cutoff, col[2], col[1]), ...) abline(v = c(2, 3)*p/n, lty = 2) bonf <- qt(.025/n, df = n - p - 1, lower.tail=FALSE) abline(h=c(-bonf, -2, 0, 2, bonf), lty=2) if (identify) { noteworthy <- cook > cutoff | abs(rstud) > bonf | hatval > 2*p/n pos <- ifelse(hatval - mean(range(hatval, na.rm=TRUE)) <= 0, 4, 2) text(hatval[noteworthy], rstud[noteworthy], labels[noteworthy], pos = pos[noteworthy]) return(which(noteworthy)) } else return(invisible(NULL)) } inflPlot(mod.duncan) inflPlot(mod.duncan, ylim=c(-3, 4), col=gray(c(0.5, 0))) # Basic matrix operations A <- matrix(c(1, 2, -4, 3, 5, 0), 2, 3) B <- matrix(1:6, 2, 3) C <- matrix(c(2, -2, 0, 1, -1, 1, 4 ,4, -4), 3, 3, byrow=TRUE) A B C A + B # addition A - B # subtraction 2*A # product of a scalar and a matrix -A # negation A %*% C # matrix product A %*% B # not conformable for matrix multiplication # vectors are treated as row or column vectors as needed a <- rep(1, 3) b <- c(1, 5, 3) C %*% a a %*% C a %*% b # inner product outer(a, b) # outer product t(B) # matrix transpose a %*% t(b) # also outer product solve(C) # matrix inverse library(MASS) # for fractions() fractions(solve(C)) solve(C, b) # solve matrix equation Cx = b solve(C) %*% b # equivalent # illustration: naive computation of LS regression X <- cbind(1, as.matrix(Prestige[,1:3])) # attach the constant y <- Prestige[,4] head(X) # first 6 rows head(y) head(Prestige[,4, drop=FALSE]) # as a one-column matrix solve(t(X) %*% X) %*% t(X) %*% y # LS coefficients lm(prestige ~ education + income + women, data=Prestige) # check? # eigenvalues and eigenvectors R <- with(Prestige, cor(cbind(education, income, women))) R # correlation matrix eigen(R) # principal-components analysis det(R) # determinant diag(R) # extract diagonal diag(R) <- NA # set diagonal R diag(1:3) # make diagonal matrix diag(3) # order-3 identity matrix # Control structures # conditionals # if ... else abs1 <- function(x) if (x < 0) -x else x abs1(-5) abs1(5) abs1(-3:3) # the first element, -3, controls the result # ifelse (vectorized) abs2 <- function(x) ifelse(x < 0, -x, x) abs2(-3:3) # cascading conditionals sign1 <- function(x) { if (x < 0) -1 else if (x > 0) 1 else 0 } sign1(-5) sign2 <- function(x) { ifelse (x < 0, -1, ifelse(x > 0, 1, 0)) } sign2(c(-5, 0, 10)) # switch convert2meters <- function(x, units=c("inches", "feet", "yards", "miles")) { units <- match.arg(units) switch(units, inches = x * 0.0254, feet = x * 0.3048, yards = x * 0.9144, miles = x * 1609.344) } convert2meters(10, "inches") convert2meters(10, "in") convert2meters(3, "feet") convert2meters(100, "yards") convert2meters(5, "miles") convert2meters(7, "fathoms") # iteration # for loops prod(1:5) # ok for small numbers gamma(5 + 1) # better factorial(5) # best factorial fact1 <- function (x) { if (x <= 1) return(1) f <- 1 # initialize for (i in 1:x) f <- f * i # accumulate product f # return result } fact1(5) # while loops fact2 <- function (x) { i <- f <- 1 # initialize while (i <= x) { f <- f * i # accumulate product i <- i + 1 # increment counter } f # return result } fact2(5) # repeat loops fact3 <- function(x) { if ((!is.numeric(x)) || (x != floor(x)) || (x < 0) || (length(x) > 1)) stop("argument must be a non-negative integer") i <- f <- 1 # initialize repeat { f <- f * i # accumulate product i <- i + 1 # increment counter if (i > x) break # termination test } f # return result } fact3(5) # recursion fact4 <- function(x) { if (x <= 1) 1 # termination condition else x * fact4(x - 1) # recursive call } fact4(5) fact5 <- fact4 remove(fact4) fact5(5) fact6 <- function(x) { if (x <= 1) 1 else x * Recall(x - 1) # recursive call } fact6(5) fact7 <- fact6 remove(fact6) fact7(5) # still works with fact6 removed # Avoiding loops # the "apply" family # apply (arrays) head(DavisThin, 10) # first 10 rows dim(DavisThin) DavisThin$thin.drive <- apply(DavisThin, 1, sum) # row sums head(DavisThin$thin.drive, 10) apply(DavisThin, 2, mean) # column means colMeans(DavisThin) # more efficient (rowMeans, colSums, rowSums too) DavisThin$thin.drive <- NULL # remove thin.drive DavisThin[1, 2] <- DavisThin[2, 4] <- DavisThin[10, 3] <- NA # some missing data head(DavisThin, 10) head(apply(DavisThin, 1, sum), 10) head(apply(DavisThin, 1, function(x) 7*mean(x, na.rm=TRUE)), 10) DavisThin[1, 2:5] <- NA # create some more missing data head(DavisThin, 10) make.scale <- function(items) { if(sum(is.na(items)) >= 4) NA else 7*mean(items, na.rm=TRUE) } head(apply(DavisThin, 1, make.scale), 10) # lapply, sapply (lists) thin.list <- as.list(DavisThin) str(thin.list) # structure of the result lapply(thin.list, mean, na.rm=TRUE) # returns list sapply(thin.list, mean, na.rm=TRUE) # simplifies result # mapply (multiple arguments integrate(dnorm, lower=-1.96, upper=1.96) # integrate normal density pnorm(1.96) - pnorm(-1.96) # check low <- c(-Inf, -3:3) low high <- c(-3:3, Inf) high P <- mapply(function(lo, hi) integrate(dnorm, lo, hi)$value, low, high) P sum(P) pnorm(high) - pnorm(low) # check # Vectorize Integrate <- Vectorize( function(fn, lower, upper) integrate(fn, lower, upper)$value, vectorize.args=c("lower", "upper") ) Integrate(dnorm, lower=low, upper=high) # tapply (tables of statistics) some(Moore) # randomly sample 10 observations with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) Moore$fcategory <- factor(Moore$fcategory, levels=c("low", "medium", "high")) Moore$partner.status <- factor(Moore$partner.status, levels=c("low", "high")) with(Moore, tapply(conformity, list(Status=partner.status, Authoritarianism=fcategory), mean)) # to loop or not to loop? time1 <- function(n){ # inefficient! a <- NULL for(i in 1:n) a <- c(a, i^2) a } system.time(time1(30000)) time2 <- function(n){ # better a <- numeric(n) # initialize to final length for(i in 1:n) a[i] <- i^2 a } system.time(time2(30000)) time3 <- function(n){ # best a <- (1:n)^2 # vectorize a } system.time(time3(30000)) time4 <- function(n){ # (slightly) inefficient! a <- numeric(n) for(i in 1:n) a[i] <- 2 * pi * sin(i) a } system.time(time4(100000)) time5 <- function(n){ # better a <- numeric(n) for(i in 1:n) a[i] <- sin(i) 2 * pi * a # move outside loop } system.time(time5(100000)) time6 <- function(n){ # best 2 * pi * sin(1:n) # fully vectorized } system.time(time6(100000)) # don't vectorize for its own sake matrices <- vector(mode="list", length=10000) # allocate for (i in seq_along(matrices)) matrices[[i]] <- matrix(rnorm(10000), 100, 100) # simple system.time({ S1 <- matrix(0, 100, 100) # initialize for (M in matrices) S1 <- S1 + M # accumulate }) # clever system.time(S2 <- apply(array(unlist(matrices), dim = c(100, 100, 10000)), 1:2, sum)) # a smaller problem system.time({ S1 <- matrix(0, 100, 100) # initialize for (M in matrices[1:1000]) S1 <- S1 + M # accumulate }) # clever system.time(S2 <- apply(array(unlist(matrices[1:1000]), dim = c(100, 100, 1000)), 1:2, sum)) all(S1 == S2) # are answers EXACTLY equal (bad idea!)? max(abs(S1 - S2)) # how different? all.equal(S1, S2) # tests near-equality # Non-trivial programming examples # estimation of logistic regression by Newton-Raphson lreg1 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { # X is the model matrix # y is the response vector of 0s and 1s # max.iter is the maximum number of iterations # tol is a convergence criterion # verbose: show iteration history? X <- cbind(1, X) # add constant b <- b.last <- rep(0, ncol(X)) # initialize coefficients it <- 1 # initialize iteration counter while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- as.vector(1/(1 + exp(-X %*% b))) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) # update coefficients if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b # update previous coefficients it <- it + 1 # increment counter } if (verbose) cat("\n") # newline if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), var=var.b, iterations=it) } head(Mroz) # first 6 observations Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0)) Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0)) Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0)) mod.mroz.1 <- with(Mroz, lreg1(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) mod.mroz.1$coefficients sqrt(diag(mod.mroz.1$var)) # std. errors mod.mroz.glm <- glm(lfp ~ ., family=binomial, data=Mroz) # check coef(mod.mroz.glm) - mod.mroz.1$coefficients sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.1$var)) # estimation of logistic regression by optimization lreg2 <- function(X, y, method="BFGS") { X <- cbind(1, X) negLogL <- function(b, X, y) { p <- as.vector(1/(1 + exp(-X %*% b))) - sum(y*log(p) + (1 - y)*log(1 - p)) } grad <- function(b, X, y) { p <- as.vector(1/(1 + exp(-X %*% b))) - colSums((y - p)*X) } result <- optim(rep(0, ncol(X)), negLogL, gr=grad, hessian=TRUE, method=method, X=X, y=y) list(coefficients=result$par, var=solve(result$hessian), deviance=2*result$value, converged=result$convergence == 0) } mod.mroz.2 <- with(Mroz, lreg2(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) mod.mroz.2$coefficients sqrt(diag(mod.mroz.2$var)) mod.mroz.2$converged coef(mod.mroz.glm) - mod.mroz.2$coefficients # check sqrt(diag(vcov(mod.mroz.glm))) - sqrt(diag(mod.mroz.2$var)) # translating numbers into words makeDigits <- function(x) strsplit(as.character(x), "")[[1]] makeDigits(123456) makeDigits(-123456) # whoops! makeDigits(1000000000) # whoops! options("scipen") ?options options(scipen=100) makeDigits(1000000000) makeNumber <- function(x) as.numeric(paste(x, collapse="")) makeNumber(c("1", "2", "3", "4", "5")) ones <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine") teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", " seventeen", "eighteen", "nineteen") names(ones) <- names(teens) <- 0:9 tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety") names(tens) <- 2:9 suffixes <- c("thousand,", "million,", "billion,", "trillion,") ones["5"] teens["3"] tens["7"] trim <- function(text) { gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text) } trim("twenty-one") trim("twenty-zero") number2words <- function(x) { negative <- x < 0 x <- abs(x) digits <- makeDigits(x) nDigits <- length(digits) result <- if (nDigits == 1) as.vector(ones[digits]) # 1-digit number else if (nDigits == 2) # 2-digit number if (x <= 19) as.vector(teens[digits[2]]) # <= 19 else trim(paste(tens[digits[1]], "-", ones[digits[2]], sep="")) # > 19 else if (nDigits == 3) { # 3-digit number tail <- makeNumber(digits[2:3]) # tens and units digits if (tail == 0) paste(ones[digits[1]], "hundred") else trim(paste(ones[digits[1]], "hundred", number2words(tail))) # recursion } else { # 3 or more digits nSuffix <- ((nDigits + 2) %/% 3) - 1 # power of 1000 if (nSuffix > length(suffixes) || nDigits > 15) stop(paste(x, "is too large!")) pick <- 1:(nDigits - 3*nSuffix) trim(paste(number2words(makeNumber(digits[pick])), suffixes[nSuffix], number2words(makeNumber(digits[-pick])))) # recursion } if (negative) paste("minus", result) else result # negative? } number2words(123456789) number2words(-123456789) number2words(-123456000) debug(number2words) number2words(123456789) remove(makeDigits, makeNumber, number2words, trim, ones, teens, tens, suffixes) options(scipen=0) # back to default numbers2words <- function(x, billion=c("US", "UK"), and=if (billion == "US") "" else "and") { billion <- match.arg(billion) # check argument, allows abbreviation trim <- function(text) # local function { gsub("(^\ *)|((\ *|-|,\ zero|-zero)$)", "", text) } makeNumber <- function(x) as.numeric(paste(x, collapse="")) makeDigits <- function(x) strsplit(as.character(x), "")[[1]] helper <- function(x) # for recursion { negative <- x < 0 x <- abs(x) digits <- makeDigits(x) nDigits <- length(digits) result <- if (nDigits == 1) as.vector(ones[digits]) else if (nDigits == 2) if (x <= 19) as.vector(teens[digits[2]]) else trim(paste(tens[digits[1]], "-", ones[digits[2]], sep="")) else if (nDigits == 3) { tail <- makeNumber(digits[2:3]) if (tail == 0) paste(ones[digits[1]], "hundred") else trim(paste(ones[digits[1]], trim(paste("hundred", and)), helper(tail))) } else { nSuffix <- ((nDigits + 2) %/% 3) - 1 if (nSuffix > length(suffixes) || nDigits > 15) stop(paste(x, "is too large!")) pick <- 1:(nDigits - 3*nSuffix) trim(paste(helper(makeNumber(digits[pick])), suffixes[nSuffix], helper(makeNumber(digits[-pick])))) } if (billion == "UK"){ words <- strsplit(result, " ")[[1]] if (length(grep("million,", words)) > 1) result <- sub(" million, ", ", ", result) } if (negative) paste("minus", result) else result } opts <- options(scipen=100) on.exit(options(opts)) # return scipen to previous value on exit ones <- c("zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine") # local variable teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", " seventeen", "eighteen", "nineteen") names(ones) <- names(teens) <- 0:9 tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety") names(tens) <- 2:9 suffixes <- if (billion == "US") c("thousand,", "million,", "billion,", "trillion,") else c("thousand,", "million,", "thousand million,", "billion,") x <- round(x) # round x to an integer if (length(x) > 1) sapply(x, helper) else helper(x) # vectorize } numbers2words(c(1234567890123, -0123, 1000)) numbers2words(c(1234567890123, -0123, 1000), billion="UK") # Simulation example: # efficiency of LS and robust regression when errors are normal and non-normal X <- as.matrix(Prestige[,c("education", "income", "women")]) mod <- lm(prestige ~ education + income + women, data=Prestige) beta <- coef(mod) # taken as "true" parameters sigma <- summary(mod)$sigma # taken as true scale of errors Ey <- fitted(mod) # taken as expectation of response # initialize: robust.t <- robust.normal <- LS.t <- LS.normal <- matrix(0, 1000, 4) (seed <- sample(1e6, 1)) set.seed(seed) # for reproducibility system.time( for (i in 1:1000){ normal.errors <- sigma*rnorm(102) # sample errors t.errors <- sigma*rt(102, 2) y.normal <- Ey + normal.errors # construct observed response y.t <- Ey + t.errors LS.normal[i,] <- lsfit(X, y.normal)$coef - beta # LS sampling error LS.t[i,] <- lsfit(X, y.t)$coef - beta robust.normal[i,] <- coef(rlm(y.normal ~ X, method="MM", maxit=100)) - beta robust.t[i,] <- coef(rlm(y.t ~ X, method="MM", maxit=100)) - beta } ) # estimated relative bias (should be close to 0): colMeans(LS.normal)/beta colMeans(LS.t)/beta colMeans(robust.normal)/beta colMeans(robust.t)/beta # estimated relative RMSE: sqrt(colMeans(LS.normal^2))/abs(beta) sqrt(colMeans(robust.normal^2))/abs(beta) sqrt(colMeans(LS.t^2))/abs(beta) sqrt(colMeans(robust.t^2))/abs(beta)