##--------------------------------------------------## ## Script for Improving and Debugging R Programs ## ## John Fox ## ## Programming in R ## ## ICPSR Summer Program 2009 ## ##--------------------------------------------------## # Debugging library(car) # for data Mroz$lfp <- with(Mroz, ifelse(lfp == "yes", 1, 0)) Mroz$wc <- with(Mroz, ifelse(wc == "yes", 1, 0)) Mroz$hc <- with(Mroz, ifelse(hc == "yes", 1, 0)) # bugged! lreg1b <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } mod.mroz.1b <- with(Mroz, lreg1b(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # call stack to error: traceback traceback() # interrupt execution, examine environment of function: browser ?browser # bugged! lreg1b <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) browser() var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } mod.mroz.1b <- with(Mroz, lreg1b(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # t(X) %*% V %*% X # str(X) # head(X) # str(V) # V # str(p) # head(p) # p <- as.vector(p) # str(diag(p * (1 - p))) # V <- diag(p * (1 - p)) # V[1:10, 1:10] # # Q # step-through debugger: debug ?debug # bugged! (original bugged function) lreg1b <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- 1/(1 + exp(-X %*% b)) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") if (verbose) cat("\n") list(coefficients=as.vector(b), var=var.b, iterations=it) } debug(lreg1b) # set debug flag mod.mroz.1b <- with(Mroz, lreg1b(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) # postmortem debugger: debugger undebug(lreg1b) ?debugger options(error=dump.frames) mod.mroz.1b <- with(Mroz, lreg1b(cbind(k5, k618, age, wc, hc, lwg, inc), lfp)) debugger() # Measuring time and memory allocation options(error=NULL) # restore default # original debugged function lreg1 <- function(X, y, max.iter=10, tol=1E-6, verbose=FALSE) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ if (verbose) cat("\niteration = ", it, ": ", b) p <- as.vector(1/(1 + exp(-X %*% b))) V <- diag(p * (1 - p)) var.b <- solve(t(X) %*% V %*% X) b <- b + var.b %*% t(X) %*% (y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (verbose) cat("\n") if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), var=var.b, iterations=it) } # alternative using optim() lreg2 <- function(X, y, method="BFGS") { X <- cbind(1, X) negLogL <- function(b, X, y) { p <- as.vector(1/(1 + exp(-X %*% b))) - sum(y*log(p) + (1 - y)*log(1 - p)) } grad <- function(b, X, y) { p <- as.vector(1/(1 + exp(-X %*% b))) - apply(((y - p)*X), 2, sum) } result <- optim(rep(0, ncol(X)), negLogL, gr=grad, hessian=TRUE, method=method, X=X, y=y) list(coefficients=result$par, var=solve(result$hessian), deviance=2*result$value, converged=result$convergence == 0) } # a larger (but not large!) problem set.seed(12345) # for reproducibility X <- matrix(rnorm(5000*10), 5000, 10) y <- rbinom(5000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11)))) # measuring time: system.time system.time(mod.1 <- lreg1(X, y)) mod.1$coef system.time(mod.2 <- lreg2(X, y)) system.time(mod.glm <- glm(y ~ X, family=binomial)) # profiling time and memory use: Rprof tmp <- tempfile() # create temporary file Rprof(tmp, memory.profiling=TRUE) # turn on profiling mod.1 <- lreg1(X, y) Rprof() # turn off profiling summaryRprof(tmp, memory="both") # summarize results unlink(tmp) # delete temporary file # new version of function lreg1c <- function(X, y, max.iter=10, tol=1E-6) { X <- cbind(1, X) b <- b.last <- rep(0, ncol(X)) it <- 1 while (it <= max.iter){ p <- as.vector(1/(1 + exp(-X %*% b))) var.b <- solve(crossprod(X, p * (1 - p) * X)) # never form V! b <- b + var.b %*% crossprod(X, y - p) if (max(abs(b - b.last)/(abs(b.last) + 0.01*tol)) < tol) break b.last <- b it <- it + 1 } if (it > max.iter) warning("maximum iterations exceeded") list(coefficients=as.vector(b), var=var.b, iterations=it) } # explanation: XX <- matrix(1, 5, 3) # don't want to overwrite X XX V <- diag(1:5) V V %*% XX 1:5 * XX # same thing! system.time(mod.1c <- lreg1c(X, y)) mod.1c$coef # a much larger problem set.seed(12345) # for reproducibility X <- matrix(rnorm(100000*10), 100000, 10) y <- rbinom(100000, 1, prob=1/(1 + exp(-cbind(1, X) %*% rep(1, 11)))) system.time(mod.1 <- lreg1(X, y)) system.time(mod.1c <- lreg1c(X, y)) system.time(mod.2 <- lreg2(X, y)) system.time(mod.glm <- glm(y ~ X, family=binomial)) max(abs(mod.1c$coef - coef(mod.glm)))