##------------------------------------------------------------------------## ## Script for Chapter 6, An R and S-PLUS Companion to Applied Regression ## ## John Fox ## ## Sage Publications, 2002 ## ##------------------------------------------------------------------------## # preliminaries options(width=65) options(digits=5) # outliers, leverage, and influence diagnostics library(car) data(Duncan) attach(Duncan) mod.duncan <- lm(prestige~income + education) summary(mod.duncan) qq.plot(mod.duncan, simulate=T, labels=row.names(Duncan)) outlier.test(mod.duncan, labels=row.names(Duncan)) row.names(Duncan)[6] plot(hatvalues(mod.duncan)) abline(h=c(2, 3)*3/45, lty=2) identify(1:45, hatvalues(mod.duncan), row.names(Duncan)) dfbs.duncan <- dfbetas(mod.duncan) dfbs.duncan[1:5,] plot(dfbs.duncan[,c(2,3)]) identify(dfbs.duncan[,2], dfbs.duncan[,3], row.names(Duncan)) plot(cookd(mod.duncan)) abline(h=4/42, lty=2) identify(1:45, cookd(mod.duncan), row.names(Duncan)) plot(hatvalues(mod.duncan), rstudent(mod.duncan), type='n') cook <- sqrt(cookd(mod.duncan)) points(hatvalues(mod.duncan), rstudent(mod.duncan), cex=10*cook/max(cook)) abline(h=c(-2, 0, 2), lty=2) abline(v=c(2, 3)*3/45, lty=2) identify(hatvalues(mod.duncan), rstudent(mod.duncan), row.names(Duncan)) ## S-PLUS alternative plot(hatvalues(mod.duncan), rstudent(mod.duncan), type='n') cook <- sqrt(as.vector(cookd(mod.duncan))) cook <- sqrt(cookd(mod.duncan)) max.cook <- max(cook) hat <- hatvalues(mod.duncan) rstud <- rstudent(mod.duncan) for (i in 1:length(cook)) points(hat[i], rstud[i], cex=10*cook[i]/max.cook) abline(h=c(-2, 0, 2), lty=2) abline(v=c(2, 3)*3/45, lty=2) identify(hatvalues(mod.duncan), rstudent(mod.duncan), row.names(Duncan)) par(mfrow=c(1,2)) av.plots(mod.duncan, labels=row.names(Duncan)) # non-normality detach(Duncan) data(Ornstein) attach(Ornstein) mod.ornstein <- lm(interlocks + 1 ~ assets + nation + sector) summary(mod.ornstein) Anova(mod.ornstein) par(mfrow=c(1,1)) qq.plot(mod.ornstein, sim=T) plot(density(rstudent(mod.ornstein)), main='rstudent') ## S-PLUS: plot(density(rstudent(mod.ornstein)), type='l') library(MASS) boxcox(mod.ornstein) boxcox(mod.ornstein, lambda=seq(.1, .5, by=.01)) # non-constant error variance par(mfrow=c(1,1)) plot(fitted.values(mod.ornstein), rstudent(mod.ornstein)) abline(h=0, lty=2) spread.level.plot(mod.ornstein) ncv.test(mod.ornstein) ncv.test(mod.ornstein, ~ assets + nation + sector) mod.ornstein.2 <- lm((interlocks + 1)^(1/3) ~ sqrt(assets) + nation + sector) mod.ornstein.cv <- update(mod.ornstein, . ~ . + box.cox.var(interlocks + 1)) summary(mod.ornstein.cv) av.plots(mod.ornstein.cv, 'box.cox.var(interlocks + 1)') # for the reader summary(mod.ornstein.2) Anova(mod.ornstein.2) qq.plot(mod.ornstein.2) spread.level.plot(mod.ornstein.2) ncv.test(mod.ornstein.2) ncv.test(mod.ornstein.2, ~ sqrt(assets) + nation + sector) Anova(mod.ornstein, white.adjust="hc3") lm(interlocks ~ assets + nation + sector, weights=1/assets) # nonlinearity detach(Ornstein) data(Prestige) attach(Prestige) mod.prestige <- lm(prestige ~ income + education + women) par(mfrow=c(2,2)) cr.plots(mod.prestige) Ask(p, function(p) cr.plots(lm(prestige ~ box.cox(income, p) + education + poly(women,2)), "box.cox(income, p)")) ## for S-PLUS: Ask(p, function(p) { assign('income.p', box.cox(income, p), frame=1) cr.plots(lm(prestige ~ income.p + education + poly(women,2)), "income.p") }) mod.prestige.2 <- lm(prestige ~ log(income, 10) + education + poly(women,2)) summary(mod.prestige.2) box.tidwell(prestige ~ income + education, other.x= ~ poly(women, 2)) mod.prestige.cv <- lm(prestige ~ income + education + poly(women, 2) + I(income*log(income)) + I(education*log(education))) summary(mod.prestige.cv) par(mfrow=c(1,2)) av.plots(mod.prestige.cv) # collinearity detach(Prestige) data(Ericksen) Ericksen mod.census <- lm(undercount ~ ., data=Ericksen) summary(mod.census) vif(mod.census) vif(mod.ornstein) census.step <- step(mod.census) summary(census.step) census.step.bic <- step(mod.census, k=log(66)) census.step.forward <- step(lm(undercount ~ 1, data=Ericksen), scope= ~ minority + crime + poverty + language + highschool + housing + city + conventional, k=log(66)) par(mfrow=c(1,2)) library(leaps) census.subsets <- regsubsets(undercount ~ ., nbest=10, data=Ericksen) plot.subsets(census.subsets) plot.subsets(census.subsets, min.size=3, max.size=5, legend=F, ylim=c(-53, -48)) # diagnostics for glms # influence diagnostics data(Womenlf) attach(Womenlf) mod.working <- glm(partic != 'not.work' ~ hincome + children, family=binomial) summary(mod.working) plot(cookd(mod.working)) identify(1:length(partic), cookd(mod.working)) dfb <- dfbeta(mod.working) dfb[1:5,] par(mfrow=c(1,2)) plot(dfb[,2], ylab='dfbeta(hincome)') identify(1:length(partic), dfb[,2]) plot(dfb[,3], ylab='dfbeta(children)') summary(update(mod.working, subset=-c(76, 77))) # nonlinearity diagnostics # Ornstein's Poisson regression par(mfrow=c(1,1)) detach(Womenlf) attach(Ornstein) mod.ornstein.pois <- glm(interlocks ~ assets + nation + sector, family=poisson) cr.plots(mod.ornstein.pois) mod.ornstein.pois.2 <- glm(interlocks ~ log(assets) + nation + sector, family=poisson) cr.plots(mod.ornstein.pois.2, 'log(assets)') mod.ornstein.pois.cv <- update(mod.ornstein.pois, . ~ . + I(assets*log(assets))) summary(mod.ornstein.pois.cv) av.plots(mod.ornstein.pois.cv, 'I(assets * log(assets))') 1 + -2.1773e-05/2.0851e-05 # Long's regression detach(Ornstein) data(Mroz) attach(Mroz) mod.mroz <- glm(lfp~k5+k618+age+wc+hc+lwg+inc, family=binomial) cr.plots(mod.mroz, 'lwg')