##--------------------------------------------##
##  Introduction to Nonparametric Regression  ##
##                John Fox                    ##
##        ESRC Oxford Spring School           ##
##                May 2005                    ##
##--------------------------------------------##

# nonparametric simple regression

    # local polynomial regression

        # UN data
            
library(car)
data(UN)
head(UN, 10)

UN <- na.omit(UN) # not strictly necessary
head(UN, 10)

attach(UN)
search()

plot(gdp, infant.mortality)
identify(gdp, infant.mortality, rownames(UN))

lines(loess.smooth(gdp, infant.mortality, span=.5, family="gaussian"), lwd=3)
abline(lm(infant.mortality ~ gdp), lty=2, lwd=3)

        # transforming to approximate linearity
            
plot(log10(gdp), log10(infant.mortality))
identify(log10(gdp), log10(infant.mortality), rownames(UN))
lines(loess.smooth(log10(gdp), log10(infant.mortality), span=.5,
    family="gaussian"), lwd=3)
abline(lm(log10(infant.mortality) ~ log10(gdp)), lty=2, lwd=3)

        # robust fit
            
plot(log10(gdp), log10(infant.mortality))
lines(loess.smooth(log10(gdp), log10(infant.mortality), span=.4,
    family="gaussian"), lwd=2, lty=2) # non-robust fit
lines(loess.smooth(log10(gdp), log10(infant.mortality), span=.4), 
    lwd=2) # robust fit
    
        # Canadian occupational prestige data
        
detach(UN)
rm(list=objects())
data(Prestige)
some(Prestige)
attach(Prestige)
search()

        # selecting the span
        
par(mfrow=c(2,3)) # 2 x 3 array of plots (or turn on plot history)
for (span in seq(.1, .9, by=.2)){
    plot(income, prestige, main=paste("s =", span))
    lines(lowess(income, prestige, f=span, iter=0), lwd=3)
    }

        # statistical inference

library(gam)

            # test of nonlinearity
            
mod.linear <- lm(prestige ~ income)
summary(mod.linear)

mod.nonpar <- loess(prestige ~ income, span=.6, degree=1)
summary(mod.nonpar)

RSS.1 <- sum(residuals(mod.linear)^2)
RSS.2 <- sum(residuals(mod.nonpar)^2)

f <- ((RSS.1 - RSS.2)/(4.3 - 2)) / (RSS.2/(102 - 4.3))
pf(f, 4.3 - 3, 102 - 4.3, lower.tail=FALSE)

            # test of income effect
            
TSS <- sum((prestige - mean(prestige))^2)
f <- ((TSS - RSS.2)/(4.3 - 1)) / (RSS.2/(102 - 4.3))
pf(f, 4.3 - 1, 102 - 4.3, lower.tail=FALSE)

            # confidence bands
            
inc <- seq(min(income), max(income), length=500)
fit <- predict(mod.nonpar, data.frame(income=inc), se=TRUE)
str(fit)

plot(income, prestige, ylim=c(15,95))
lines(inc, fit$fit, lwd=3)
lines(inc, fit$fit + 2*fit$se.fit, lty=2, lwd=2)
lines(inc, fit$fit - 2*fit$se.fit, lty=2, lwd=2)

        # regression and smoothing splines

            # regression spline
        
library(splines)
plot(income, prestige)
mod.regspline <- lm(prestige ~ ns(income, knots=10000))
fit <- predict(mod.regspline, data.frame(income=inc))
lines(inc, fit, lwd=3)
lines(loess.smooth(income, prestige, span=.6), lty=2, lwd=2) # cf.

            # smoothing spline
            
plot(income, prestige)
lines(smooth.spline(income, prestige, df=4.3), lwd=3)
lines(loess.smooth(income, prestige, span=.6), lty=2, lwd=2) # cf.

    # nonlinearity diagnostics: component+residual plots
    
mod.prestige <- lm(prestige ~ income + education, data=Prestige)
cr.plots(mod.prestige)


# nonparametric multiple regression

    # general model

mod.loess <- loess(prestige ~ income*education, span=.5, degree=1)
summary(mod.loess)

        # 3D perspective plot of fitted surface
    
new.prestige <- expand.grid(
    income = inc <- seq(0, 25000, length=20),
    education = ed <- seq(6, 16, length=20)
    )
fit <- predict(mod.loess, new.prestige)
graph <- persp(x=inc, y=ed, fit,
    xlab="Income", ylab="Education", zlab="Prestige", expand=0.5,
    col="gray70", ticktype="detailed", theta=-45)

trans3d <- function(x,y,z, pmat) {  # this function from the persp() help file
  tr <- cbind(x,y,z,1) %*% pmat
  list(x = tr[,1]/tr[,4], y= tr[,2]/tr[,4])
  }

points(trans3d(x=income, y=education, z=prestige, pmat=graph), col=2, pch=16)

        # contour plot of fitted surface
    
contour(x=inc, y=ed, fit, xlab="Income", ylab="Education",
    levels=seq(20, 80, by=10), labcex=1.2)
abline(h=seq(6, 16, by=2), v=seq(0, 25000, by=5000), col="gray80")

        # tests for terms in the model
    
mod.loess.inc <- loess(prestige ~ income, span=.7, degree=1)
summary(mod.loess.inc)
anova(mod.loess.inc, mod.loess) # test of education effect

mod.loess.ed <- loess(prestige ~ education, span=.7, degree=1)
summary(mod.loess.ed)
anova(mod.loess.ed, mod.loess) # test of income effect

    # additive regression model

library(gam)
prestige.additive <- gam(prestige ~ lo(income, span=.7) +
    lo(education, span=.7))
summary(prestige.additive)
par(mfrow=c(1,2))
plot(prestige.additive, ylab="Prestige", residuals=TRUE, se=TRUE, ask=FALSE)

        # inference (note: problem with df in gam package)
        

            # tests for terms in the model
            
prestige.inc <- gam(prestige ~ lo(income, span=.7))
prestige.ed <- gam(prestige ~ lo(education, span=.7))
anova(prestige.inc, prestige.additive, test="F") # test of education effect
anova(prestige.ed, prestige.additive, test="F") # test of income effect

            # test for interaction between income and education
            
prestige.general <- gam(prestige ~ lo(income, education, span=.5))
anova(prestige.additive, prestige.general, test="F")

            # linearity tests
            
prestige.inc.lin <- gam(prestige ~ income + lo(education, span=.7))
prestige.ed.lin <- gam(prestige ~ lo(income, span=.7) + education)
anova(prestige.inc.lin, prestige.additive)  # for income
anova(prestige.ed.lin, prestige.additive)  # for education

        # using the mgcv package
        
detach("package:gam")
library(mgcv)

prestige.additive.mgcv <- gam(prestige ~ s(income) + s(education))
summary(prestige.additive.mgcv)
par(mfrow=c(1,2))
plot(prestige.additive.mgcv, ylab="Prestige", residuals=TRUE, pch=1)

            # fixing the df for the smooth terms
            
prestige.additive.fixdf <- gam(prestige ~ s(income, fx=TRUE, k=4)
    + s(education, fx=TRUE, k=4))
summary(prestige.additive.fixdf)
par(mfrow=c(1,2))
plot(prestige.additive.fixdf, ylab="Prestige", residuals=TRUE, pch=1)

# generalized nonparametric regression

    # scatterplot smoothing

detach(Prestige)
rm(list=objects())
data(Mroz)
attach(Mroz)
some(Mroz)

par(mfrow=c(1,1))
plot(lwg, jitter(as.numeric(lfp == "yes"), factor=.125),
    xlab="Log Estimated Wages", ylab="Labour-Force Participation")
abline(h=c(0,1))
mod.lin.logit <- glm(lfp ~ lwg, family=binomial)
x <- seq(-2, 3, length=500)
lines(x, predict(mod.lin.logit, list(lwg=x), type="response"),
    lty=2, lwd=3)
mod.nonpar.logit <- gam(lfp ~ s(lwg), family=binomial)
lines(x, predict(mod.nonpar.logit, list(lwg=x), type="response"), lwd=3)

    # generalized additive model

table(k5)
table(k618)

mod.mroz.glm <- gam(lfp ~ age + inc + k5 +  # fits the glm via gam()
    + k618 + wc + hc, family=binomial)      # cf., same model fit with glm()

mod.mroz.gam <- gam(lfp ~ s(age) + s(inc) + factor(k5)
    + factor(k618) + wc + hc, family=binomial)
summary(mod.mroz.gam)
plot(mod.mroz.gam, residuals=TRUE, all.terms=TRUE, pch=1)

        # tests of nonlinearity
        
             # overall test of nonlinearity
        
anova(mod.mroz.glm, mod.mroz.gam, test="Chisq")

            # tests for nonlinearity of individual terms (e.g.)
            
mod.mroz.gam.age.lin <- gam(lfp ~ age + s(inc) + factor(k5)
    + factor(k618) + wc + hc, family=binomial)
anova(mod.mroz.gam.age.lin, mod.mroz.gam,
    test="Chisq") # test for linearity of age
    
mod.mroz.gam.k5.lin <- gam(lfp ~ s(age) + s(inc) + k5
    + factor(k618) + wc + hc, family=binomial)
anova(mod.mroz.gam.k5.lin, mod.mroz.gam,
    test="Chisq") # test for linearity of k5; note df != 2 exactly (but close)
    
        # tests for individual terms (e.g.)
        
mod.mroz.gam.minus.age <- gam(lfp ~ s(inc) + factor(k5)
    + factor(k618) + wc + hc, family=binomial)
anova(mod.mroz.gam.minus.age, mod.mroz.gam, test="Chisq") # test for age

mod.mroz.gam.minus.k5 <- gam(lfp ~ s(age) + s(inc)
    + factor(k618) + wc + hc, family=binomial)
anova(mod.mroz.gam.minus.k5, mod.mroz.gam, test="Chisq") # test for k5