##------------------------------------------------------------##
##              Script for Session 3: Data in R               ##
##                       John Fox                             ##
##  Introduction to the R Statistical Computing Environment   ##
##                ICPSR Summer Program                        ##
##                         2010                               ##
##------------------------------------------------------------##


# Data in R

    # reading data

        # entering data at the keyboard

(x <- c(1, 2, 3, 4)) # numeric data
(names <- c("John", "Sandy", 'Mary')) # character data
(v <- c(TRUE, FALSE)) # logical data


cooperation <- scan()
    49 64 37 52 68 54
    61 79 64 29
    27 58 52 41 30 40 39
    44 34 44

cooperation
    
        # patterned data
    
rep(5, 3)
rep(c(1, 2, 3), 2)
rep(1:3, 3:1)

(condition <- rep(c("public", "anonymous"), c(10, 10)))
(sex <- rep(rep(c("male", "female"), c(5, 5)), 2))

(Guyer <- data.frame(cooperation, condition, sex))


        # reading data from a file into a data frame
        
Prestige <- read.table("d:/data/Prestige.txt", header=TRUE)
Prestige

Prestige <- read.table(file.choose(), header=TRUE) # alternative

        # reading data via the clipboard (e.g., from Excel)
        
Duncan <- read.table("clipboard", header=TRUE)
Duncan

remove(Duncan)

    
    # importing data
    
        # from a spreadsheet
    
library(RODBC)
channel <- odbcConnectExcel("D:/data/Datasets.xls")
Prestige <- sqlQuery(channel, "select * from [Prestige$]")
odbcClose(channel)
head(Prestige) # first 6 rows
rownames(Prestige) <- Prestige$F1
Prestige$F1 <- NULL
head(Prestige)

remove(Prestige)
        
        # from another package.
        
library(foreign)
Nations <- read.spss("d:/data/nations.por", to.data.frame=TRUE)
View(Nations)

names(Nations) <- tolower(names(Nations))
head(Nations)

remove(Nations)

    # accessing data in a package

library(car)
data(Duncan) # not necessary (if package supports "lazy loading" of data)
Duncan
remove(Duncan)

Prestige # from car via the lazy-loading mechanism

        # from a package that's not loaded:
        
data(Animals, package="MASS")
Animals

objects()
remove(Animals, channel, names, v, x)

    # writing data to a file
    
write.table(Duncan, "c:/temp/Duncan.txt")

    # working with data frames

        # The search path

search()
prestige

Duncan[, "prestige"]

attach(Duncan)
prestige
search()

attach(Prestige)
search()
prestige    # prestige in Prestige shadows prestige in Duncan


Duncan[, "prestige"]  # still there!

detach(Prestige)
search()
prestige  # now from Duncan


mean(prestige)
mean(prestige, trim=0.1)

mean <- function(x){
    warning("the mean function in the base package is shadowed")
    sum(x)/length(x)
}

mean(prestige)
mean(prestige, trim=0.1)

remove(mean)
mean(prestige, trim=.1)

mean <- mean(prestige)  # variable named "mean" --  no problem!
mean
mean(prestige)

remove(mean)
detach(Duncan)

        # avoiding attach()
        
mean(Duncan$prestige) # explicit indexing

(lm(prestige ~ income + education, data=Duncan)) # using the data argument

with(Duncan, mean(prestige))  # using with()
with(Duncan, lm(prestige ~ income + education))

    # missing data

head(Freedman, 10)  # first 10 rows
tail(Freedman)  # last 6 rows
some(Freedman)  # 10 randomly sampled rows

Freedman$density
median(Freedman$density)
median(Freedman$density, na.rm=TRUE)

with(Freedman, {
    plot(density, crime) # NAs removed
    identify(density, crime, row.names(Freedman)) 
})

log(c(1, 10, NA, 100), base=10) # NAs propogated

with(Freedman, plot(log(density, base=10), crime))

lm(crime ~ log(density, base=10), data=Freedman) # NAs handled by na.action
getOption("na.action")

abline(lm(crime ~ log(density, base=10), data=Freedman), lty="dashed")

good <- with(Freedman, complete.cases(crime, density))
head(good, 20)  # first 20 values
with(Freedman,  # NAs handled by indexing:
    lines(lowess(log(density[good], base=10), crime[good], f=1.0)))

Freedman.good <- na.omit(Freedman) # filtering NAs
head(Freedman.good)  # first 6 rows
dim(Freedman.good)   # number of rows and columns
        
        # testing for NAs
        
NA == c(1, 2, NA, 4) # wrong!
is.na(c(1, 2, NA, 4))
sum(is.na(Freedman)) # counts of NAs

objects()
remove(good, Freedman.good)

    # numeric variables, character variables, and factors

condition
is.character(condition)

condition <- as.factor(condition)
condition

remove(cooperation, condition, sex)

Guyer$condition
is.character(Guyer$condition)
is.factor(Guyer$condition)

summary(Guyer)

    # modifying data

Guyer$perc.coop <- 100*Guyer$cooperation/120 # assign to data frame
head(Guyer)  # first 6 rows

Guyer$cooperation <- with(Guyer, log(perc.coop/(100 - perc.coop))) # replace
head(Guyer)


    # matrices, arrays, and lists

(A <- matrix(1:12, 3, 4))
(B <- matrix(c("a","b","c"), 4, 3, byrow=TRUE))
dim(A)
dim(B)

attributes(A)

str(A) # structure of an object
str(B)

(v <- sample(10, 10)) # permutation of 1:10
dim(v)

vv <- v  # make a copy
dim(vv) <- c(5, 2) # reshape into a matrix
vv

(array.3 <- array(1:24, c(4,3,2)))  # 3D array
dim(array.3)

(list.1 <- list(mat.1=A, mat.2=B, vec=v))  # a list

    # indexing
    
        # vectors

v
v[2]        # one element
v[c(4, 2, 6)] # several elements
v[c(4, 2, 4)] # elements may be repeated

v[-c(2, 4, 6, 8, 10)]   # omitting elements

names(v) <- letters[1:10]
v
names(v)
v[c("f", "i", "g")]   # indexing by names

v < 6
v[v < 6]    # logical indexing

(vv <- v)  # make a copy

vv[c(1, 3, 5)] <- c(1, 2, 3)    # replacing elements
vv

vv[c("b", "d", "f", "h", "j")] <- 0
vv

remove(vv)

        # matrices

A
A[2, 3]
A[c(1, 2), 2]
A[c(1, 2), c(2, 3)]
A[c(1, 2), ]

A[c(1, 2), 2, drop=FALSE]    # retain column dimension

A[ , -c(1, 3)]  # delete columns 1 and 3
A[-1, -2]       # delete row 1 and column 2

rownames(A) <- c("one", "two", "three")
colnames(A) <- c("w", "x", "y", "z")
A

A[c("one", "two"), c("x", "y")]
A[c(TRUE, FALSE, TRUE), ]

(AA <- A)

AA[1, ] <- 0
AA

remove(AA)

        # lists

list.1

list.1[c(2, 3)]

list.1[2]   # a one-element list
class(list.1[2])

list.1[[2]] # a list element
class(list.1[[2]])

list.1["mat.1"]
list.1[["mat.1"]]

list.1$mat.1

list.1$mat.1 <- matrix(1, 2, 2)     # replacing a list element
list.1$title <- "an arbitrary list" # adding an element
list.1$mat.2 <- NULL                  # removing an element
list.1

        # data frames

head(Guyer)

Guyer[, 1] # first column
Guyer[, "cooperation"] # by name
Guyer[c(1, 2), ] # first two rows
Guyer[c("1", "2"), "cooperation"]
Guyer[-(6:20), ] # remove rows

Guyer$sex=="female" & Guyer$condition=="public"
Guyer[Guyer$sex=="female" & Guyer$condition=="public", ] # selected rows

Guyer$cooperation
Guyer[["cooperation"]]
Guyer["cooperation"]  # a one-column data frame

    # large data sets
    
set.seed(123456789) # for reproducibility
X <- rnorm(100000*100)
X <- matrix(X, 100000, 100)

y <- 10 + as.vector(X %*% rep(1, 100) + rnorm(100000, sd=10))

memory.size() # memory used in Mb
memory.limit()

system.time(m <- lm(y ~ X)) # a linear least-squares regression
head(coef(m))  # first 6 coefficients

p <- as.vector(1/(1 + exp(-X %*% rep(0.25, 100))))
summary(p)
yy <- rbinom(100000, 1, prob=p)
table(yy)

system.time(m <- glm(yy ~ X, family=binomial))
head(coef(m))

objects()
remove(list=objects())
objects()