##------------------------------------------------------------## ## Script for Session 3: Data in R ## ## John Fox ## ## Introduction to the R Statistical Computing Environment ## ## ICPSR Summer Program ## ## 2010 ## ##------------------------------------------------------------## # Data in R # reading data # entering data at the keyboard (x <- c(1, 2, 3, 4)) # numeric data (names <- c("John", "Sandy", 'Mary')) # character data (v <- c(TRUE, FALSE)) # logical data cooperation <- scan() 49 64 37 52 68 54 61 79 64 29 27 58 52 41 30 40 39 44 34 44 cooperation # patterned data rep(5, 3) rep(c(1, 2, 3), 2) rep(1:3, 3:1) (condition <- rep(c("public", "anonymous"), c(10, 10))) (sex <- rep(rep(c("male", "female"), c(5, 5)), 2)) (Guyer <- data.frame(cooperation, condition, sex)) # reading data from a file into a data frame Prestige <- read.table("d:/data/Prestige.txt", header=TRUE) Prestige Prestige <- read.table(file.choose(), header=TRUE) # alternative # reading data via the clipboard (e.g., from Excel) Duncan <- read.table("clipboard", header=TRUE) Duncan remove(Duncan) # importing data # from a spreadsheet library(RODBC) channel <- odbcConnectExcel("D:/data/Datasets.xls") Prestige <- sqlQuery(channel, "select * from [Prestige$]") odbcClose(channel) head(Prestige) # first 6 rows rownames(Prestige) <- Prestige$F1 Prestige$F1 <- NULL head(Prestige) remove(Prestige) # from another package. library(foreign) Nations <- read.spss("d:/data/nations.por", to.data.frame=TRUE) View(Nations) names(Nations) <- tolower(names(Nations)) head(Nations) remove(Nations) # accessing data in a package library(car) data(Duncan) # not necessary (if package supports "lazy loading" of data) Duncan remove(Duncan) Prestige # from car via the lazy-loading mechanism # from a package that's not loaded: data(Animals, package="MASS") Animals objects() remove(Animals, channel, names, v, x) # writing data to a file write.table(Duncan, "c:/temp/Duncan.txt") # working with data frames # The search path search() prestige Duncan[, "prestige"] attach(Duncan) prestige search() attach(Prestige) search() prestige # prestige in Prestige shadows prestige in Duncan Duncan[, "prestige"] # still there! detach(Prestige) search() prestige # now from Duncan mean(prestige) mean(prestige, trim=0.1) mean <- function(x){ warning("the mean function in the base package is shadowed") sum(x)/length(x) } mean(prestige) mean(prestige, trim=0.1) remove(mean) mean(prestige, trim=.1) mean <- mean(prestige) # variable named "mean" -- no problem! mean mean(prestige) remove(mean) detach(Duncan) # avoiding attach() mean(Duncan$prestige) # explicit indexing (lm(prestige ~ income + education, data=Duncan)) # using the data argument with(Duncan, mean(prestige)) # using with() with(Duncan, lm(prestige ~ income + education)) # missing data head(Freedman, 10) # first 10 rows tail(Freedman) # last 6 rows some(Freedman) # 10 randomly sampled rows Freedman$density median(Freedman$density) median(Freedman$density, na.rm=TRUE) with(Freedman, { plot(density, crime) # NAs removed identify(density, crime, row.names(Freedman)) }) log(c(1, 10, NA, 100), base=10) # NAs propogated with(Freedman, plot(log(density, base=10), crime)) lm(crime ~ log(density, base=10), data=Freedman) # NAs handled by na.action getOption("na.action") abline(lm(crime ~ log(density, base=10), data=Freedman), lty="dashed") good <- with(Freedman, complete.cases(crime, density)) head(good, 20) # first 20 values with(Freedman, # NAs handled by indexing: lines(lowess(log(density[good], base=10), crime[good], f=1.0))) Freedman.good <- na.omit(Freedman) # filtering NAs head(Freedman.good) # first 6 rows dim(Freedman.good) # number of rows and columns # testing for NAs NA == c(1, 2, NA, 4) # wrong! is.na(c(1, 2, NA, 4)) sum(is.na(Freedman)) # counts of NAs objects() remove(good, Freedman.good) # numeric variables, character variables, and factors condition is.character(condition) condition <- as.factor(condition) condition remove(cooperation, condition, sex) Guyer$condition is.character(Guyer$condition) is.factor(Guyer$condition) summary(Guyer) # modifying data Guyer$perc.coop <- 100*Guyer$cooperation/120 # assign to data frame head(Guyer) # first 6 rows Guyer$cooperation <- with(Guyer, log(perc.coop/(100 - perc.coop))) # replace head(Guyer) # matrices, arrays, and lists (A <- matrix(1:12, 3, 4)) (B <- matrix(c("a","b","c"), 4, 3, byrow=TRUE)) dim(A) dim(B) attributes(A) str(A) # structure of an object str(B) (v <- sample(10, 10)) # permutation of 1:10 dim(v) vv <- v # make a copy dim(vv) <- c(5, 2) # reshape into a matrix vv (array.3 <- array(1:24, c(4,3,2))) # 3D array dim(array.3) (list.1 <- list(mat.1=A, mat.2=B, vec=v)) # a list # indexing # vectors v v[2] # one element v[c(4, 2, 6)] # several elements v[c(4, 2, 4)] # elements may be repeated v[-c(2, 4, 6, 8, 10)] # omitting elements names(v) <- letters[1:10] v names(v) v[c("f", "i", "g")] # indexing by names v < 6 v[v < 6] # logical indexing (vv <- v) # make a copy vv[c(1, 3, 5)] <- c(1, 2, 3) # replacing elements vv vv[c("b", "d", "f", "h", "j")] <- 0 vv remove(vv) # matrices A A[2, 3] A[c(1, 2), 2] A[c(1, 2), c(2, 3)] A[c(1, 2), ] A[c(1, 2), 2, drop=FALSE] # retain column dimension A[ , -c(1, 3)] # delete columns 1 and 3 A[-1, -2] # delete row 1 and column 2 rownames(A) <- c("one", "two", "three") colnames(A) <- c("w", "x", "y", "z") A A[c("one", "two"), c("x", "y")] A[c(TRUE, FALSE, TRUE), ] (AA <- A) AA[1, ] <- 0 AA remove(AA) # lists list.1 list.1[c(2, 3)] list.1[2] # a one-element list class(list.1[2]) list.1[[2]] # a list element class(list.1[[2]]) list.1["mat.1"] list.1[["mat.1"]] list.1$mat.1 list.1$mat.1 <- matrix(1, 2, 2) # replacing a list element list.1$title <- "an arbitrary list" # adding an element list.1$mat.2 <- NULL # removing an element list.1 # data frames head(Guyer) Guyer[, 1] # first column Guyer[, "cooperation"] # by name Guyer[c(1, 2), ] # first two rows Guyer[c("1", "2"), "cooperation"] Guyer[-(6:20), ] # remove rows Guyer$sex=="female" & Guyer$condition=="public" Guyer[Guyer$sex=="female" & Guyer$condition=="public", ] # selected rows Guyer$cooperation Guyer[["cooperation"]] Guyer["cooperation"] # a one-column data frame # large data sets set.seed(123456789) # for reproducibility X <- rnorm(100000*100) X <- matrix(X, 100000, 100) y <- 10 + as.vector(X %*% rep(1, 100) + rnorm(100000, sd=10)) memory.size() # memory used in Mb memory.limit() system.time(m <- lm(y ~ X)) # a linear least-squares regression head(coef(m)) # first 6 coefficients p <- as.vector(1/(1 + exp(-X %*% rep(0.25, 100)))) summary(p) yy <- rbinom(100000, 1, prob=p) table(yy) system.time(m <- glm(yy ~ X, family=binomial)) head(coef(m)) objects() remove(list=objects()) objects()