### ### ### Boston - Prelim ### ver 0.0.4 ### #################################################### #------------------------------------------------- ###--- 0. Preliminary library(MASS) # install.packages("MASS") data(Boston) # see the data head(Boston) # another way to see the data ?Boston # see explanation for variables # 1 crim per capita crime rate by town. # 2 zn proportion of residential land zoned for lots over 25,000 sq.ft. # 3 indus proportion of non-retail business acres per town. # 4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). # 5 nox nitrogen oxides concentration (parts per 10 million). # 6 rm average number of rooms per dwelling. # 7 age proportion of owner-occupied units built prior to 1940. # 8 dis weighted mean of distances to five Boston employment centres. # 9 rad index of accessibility to radial highways. # 10 tax full-value property-tax rate per $10,000. # 11 ptratio pupil-teacher ratio by town. # 12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. # 13 lstat lower status of the population (percent). # 14 medv median value of owner-occupied homes in \$1000s. # 13 lstat = lower status of the population (percent) # Proportion of population that is lower status # = 1/2 (proportion of adults without, # some high school education and proportion of male # workers classified as laborers). The logarithmic # specification implies that socioeconomic status # distinctions mean more in the upper brackets of # society than in the lower classes. Source: 1970 U. S. Census #--- Turn the data into tibble --- library(tidyverse) # install.packages("tidyverse") Boston <- as_tibble(Boston) Boston class(Boston) is_tibble(Boston) # is Boston tibble now? is.data.frame(Boston) # it's still data.frame too Boston # only first 10 rows will print print(Boston, n=100) # if you want to see more rows # fix(Boston) # spreadsheet view NOT RECOMMENDED. CLOSE TO MOVE FORWARD. Boston[ ,1] # 1st column Boston[,"age"] # column "age" Boston$age # column "age age # you can't refer just by columnnames attach(Boston) # now you can use their columnnames w/o "Boston$" age # now this works indus ### column "medv" is the respoinse variable. # Scatter plots plot(crim, medv) plot(zn, medv) plot(age, medv) plot(rad, medv) plot(ptratio, medv) plot(black, medv) plot(chas, medv) # Histogram of the respoonse variable hist(medv) ### setting response variable # Rename "medv" column as "resp" to streamline analysis. Boston2 <- Boston %>% rename(resp=medv) Boston2 # move "resp" columnm to 1st Boston2 <- Boston2 %>% relocate(resp) Boston2 # turn "chas" column into 0/1 factor Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) ) Boston3 #------------------------------------------------- ###--- 1. Routine Exploratory Analysis (class of resp should be "dbl") Orig <- Boston2 resp.col.name <- "resp" #- Check for N/A in data. Remove if there's any. summary(Orig) sum(is.na(Orig)) # If there is na in the data, run below dim(Orig) Orig <- Orig %>% na.omit() dim(Orig) ##---------- ##- Correlation Check # Pick columns that are numeric library(corrplot) # install.packages("corrplot") Orig_num <- Orig %>% select_if(is.numeric) cor(Orig_num) corrplot::corrplot(cor(Orig_num)) corrplot::corrplot(cor(Orig_num), method="number") ##---------- ##- Visualization # pairs(Orig) equivalent library(GGally) # install.packages("GGally") GGally::ggpairs(Orig[, c(2:7, 1)], aes(alpha=1)) GGally::ggpairs(Orig[, c(8:14, 1)], aes(alpha=1)) ###---------- ###- Chi-sq test of association for each column # Pick columns that has more than 1 unique value list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>% gather() %>% filter(value>1) %>% pull(key) # Apply chisq.test to those columns ChiSq.pval <- Orig %>% select(list_cols) %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value)) ChiSq.pval <- Orig %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value)) ChiSq.pval barplot(t(t(as.matrix(ChiSq.pval))), las=2, cex.names=1, main="p-values from Chi-sq test of association") abline(h=.05, col='red') which(ChiSq.pval < .05) # Col num of variables w <.05 p-value list_cols[which(ChiSq.pval < .05)] # Top list of 'important' variables which(ChiSq.pval > .6) # Col num of variables w >.6 p-value list_cols[which(ChiSq.pval > .6)] # Top list of 'unimportatnt' variables ### End of Routine Exploratory Analysis. ### Try repeating with Boston3.