###
###
###  Boston - Prelim
###   ver 0.0.4
###
####################################################


#-------------------------------------------------
###--- 0. Preliminary
library(MASS)   # install.packages("MASS")
data(Boston)    # see the data
head(Boston)    # another way to see the data

?Boston  # see explanation for variables

  # 1  crim     per capita crime rate by town.
  # 2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
  # 3  indus    proportion of non-retail business acres per town.
  # 4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
  # 5  nox      nitrogen oxides concentration (parts per 10 million).
  # 6  rm       average number of rooms per dwelling.
  # 7  age      proportion of owner-occupied units built prior to 1940.
  # 8  dis      weighted mean of distances to five Boston employment centres.
  # 9  rad      index of accessibility to radial highways.
  # 10 tax      full-value property-tax rate per $10,000.
  # 11 ptratio  pupil-teacher ratio by town.
  # 12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
  # 13 lstat    lower status of the population (percent).
  # 14 medv     median value of owner-occupied homes in \$1000s.

  # 13 lstat = lower status of the population (percent)
  #      Proportion of population that is lower status
  #      = 1/2 (proportion of adults without,
  #      some high school education and proportion of male
  #      workers classified as laborers). The logarithmic
  #      specification implies that socioeconomic status
  #      distinctions mean more in the upper brackets of
  #      society than in the lower classes. Source: 1970 U. S. Census


#--- Turn the data into tibble ---
library(tidyverse)            # install.packages("tidyverse")
Boston <- as_tibble(Boston)
Boston

class(Boston)
is_tibble(Boston)         # is Boston tibble now?
is.data.frame(Boston)     # it's still data.frame too

Boston                    # only first 10 rows will print
print(Boston, n=100)      # if you want to see more rows

# fix(Boston)   # spreadsheet view NOT RECOMMENDED. CLOSE TO MOVE FORWARD.

Boston[ ,1]         # 1st column
Boston[,"age"]      # column "age"
Boston$age          # column "age

age                 # you can't refer just by columnnames
attach(Boston)      # now you can use their columnnames w/o "Boston$"

age                 # now this works
indus


### column "medv" is the respoinse variable.

# Scatter plots
plot(crim, medv)
plot(zn, medv)
plot(age, medv)
plot(rad, medv)
plot(ptratio, medv)
plot(black, medv)
plot(chas, medv)

# Histogram of the respoonse variable
hist(medv)


### setting response variable
# Rename "medv" column as "resp" to streamline analysis.
Boston2 <- Boston %>% rename(resp=medv)
Boston2

# move "resp" columnm to 1st
Boston2 <- Boston2 %>% relocate(resp)
Boston2

# turn "chas" column into 0/1 factor
Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) )
Boston3


#-------------------------------------------------
###--- 1. Routine Exploratory Analysis (class of resp should be "dbl")
Orig <- Boston2
resp.col.name <- "resp"

  #- Check for N/A in data. Remove if there's any.
  summary(Orig)
  sum(is.na(Orig))
  # If there is na in the data, run below
  dim(Orig)
  Orig <- Orig %>% na.omit()
  dim(Orig)


  ##----------
  ##- Correlation Check

  # Pick columns that are numeric
  library(corrplot)    # install.packages("corrplot")
    Orig_num <- Orig %>% select_if(is.numeric)
    cor(Orig_num)
    corrplot::corrplot(cor(Orig_num))
    corrplot::corrplot(cor(Orig_num), method="number")


  ##----------
  ##- Visualization

  # pairs(Orig) equivalent
  library(GGally)      # install.packages("GGally")
    GGally::ggpairs(Orig[, c(2:7, 1)], aes(alpha=1))
    GGally::ggpairs(Orig[, c(8:14, 1)], aes(alpha=1))


  ###----------
  ###- Chi-sq test of association for each column

  # Pick columns that has more than 1 unique value
  list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>%
        gather() %>% filter(value>1) %>% pull(key)

  # Apply chisq.test to those columns
  ChiSq.pval <- Orig %>% select(list_cols) %>%
        summarise_all(funs(chisq.test(., Orig$resp)$p.value))


  ChiSq.pval <- Orig %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value))
  ChiSq.pval
  barplot(t(t(as.matrix(ChiSq.pval))), las=2, cex.names=1,
      main="p-values from Chi-sq test of association")
  abline(h=.05, col='red')

  which(ChiSq.pval < .05)             # Col num of variables w <.05 p-value
  list_cols[which(ChiSq.pval < .05)]  # Top list of 'important' variables

  which(ChiSq.pval > .6)              # Col num of variables w >.6 p-value
  list_cols[which(ChiSq.pval > .6)]   # Top list of 'unimportatnt' variables

### End of Routine Exploratory Analysis.
### Try repeating with Boston3.