###
###
###  Digits - Prelim
###
###
####################################################


#-------------------------------------------------
###--- 0. Preliminary
library(tidyverse)
# load data from author's website
Digits <- read_csv(file="https://nmimoto.github.io/datasets/NISTdigits.csv")
Digits

names(Digits)


# Remove "id" and change class "digit" to factor, rename "digit" to "resp"
Digits2 <- Digits %>% select(-"index") %>%
    mutate(digit=factor(digit)) %>%
    rename(resp=digit)
Digits2


dim(Digits2)
#  [1] 1797   65


###----------
###- Visualize digits
library(ggplot2)
library(grid)
    n_row <- sample(1:1797, 1)
    print(n_row)
    grid <- expand.grid(X=0:7, Y=0:7)
    grid <- grid[,c(2,1)]
    grid$Z <- t(Digits2[n_row, -1])
    colnames(grid) <- c("X", "Y", "Z")

    # Heatmap
    pplot <- ggplot(grid, aes(X, Y, fill= Z)) +
        geom_tile() +
        ggtitle(paste("Digit =", Digits2[n_row,1]))
    print(pplot, vp=grid::viewport(angle=-90))

    #there's also rotate() options from library(ggpubr)


####----------
###--- Routine Exploratory Analysis (resp should be factor)
Orig <- Digits2
resp.col.name <- "resp"

    #- Check for N/A in data. Remove if there's any.
    summary(Orig)
    sum(is.na(Orig))
    # Orig <- Orig %>% na.omit()
    # dim(Orig)

    table(Orig[, resp.col.name])
    #  No Yes
    # 357 212

    ##----------
    ##- Correlation Check
    # Turn the response to numeric
    Orig2 <- Orig %>% mutate(resp=as.numeric(resp=="Yes"))

    # Pick columns that are numeric
    Orig2_num <- Orig2 %>% select_if(is.numeric)
    # cor(Orig2_num)
    # corrplot::corrplot(cor(Orig2_num), method="number")
    corrplot::corrplot(cor(Orig2_num))

    ##----------
    ##- Visualization

    # pairs(Orig) equivalent
    library(GGally)      # install.packages("GGally")
      GGally::ggpairs(Orig[, c(1,  2:6)],  aes(colour=factor(resp), alpha=1))
      GGally::ggpairs(Orig[, c(1,  7:11)], aes(colour=factor(resp), alpha=1))
      GGally::ggpairs(Orig[, c(1, 12:16)], aes(colour=factor(resp), alpha=1))
      GGally::ggpairs(Orig[, c(1, 17:21)], aes(colour=factor(resp), alpha=1))
      GGally::ggpairs(Orig[, c(1, 22:26)], aes(colour=factor(resp), alpha=1))
      GGally::ggpairs(Orig[, c(1, 27:31)], aes(colour=factor(resp), alpha=1))


    ###----------
    ###- Chi-sq test of association for each column

    # Pick columns that has more than 1 unique value
    list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>%
        gather() %>% filter(value > 1) %>% pull(key)

    # Apply chisq.test to those columns
    ChiSq.pval <- Orig %>% select(list_cols) %>%
        summarise_all(funs(chisq.test(., Orig$resp)$p.value))

    ChiSq.pval
    which(ChiSq.pval < .05)             # Col num of variables w <.05 p-value
    list_cols[which(ChiSq.pval < .05)]  # Top list of 'important' variables

    which(ChiSq.pval > .6)              # Col num of variables w >.6 p-value
    list_cols[which(ChiSq.pval > .6)]   # Top list of 'unimportatnt' variables

### End of Routine Exploratory Analysis