### ### ### Digits - Prelim ### ### #################################################### #------------------------------------------------- ###--- 0. Preliminary library(tidyverse) # load data from author's website Digits <- read_csv(file="https://nmimoto.github.io/datasets/NISTdigits.csv") Digits names(Digits) # Remove "id" and change class "digit" to factor, rename "digit" to "resp" Digits2 <- Digits %>% select(-"index") %>% mutate(digit=factor(digit)) %>% rename(resp=digit) Digits2 dim(Digits2) # [1] 1797 65 ###---------- ###- Visualize digits library(ggplot2) library(grid) n_row <- sample(1:1797, 1) print(n_row) grid <- expand.grid(X=0:7, Y=0:7) grid <- grid[,c(2,1)] grid$Z <- t(Digits2[n_row, -1]) colnames(grid) <- c("X", "Y", "Z") # Heatmap pplot <- ggplot(grid, aes(X, Y, fill= Z)) + geom_tile() + ggtitle(paste("Digit =", Digits2[n_row,1])) print(pplot, vp=grid::viewport(angle=-90)) #there's also rotate() options from library(ggpubr) ####---------- ###--- Routine Exploratory Analysis (resp should be factor) Orig <- Digits2 resp.col.name <- "resp" #- Check for N/A in data. Remove if there's any. summary(Orig) sum(is.na(Orig)) # Orig <- Orig %>% na.omit() # dim(Orig) table(Orig[, resp.col.name]) # No Yes # 357 212 ##---------- ##- Correlation Check # Turn the response to numeric Orig2 <- Orig %>% mutate(resp=as.numeric(resp=="Yes")) # Pick columns that are numeric Orig2_num <- Orig2 %>% select_if(is.numeric) # cor(Orig2_num) # corrplot::corrplot(cor(Orig2_num), method="number") corrplot::corrplot(cor(Orig2_num)) ##---------- ##- Visualization # pairs(Orig) equivalent library(GGally) # install.packages("GGally") GGally::ggpairs(Orig[, c(1, 2:6)], aes(colour=factor(resp), alpha=1)) GGally::ggpairs(Orig[, c(1, 7:11)], aes(colour=factor(resp), alpha=1)) GGally::ggpairs(Orig[, c(1, 12:16)], aes(colour=factor(resp), alpha=1)) GGally::ggpairs(Orig[, c(1, 17:21)], aes(colour=factor(resp), alpha=1)) GGally::ggpairs(Orig[, c(1, 22:26)], aes(colour=factor(resp), alpha=1)) GGally::ggpairs(Orig[, c(1, 27:31)], aes(colour=factor(resp), alpha=1)) ###---------- ###- Chi-sq test of association for each column # Pick columns that has more than 1 unique value list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>% gather() %>% filter(value > 1) %>% pull(key) # Apply chisq.test to those columns ChiSq.pval <- Orig %>% select(list_cols) %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value)) ChiSq.pval which(ChiSq.pval < .05) # Col num of variables w <.05 p-value list_cols[which(ChiSq.pval < .05)] # Top list of 'important' variables which(ChiSq.pval > .6) # Col num of variables w >.6 p-value list_cols[which(ChiSq.pval > .6)] # Top list of 'unimportatnt' variables ### End of Routine Exploratory Analysis