###
###  BCancer - Prelim
###      ver 0.0.4
###
####################################################


#-------------------------------------------------
###--- 0. Preliminary
library(tidyverse)
BCancer <- read_csv("https://nmimoto.github.io/datasets/bcancer.csv")
BCancer
print(BCancer, width=1000)  # show all columns

#  569 × 32
#  1  id                      "numeric"   "dbl"
#  2  diagnosis               "character" "chr"   <--- Response Variable. M=malignant, B=benign
#  3  mean_radius             "numeric"   "dbl"   (mean of distances from center to points on the perimeter)
#  4  mean_texture            "numeric"   "dbl"
#  5  mean_perimeter          "numeric"   "dbl"
#  6  mean_area               "numeric"   "dbl"
#  7  mean_smoothness         "numeric"   "dbl"
#  8  mean_compactness        "numeric"   "dbl"
#  9  mean_concavity          "numeric"   "dbl"
#  10 mean_concave_points     "numeric"   "dbl"
#  11 mean_symmetry           "numeric"   "dbl"
#  12 mean_fractal dimension  "numeric"   "dbl"
#  13 radius_error            "numeric"   "dbl"
#  14 texture_error           "numeric"   "dbl"   (standard deviation of gray-scale values)
#  15 perimeter_error         "numeric"   "dbl"
#  16 area_error              "numeric"   "dbl"
#  17 smoothness_error        "numeric"   "dbl"   (local variation in radius lengths)
#  18 compactness_error       "numeric"   "dbl"   (perimeter^2 / area - 1.0)
#  19 concavity_error         "numeric"   "dbl"   (severity of concave portions of the contour)
#  20 concave_points_error    "numeric"   "dbl"   (number of concave portions of the contour)
#  21 symmetry_error          "numeric"   "dbl"
#  22 fractal_dimension_error "numeric"   "dbl"   ("coastline approximation" - 1)
#  23 worst_radius            "numeric"   "dbl"
#  24 worst_texture           "numeric"   "dbl"
#  25 worst_perimeter         "numeric"   "dbl"
#  26 worst_area              "numeric"   "dbl"
#  27 worst_smoothness        "numeric"   "dbl"
#  28 worst_compactness       "numeric"   "dbl"
#  29 worst_concavity         "numeric"   "dbl"
#  30 worst_concave_points    "numeric"   "dbl"
#  31 worst_symmetry          "numeric"   "dbl"
#  32 worst_fractal_dimension "numeric"   "dbl"

dim(BCancer)

# List out each column name and its Class and Type
Cls = BCancer %>% head %>% collect %>% lapply(class) %>% unlist
Type = BCancer %>% head %>% collect %>% lapply(type_sum) %>% unlist
cbind(Cls, Type)


#- Histogram of the respoonse variable
table(BCancer$diagnosis)


# Rename medv column as resp.
BCancer2 <- BCancer %>%
    select(-"id") %>%                        # remove "id" column
    rename(resp=diagnosis) %>%               # rename the column
    relocate(resp) %>%                       # move "resp" to 1st column
    mutate(resp=ifelse(resp=="M", "Yes", "No")) %>%    # change M/B to Yes/No
    mutate(resp=as.factor(resp))
BCancer2

table(BCancer2$resp)    # note that it is No/Yes, instead of Yes/No


#-------------------------------------------------
###--- 1. Routine Exploratory Analysis (class of resp should be "dbl")
Orig <- BCancer2
resp.col.name <- "resp"

  #- Check for N/A in data. Remove if there's any.
  summary(Orig)
  sum(is.na(Orig))
  dim(Orig)
  # If there is na in the data, run below
  Orig <- Orig %>% na.omit()
  dim(Orig)


  ##----------
  ##- Correlation Check

  # Pick columns that are numeric
  library(corrplot)    # install.packages("corrplot")
    Orig_num <- Orig %>% select_if(is.numeric)
    cor(Orig_num)
    corrplot::corrplot(cor(Orig_num))
    # corrplot::corrplot(cor(Orig_num), method="number")


  ##----------
  ##- Visualization

  # pairs(Orig) equivalent
  library(GGally)      # install.packages("GGally")
    GGally::ggpairs(Orig[, c(2:7,   1)], aes(color=resp, alpha=1))
    GGally::ggpairs(Orig[, c(8:13,  1)], aes(color=resp, alpha=1))
    GGally::ggpairs(Orig[, c(14:19, 1)], aes(color=resp, alpha=1))
    GGally::ggpairs(Orig[, c(20:25, 1)], aes(color=resp, alpha=1))
    GGally::ggpairs(Orig[, c(26:32, 1)], aes(color=resp, alpha=1))

  ###----------
  ###- Chi-sq test of association for each column

  # Pick columns that has more than 1 unique value
  list_cols <- Orig %>% summarise_all(function(x) length(unique(x))) %>%
        gather() %>% filter(value>1) %>% pull(key)

  # Apply chisq.test to those columns
  ChiSq.pval <- Orig %>% select(list_cols) %>%
        summarise_all(funs(chisq.test(., Orig$resp)$p.value))


  ChiSq.pval <- Orig %>% summarise_all(funs(chisq.test(., Orig$resp)$p.value))
  ChiSq.pval
  barplot(t(t(as.matrix(ChiSq.pval))), las=2, cex.names=1,
          main="p-values from Chi-sq test of association")
  abline(h=.05, col='red')

  which(ChiSq.pval < .05)             # Col num of variables w <.05 p-value
  list_cols[which(ChiSq.pval < .05)]  # Top list of 'important' variables

  which(ChiSq.pval > .6)              # Col num of variables w >.6 p-value
  list_cols[which(ChiSq.pval > .6)]   # Top list of 'unimportatnt' variables


#  which(ChiSq.pval < .05)             # Col num of variables w <.05 p-value
# [1]  1  2 22 29
# [1] "resp"  "mean_radius" "worst_radius"  "worst_concave_points"
# which(ChiSq.pval > .6)               # Col num of variables w >.6 p-value
# [1]  6 20
# [1] "mean_smoothness" "symmetry_error"


### End of Routine Exploratory Analysis.