####
###
###  K-NN - Default Data (in ISLR package)
###          with Oversampling
###
####################################################


###--- 1. Preliminary
library(ISLR)

data(Default)
names(Default)
#  "default" "student" "balance" "income"

dim(Default)
# 10000  4


library(tidyverse)
Default <- as_tibble(Default)
Default


## Change class of default and student column from
##   factor to numeric (for kNN later on)
Default3 <- Default %>% mutate( student=as.numeric(student=="Yes") )
Default3


###--- 2. Data Separation with OVERSAMPLING
###--- Divide Dataset to Training and Testing and Set up k fold CV
Orig <- Default3             # Entire Data set (have to be data.frame)
train.size <- 9000           # num of rows for training set
test.size <- 1000            # num of rows for testing set
resp.col.name <- "default"   # name of response column
num.folds <- 5               # k for k-fold CV
my.seed <- 8346              # give a seed
ovspl.size  <- 5000          # size of oversample to add to training set

    #---
    set.seed(my.seed)
    ix = sample(1:nrow(Orig))
    Orig2 = Orig[ix, ]
    Train.set  = Orig2[1:train.size, ]
    Train.resp = Orig2[1:train.size, resp.col.name]
    Test.set   = Orig2[(train.size+1):(train.size+test.size), ]
    Test.resp  = Orig2[(train.size+1):(train.size+test.size), resp.col.name]

    #--- Oversample (We use Default3 from here on)
    set.seed(my.seed)
    ix.oversample = sample(which(Train.resp=="Yes"), ovspl.size, replace=TRUE)
    Oversampled.Yes = Train.set[ix.oversample, ]
    #- Add the oversample to Training Set (Does not affect Test.set)
    Train.set  = bind_rows(Train.set, Oversampled.Yes)
    Train.resp = bind_rows(Train.resp, Oversampled.Yes[ ,resp.col.name])


    # K-fold Cross Validation
    library(cvTools)     # install.packages("cvTools")
    set.seed(my.seed)
    folds = cvFolds(  nrow(Train.set),  K=num.folds  )  # k-fold CV (random assignment)

    CV.train      = list(Train.set[ folds$which!=1, ])
    CV.train.resp = list(Train.resp[folds$which!=1,1])
    CV.valid      = list(Train.set[ folds$which==1, ])
    CV.valid.resp = list(Train.resp[folds$which==1,1])

    for (k in 2:num.folds) {
      CV.train[[k]]      = Train.set[ folds$which!=k, ]
      CV.train.resp[[k]] = Train.resp[folds$which!=k,1]
      CV.valid[[k]]      = Train.set[ folds$which==k, ]
      CV.valid.resp[[k]] = Train.resp[folds$which==k,1]
    }
# Output (all data.frame):
#   Train.set      /  Train.resp
#   Test.set       /  Test.resp
#   CV.train[[k]]  /  CV.train.resp[[k]]
#   CV.valid[[k]]  /  CV.valid.resp[[k]]


# Train.set is oversampled
table(Train.set[, 'default'])
table(Train.resp)

# Test.set is not
table(Test.set[, 'default'])
table(Test.resp)


###--- 3. K-NN with 5-fold CV for Training Set
library(DMwR)    # for kNN()
library(caret)   # for confusionMatrix()
library(pROC)    # for plot.roc()

k.value.list = c(1,10,50,100,150,200,250,300,400)        # number of neighbour to include
threshold = .05    # use values within 0.01 - 0.15


ER.test.record  <- matrix(0, 5, length(k.value.list))
AUC.test.record <- matrix(0, 5, length(k.value.list))
for (i in 1:length(k.value.list)){
    for (k in 1:5) {

        k.value <- k.value.list[i]
        nn3 <- kNN(default ~ ., as.data.frame(CV.train[[k]]),
                   as.data.frame(CV.valid[[k]]), norm=FALSE, k=k.value, prob=TRUE)

        Valid.prob <- 1- as.numeric(attributes(nn3)$prob)  # kNN thinks "No" is 1.  Need to flip.

        Valid.pred  = ifelse(Valid.prob  > threshold, "Yes", "No")
        CM.test <- confusionMatrix(factor(Valid.pred), factor(as.matrix(CV.valid.resp[[k]])),positive="Yes")
        # CM.test$table

        AUC.test = auc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes"))

        #plot.roc(factor(as.matrix(Test.resp)),  Valid.prob, levels=c("No", "Yes"))
          # point corresponding to CM.test
        #  abline(h=CM.test[["byClass"]][["Sensitivity"]], v=CM.test[["byClass"]][["Specificity"]], col="red")

        #  text(.2, .2, paste("AUC=",round(auc.test, 3)))

        ER.test  <- mean(Valid.pred != CV.valid.resp[[k]])     # Error Rate for testing set

        ER.test.record[k,i]  =  ER.test
        AUC.test.record[k,i] =  AUC.test
        # note that ERs are tied to threshold value
    }
}

ER.test.record
AUC.test.record

apply(AUC.test.record, 2, mean)