#### ### ### K-NN - Default Data (in ISLR package) ### with Oversampling ### #################################################### ###--- 1. Preliminary library(ISLR) data(Default) names(Default) # "default" "student" "balance" "income" dim(Default) # 10000 4 library(tidyverse) Default <- as_tibble(Default) Default ## Change class of default and student column from ## factor to numeric (for kNN later on) Default3 <- Default %>% mutate( student=as.numeric(student=="Yes") ) Default3 ###--- 2. Data Separation with OVERSAMPLING ###--- Divide Dataset to Training and Testing and Set up k fold CV Orig <- Default3 # Entire Data set (have to be data.frame) train.size <- 9000 # num of rows for training set test.size <- 1000 # num of rows for testing set resp.col.name <- "default" # name of response column num.folds <- 5 # k for k-fold CV my.seed <- 8346 # give a seed ovspl.size <- 5000 # size of oversample to add to training set #--- set.seed(my.seed) ix = sample(1:nrow(Orig)) Orig2 = Orig[ix, ] Train.set = Orig2[1:train.size, ] Train.resp = Orig2[1:train.size, resp.col.name] Test.set = Orig2[(train.size+1):(train.size+test.size), ] Test.resp = Orig2[(train.size+1):(train.size+test.size), resp.col.name] #--- Oversample (We use Default3 from here on) set.seed(my.seed) ix.oversample = sample(which(Train.resp=="Yes"), ovspl.size, replace=TRUE) Oversampled.Yes = Train.set[ix.oversample, ] #- Add the oversample to Training Set (Does not affect Test.set) Train.set = bind_rows(Train.set, Oversampled.Yes) Train.resp = bind_rows(Train.resp, Oversampled.Yes[ ,resp.col.name]) # K-fold Cross Validation library(cvTools) # install.packages("cvTools") set.seed(my.seed) folds = cvFolds( nrow(Train.set), K=num.folds ) # k-fold CV (random assignment) CV.train = list(Train.set[ folds$which!=1, ]) CV.train.resp = list(Train.resp[folds$which!=1,1]) CV.valid = list(Train.set[ folds$which==1, ]) CV.valid.resp = list(Train.resp[folds$which==1,1]) for (k in 2:num.folds) { CV.train[[k]] = Train.set[ folds$which!=k, ] CV.train.resp[[k]] = Train.resp[folds$which!=k,1] CV.valid[[k]] = Train.set[ folds$which==k, ] CV.valid.resp[[k]] = Train.resp[folds$which==k,1] } # Output (all data.frame): # Train.set / Train.resp # Test.set / Test.resp # CV.train[[k]] / CV.train.resp[[k]] # CV.valid[[k]] / CV.valid.resp[[k]] # Train.set is oversampled table(Train.set[, 'default']) table(Train.resp) # Test.set is not table(Test.set[, 'default']) table(Test.resp) ###--- 3. K-NN with 5-fold CV for Training Set library(DMwR) # for kNN() library(caret) # for confusionMatrix() library(pROC) # for plot.roc() k.value.list = c(1,10,50,100,150,200,250,300,400) # number of neighbour to include threshold = .05 # use values within 0.01 - 0.15 ER.test.record <- matrix(0, 5, length(k.value.list)) AUC.test.record <- matrix(0, 5, length(k.value.list)) for (i in 1:length(k.value.list)){ for (k in 1:5) { k.value <- k.value.list[i] nn3 <- kNN(default ~ ., as.data.frame(CV.train[[k]]), as.data.frame(CV.valid[[k]]), norm=FALSE, k=k.value, prob=TRUE) Valid.prob <- 1- as.numeric(attributes(nn3)$prob) # kNN thinks "No" is 1. Need to flip. Valid.pred = ifelse(Valid.prob > threshold, "Yes", "No") CM.test <- confusionMatrix(factor(Valid.pred), factor(as.matrix(CV.valid.resp[[k]])),positive="Yes") # CM.test$table AUC.test = auc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes")) #plot.roc(factor(as.matrix(Test.resp)), Valid.prob, levels=c("No", "Yes")) # point corresponding to CM.test # abline(h=CM.test[["byClass"]][["Sensitivity"]], v=CM.test[["byClass"]][["Specificity"]], col="red") # text(.2, .2, paste("AUC=",round(auc.test, 3))) ER.test <- mean(Valid.pred != CV.valid.resp[[k]]) # Error Rate for testing set ER.test.record[k,i] = ER.test AUC.test.record[k,i] = AUC.test # note that ERs are tied to threshold value } } ER.test.record AUC.test.record apply(AUC.test.record, 2, mean)