#### ### ### K-NN - Default Data (in ISLR package) ### with SMOTE ### #################################################### ###--- 1. Preliminary library(ISLR) data(Default) names(Default) # "default" "student" "balance" "income" dim(Default) # 10000 4 library(tidyverse) Default <- as_tibble(Default) Default ## Change class of default and student column from ## factor to numeric (for kNN later on) Default3 <- Default %>% mutate( student=as.numeric(student=="Yes")) Default3 table(Default3$default) ###--- 2. Data Separation with SMOTE ###--- Divide Dataset to Training and Testing and Set up k fold CV Orig <- Default3 # Entire Data set (have to be data.frame) train.size <- 8325 # num of rows for training set test.size <- 1000 # num of rows for testing set resp.col.name <- "default" # name of response column num.folds <- 5 # k for k-fold CV my.seed <- 8346 # give a seed #--- set.seed(my.seed) ix = sample(1:nrow(Orig)) Orig2 = Orig[ix, ] Test.set = Orig2[(train.size+1):(train.size+test.size), ] Test.resp = Orig2[(train.size+1):(train.size+test.size), resp.col.name] #- Oversample the Training Set with SMOTE (We use Default3) library(DMwR) set.seed(my.seed) Train.set <- DMwR::SMOTE(default ~ ., data=as.data.frame(Orig), perc.over = 1200, perc.under=100) Train.set <- as_tibble(Train.set) Train.resp = Train.set[, resp.col.name] # K-fold Cross Validation library(cvTools) # install.packages("cvTools") set.seed(my.seed) folds = cvFolds( nrow(Train.set), K=num.folds ) # k-fold CV (random assignment) CV.train = list(Train.set[ folds$which!=1, ]) CV.train.resp = list(Train.resp[folds$which!=1,1]) CV.valid = list(Train.set[ folds$which==1, ]) CV.valid.resp = list(Train.resp[folds$which==1,1]) for (k in 2:num.folds) { CV.train[[k]] = Train.set[ folds$which!=k, ] CV.train.resp[[k]] = Train.resp[folds$which!=k,1] CV.valid[[k]] = Train.set[ folds$which==k, ] CV.valid.resp[[k]] = Train.resp[folds$which==k,1] } # Output (all data.frame): # Train.set / Train.resp # Test.set / Test.resp # CV.train[[k]] / CV.train.resp[[k]] # CV.valid[[k]] / CV.valid.resp[[k]] # Train.set is oversampled table(Train.set[, 'default']) table(Train.resp) # Test.set is not table(Test.set[, 'default']) table(Test.resp) # Visualize SMOTE plot(Train.set$balance, Train.set$income, col=ifelse(Train.set$default=="Yes", "red", "blue"), pch=ifelse((Train.set$default=="Yes"), 4, 3), xlab="Balance", ylab="Income") ###--- 3. K-NN with 5-fold CV for Training Set library(DMwR) # for kNN() library(caret) # for confusionMatrix() library(pROC) # for plot.roc() k.value.list = c(1,10,50,100,150,200,250,300,400) # number of neighbour to include threshold = .05 # use values within 0.01 - 0.15 ER.test.record <- matrix(0, 5, length(k.value.list)) AUC.test.record <- matrix(0, 5, length(k.value.list)) for (i in 1:length(k.value.list)){ print(i) for (k in 1:5) { k=2; i=3 k.value <- k.value.list[i] nn3 <- kNN(default ~ ., as.data.frame(CV.train[[k]]), as.data.frame(CV.valid[[k]]), norm=FALSE, k=k.value, prob=TRUE) Valid.prob <- 1- as.numeric(attributes(nn3)$prob) # kNN thinks "No" is 1. Need to flip. Valid.pred = ifelse(Valid.prob > threshold, "Yes", "No") CM.test <- confusionMatrix(factor(Valid.pred), factor(as.matrix(CV.valid.resp[[k]])), positive="Yes") # CM.test$table AUC.test = auc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes")) plot.roc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes")) #plot.roc(factor(as.matrix(Test.resp)), Valid.prob, levels=c("No", "Yes")) # point corresponding to CM.test # abline(h=CM.test[["byClass"]][["Sensitivity"]], v=CM.test[["byClass"]][["Specificity"]], col="red") # text(.2, .2, paste("AUC=",round(auc.test, 3))) ER.test <- mean(Valid.pred != CV.valid.resp[[k]]) # Error Rate for testing set ER.test.record[k,i] = ER.test AUC.test.record[k,i] = AUC.test # note that ERs are tied to threshold value } } ER.test.record AUC.test.record apply(AUC.test.record, 2, mean)