### ### Heart Data - Support Vector Machine ### ver 0.0.4 ### #################################################### #--- 0. Preliminary #--- 1. Support Vector Machine with Linear Kernel using In-House CV #--- 2. SVM with Linear Kernel using Auto-CV #--- 3. SVM with Radial Kernel using Auto-CV #--- 4. SVM with Polynomial Kernel using Auto-CV #--- 5. Threshold Picker ###------------------------------------------------------- ###--- 0. Preliminary # Using Heart2 from https://nmimoto.github.io/R/heart0.txt library(tidyverse) Heart <- read_csv(file="https://nmimoto.github.io/datasets/heart.csv") # Rename medv column as resp. Heart2 <- Heart %>% select(-"index") %>% # remove col named "index" rename(resp=AHD) %>% # rename the column relocate(resp) %>% # move "resp" to 1st column mutate(resp=as.factor(resp), # Turn these columns to instead of ChestPain=as.factor(ChestPain), Thal=as.factor(Thal), Sex=as.factor(Sex), Fbs=as.factor(Fbs), RestECG=as.factor(RestECG), ExAng=as.factor(ExAng)) print(Heart2, width=1000) table(Heart2$resp) # note that first level is "No" ######################################## ### ### When we are doing SVM or NN, we need to turn all variables into numbers. ### The data will be scaled inside svm() automatically. ### ######################################## print(Heart, width=1000) # Rename medv column as resp. Heart3 <- Heart %>% select(-"index") %>% # remove col named "index" rename(resp=AHD) %>% # rename the column relocate(resp) %>% # move "resp" to 1st column mutate(resp=as.factor(resp)) # Turn these columns to instead of print(Heart3, width=1000) Heart3 <- Heart3 %>% select(-c("ChestPain", "Thal")) # remove columns print(Heart3, width=1000) ###-------------------------------------------------------------------- # need to remove rows with NA. Orig <- Heart3 # Check for N/A in data. Remove if there's any. summary(Orig) dim(Orig) sum(is.na(Orig)) # If there is na in the data, run below Orig <- Orig %>% na.omit() dim(Orig) ###-------------------------------------------------------------------- ###--- 0b. Divide Dataset to Training and Testing and Set up k fold CV Orig <- Orig # Entire Data set (have to be data.frame) train.size <- 250 # num of rows for training set test.size <- 47 # num of rows for testing set my.seed <- 7211 # give a seed ### ### This line replaces the AAAA to BBBB chunk source('https://nmimoto.github.io/R/ML-00.txt') ### # Output (all data.frame): # Train.set / Train.resp # Test.set / Test.resp # CV.train[[k]] / CV.train.resp[[k]] # CV.valid[[k]] / CV.valid.resp[[k]] ###------------------------------------------------- ###--- 1. Support Vector Machine with Linear Kernel using In-House CV library (e1071) # install.packages('e1071", repos='https://cran.case.edu/') AUCs <- MSE.valid <- matrix(0, 5, 2) colnames(AUCs) = c("Train AUC", "Valid AUC") for (k in 1:5) { Fit00 <- e1071::svm(resp~., data=CV.train[[k]], # <----- Change model here kernel="linear", cost=5, # kernel ="radial", gamma = .5, cost=1, scale=TRUE) summary(Fit00) #- Training Confusion Matrix Train.prob0 = predict(Fit00, CV.train[[k]], decision.values=TRUE) Train.prob = attributes(Train.prob0)$decision.values #- Predict in Validation Set Valid.prob0 = predict(Fit00, CV.valid[[k]], decision.values=TRUE) Valid.prob = attributes(Valid.prob0)$decision.values #- Check the training set accuracy library(caret) library(pROC) Train.prob1 = as.numeric(Train.prob)# (Train.prob-min(Train.prob))/(max(Train.prob)-min(Train.prob))) Valid.prob1 = as.numeric(Train.prob) #(Valid.prob-min(Valid.prob))/(max(Valid.prob)-min(Valid.prob))) AUCs[k,] <- round(c(auc(factor(as.matrix(CV.train.resp[[k]])), as.vector(Train.prob), levels=c("No", "Yes")), auc(factor(as.matrix(CV.valid.resp[[k]])), as.vector(Valid.prob), levels=c("No", "Yes"))), 4) } AUCs Av.AUCs = apply(AUCs, 2, mean) names(Av.AUCs) = c("Av.Train AUC", "Av.Valid AUC") Av.AUCs ### ### Make decision about best model based on Av Valid AUC. ### Better Cost parameter value must be found. ### After Best Cost value is found, final Train/Test fit must be performed. ### ###------------------------------------------------- ###--- 2. SVM with Linear Kernel using Auto-CV library (e1071) # install.packages('e1071", repos='https://cran.case.edu/') ## [ Train.set 400 ] [Test.set 106] ## [80][80][80][80][80] # Pick best Cost with automatic 5-fold CV set.seed (my.seed) Tuned01 = e1071::tune(svm, resp~., data=Train.set, kernel ="linear", ranges=list( cost=c(0.01, 0.1, 1, 5, 10, 100, 1000)), # <- try these cost parameters scale=TRUE, tunecontrol=tune.control(cross=5)) summary(Tuned01) #--- Extract the model with best cost summary(Tuned01$best.model) # plot(resp~., Tuned01$best.model, data=Train.set) #------------------ # Final Fit with Training Set vs Test Set using the best model Final.fit <- Tuned01$best.model # results from CV tuning #- Training Confusion Matrix Train.prob0 = predict(Final.fit, Train.set, decision.values=TRUE) Train.prob = attributes(Train.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Testing Confusion Matrix Test.prob0 = predict(Final.fit, Test.set, decision.values=TRUE) Test.prob = attributes(Test.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Pick a threshold threshold <- .7 Train.pred = ifelse(Train.prob > threshold, "No", "Yes") #Train.pred = ifelse(Train.prob > threshold, "Yes", "No") #- Test.pred = ifelse(Test.prob > threshold, "No", "Yes") #Test.pred = ifelse(Test.prob > threshold, "Yes", "No") CM.train <- caret::confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes") CM.train CM.test <- caret::confusionMatrix(factor(Test.pred), factor(as.matrix(Test.resp)), positive="Yes") CM.test #- ROC curve and AUC layout(matrix(1:2,1,2)) pROC::plot.roc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.train[["byClass"]][["Sensitivity"]], v=1-CM.train[["byClass"]][["Specificity"]], col="red") auc.train = pROC::auc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.train, 3))) #- Testing pROC::plot.roc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.test[["byClass"]][["Sensitivity"]], v=1-CM.test[["byClass"]][["Specificity"]], col="red") auc.test = pROC::auc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.test, 3))) layout(1) c(auc.train, auc.test) ### ### AUC.test is the estimate for this model's prediction power. (with threshold = .5) ### ROC curve represents the overall capability of this model. ### By changing threshold, we can be at anywhere on the curve. ### ###------------------------------------------------- ###--- 3. SVM with Linear Kernel using Auto-CV library (e1071) # install.packages('e1071", repos='https://cran.case.edu/') ## [ Train.set 400 ] [Test.set 106] ## [80][80][80][80][80] # Pick best Cost with automatic 5-fold CV set.seed (my.seed) Tuned01 = e1071::tune(svm, resp~., data=Train.set, kernel ="radial", ranges=list( gamma = 2^(-1:4), cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100, 1000)), scale=TRUE, tunecontrol=tune.control(cross=5)) summary(Tuned01) #--- Extract the model with best cost summary(Tuned01$best.model) # plot(resp~., Tuned01$best.model, data=Train.set) #------------------ # Final Fit with Training Set vs Test Set using the best model Final.fit <- Tuned01$best.model # results from CV tuning #- Training Confusion Matrix Train.prob0 = predict(Final.fit, Train.set, decision.values=TRUE) Train.prob = attributes(Train.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Testing Confusion Matrix Test.prob0 = predict(Final.fit, Test.set, decision.values=TRUE) Test.prob = attributes(Test.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Pick a threshold threshold <- .7 Train.pred = ifelse(Train.prob > threshold, "No", "Yes") #Train.pred = ifelse(Train.prob > threshold, "Yes", "No") #- Test.pred = ifelse(Test.prob > threshold, "No", "Yes") #Test.pred = ifelse(Test.prob > threshold, "Yes", "No") CM.train <- caret::confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes") CM.train CM.test <- caret::confusionMatrix(factor(Test.pred), factor(as.matrix(Test.resp)), positive="Yes") CM.test #- ROC curve and AUC layout(matrix(1:2,1,2)) pROC::plot.roc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.train[["byClass"]][["Sensitivity"]], v=1-CM.train[["byClass"]][["Specificity"]], col="red") auc.train = pROC::auc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.train, 3))) #- Testing pROC::plot.roc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.test[["byClass"]][["Sensitivity"]], v=1-CM.test[["byClass"]][["Specificity"]], col="red") auc.test = pROC::auc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.test, 3))) layout(1) c(auc.train, auc.test) ### ### ###------------------------------------------------- ###--- 4. SVM with Linear Kernel using Auto-CV library (e1071) # install.packages('e1071", repos='https://cran.case.edu/') ## [ Train.set 400 ] [Test.set 106] ## [80][80][80][80][80] # Pick best Cost with automatic 5-fold CV set.seed (my.seed) Tuned01 = e1071::tune(svm, resp~., data=Train.set, kernel ="polynomial", ranges=list( degree = 2^(-1:3), gamma = 2^(2:5), coef0=c(-2, 0,2^(-1:2)), cost=c(0.0001, 0.001, 0.01, 0.1, 1, 5, 10, 100)), scale=TRUE, tunecontrol=tune.control(cross=5)) summary(Tuned01) #--- Extract the model with best cost summary(Tuned01$best.model) # plot(resp~., Tuned01$best.model, data=Train.set) #------------------ # Final Fit with Training Set vs Test Set using the best model Final.fit <- Tuned01$best.model # results from CV tuning #- Training Confusion Matrix Train.prob0 = predict(Final.fit, Train.set, decision.values=TRUE) Train.prob = attributes(Train.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Testing Confusion Matrix Test.prob0 = predict(Final.fit, Test.set, decision.values=TRUE) Test.prob = attributes(Test.prob0)$decision.values head(Train.prob) # Check col name for No/Yes or Yes/No #- Pick a threshold threshold <- .7 Train.pred = ifelse(Train.prob > threshold, "No", "Yes") #Train.pred = ifelse(Train.prob > threshold, "Yes", "No") #- Test.pred = ifelse(Test.prob > threshold, "No", "Yes") #Test.pred = ifelse(Test.prob > threshold, "Yes", "No") CM.train <- caret::confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes") CM.train CM.test <- caret::confusionMatrix(factor(Test.pred), factor(as.matrix(Test.resp)), positive="Yes") CM.test #- ROC curve and AUC layout(matrix(1:2,1,2)) pROC::plot.roc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.train[["byClass"]][["Sensitivity"]], v=1-CM.train[["byClass"]][["Specificity"]], col="red") auc.train = pROC::auc(factor(as.matrix(Train.resp)), as.vector(Train.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.train, 3))) #- Testing pROC::plot.roc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) # point corresponding to CM.train abline(h=CM.test[["byClass"]][["Sensitivity"]], v=1-CM.test[["byClass"]][["Specificity"]], col="red") auc.test = pROC::auc(factor(as.matrix(Test.resp)), as.vector(Test.prob), levels=c("No", "Yes")) text(.2, .2, paste("AUC=",round(auc.test, 3))) layout(1) c(auc.train, auc.test) ### ### ###----------------------------------------------------------- ###--- 5. Threshold Picker #cost.list = c(0,0,3,1)/4 # order of (TP, TN, FP, FN) #cost.list = c(0,0,2,1)/3 # order of (TP, TN, FP, FN) #cost.list = c(0,0,1,1)/2 # order of (TP, TN, FP, FN) #cost.list = c(0,0,1,2)/3 # order of (TP, TN, FP, FN) cost.list = c(0,0,1,3)/4 # order of (TP, TN, FP, FN) threshold.list = seq(.01,.99,.01) # grid for threshold #- Pick a model svm.fit <- Tuned02$best.model Test.prob0 = predict(svm.fit, Test.set, decision.values=TRUE) Test.prob = attributes(Test.prob0)$decision.values head(Test.pred) # Check col name for No/Yes or Yes/No library(caret) # for confusionMatrix cost=0 for (i in 1:length(threshold.list)){ threshold = threshold.list[i] #- Check the training set accuracy Test.pred = ifelse(Test.prob > threshold, "Yes", "No") CM.test <- confusionMatrix(factor(Test.pred), factor(as.matrix(Test.resp)), positive="Yes") TP = CM.test$table[2,2] # True Pos TN = CM.test$table[1,1] # True Neg FP = CM.test$table[2,1] # False Pos FN = CM.test$table[1,2] # False Neg cost[i] = sum(c(TP, TN, FP, FN) * cost.list) } plot(threshold.list, cost, xlab="threshold") cost.list which.min(cost) min(cost) threshold.list[which.min(cost)]