### ### ### Digits - SVM ### ### #################################################### #------------------------------------------------- ###--- 0. Preliminary library(tidyverse) # load data from author's website Digits <- read_csv(file="https://nmimoto.github.io/datasets/NISTdigits.csv") # Remove "id" and change class "digit" to factor Digits2 <- Digits %>% select(-"index") %>% mutate(digit=factor(digit)) %>% rename(resp=digit) Digits2 dim(Digits2) # [1] 1797 65 ###-------------------------------------------------------------------- ###--- 0b. Divide Dataset to Training and Testing and Set up k fold CV Orig <- Digits2 # Entire Data set (have to be data.frame) resp.col.name <- "resp" # name of response column train.size <- 1497 # num of rows for training set test.size <- 300 # num of rows for testing set #train.size <- 500 # num of rows for training set #test.size <- 100 # num of rows for testing set num.folds <- 5 # k for k-fold CV my.seed <- 55622 # give a seed #--- set.seed(my.seed) ix = sample(1:nrow(Orig)) Orig2 = Orig[ix, ] Train.set = Orig2[1:train.size, ] Train.resp = Orig2[1:train.size, resp.col.name] Test.set = Orig2[(train.size+1):(train.size+test.size), ] Test.resp = Orig2[(train.size+1):(train.size+test.size), resp.col.name] # K-fold Cross Validation library(cvTools) # install.packages("cvTools") set.seed(my.seed) folds = cvFolds( nrow(Train.set), K=num.folds ) # k-fold CV (random assignment) CV.train = list(Train.set[ folds$which!=1, ]) CV.train.resp = list(Train.resp[folds$which!=1,1]) CV.valid = list(Train.set[ folds$which==1, ]) CV.valid.resp = list(Train.resp[folds$which==1,1]) for (k in 2:num.folds) { CV.train[[k]] = Train.set[ folds$which!=k, ] CV.train.resp[[k]] = Train.resp[folds$which!=k,1] CV.valid[[k]] = Train.set[ folds$which==k, ] CV.valid.resp[[k]] = Train.resp[folds$which==k,1] } # Output (all data.frame): # Train.set / Train.resp # Test.set / Test.resp # CV.train[[k]] / CV.train.resp[[k]] # CV.valid[[k]] / CV.valid.resp[[k]] ###------------------------------------------------------------------- ###--- 1. Support Vector Machine - Linear Kernel ##--- SVM on training set library(e1071) Fit01 <- e1071::svm(resp~., data=Train.set, kernel="linear", cost=5, scale=FALSE) summary(Fit01) ##--- c. Pick best Cost with automatic 5-fold CV set.seed (my.seed) Tuned01 = e1071::tune(svm, resp~., data=Train.set, kernel ="linear", ranges=list( cost=c(0.0001, 0.001, 0.01, 0.1, 1, 10)), scale=FALSE, tunecontrol=tune.control(cross=5)) summary(Tuned01) #--- Extract the model with best cost summary(Tuned01$best.model) # plot(resp~., Tuned01$best.model, data=Train.set) #- Pick a threshold svm.fit <- Tuned01$best.model # results from CV tuning svm.fit #------------------ #- Training Confusion Matrix Train.pred = predict(svm.fit, Train.set, decision.values=FALSE) head(Train.pred) #- Testing Confusion Matrix Test.pred = predict(svm.fit, Test.set, decision.values=FALSE) head(Test.pred) CM.train <- caret::confusionMatrix(Train.pred, pull(Train.resp, resp)) CM.test <- caret::confusionMatrix(Test.pred, pull(Test.resp, resp)) CM.train CM.test Test.pred2 = predict(svm.fit, Test.set, decision.values=TRUE) head(Test.pred2) attributes(attributes(Test.pred2)$decision.values)$dimnames # (10 choose 2) # "5/6" "5/9" "5/7" "5/3" "5/4" "5/1" "5/8" "5/0" "5/2" # "6/9" "6/7" "6/3" "6/4" "6/1" "6/8" "6/0" "6/2" # "9/7" "9/3" "9/4" "9/1" "9/8" "9/0" "9/2" # "7/3" "7/4" "7/1" "7/8" "7/0" "7/2" # "3/4" "3/1" "3/8" "3/0" "3/2" # "4/1" "4/8" "4/0" "4/2" # "1/8" "1/0" "1/2" # "8/0" "8/2" # "0/2" #- Multiclass Voting Scheme A <- attributes(Test.pred2)$decision.values A[1, ] sign(A[1,]) Test.pred[1] #5/6 5/9 5/7 5/3 5/4 5/1 5/8 5/0 5/2 6/9 6/7 6/3 6/4 6/1 6/8 6/0 6/2 9/7 9/3 9/4 # 1 1 1 1 1 -1 -1 1 1 -1 -1 1 -1 -1 -1 -1 -1 1 1 1 #9/1 9/8 9/0 9/2 7/3 7/4 7/1 7/8 7/0 7/2 3/4 3/1 3/8 3/0 3/2 4/1 4/8 4/0 4/2 1/8 # -1 -1 -1 1 1 -1 -1 -1 1 1 -1 -1 -1 -1 1 -1 -1 -1 1 -1 #1/0 1/2 8/0 8/2 0/2 # 1 1 1 1 1 A[2, ] sign(A[2,]) Test.pred[2] #5/6 5/9 5/7 5/3 5/4 5/1 5/8 5/0 5/2 6/9 6/7 6/3 6/4 6/1 6/8 6/0 6/2 9/7 9/3 9/4 # 1 -1 -1 -1 -1 1 -1 1 1 -1 -1 -1 -1 -1 -1 -1 -1 -1 1 1 #9/1 9/8 9/0 9/2 7/3 7/4 7/1 7/8 7/0 7/2 3/4 3/1 3/8 3/0 3/2 4/1 4/8 4/0 4/2 1/8 # 1 1 1 1 1 1 1 1 1 1 -1 1 -1 1 1 -1 -1 1 1 -1 #1/0 1/2 8/0 8/2 0/2 # 1 1 1 1 -1