####
###
###  Default Data (ISLR) - Logistic Regession
###                                      ver 0.0.4
###
####################################################
###--- 0. Preliminary
###--- 1. Data Separation (Copied from Rtut-CV)
###--- 2. Logistic Regression on Training Set
###--- 3. Logistic Regression with CV
###--- 4. Final Training/Test fit using best model
###--- 5. Determine the Best Threshold


###-------------------------------------------------------
###--- 0. Preliminary
library(ISLR)    # install.packages('ISLR', repos='https://cran.case.edu/')
head(Default)    # see the data
dim(Default)

#  ?Default  # see explanation for variables

#  A data frame with 10000 observations on the following 4 variables.
#
# 1 default (factor)  A factor with levels No and Yes indicating whether the customer defaulted on their debt
# 2 student (factor)  A factor with levels No and Yes indicating whether the customer is a student
# 3 balance (numeric) The average balance that the customer has remaining on their credit card after making their monthly payment
# 4 income  (numeric) Income of customer
#
# column "default" is the respoinse variable.


#--- Turn the data into tibble ---
library(tidyverse)            # install.packages("tidyverse")
Default <- as_tibble(Default)
Default

Default2 <- Default %>%
              rename(resp=default) %>%     # Rename "default" column as "resp"
              relocate(resp)               # move "resp" columnm to 1st
Default2


###-------------------------------------------------------
###--- 1. Data Separation (Copied from Rtut-CV)

### --- Divide Dataset to Training and Testing and Set up k fold CV
Orig <- Default2             # Entire Data set (have to be data.frame)
train.size <- 8500           # num of rows for training set
test.size <- 1500            # num of rows for testing set
my.seed <- 8346              # give a seed

    #---AAAA---
    resp.col.name <- "resp"   # name of response column
    num.folds <- 5               # k for k-fold CV

    set.seed(my.seed)
    ix = sample(1:nrow(Orig))
    Orig2 = Orig[ix, ]
    Train.set  = Orig2[1:train.size, ]
    Train.resp = Orig2[1:train.size, resp.col.name]
    Test.set   = Orig2[(train.size+1):(train.size+test.size), ]
    Test.resp  = Orig2[(train.size+1):(train.size+test.size), resp.col.name]

    # K-fold Cross Validation
    library(cvTools)     # install.packages("cvTools")
    set.seed(my.seed)
    folds = cvFolds(  nrow(Train.set),  K=num.folds  )  # k-fold CV (random assignment)

    CV.train      = list(Train.set[ folds$which!=1, ])
    CV.train.resp = list(Train.resp[folds$which!=1,1])
    CV.valid      = list(Train.set[ folds$which==1, ])
    CV.valid.resp = list(Train.resp[folds$which==1,1])

    for (k in 2:num.folds) {
      CV.train[[k]]      = Train.set[ folds$which!=k, ]
      CV.train.resp[[k]] = Train.resp[folds$which!=k,1]
      CV.valid[[k]]      = Train.set[ folds$which==k, ]
      CV.valid.resp[[k]] = Train.resp[folds$which==k,1]
    }
    #---BBBB---

###  Line below will replace above chunk (#---AAAA to #---BBBB)
###  source('https://nmimoto.github.io/R/ML-00.txt')
###

    # Output (all data.frame):
    #   Train.set
    #   Train.resp
    #   Test.set
    #   Test.resp
    #   CV.train[[k]]
    #   CV.train.resp[[k]]
    #   CV.valid[[k]]
    #   CV.valid.resp[[k]]


###-------------------------------------------------------
###--- 2. Logistic Regression on Training Set


###----------------------------
###--- 2.1 Model1 Just balance
Fit01 <- glm(resp ~ balance,  family=binomial, data=Train.set)
summary(Fit01)

    #--- Plot the result for visualization
    plot(Train.set$balance, Train.set$resp=="Yes", ylab="Resp", xlab="Balance")
    lines(Train.set$balance, Fit01$fitted, lwd=2, col="red", type="p")


###----------------------------
###--- 2.2 Model2 All three
Fit02 <- glm(resp ~ .,  family=binomial, data=Train.set)
summary(Fit02)

coef(Fit02)
summary(Fit02)$coef        # extract estimated parameters
summary(Fit02)$coef[,4]   # extract 4th column (P-value) of above

    #--- Plot the result for visualization
    plot(Train.set$balance, Train.set$resp=="Yes", ylab="Resp", xlab="Balance")
    lines(Train.set$balance, Fit02$fitted, lwd=2, col="red", type="p")


###----------------------------
###--- 2.3 Model3 Remove income
Fit03 <- glm(resp ~ balance + student,  family=binomial, data=Train.set)
summary(Fit03)

    #--- Plot the result for visualization
    plot(Train.set$balance, Train.set$resp=="Yes", ylab="Resp", xlab="Balance")
    lines(Train.set$balance, Fit03$fitted, lwd=2, col="red", type="p")
                              # two red lines are for Student= Yes/No
    ix = (Train.set$student=="Yes")
    lines(Train.set$balance[ix], Fit03$fitted[ix], lwd=2, col="blue", type="p")

    #- Extract fitted response (training)
    Train.prob =predict(Fit03, type ="response")
    head(Train.prob)

    #- Predict in Test Set
    Test.prob = predict(Fit03, newdata=Test.set, type="response")
    head(Test.prob)


    ###----------------------------
    #- 2.3a Output result for given threshold value
    threshold = .9   # pick a threshold

    #- Check the training set accuracy
    library(caret)
    Train.pred = ifelse(Train.prob > threshold, "Yes", "No")  # Turn the fitted values to Up/Down using threshold of .5
    Test.pred  = ifelse(Test.prob  > threshold, "Yes", "No")
    CM.train <- confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes")
    CM.test  <- confusionMatrix(factor(Test.pred),  factor(as.matrix(Test.resp)),  positive="Yes")

    CM.train            # Training set result
    CM.train$table      # output just the table

    CM.train[["byClass"]][["Sensitivity"]]
    CM.train[["byClass"]][["Specificity"]]

    CM.test             # Testing set
    CM.test$table      # output just the table

    # Test set result
    #               Reference
    # Prediction      No Yes
    #        No     1437  62         [Specificity][  ]  = TrueNeg / sum of col
    #        Yes       0   1         [  ][Sensitivity]  = TruePos / sum of col

    colSums(CM.test$table) / sum(colSums(CM.test$table))    # % of Actual Yes/No
    rowSums(CM.test$table) / sum(rowSums(CM.test$table))    # % of predicted Yes/No


    ###----------------------------
    #- 2.3b Output ROC curve and AUC for all threshold
    library(pROC)
    #- Training Set
    plot.roc(factor(as.matrix(Train.resp)),  Train.prob, levels=c("No", "Yes"))
    # point corresponding to CM.train
    abline(h=CM.train[["byClass"]][["Sensitivity"]], v=CM.train[["byClass"]][["Specificity"]], col="red")
    auc.train = auc(factor(as.matrix(Train.resp)), Train.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Train AUC=",round(auc.train, 3)))


###-------------------------------------------------------
###--- 3. Logistic Regression with CV

AUCs <- MSE.valid <- matrix(0, 5, 2)
colnames(AUCs) = c("Train AUC", "Valid AUC")
for (k in 1:5) {

    Fit00 <- glm(resp ~. , family=binomial, data=CV.train[[k]])     # <----- Change model here

    #- Extract fitted response (training)
    Train.prob =predict(Fit00, type ="response")  # fitted responses
    #- Predict in Validation Set
    Valid.prob = predict(Fit00, newdata=CV.valid[[k]], type="response")


    #- Check the training set accuracy
    library(caret)
    Train.pred = ifelse(Train.prob > threshold, "Yes", "No")  # Turn the fitted values to Up/Down using threshold of .5
    Valid.pred = ifelse(Valid.prob > threshold, "Yes", "No")
    #CM.train <- confusionMatrix(factor(Train.pred),  factor(as.matrix(CV.train.resp[[k]])), positive="Yes")
    #CM.valid  <- confusionMatrix(factor(Valid.pred), factor(as.matrix(CV.valid.resp[[k]])), positive="Yes")

    AUCs[k,] <- round(c(auc(factor(as.matrix(CV.train.resp[[k]])), Train.prob, levels=c("No", "Yes")),
                        auc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes"))), 4)

}
AUCs

Av.AUCs = apply(AUCs, 2, mean)
names(Av.AUCs) = c("Av.Train AUC", "Av.Valid AUC")
Av.AUCs


###
###   Make decision about best model based on Av Valid AUC
###


###----------------------------
###--- 4. Final Training/Test fit using best model

### Best model
Fit03 <- glm(resp ~ balance + student,  family=binomial, data=Train.set)
summary(Fit03)


###---
Fit00 = Fit03

#- Extract fitted response (training)
Train.prob =predict(Fit00, type ="response")
head(Train.prob)

#- Predict in Test Set
Test.prob = predict(Fit00, newdata=Test.set, type="response")
head(Test.prob)


###----------------------------
#- 4a Output result for given threshold value
threshold = .9   # pick a threshold

    #- Check the training set accuracy
    library(caret)
    Train.pred = ifelse(Train.prob > threshold, "Yes", "No")  # Turn the fitted values to Up/Down using threshold of .5
    Test.pred  = ifelse(Test.prob  > threshold, "Yes", "No")
    CM.train <- confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes")
    CM.test  <- confusionMatrix(factor(Test.pred),  factor(as.matrix(Test.resp)),  positive="Yes")

    CM.train            # Training set result
    CM.train$table      # output just the table

    CM.train[["byClass"]][["Sensitivity"]]
    CM.train[["byClass"]][["Specificity"]]

    CM.test             # Testing set
    CM.test$table      # output just the table

    # Test set result
    #               Reference
    # Prediction      No Yes
    #        No     1437  62         [Specificity][  ]  = TrueNeg / sum of col
    #        Yes       0   1         [  ][Sensitivity]  = TruePos / sum of col

    colSums(CM.test$table) / sum(colSums(CM.test$table))    # % of Actual Yes/No
    rowSums(CM.test$table) / sum(rowSums(CM.test$table))    # % of predicted Yes/No


###----------------------------
#- 4b Output ROC curve and AUC for all threshold
library(pROC)
    #- Training Set
    plot.roc(factor(as.matrix(Train.resp)),  Train.prob, levels=c("No", "Yes"))
    # point corresponding to CM.train
    abline(h=CM.train[["byClass"]][["Sensitivity"]], v=CM.train[["byClass"]][["Specificity"]], col="red")
    auc.train = auc(factor(as.matrix(Train.resp)), Train.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Train AUC=",round(auc.train, 3)))

    #- Test Set
    plot.roc(factor(as.matrix(Test.resp)),  Test.prob, levels=c("No", "Yes"))
    # point corresponding to CM.test
    abline(h=CM.test[["byClass"]][["Sensitivity"]], v=CM.test[["byClass"]][["Specificity"]], col="red")
    auc.test = auc(factor(as.matrix(Test.resp)), Test.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Test AUC=",round(auc.test, 3)))


###-------------------------------------------------------
###--- 5. Determine the Best Threshold

#--- Decide on Threshould penalty
#cost.list = c(1,1,1,1)/4           # order of (TP, TN, FP, FN)
cost.list = c(0,0,3,1)/4           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,1)/2           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,2)/3           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,3)/4           # order of (TP, TN, FP, FN)


threshold.list = seq(0.01,.99,.01)    # grid for threshold
cost=0
library(caret)      # for confusionMatrix
for (i in 1:length(threshold.list)){

    threshold = threshold.list[i]

    #- Check the training set accuracy
    Test.pred  = ifelse(Test.prob  > threshold, "Yes", "No")
    CM.test  <- confusionMatrix(factor(Test.pred),
                                factor(as.matrix(Test.resp)),
                                positive="Yes")
    TP = CM.test$table[2,2]   # True  Pos
    TN = CM.test$table[1,1]   # True  Neg
    FP = CM.test$table[2,1]   # False Pos
    FN = CM.test$table[1,2]   # False Neg

    cost[i] = sum(c(TP, TN, FP, FN) * cost.list)
}
plot(threshold.list, cost, xlab="threshold")

cost.list
which.min(cost)
min(cost)
threshold.list[which.min(cost)]