###
###  Breast Cancer Data - Logistic Regression
###      ver 0.0.4
###
####################################################
#--- 0. Preliminary
#--- 1. Logistic Regression with CV
#--- 2. Final Training/Test fit using best model
#--- 3. Threshold Picker


#-------------------------------------------------
###--- 0. Preliminary
library(tidyverse)
BCancer <- read_csv("https://nmimoto.github.io/datasets/bcancer.csv")
BCancer
print(BCancer, width=1000)  # show all columns

# Rename medv column as resp.
BCancer2 <- BCancer %>%
    select(-"id") %>%                        # remove "id" column
    rename(resp=diagnosis) %>%               # rename the column
    relocate(resp) %>%                       # move "resp" to 1st column
    mutate(resp=ifelse(resp=="M", "Yes", "No")) %>%    # change M/B to Yes/No
    mutate(resp=as.factor(resp))
BCancer2

table(BCancer2$resp)    # note that it is No/Yes, instead of Yes/No


#  which(ChiSq.pval < .05)             # Col num of variables w <.05 p-value
# [1]  1  2 22 29
# [1] "resp"  "mean_radius" "worst_radius"  "worst_concave_points"
# which(ChiSq.pval > .6)               # Col num of variables w >.6 p-value
# [1]  6 20
# [1] "mean_smoothness" "symmetry_error"


###--------------------------------------------------------------------
###--- 0b. Divide Dataset to Training and Testing and Set up k fold CV
Orig <- BCancer2             # Entire Data set (have to be data.frame)
train.size <- 500            # num of rows for training set
test.size <-  69             # num of rows for testing set
my.seed <-1534               # give a seed

  ###
  ### This line replaces the AAAA to BBBB chunk
  source('https://nmimoto.github.io/R/ML-00.txt')
  ###

# Output (all data.frame):
#   Train.set      /  Train.resp
#   Test.set       /  Test.resp
#   CV.train[[k]]  /  CV.train.resp[[k]]
#   CV.valid[[k]]  /  CV.valid.resp[[k]]


###-------------------------------------------------------
###--- 1. Logistic Regression with CV
AUCs <- MSE.valid <- matrix(0, 5, 2)
colnames(AUCs) = c("Train AUC", "Valid AUC")
for (k in 1:5) {

    Fit00 <- glm(resp ~. , family=binomial, data=CV.train[[k]])     # <----- Change model here

    #- Extract fitted response (training)
    Train.prob =predict(Fit00, type ="response")  # fitted responses
    #- Predict in Validation Set
    Valid.prob = predict(Fit00, newdata=CV.valid[[k]], type="response")

    #- Check the training set accuracy
    library(caret)
    library(pROC)
    AUCs[k,] <- round(c(auc(factor(as.matrix(CV.train.resp[[k]])), Train.prob, levels=c("No", "Yes")),
                        auc(factor(as.matrix(CV.valid.resp[[k]])), Valid.prob, levels=c("No", "Yes"))), 4)

}
AUCs

Av.AUCs = apply(AUCs, 2, mean)
names(Av.AUCs) = c("Av.Train AUC", "Av.Valid AUC")
Av.AUCs


###
###   Make decision about best model based on Av Valid AUC
###   Try the model:  resp ~ mean_radius + worst_radius + worst_concave_points
###


###----------------------------
###--- 2. Final Training/Test fit using best model

### Best model
Fit01 <- glm(resp ~ mean_radius +
                 worst_radius +
                 worst_concave_points,
             family=binomial, data=Train.set )
summary(Fit01)
coef(Fit01)


###---
Fit00 = Fit01

#- Extract fitted response (training)
Train.prob =predict(Fit00, type ="response")
head(Train.prob)

#- Predict in Test Set
Test.prob = predict(Fit00, newdata=Test.set, type="response")
head(Test.prob)


###----------------------------
#- 2a Output result for given threshold value
threshold = .9   # pick a threshold

    #- Check the training set accuracy
    library(caret)
    Train.pred = ifelse(Train.prob > threshold, "Yes", "No")  # Turn the fitted values to Up/Down using threshold of .5
    Test.pred  = ifelse(Test.prob  > threshold, "Yes", "No")
    CM.train <- confusionMatrix(factor(Train.pred), factor(as.matrix(Train.resp)), positive="Yes")
    CM.test  <- confusionMatrix(factor(Test.pred),  factor(as.matrix(Test.resp)),  positive="Yes")

    CM.train            # Training set result
    CM.train$table      # output just the table

    CM.train[["byClass"]][["Sensitivity"]]
    CM.train[["byClass"]][["Specificity"]]

    CM.test             # Testing set
    CM.test$table      # output just the table

    # Test set result
    #               Reference
    # Prediction      No Yes
    #        No     1437  62         [Specificity][  ]  = TrueNeg / sum of col
    #        Yes       0   1         [  ][Sensitivity]  = TruePos / sum of col
    # High threshold -> Specificity goes to 1. Very conservative. Everyone is flagged neg.
    # Low threshold  -> Sensitivity goes to 1. Very agressive. Eveyone is flagged pos.


    colSums(CM.test$table) / sum(colSums(CM.test$table))    # % of Actual Yes/No
    rowSums(CM.test$table) / sum(rowSums(CM.test$table))    # % of predicted Yes/No


###----------------------------
#- 2b Output ROC curve and AUC for all threshold
library(pROC)
    #- Training Set
    plot.roc(factor(as.matrix(Train.resp)),  Train.prob, levels=c("No", "Yes"))
    # point corresponding to CM.train
    abline(h=CM.train[["byClass"]][["Sensitivity"]], v=CM.train[["byClass"]][["Specificity"]], col="red")
    auc.train = auc(factor(as.matrix(Train.resp)), Train.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Train AUC=",round(auc.train, 3)))

    #- Test Set
    plot.roc(factor(as.matrix(Test.resp)),  Test.prob, levels=c("No", "Yes"))
    # point corresponding to CM.test
    abline(h=CM.test[["byClass"]][["Sensitivity"]], v=CM.test[["byClass"]][["Specificity"]], col="red")
    auc.test = auc(factor(as.matrix(Test.resp)), Test.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Test AUC=",round(auc.test, 3)))

    c(auc.train, auc.test)


layout(matrix(1:2, 1, 2))
    plot.roc(factor(as.matrix(Train.resp)),  Train.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Train AUC=",round(auc.train, 3)))
    plot.roc(factor(as.matrix(Test.resp)),  Test.prob, levels=c("No", "Yes"))
    text(.2, .2, paste("Test AUC=",round(auc.test, 3)))
    layout(1)


plot(AUCs[,2], col="red", ylim=c(.5,1))
lines(AUCs[,1], type="p")
abline(h=auc.test)

AUCs
Av.AUCs
c(auc.train, auc.test)


###-----------------------------------------------------------
###--- 3. Threshold Picker

# Uses Test.prob from last section.

#cost.list = c(1,1,1,1)/4           # order of (TP, TN, FP, FN)
cost.list = c(0,0,3,1)/4           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,1)/2           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,2)/3           # order of (TP, TN, FP, FN)
#cost.list = c(0,0,1,3)/4           # order of (TP, TN, FP, FN)


threshold.list = seq(0.01,.99,.01)    # grid for threshold
cost=0
library(caret)      # for confusionMatrix
for (i in 1:length(threshold.list)){

    threshold = threshold.list[i]

    #- Check the training set accuracy
    Test.pred  = ifelse(Test.prob  > threshold, "Yes", "No")
    CM.test  <- confusionMatrix(factor(Test.pred),
                                factor(as.matrix(Test.resp)),
                                positive="Yes")
    TP = CM.test$table[2,2]   # True  Pos
    TN = CM.test$table[1,1]   # True  Neg
    FP = CM.test$table[2,1]   # False Pos
    FN = CM.test$table[1,2]   # False Neg

    cost[i] = sum(c(TP, TN, FP, FN) * cost.list)
}
plot(threshold.list, cost, xlab="threshold")

cost.list
which.min(cost)
min(cost)
threshold.list[which.min(cost)]