###
###
### Boston Data (MASS) - Multiple Regression
###                         with 5-fold Cross Validation
###   ver 0.0.4
###
##################################################


###-------------------------------------------------
###--- 0. Preliminary

# Using Boston3 from https://nmimoto.github.io/R/boston0.txt
library(MASS)                # install.packages("MASS")
library(tidyverse)           # install.packages("tidyverse")
Boston <- as_tibble(Boston)
Boston

Boston2 <- Boston %>%
    rename(resp=medv) %>%    # Rename "medv" column as "resp" to streamline analysis.
    relocate(resp)           # move "resp" columnm to 1st
Boston2

# turn "chas" column into 0/1 factor
Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) )
Boston3

# Importance List by Chisq test
  # top list:
  # c(3, 4, 6, 10)  /  "zn" "indus" "nox" "rad"

  # bottom list:
  # c(7, 8, 9, 13)  /  "rm" "age" "dis" "black"

  #1  resp    (dbl)   <- used to be "medv". Response variable (Y).
  #2  crim    (dbl)
  #3  zn      (dbl)
  #4  indus   (dbl)
  #5  chas    (factor)
  #6  nox     (dbl)
  #7  rm      (dbl)
  #8  age     (dbl)
  #9  dis     (dbl)
  #10 rad     (dbl)
  #11 tax     (dbl)
  #12 ptratio (dbl)
  #13 black   (dbl)
  #14 lstat   (dbl)


###-------------------------------------------------
###--- 1. Data Separation (Copied from Rtut-CV)
### Divide Dataset to Training and Testing and Set up k fold CV
Orig <- Boston3              # Entire Data set (have to be data.frame)
train.size <- 400            # num of rows for training set
test.size <- 106              # num of rows for testing set
my.seed <- 3231              # give a seed

  ###
  ### This line replaces the AAAA to BBBB chunk
  source('https://nmimoto.github.io/R/ML-00.txt')
  ###

# Output (all data.frame):
#   Train.set      /  Train.resp
#   Test.set       /  Test.resp
#   CV.train[[k]]  /  CV.train.resp[[k]]
#   CV.valid[[k]]  /  CV.valid.resp[[k]]

# [ Train.set 400     ]     [Test.set 106]
# [80][80][80][80][80]


#--- plot Training and Test for visualization
fold = 2
plot(CV.train[[fold]]$lstat,  CV.train[[fold]]$resp, xlab="lstat", ylab="Response (medv)")
lines(CV.valid[[fold]]$lstat, CV.valid[[fold]]$resp, type="p", col="red",pch=19)


###-------------------------------------------------
###--- 2. Multiple Regression with all variables, no Cross Validation all data
Reg01 <- lm(medv ~ . , data=Boston)
summary(Reg01)

plot(Reg01)


##
## Search for the better model without CV
##

## subtracting
Reg02 <- lm(medv ~. -rm -age -dis -black, data=Boston)
summary(Reg02)

Reg02 <- lm(medv ~. -rm -age -dis , data=Boston)
summary(Reg02)

Reg02 <- lm(medv ~. -age -black, data=Boston)
summary(Reg02)

Reg02 <- lm(medv ~. -age -black -indus, data=Boston)
summary(Reg02)

plot(Reg02)


## adding
Reg03 <- lm(medv ~ zn + indus + nox + rad, data=Boston)
summary(Reg03)

Reg03 <- lm(medv ~ crim + zn + indus + nox + rad, data=Boston)
summary(Reg03)


  #2  crim    (dbl)
  #3  zn      (dbl)
  #4  indus   (dbl)
  #5  chas    (factor)
  #6  nox     (dbl)
  #7  rm      (dbl)
  #8  age     (dbl)
  #9  dis     (dbl)
  #10 rad     (dbl)
  #11 tax     (dbl)
  #12 ptratio (dbl)
  #13 black   (dbl)
  #14 lstat   (dbl)

  # Importance List by Chisq test
  # top list:
  # c(3, 4, 6, 10)  /  "zn" "indus" "nox" "rad"

  # bottom list:
  # c(7, 8, 9, 13)  /  "rm" "age" "dis" "black"


###-------------------------------------------------
###--- 3. Multiple Regression with Cross Validation (use all columns)

##
## Example for for 1 fold
##
k=1  # CV fold number

Reg01 <- lm(resp ~., data=CV.train[[k]])
  summary(Reg01)
  # plot(Reg01)      # plots 4 diagnosis plots

  #--- Get training / validation fit
  Train.fitted = predict(Reg01, newdata=CV.train[[k]])
  Valid.pred = predict(Reg01, newdata=CV.valid[[k]])

    #--- Plot Y vs Yhat
    plot( Train.fitted, as.matrix(CV.train.resp[[k]]), xlab="Fitted", ylab="Actual")
    lines(Valid.pred, as.matrix(CV.valid.resp[[k]]), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20)
    abline(0,1)

    library(caret)            # install.packages("caret")
    CVFitDiagnosis <- data.frame(
      tr.RMSE   = caret::RMSE(Train.fitted, as.matrix(CV.train.resp[[k]])),
      tr.Rsquare  = caret::R2(Train.fitted,           CV.train.resp[[k]]),
      val.RMSE  = caret::RMSE(Valid.pred, as.matrix(CV.valid.resp[[k]])),
      val.Rsquare = caret::R2(Valid.pred,           CV.valid.resp[[k]])
    )
    CVFitDiagnosis


###-------------------------------------------------
###--- 4. Multiple Regression with CV (search for better model)

##
##  Change the model and search for the better model
##
layout(matrix(1:6, 2, 3, byrow=TRUE))    # to plot 5 in 1 page
CVFitDiagnosis <- numeric(0)
for (k in 1:5) {

  Reg01 <- lm(resp ~. , data=CV.train[[k]])      ## <====== Change the model here
  # summary(Reg01)
  # plot(Reg01)      # plots 4 diagnosis plots

  #--- Get training / validation fit
  Train.fitted = predict(Reg01, newdata=CV.train[[k]])
  Valid.pred = predict(Reg01, newdata=CV.valid[[k]])

    #--- Plot Y vs Yhat
    plot( Train.fitted, as.matrix(CV.train.resp[[k]]), xlab="Fitted", ylab="Actual",main=paste("K=",k))
    lines(Valid.pred, as.matrix(CV.valid.resp[[k]]), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20)
    abline(0,1)

    library(caret)            # install.packages("caret")
    CVFitDiagnosis1 <- data.frame(
      tr.RMSE     = caret::RMSE(Train.fitted, as.matrix(CV.train.resp[[k]])),
      tr.Rsquare  = caret::R2(Train.fitted,             CV.train.resp[[k]]),
      val.RMSE    = caret::RMSE(Valid.pred, as.matrix(CV.valid.resp[[k]])),
      val.Rsquare = caret::R2(Valid.pred,             CV.valid.resp[[k]])
    )
    CVFitDiagnosis <- rbind(CVFitDiagnosis, CVFitDiagnosis1)
}
layout(1)

CVFitDiagnosis

Av.CVFitDiagnosis = apply(CVFitDiagnosis, 2, mean)
Av.CVFitDiagnosis


##
##  Av. val.RMSE is the summary for how good this model is fitting over all CV
##


###-------------------------------------------------
###--- 5. Final Fit with Train.set

## [ Train.set 400     ]     [Test.set 106]
## [80][80][80][80][80]


Reg01 <- lm(resp ~. , data=Train.set)      ## <====== Best model you found in 4
summary(Reg01)
plot(Reg01)      # plots 4 diagnosis plots

#--- Get training / validation fit
Train.fitted = predict(Reg01, newdata=Train.set)
Test.pred    = predict(Reg01, newdata=Test.set)

#--- Plot Y vs Yhat
plot( Train.fitted, as.matrix(Train.resp), xlab="Fitted", ylab="Actual",main="Final Test.set fit")
lines(Test.pred, as.matrix(Test.resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20)
abline(0,1)

library(caret)            # install.packages("caret")
FinalFitDiagnosis <- data.frame(
  tr.RMSE      = caret::RMSE(Train.fitted, as.matrix(Train.resp)),
  tr.Rsquare   = caret::R2(  Train.fitted,           Train.resp),
  test.RMSE    = caret::RMSE(Test.pred,  as.matrix(Test.resp)),
  test.Rsquare = caret::R2(  Test.pred,            Test.resp)
)
FinalFitDiagnosis

##
##  Use test.RMSE or test.Rsquare as summary of the model fit.  These numbers does not have penalty for having more parameters.
##