###
###
###  Concrete Data - Multiple Regression (Simple version)
###    ver 0.0.4
###
##################################################


#-------------------------------------------------
###--- 0. Preliminary
library(tidyverse)
Concrete <- read_csv("https://nmimoto.github.io/datasets/concrete.csv")
Concrete

print(Concrete, n=20)  # if you want to see more rows

  # Cement           (component 1)                    -- quantitative -- kg in a m3 mixture
  # Slag             (component 2) Blast Furnace Slag -- quantitative -- kg in a m3 mixture
  # Fly              (component 3) Fly Ash            -- quantitative -- kg in a m3 mixture
  # Water            (component 4)                    -- quantitative -- kg in a m3 mixture
  # Superplasticizer (component 5)                    -- quantitative -- kg in a m3 mixture
  # Coarse           (component 6) Coarse Aggregate   -- quantitative -- kg in a m3 mixture
  # Fine             (component 7) Fine Aggregate     -- quantitative -- kg in a m3 mixture
  # Age                                               -- quantitative -- Day (1~365)
  # CCS              Concrete compressive strength    -- quantitative -- MPa -- response variable

# Rename "CCS" column as "resp" to streamline analysis.
Concrete2 <- Concrete %>% rename(resp=CCS) %>% relocate(resp)
Concrete2


#-------------------------------------------------
###--- 1. Routine Exploratory Analysis (class of resp should be "dbl")
Orig <- Concrete2    # "resp" column is required

  #- Check for N/A in data. Remove if there's any.
  summary(Orig)
  sum(is.na(Orig))
  # If there is na in the data, run below
  dim(Orig)
  Orig <- Orig %>% na.omit()
  dim(Orig)


###-------------------------------------------------
###--- 1. Data Separation (Copied from Rtut-CV)
### Divide Dataset to Training and Testing and Set up k fold CV
Orig <- Concrete2            # Entire Data set (have to be data.frame)
train.size <- 850            # num of rows for training set
test.size <- 180             # num of rows for testing set
my.seed <- 4345              # give a seed

  ###
  ### This line replaces the AAAA to BBBB chunk
  source('https://nmimoto.github.io/R/ML-00.txt')
  ###

# Output (all data.frame):
#   Train.set      /  Train.resp
#   Test.set       /  Test.resp
#   CV.train[[k]]  /  CV.train.resp[[k]]
#   CV.valid[[k]]  /  CV.valid.resp[[k]]

# [  Train.set 850        ]  [Test.set 180]  1030 total
# [170][170][170][170][170]


#--- plot Training and Test for visualization
fold = 2
plot(CV.train[[fold]]$Age,  CV.train[[fold]]$resp, xlab="lstat", ylab="Response (medv)")
lines(CV.valid[[fold]]$Age, CV.valid[[fold]]$resp, type="p", col="red",pch=19)


###-------------------------------------------------
###--- 2. Multiple Regression with manual CV
layout(matrix(1:6, 2, 3, byrow=TRUE))    # to plot 5 in 1 page
CVFitDiagnosis <- numeric(0)
for (k in 1:5) {

    set.seed(my.seed)
    Fit00 = lm(resp ~.,  data=CV.train[[k]]) # <===== Try using different set of columns here

    # summary(Fit00)
    # plot(Fit00)  # Use only if NN is small enough (hidden <10)

    #--- Get training / validation fit
    Train.fitted = predict(Fit00, newdata=CV.train[[k]])
    Valid.pred   = predict(Fit00, newdata=CV.valid[[k]])

    #--- Plot Y vs Yhat
    plot( Train.fitted, as.matrix(CV.train[[k]]$resp), xlab="Fitted", ylab="Actual",main=paste("K=",k))
    lines(Valid.pred,   as.matrix(CV.valid[[k]]$resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20)
    abline(0,1)

    library(caret)            # install.packages("caret")
    CVFitDiagnosis1 <- data.frame(
        tr.RMSE     = caret::RMSE(Train.fitted, as.matrix(CV.train[[k]]$resp)),
        tr.Rsquare  = caret::R2(  Train.fitted,           CV.train[[k]]$resp),
        val.RMSE    = caret::RMSE(Valid.pred,   as.matrix(CV.valid[[k]]$resp)),
        val.Rsquare = caret::R2(  Valid.pred,             CV.valid[[k]]$resp)
    )
    CVFitDiagnosis <- rbind(CVFitDiagnosis, CVFitDiagnosis1)
}
layout(1)

CVFitDiagnosis
Av.CVFitDiagnosis = apply(CVFitDiagnosis, 2, mean)
Av.CVFitDiagnosis


###-------------------------------------------------
###--- 3. Ordinary Regression (w Entire Training set)
Fit01 = lm(resp ~. , data=Train.set)
summary(Fit01)

# get training / validation fit
Train.fitted = predict(Fit01, newdata = Train.set)
Test.pred    = predict(Fit01, newdata = Test.set)

#--- Plot Y vs Yhat
plot( Train.fitted, as.matrix(CV.train[[k]]$resp), xlab="Fitted", ylab="Actual",main=paste("K=",k))
lines(Valid.pred,   as.matrix(CV.valid[[k]]$resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20)
abline(0,1)

library(caret)          # install.packages("caret")
OLS <- data.frame(
  RMSE.tr      = caret::RMSE(Train.fitted, as.matrix(Train.resp)),
  Rsquare.tr   = caret::R2(  Train.fitted,           Train.resp),
  RMSE.test    = caret::RMSE(Test.pred,    as.matrix(Test.resp)),
  Rsquare.test = caret::R2(  Test.pred,              Test.resp)
)
OLS


## Also try
## lm(resp ~ . -Fine, data=Concrete2)