### ### ### Boston Data (MASS) - Multiple Regression ### with 5-fold Cross Validation ### ver 0.0.4 ### ################################################## ###------------------------------------------------- ###--- 0. Preliminary # Using Boston3 from https://nmimoto.github.io/R/boston0.txt library(MASS) # install.packages("MASS") library(tidyverse) # install.packages("tidyverse") Boston <- as_tibble(Boston) Boston Boston2 <- Boston %>% rename(resp=medv) %>% # Rename "medv" column as "resp" to streamline analysis. relocate(resp) # move "resp" columnm to 1st Boston2 # turn "chas" column into 0/1 factor Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) ) Boston3 # Importance List by Chisq test # top list: # c(3, 4, 6, 10) / "zn" "indus" "nox" "rad" # bottom list: # c(7, 8, 9, 13) / "rm" "age" "dis" "black" #1 resp (dbl) <- used to be "medv". Response variable (Y). #2 crim (dbl) #3 zn (dbl) #4 indus (dbl) #5 chas (factor) #6 nox (dbl) #7 rm (dbl) #8 age (dbl) #9 dis (dbl) #10 rad (dbl) #11 tax (dbl) #12 ptratio (dbl) #13 black (dbl) #14 lstat (dbl) ###------------------------------------------------- ###--- 1. Data Separation (Copied from Rtut-CV) ### Divide Dataset to Training and Testing and Set up k fold CV Orig <- Boston3 # Entire Data set (have to be data.frame) train.size <- 400 # num of rows for training set test.size <- 106 # num of rows for testing set my.seed <- 3231 # give a seed ### ### This line replaces the AAAA to BBBB chunk source('https://nmimoto.github.io/R/ML-00.txt') ### # Output (all data.frame): # Train.set / Train.resp # Test.set / Test.resp # CV.train[[k]] / CV.train.resp[[k]] # CV.valid[[k]] / CV.valid.resp[[k]] # [ Train.set 400 ] [Test.set 106] # [80][80][80][80][80] #--- plot Training and Test for visualization fold = 2 plot(CV.train[[fold]]$lstat, CV.train[[fold]]$resp, xlab="lstat", ylab="Response (medv)") lines(CV.valid[[fold]]$lstat, CV.valid[[fold]]$resp, type="p", col="red",pch=19) ###------------------------------------------------- ###--- 2. Multiple Regression with all variables, no Cross Validation all data Reg01 <- lm(medv ~ . , data=Boston) summary(Reg01) plot(Reg01) ## ## Search for the better model without CV ## ## subtracting Reg02 <- lm(medv ~. -rm -age -dis -black, data=Boston) summary(Reg02) Reg02 <- lm(medv ~. -rm -age -dis , data=Boston) summary(Reg02) Reg02 <- lm(medv ~. -age -black, data=Boston) summary(Reg02) Reg02 <- lm(medv ~. -age -black -indus, data=Boston) summary(Reg02) plot(Reg02) ## adding Reg03 <- lm(medv ~ zn + indus + nox + rad, data=Boston) summary(Reg03) Reg03 <- lm(medv ~ crim + zn + indus + nox + rad, data=Boston) summary(Reg03) #2 crim (dbl) #3 zn (dbl) #4 indus (dbl) #5 chas (factor) #6 nox (dbl) #7 rm (dbl) #8 age (dbl) #9 dis (dbl) #10 rad (dbl) #11 tax (dbl) #12 ptratio (dbl) #13 black (dbl) #14 lstat (dbl) # Importance List by Chisq test # top list: # c(3, 4, 6, 10) / "zn" "indus" "nox" "rad" # bottom list: # c(7, 8, 9, 13) / "rm" "age" "dis" "black" ###------------------------------------------------- ###--- 3. Multiple Regression with Cross Validation (use all columns) ## ## Example for for 1 fold ## k=1 # CV fold number Reg01 <- lm(resp ~., data=CV.train[[k]]) summary(Reg01) # plot(Reg01) # plots 4 diagnosis plots #--- Get training / validation fit Train.fitted = predict(Reg01, newdata=CV.train[[k]]) Valid.pred = predict(Reg01, newdata=CV.valid[[k]]) #--- Plot Y vs Yhat plot( Train.fitted, as.matrix(CV.train.resp[[k]]), xlab="Fitted", ylab="Actual") lines(Valid.pred, as.matrix(CV.valid.resp[[k]]), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(0,1) library(caret) # install.packages("caret") CVFitDiagnosis <- data.frame( tr.RMSE = caret::RMSE(Train.fitted, as.matrix(CV.train.resp[[k]])), tr.Rsquare = caret::R2(Train.fitted, CV.train.resp[[k]]), val.RMSE = caret::RMSE(Valid.pred, as.matrix(CV.valid.resp[[k]])), val.Rsquare = caret::R2(Valid.pred, CV.valid.resp[[k]]) ) CVFitDiagnosis ###------------------------------------------------- ###--- 4. Multiple Regression with CV (search for better model) ## ## Change the model and search for the better model ## layout(matrix(1:6, 2, 3, byrow=TRUE)) # to plot 5 in 1 page CVFitDiagnosis <- numeric(0) for (k in 1:5) { Reg01 <- lm(resp ~. , data=CV.train[[k]]) ## <====== Change the model here # summary(Reg01) # plot(Reg01) # plots 4 diagnosis plots #--- Get training / validation fit Train.fitted = predict(Reg01, newdata=CV.train[[k]]) Valid.pred = predict(Reg01, newdata=CV.valid[[k]]) #--- Plot Y vs Yhat plot( Train.fitted, as.matrix(CV.train.resp[[k]]), xlab="Fitted", ylab="Actual",main=paste("K=",k)) lines(Valid.pred, as.matrix(CV.valid.resp[[k]]), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(0,1) library(caret) # install.packages("caret") CVFitDiagnosis1 <- data.frame( tr.RMSE = caret::RMSE(Train.fitted, as.matrix(CV.train.resp[[k]])), tr.Rsquare = caret::R2(Train.fitted, CV.train.resp[[k]]), val.RMSE = caret::RMSE(Valid.pred, as.matrix(CV.valid.resp[[k]])), val.Rsquare = caret::R2(Valid.pred, CV.valid.resp[[k]]) ) CVFitDiagnosis <- rbind(CVFitDiagnosis, CVFitDiagnosis1) } layout(1) CVFitDiagnosis Av.CVFitDiagnosis = apply(CVFitDiagnosis, 2, mean) Av.CVFitDiagnosis ## ## Av. val.RMSE is the summary for how good this model is fitting over all CV ## ###------------------------------------------------- ###--- 5. Final Fit with Train.set ## [ Train.set 400 ] [Test.set 106] ## [80][80][80][80][80] Reg01 <- lm(resp ~. , data=Train.set) ## <====== Best model you found in 4 summary(Reg01) plot(Reg01) # plots 4 diagnosis plots #--- Get training / validation fit Train.fitted = predict(Reg01, newdata=Train.set) Test.pred = predict(Reg01, newdata=Test.set) #--- Plot Y vs Yhat plot( Train.fitted, as.matrix(Train.resp), xlab="Fitted", ylab="Actual",main="Final Test.set fit") lines(Test.pred, as.matrix(Test.resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(0,1) library(caret) # install.packages("caret") FinalFitDiagnosis <- data.frame( tr.RMSE = caret::RMSE(Train.fitted, as.matrix(Train.resp)), tr.Rsquare = caret::R2( Train.fitted, Train.resp), test.RMSE = caret::RMSE(Test.pred, as.matrix(Test.resp)), test.Rsquare = caret::R2( Test.pred, Test.resp) ) FinalFitDiagnosis ## ## Use test.RMSE or test.Rsquare as summary of the model fit. These numbers does not have penalty for having more parameters. ##