### ### ### Concrete Data - Multiple Regression (Simple version) ### ver 0.0.5 ### ################################################## #------------------------------------------------- ###--- 0. Preliminary library(tidyverse) Concrete <- read_csv("https://nmimoto.github.io/datasets/concrete.csv") Concrete print(Concrete, n=20) # if you want to see more rows # Cement (component 1) -- quantitative -- kg in a m3 mixture # Slag (component 2) Blast Furnace Slag -- quantitative -- kg in a m3 mixture # Fly (component 3) Fly Ash -- quantitative -- kg in a m3 mixture # Water (component 4) -- quantitative -- kg in a m3 mixture # Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture # Coarse (component 6) Coarse Aggregate -- quantitative -- kg in a m3 mixture # Fine (component 7) Fine Aggregate -- quantitative -- kg in a m3 mixture # Age -- quantitative -- Day (1~365) # CCS Concrete compressive strength -- quantitative -- MPa -- response variable # Rename "CCS" column as "resp" to streamline analysis. Concrete2 <- Concrete %>% rename(resp=CCS) %>% relocate(resp) Concrete2 #------------------------------------------------- ###--- 1. Routine Exploratory Analysis (class of resp should be "dbl") Orig <- Concrete2 # "resp" column is required #- Check for N/A in data. Remove if there's any. summary(Orig) sum(is.na(Orig)) # If there is na in the data, run below dim(Orig) Orig <- Orig %>% na.omit() dim(Orig) ###------------------------------------------------------- ###--- 1. Data Separation (Copied from Ch3-Lab) # Load the file from my website source("https://nmimoto.github.io/R/ML-00.txt") # load CreateCV() # Use this function to create CV dataset CreateCV(Orig, numFolds=5, seed=3231) # Global variables created: # Train.set Train.resp # Test.set Test.resp # CV.train[[k]] CV.train.resp[[k]] # CV.valid[[k]] CV.valid.resp[[k]] #--- plot Training and Test for visualization fold = 2 plot(CV.train[[fold]]$lstat, CV.train[[fold]]$resp, xlab="lstat", ylab="Response (medv)") lines(CV.valid[[fold]]$lstat, CV.valid[[fold]]$resp, type="p", col="red", pch=19) ###------------------------------------------------- ###--- 2. Multiple Regression with manual CV layout(matrix(1:6, 2, 3, byrow=TRUE)) # to plot 5 in 1 page CVFitDiagnosis <- numeric(0) for (k in 1:5) { set.seed(my.seed) Fit00 = lm(resp ~., data=CV.train[[k]]) # <===== Try using different set of columns here # summary(Fit00) # plot(Fit00) # Use only if NN is small enough (hidden <10) #--- Get training / validation fit Train.fitted = predict(Fit00, newdata=CV.train[[k]]) Valid.pred = predict(Fit00, newdata=CV.valid[[k]]) #--- Plot Y vs Yhat plot( Train.fitted, as.matrix(CV.train[[k]]$resp), xlab="Fitted", ylab="Actual",main=paste("K=",k)) lines(Valid.pred, as.matrix(CV.valid[[k]]$resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(0,1) library(caret) # install.packages("caret") CVFitDiagnosis1 <- data.frame( tr.RMSE = caret::RMSE(Train.fitted, as.matrix(CV.train[[k]]$resp)), tr.Rsquare = caret::R2( Train.fitted, CV.train[[k]]$resp), val.RMSE = caret::RMSE(Valid.pred, as.matrix(CV.valid[[k]]$resp)), val.Rsquare = caret::R2( Valid.pred, CV.valid[[k]]$resp) ) CVFitDiagnosis <- rbind(CVFitDiagnosis, CVFitDiagnosis1) } layout(1) CVFitDiagnosis Av.CVFitDiagnosis = apply(CVFitDiagnosis, 2, mean) Av.CVFitDiagnosis ###------------------------------------------------- ###--- 3. Ordinary Regression (w Entire Training set) Fit01 = lm(resp ~. , data=Train.set) summary(Fit01) # get training / validation fit Train.fitted = predict(Fit01, newdata = Train.set) Test.pred = predict(Fit01, newdata = Test.set) library(caret) # install.packages("caret") FinalFitDiagnosis <- data.frame( tr.RMSE = caret::RMSE(Train.fitted, as.matrix(Train.resp)), tr.Rsquare = caret::R2( Train.fitted, Train.resp), test.RMSE = caret::RMSE(Test.pred, as.matrix(Test.resp)), test.Rsquare = caret::R2( Test.pred, Test.resp) ) FinalFitDiagnosis #--- Plot Y vs Yhat plot( Train.fitted, as.matrix(Train.resp), xlab="Fitted", ylab="Actual",main="Final Test.set fit") lines(Test.pred, as.matrix(Test.resp), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(0,1) #--- Plot Residual vs Yhat in Test Set plot(Test.pred, as.matrix(Test.set$resp-Test.pred), type="p", xlab="Fitted", ylab="Actual", col="red", pch=20) abline(h=0) ## Also try ## Fit02 = lm(resp ~ . -Fine, data=Concrete2)