### ### ### Boston data (MASS) - Polynomial Regression (X=lstat only) ### with 5-fold Cross Validation ### ver 0.0.2 ### ################################################################# ###------------------------------------------------- ###--- 0. Preliminary # Using Boston3 from https://nmimoto.github.io/R/boston0.txt library(MASS) # install.packages("MASS") library(tidyverse) # install.packages("tidyverse") Boston <- as_tibble(Boston) Boston # Rename "medv" column as "resp" to streamline analysis. Boston2 <- Boston %>% rename(resp=medv) Boston2 # move "resp" columnm to 1st Boston2 <- Boston2 %>% relocate(resp) Boston2 # turn "chas" column into 0/1 factor Boston3 <- Boston2 %>% mutate( chas=as.factor(chas) ) Boston3 # Importance List by Chisq test # top list: # 3 4 6 10 # "zn" "indus" "nox" "rad" # bottom list # c(7, 8, 9, 13) # "rm" "age" "dis" "black" #1 resp (dbl) <- used to be "medv". Response variable (Y). #2 crim (dbl) #3 zn (dbl) #4 indus (dbl) #5 chas (factor) #6 nox (dbl) #7 rm (dbl) #8 age (dbl) #9 dis (dbl) #10 rad (dbl) #11 tax (dbl) #12 ptratio (dbl) #13 black (dbl) #14 lstat (dbl) ###------------------------------------------------- ###--- 2. Data Separation (Copied from Rtut-CV) ### Divide Dataset to Training and Testing and Set up k fold CV Orig <- Boston3 # Entire Data set (have to be data.frame) train.size <- 400 # num of rows for training set test.size <- 106 # num of rows for testing set resp.col.name <- "resp" # name of response column num.folds <- 5 # k for k-fold CV my.seed <- 3231 # give a seed # [ training 400 ] [testing 106] # [80][80][80][80][80] #--- set.seed(my.seed) ix = sample(1:nrow(Orig)) Orig2 = Orig[ix, ] Train.set = Orig2[1:train.size, ] Train.resp = Orig2[1:train.size, resp.col.name] Test.set = Orig2[(train.size+1):(train.size+test.size), ] Test.resp = Orig2[(train.size+1):(train.size+test.size), resp.col.name] # K-fold Cross Validation library(cvTools) # install.packages("cvTools") set.seed(my.seed) folds = cvFolds( nrow(Train.set), K=num.folds ) # k-fold CV (random assignment) CV.train = list(Train.set[ folds$which!=1, ]) CV.train.resp = list(Train.resp[folds$which!=1,1]) CV.valid = list(Train.set[ folds$which==1, ]) CV.valid.resp = list(Train.resp[folds$which==1,1]) for (k in 2:num.folds) { CV.train[[k]] = Train.set[ folds$which!=k, ] CV.train.resp[[k]] = Train.resp[folds$which!=k,1] CV.valid[[k]] = Train.set[ folds$which==k, ] CV.valid.resp[[k]] = Train.resp[folds$which==k,1] } # Output (all data.frame): # Train.set [400x14] / Train.resp [400x1] # Test.set [106x14] / Test.resp [106x1] # CV.train[[k]] / CV.train.resp[[k]] k goes from 1 to num.folds # CV.valid[[k]] / CV.valid.resp[[k]] k goes from 1 to num.folds ###------------------------------------------------- ###--- 3. Polynomial Regression using 5-fold CV (X=lstat only) # deg.poly 1 to 10 # CV fold k 1 to 5 #--- plot Training and Test for visualization fold = 2 plot(CV.train[[fold]]$lstat, CV.train[[fold]]$resp, xlab="lstat", ylab="Response (medv)") lines(CV.valid[[fold]]$lstat, CV.valid[[fold]]$resp, type="p", col="red",pch=19) #--- If all data is used deg.poly = 1 Fit00 <- lm( resp ~ poly(lstat, deg.poly), data=Boston3) summary(Fit00) ###--- 3.a Polynomial Regression for 1 d and 1 k deg.poly = 3 k=3 Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] ) summary(Fit01) #- CV Training MSE MSE.train <- mean(Fit01$residuals^2) #- CV Validation MSE Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]]) MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2) c(MSE.train, MSE.valid) # Plot the fit plot(CV.train[[k]]$lstat, CV.train[[k]]$resp, xlab="X", ylab="Y") lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19) ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="red" ) text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2))) text(30,46, paste("k=",k, ": MSE.va=",round(MSE.valid,2)), col="red") # layout( matrix(1:5, 1,5) ) # layout(1) ### ### Now change deg.poly and k and run again ### ###--- 3.b Polynomial Regression for d=1:10 and k=1:5 MSE.train <- MSE.valid <- matrix(0, 5, 10) for (deg.poly in 1:10) { for (k in 1:5) { Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] ) summary(Fit01) #--- CV Training MSE MSE.train[k,deg.poly] <- mean(Fit01$residuals^2) #--- CV Validation MSE Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]]) MSE.valid[k,deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2) } } MSE.train MSE.valid Av.MSE.train = apply(MSE.train, 2, mean) Av.MSE.valid = apply(MSE.valid, 2, mean) # Plot Average MSE plot(Av.MSE.train, type="o", ylab="MSE") lines(Av.MSE.valid, type="o", col="red") legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red")) ### ### Make decision about best deg.poly to use from the Av MSE plot ### ###------------------------------------------------- ###--- 4. Best Polynomial Regression Model for entire Training set (X=lstat only) # [80][80][80][80][80] <- 1 of these were red, 4 others were white # [ training 400 ] [testing 106] <- now all training is white, all testing is blue deg.poly = 6 # You need to choose this value based on what you saw in 3b. Fit05 <- lm( resp ~ poly(lstat, deg.poly), data=Train.set) summary(Fit05) #- CV Training MSE MSE.train <- mean(Fit05$residuals^2) #- CV Validation MSE pred<- predict(Fit05, newdata=Test.set) MSE.test <- mean((Test.set$resp - pred)^2) c(MSE.train, MSE.test) # Plot the fit plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model") lines(Test.set$lstat, Test.set$resp, col="blue", type="p", pch=19) ix = sort(Train.set$lstat, index.return=TRUE)$ix lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" ) text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2))) text(30,46, paste(" MSE.test=",round(MSE.test,2)), col="blue")