As we did in Ch2-Lab.
library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
## 7 12.43 22.9
## 8 19.15 27.1
## 9 29.93 16.5
## 10 17.10 18.9
## [1] "data.frame"
## [1] 506 14
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
?Boston # see explanation for variables
# 1 crim per capita crime rate by town.
# 2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
# 3 indus proportion of non-retail business acres per town.
# 4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# 5 nox nitrogen oxides concentration (parts per 10 million).
# 6 rm average number of rooms per dwelling.
# 7 age proportion of owner-occupied units built prior to 1940.
# 8 dis weighted mean of distances to five Boston employment centres.
# 9 rad index of accessibility to radial highways.
# 10 tax full-value property-tax rate per $10,000.
# 11 ptratio pupil-teacher ratio by town.
# 12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# 13 lstat lower status of the population (percent).
# 14 medv (response Y) median value of owner-occupied homes in $1000s.
I’ve written a function and posed on my website. You can load the function using single line with sourse() command.
The function is called CreateCV(). The dataframe you feed into the funciton must contain the “resp” column.
# Turn Boston into Tibble
library(tidyverse) # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston
## # A tibble: 506 × 14
## crim zn indus chas nox rm age dis rad tax ptratio
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296 15.3
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242 17.8
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242 17.8
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222 18.7
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222 18.7
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222 18.7
## 7 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311 15.2
## 8 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311 15.2
## 9 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311 15.2
## 10 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311 15.2
## # ℹ 496 more rows
## # ℹ 3 more variables: black <dbl>, lstat <dbl>, medv <dbl>
# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston
## # A tibble: 506 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## 7 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311
## 8 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311
## 9 16.5 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311
## 10 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311
## # ℹ 496 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
## [1] 0
## # A tibble: 506 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## 7 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311
## 8 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311
## 9 16.5 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311
## 10 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311
## # ℹ 496 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt") # load CreateCV()
# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)
##
## number of folds: 5
## number of obs: 506
## size of training data: 420
## size of test data: 86
## size of each fold : 84
##
## Global variables created:
## Train.set Train.resp
## Test.set Test.resp
## CV.train[[k]] CV.train.resp[[k]]
## CV.valid[[k]] CV.valid.resp[[k]]
# Output (all data.frame):
# Train.set Train.resp
# Test.set Test.resp
# CV.train[[[[k]]]] CV.train.resp[[[[k]]]]
# CV.valid[[[[k]]]] CV.valid.resp[[[[k]]]]
Train.set
## # A tibble: 422 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 6 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## 7 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311
## 8 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311
## 9 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311
## 10 18.9 0.117 12.5 7.87 0 0.524 6.01 82.9 6.23 5 311
## # ℹ 412 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 84 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 16.5 0.211 12.5 7.87 0 0.524 5.63 100 6.08 5 311
## 2 15 0.225 12.5 7.87 0 0.524 6.38 94.3 6.35 5 311
## 3 18.2 0.638 0 8.14 0 0.538 6.10 84.5 4.46 4 307
## 4 13.6 1.25 0 8.14 0 0.538 5.57 98.1 3.80 4 307
## 5 19.6 0.852 0 8.14 0 0.538 5.96 89.2 4.01 4 307
## 6 14.5 0.988 0 8.14 0 0.538 5.81 100 4.10 4 307
## 7 13.5 1.61 0 8.14 0 0.538 6.10 96.9 3.76 4 307
## 8 35.4 0.0131 90 1.22 0 0.403 7.25 21.9 8.70 5 226
## 9 16 0.172 25 5.13 0 0.453 5.97 93.4 6.82 8 284
## 10 19.4 0.0438 80 3.37 0 0.398 5.79 31.1 6.61 4 337
## # ℹ 74 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 337 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2 4.09 1 296
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9 4.97 2 242
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1 4.97 2 242
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06 3 222
## 5 28.7 0.0298 0 2.18 0 0.458 6.43 58.7 6.06 3 222
## 6 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56 5 311
## 7 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95 5 311
## 8 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59 5 311
## 9 18.9 0.117 12.5 7.87 0 0.524 6.01 82.9 6.23 5 311
## 10 21.7 0.0938 12.5 7.87 0 0.524 5.89 39 5.45 5 311
## # ℹ 327 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 85 × 14
## resp crim zn indus chas nox rm age dis rad tax
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 36.2 0.0690 0 2.18 0 0.458 7.15 54.2 6.06 3 222
## 2 20.4 0.630 0 8.14 0 0.538 5.95 61.8 4.71 4 307
## 3 15.2 1.23 0 8.14 0 0.538 6.14 91.7 3.98 4 307
## 4 30.8 0.0276 75 2.95 0 0.428 6.60 21.8 5.40 3 252
## 5 19.3 0.171 0 6.91 0 0.448 5.68 33.8 5.10 3 233
## 6 19.7 0.0887 21 5.64 0 0.439 5.96 45.7 6.81 4 243
## 7 19.6 0.103 25 5.13 0 0.453 5.93 47.2 6.93 8 284
## 8 18.7 0.149 25 5.13 0 0.453 5.74 66.2 7.23 8 284
## 9 24.2 0.0883 0 10.8 0 0.413 6.42 6.6 5.29 4 305
## 10 22.8 0.0916 0 10.8 0 0.413 6.06 7.8 5.29 4 305
## # ℹ 75 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>
#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly), data=Boston)
summary(Fit00)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5441 -3.7122 -0.5145 2.4846 26.4153
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.5328 0.2399 93.937 < 2e-16 ***
## poly(lstat, deg.poly)1 -152.4595 5.3958 -28.255 < 2e-16 ***
## poly(lstat, deg.poly)2 64.2272 5.3958 11.903 < 2e-16 ***
## poly(lstat, deg.poly)3 -27.0511 5.3958 -5.013 7.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6558
## F-statistic: 321.7 on 3 and 502 DF, p-value: < 2.2e-16
plot(Boston$lstat, Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )
# k=1 2 3 4 5 Teat
# [84][84][84][84][84] x[86]
deg.poly = 3
k=5
Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] )
summary(Fit01)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = CV.train[[k]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3992 -3.7879 -0.8421 2.7662 26.5361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7068 0.3031 74.903 < 2e-16 ***
## poly(lstat, deg.poly)1 -121.4322 5.5733 -21.788 < 2e-16 ***
## poly(lstat, deg.poly)2 54.1723 5.5733 9.720 < 2e-16 ***
## poly(lstat, deg.poly)3 -23.6973 5.5733 -4.252 2.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.573 on 334 degrees of freedom
## Multiple R-squared: 0.6375, Adjusted R-squared: 0.6342
## F-statistic: 195.8 on 3 and 334 DF, p-value: < 2.2e-16
#- CV Training MSE
MSE.train <- mean(Fit01$residuals^2)
#- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
c(MSE.train, MSE.valid)
## [1] 30.69423 36.14407
# Plot the fit
plot(CV.train[[k]]$lstat, CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k, ": MSE.va=",round(MSE.valid,2)), col="red")
# k=1 2 3 4 5 Test
# [84][84][84][84][84] x[86]
MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
for (k in 1:5) {
Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]])
summary(Fit01)
#--- CV Training MSE
MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)
#--- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
}
}
MSE.train
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 43.05375 34.74909 33.01309 32.02085 31.23260 31.22268 31.22062
## [2,] 40.05560 32.19746 30.92455 29.83802 28.92146 28.81751 28.78919
## [3,] 43.13570 32.58281 30.40547 28.45294 27.40293 27.21876 27.21876
## [4,] 43.14692 35.09344 33.33417 31.68624 30.64266 30.56833 30.48808
## [5,] 41.03800 32.35565 30.69423 28.99434 27.66631 27.51818 27.48698
## [,8] [,9] [,10]
## [1,] 31.05750 30.90124 30.56510
## [2,] 28.76656 28.37541 28.18858
## [3,] 27.15422 27.06685 26.54419
## [4,] 30.39161 30.03558 29.80547
## [5,] 27.39787 27.03285 26.72318
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 39.10088 28.60760 27.19558 23.77802 22.76167 22.31615 22.55017
## [2,] 50.55547 38.73248 35.51312 32.62050 31.12400 31.01640 31.02691
## [3,] 38.31102 37.62793 38.22876 39.29035 38.76847 39.37066 39.36591
## [4,] 38.14640 26.99644 25.55631 24.90907 23.96566 23.79805 24.09205
## [5,] 46.99887 38.01775 36.14407 35.70672 36.10602 36.25276 36.25668
## [,8] [,9] [,10]
## [1,] 25.61753 20.99249 52.83200
## [2,] 30.83004 30.85357 30.60280
## [3,] 38.97614 37.36199 44.64711
## [4,] 24.09848 23.87373 23.79805
## [5,] 36.24200 36.11202 36.39063
Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)
# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))
# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5
# Train Test
# [ 420 ] [ 86 ]
Fit05 <- lm( resp ~ poly(lstat, deg.poly), data=Train.set)
summary(Fit05)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = Train.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3808 -3.1680 -0.8885 2.0396 27.1750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.7723 0.2654 85.809 < 2e-16 ***
## poly(lstat, deg.poly)1 -137.9509 5.4517 -25.304 < 2e-16 ***
## poly(lstat, deg.poly)2 60.5427 5.4517 11.105 < 2e-16 ***
## poly(lstat, deg.poly)3 -26.7740 5.4517 -4.911 1.30e-06 ***
## poly(lstat, deg.poly)4 24.7932 5.4517 4.548 7.12e-06 ***
## poly(lstat, deg.poly)5 -20.5613 5.4517 -3.772 0.000186 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.452 on 416 degrees of freedom
## Multiple R-squared: 0.6642, Adjusted R-squared: 0.6601
## F-statistic: 164.5 on 5 and 416 DF, p-value: < 2.2e-16
#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)
#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)
c(MSE.train, MSE.test) # Final Fit Result
## [1] 29.29834 14.89711
## [1] 29.17319 30.54516
# Plot the fit
plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
ix = sort(Train.set$lstat, index.return=TRUE)$ix
lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
text(30,46, paste(" MSE.test=",round(MSE.test,2)), col="red")
###--------------------------
### 1. Load Boston data from package MASS
# As we did in Ch2-Lab.
library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
class(Boston) # it is a data frame
dim(Boston) # (num of row, num of col)
names(Boston) # column names
?Boston # see explanation for variables
# 1 crim per capita crime rate by town.
# 2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
# 3 indus proportion of non-retail business acres per town.
# 4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# 5 nox nitrogen oxides concentration (parts per 10 million).
# 6 rm average number of rooms per dwelling.
# 7 age proportion of owner-occupied units built prior to 1940.
# 8 dis weighted mean of distances to five Boston employment centres.
# 9 rad index of accessibility to radial highways.
# 10 tax full-value property-tax rate per $10,000.
# 11 ptratio pupil-teacher ratio by town.
# 12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# 13 lstat lower status of the population (percent).
# 14 medv (response Y) median value of owner-occupied homes in $1000s.
###--------------------------
### 2. Create Cross Validation Data Set
# I've written a function and posed on my website.
# You can load the function using single line with sourse() command.
#
# The function is called CreateCV().
# The dataframe you feed into the funciton must contain the "resp" column.
# Turn Boston into Tibble
library(tidyverse) # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston
# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston
# Check if there's NA in data
sum(is.na(Boston))
Boston = Boston %>% na.omit() # remove rows with NA (if applicable)
Boston
# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt") # load CreateCV()
# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)
# Output (all data.frame):
# Train.set Train.resp
# Test.set Test.resp
# CV.train[[[[k]]]] CV.train.resp[[[[k]]]]
# CV.valid[[[[k]]]] CV.valid.resp[[[[k]]]]
Train.set
Test.set
CV.train[[1]]
CV.valid[[1]]
###--------------------------
### 3. Polynomial Regression using 5-fold CV
#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly), data=Boston)
summary(Fit00)
plot(Boston$lstat, Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )
##--------------------------
## 3a. Polynomial Regression for d=3 and k=5
# k=1 2 3 4 5 Teat
# [84][84][84][84][84] x[86]
deg.poly = 3
k=5
Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] )
summary(Fit01)
#- CV Training MSE
MSE.train <- mean(Fit01$residuals^2)
#- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
c(MSE.train, MSE.valid)
# Plot the fit
plot(CV.train[[k]]$lstat, CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k, ": MSE.va=",round(MSE.valid,2)), col="red")
# run this section for different d, and different k.
# layout( matrix(1:5, 1,5) )
# layout(1)
##--------------------------
## 3b. Polynomial Regression for d=1:10 and k=1:5
# k=1 2 3 4 5 Test
# [84][84][84][84][84] x[86]
MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
for (k in 1:5) {
Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]])
summary(Fit01)
#--- CV Training MSE
MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)
#--- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
}
}
MSE.train
MSE.valid
Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)
# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))
###--- Make decision about best deg.poly to use from the Av MSE plot
###--------------------------
### 4. Best Polynomial Regression Model for entire Training set
# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5
# Train Test
# [ 420 ] [ 86 ]
Fit05 <- lm( resp ~ poly(lstat, deg.poly), data=Train.set)
summary(Fit05)
#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)
#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)
c(MSE.train, MSE.test) # Final Fit Result
c(Av.MSE.train[5], Av.MSE.valid[5]) # Recall from CV section
# Plot the fit
plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
ix = sort(Train.set$lstat, index.return=TRUE)$ix
lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
text(30,46, paste(" MSE.test=",round(MSE.test,2)), col="red")