As we did in Ch2-Lab.
library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
## crim zn indus chas nox rm age dis rad
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5
## tax ptratio black lstat medv
## 1 296 15.3 396.90 4.98 24.0
## 2 242 17.8 396.90 9.14 21.6
## 3 242 17.8 392.83 4.03 34.7
## 4 222 18.7 394.63 2.94 33.4
## 5 222 18.7 396.90 5.33 36.2
## 6 222 18.7 394.12 5.21 28.7
## 7 311 15.2 395.60 12.43 22.9
## 8 311 15.2 396.90 19.15 27.1
## 9 311 15.2 386.63 29.93 16.5
## 10 311 15.2 386.71 17.10 18.9
## [1] "data.frame"
## [1] 506 14
## [1] "crim" "zn" "indus" "chas" "nox"
## [6] "rm" "age" "dis" "rad" "tax"
## [11] "ptratio" "black" "lstat" "medv"
?Boston # see explanation for variables
# 1 crim per capita crime rate by town.
# 2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
# 3 indus proportion of non-retail business acres per town.
# 4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# 5 nox nitrogen oxides concentration (parts per 10 million).
# 6 rm average number of rooms per dwelling.
# 7 age proportion of owner-occupied units built prior to 1940.
# 8 dis weighted mean of distances to five Boston employment centres.
# 9 rad index of accessibility to radial highways.
# 10 tax full-value property-tax rate per $10,000.
# 11 ptratio pupil-teacher ratio by town.
# 12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# 13 lstat lower status of the population (percent).
# 14 medv (response Y) median value of owner-occupied homes in $1000s.
I’ve written a function and posed on my website. You can load the function using single line with sourse() command.
The function is called CreateCV(). The dataframe you feed into the funciton must contain the “resp” column.
## Warning: package 'ggplot2' was built under R version
## 4.4.1
## ── Attaching core tidyverse packages ───────────────────
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## # A tibble: 506 × 14
## crim zn indus chas nox rm age dis
## <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 0.00632 18 2.31 0 0.538 6.58 65.2 4.09
## 2 0.0273 0 7.07 0 0.469 6.42 78.9 4.97
## 3 0.0273 0 7.07 0 0.469 7.18 61.1 4.97
## 4 0.0324 0 2.18 0 0.458 7.00 45.8 6.06
## 5 0.0690 0 2.18 0 0.458 7.15 54.2 6.06
## 6 0.0298 0 2.18 0 0.458 6.43 58.7 6.06
## 7 0.0883 12.5 7.87 0 0.524 6.01 66.6 5.56
## 8 0.145 12.5 7.87 0 0.524 6.17 96.1 5.95
## 9 0.211 12.5 7.87 0 0.524 5.63 100 6.08
## 10 0.170 12.5 7.87 0 0.524 6.00 85.9 6.59
## # ℹ 496 more rows
## # ℹ 6 more variables: rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>, medv <dbl>
# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston
## # A tibble: 506 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8
## 5 36.2 0.0690 0 2.18 0 0.458 7.15 54.2
## 6 28.7 0.0298 0 2.18 0 0.458 6.43 58.7
## 7 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6
## 8 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1
## 9 16.5 0.211 12.5 7.87 0 0.524 5.63 100
## 10 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9
## # ℹ 496 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
## [1] 0
## # A tibble: 506 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8
## 5 36.2 0.0690 0 2.18 0 0.458 7.15 54.2
## 6 28.7 0.0298 0 2.18 0 0.458 6.43 58.7
## 7 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6
## 8 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1
## 9 16.5 0.211 12.5 7.87 0 0.524 5.63 100
## 10 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9
## # ℹ 496 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt") # load CreateCV()
# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)
##
## number of folds: 5
## number of obs: 506
## training data: 84 x 5 = 420
## size of test data: 86
##
## Global variables created:
## Train.set Train.resp
## Test.set Test.resp
## CV.train[[k]] CV.train.resp[[k]]
## CV.valid[[k]] CV.valid.resp[[k]]
# Output (all data.frame):
# Train.set Train.resp
# Test.set Test.resp
# CV.train[[[[k]]]] CV.train.resp[[[[k]]]]
# CV.valid[[[[k]]]] CV.valid.resp[[[[k]]]]
Train.set
## # A tibble: 420 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2
## 2 21.6 0.0273 0 7.07 0 0.469 6.42 78.9
## 3 34.7 0.0273 0 7.07 0 0.469 7.18 61.1
## 4 33.4 0.0324 0 2.18 0 0.458 7.00 45.8
## 5 28.7 0.0298 0 2.18 0 0.458 6.43 58.7
## 6 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6
## 7 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1
## 8 16.5 0.211 12.5 7.87 0 0.524 5.63 100
## 9 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9
## 10 21.7 0.0938 12.5 7.87 0 0.524 5.89 39
## # ℹ 410 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 86 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 36.2 0.0690 0 2.18 0 0.458 7.15 54.2
## 2 15 0.225 12.5 7.87 0 0.524 6.38 94.3
## 3 18.9 0.117 12.5 7.87 0 0.524 6.01 82.9
## 4 20.4 0.630 0 8.14 0 0.538 5.95 61.8
## 5 19.6 0.852 0 8.14 0 0.538 5.96 89.2
## 6 18.9 0.0642 0 5.96 0 0.499 5.93 68.2
## 7 34.9 0.0336 75 2.95 0 0.428 7.02 15.8
## 8 26.6 0.127 0 6.91 0 0.448 6.77 2.9
## 9 25.3 0.142 0 6.91 0 0.448 6.17 6.6
## 10 19.3 0.171 0 6.91 0 0.448 5.68 33.8
## # ℹ 76 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 336 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 24 0.00632 18 2.31 0 0.538 6.58 65.2
## 2 33.4 0.0324 0 2.18 0 0.458 7.00 45.8
## 3 28.7 0.0298 0 2.18 0 0.458 6.43 58.7
## 4 22.9 0.0883 12.5 7.87 0 0.524 6.01 66.6
## 5 27.1 0.145 12.5 7.87 0 0.524 6.17 96.1
## 6 21.7 0.0938 12.5 7.87 0 0.524 5.89 39
## 7 18.2 0.638 0 8.14 0 0.538 6.10 84.5
## 8 19.9 0.627 0 8.14 0 0.538 5.83 56.5
## 9 23.1 1.05 0 8.14 0 0.538 5.94 29.3
## 10 17.5 0.784 0 8.14 0 0.538 5.99 81.7
## # ℹ 326 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
## # A tibble: 84 × 14
## resp crim zn indus chas nox rm age
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 21.6 0.0273 0 7.07 0 0.469 6.42 78.9
## 2 34.7 0.0273 0 7.07 0 0.469 7.18 61.1
## 3 16.5 0.211 12.5 7.87 0 0.524 5.63 100
## 4 18.9 0.170 12.5 7.87 0 0.524 6.00 85.9
## 5 13.9 0.841 0 8.14 0 0.538 5.60 85.7
## 6 12.7 1.13 0 8.14 0 0.538 5.71 94.1
## 7 13.1 1.15 0 8.14 0 0.538 5.70 95
## 8 21 0.0801 0 5.96 0 0.499 5.85 41.5
## 9 16.6 0.229 0 6.91 0 0.448 6.03 85.5
## 10 23.3 0.154 25 5.13 0 0.453 6.14 29.2
## # ℹ 74 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## # ptratio <dbl>, black <dbl>, lstat <dbl>
#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Boston)
summary(Fit00)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.5441 -3.7122 -0.5145 2.4846 26.4153
##
## Coefficients:
## Estimate
## (Intercept) 48.6496253
## poly(lstat, deg.poly, raw = TRUE)1 -3.8655928
## poly(lstat, deg.poly, raw = TRUE)2 0.1487385
## poly(lstat, deg.poly, raw = TRUE)3 -0.0020039
## Std. Error t value
## (Intercept) 1.4347240 33.909
## poly(lstat, deg.poly, raw = TRUE)1 0.3287861 -11.757
## poly(lstat, deg.poly, raw = TRUE)2 0.0212987 6.983
## poly(lstat, deg.poly, raw = TRUE)3 0.0003997 -5.013
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1 < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)2 9.18e-12 ***
## poly(lstat, deg.poly, raw = TRUE)3 7.43e-07 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared: 0.6578, Adjusted R-squared: 0.6558
## F-statistic: 321.7 on 3 and 502 DF, p-value: < 2.2e-16
plot(Boston$lstat, Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )
# k=1 2 3 4 5 Teat
# [84][84][84][84][84] x[86]
deg.poly = 3
k=5
Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]] )
summary(Fit01)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = CV.train[[k]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0501 -3.4933 -0.4797 2.6795 22.5574
##
## Coefficients:
## Estimate
## (Intercept) 51.7159242
## poly(lstat, deg.poly, raw = TRUE)1 -4.4918795
## poly(lstat, deg.poly, raw = TRUE)2 0.1847999
## poly(lstat, deg.poly, raw = TRUE)3 -0.0026298
## Std. Error t value
## (Intercept) 1.6942811 30.524
## poly(lstat, deg.poly, raw = TRUE)1 0.4040236 -11.118
## poly(lstat, deg.poly, raw = TRUE)2 0.0270514 6.831
## poly(lstat, deg.poly, raw = TRUE)3 0.0005222 -5.036
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1 < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)2 4.03e-11 ***
## poly(lstat, deg.poly, raw = TRUE)3 7.81e-07 ***
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.08 on 332 degrees of freedom
## Multiple R-squared: 0.7156, Adjusted R-squared: 0.713
## F-statistic: 278.5 on 3 and 332 DF, p-value: < 2.2e-16
#- CV Training MSE
MSE.train <- mean(Fit01$residuals^2)
#- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
c(MSE.train, MSE.valid)
## [1] 25.49821 28.58537
# Plot the fit
plot(CV.train[[k]]$lstat, CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k, ": MSE.va=",round(MSE.valid,2)), col="red")
# k=1 2 3 4 5 Test
# [84][84][84][84][84] x[86]
MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
for (k in 1:5) {
Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]])
summary(Fit01)
#--- CV Training MSE
MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)
#--- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
}
}
MSE.train
## [,1] [,2] [,3] [,4] [,5]
## [1,] 36.24899 27.08953 25.84182 24.21842 23.55096
## [2,] 34.56513 26.53025 24.65327 23.08652 22.44040
## [3,] 38.52559 28.92240 26.71601 24.79355 24.29306
## [4,] 36.78962 28.76672 27.05489 25.50835 24.91792
## [5,] 37.76581 27.44607 25.49821 23.50494 23.02862
## [,6] [,7] [,8] [,9] [,10]
## [1,] 23.43695 23.43695 23.43062 23.21270 23.11963
## [2,] 22.36687 22.23037 22.20459 21.91391 21.91374
## [3,] 24.06215 24.02984 24.02645 23.77189 23.70269
## [4,] 24.70608 24.64202 24.60266 24.35297 24.35027
## [5,] 22.86241 22.74223 22.69809 22.57818 22.54991
## [,1] [,2] [,3] [,4] [,5]
## [1,] 39.08241 30.64301 27.09155 24.81139 24.75677
## [2,] 46.13266 33.26572 32.01973 29.49029 29.35147
## [3,] 30.00595 23.35932 23.61148 22.78120 22.05074
## [4,] 36.99215 24.09156 22.12401 19.49744 18.99647
## [5,] 33.47810 29.78657 28.58537 27.88259 26.87975
## [,6] [,7] [,8] [,9] [,10]
## [1,] 24.44796 24.45193 24.52577 24.36412 24.63450
## [2,] 28.87407 29.12594 29.55401 31.35284 31.46868
## [3,] 22.35141 22.16244 22.14949 22.14141 22.24910
## [4,] 19.12167 19.06789 19.33934 19.70460 19.62039
## [5,] 26.76655 26.98511 27.27634 26.78139 26.70438
Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)
# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))
# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5
# Train Test
# [ 420 ] [ 86 ]
Fit05 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Train.set)
summary(Fit05)
##
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = Train.set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0360 -2.9462 -0.7869 2.1442 24.4587
##
## Coefficients:
## Estimate
## (Intercept) 7.174e+01
## poly(lstat, deg.poly, raw = TRUE)1 -1.351e+01
## poly(lstat, deg.poly, raw = TRUE)2 1.475e+00
## poly(lstat, deg.poly, raw = TRUE)3 -8.068e-02
## poly(lstat, deg.poly, raw = TRUE)4 2.082e-03
## poly(lstat, deg.poly, raw = TRUE)5 -2.015e-05
## Std. Error t value
## (Intercept) 4.088e+00 17.550
## poly(lstat, deg.poly, raw = TRUE)1 1.805e+00 -7.488
## poly(lstat, deg.poly, raw = TRUE)2 2.763e-01 5.340
## poly(lstat, deg.poly, raw = TRUE)3 1.869e-02 -4.317
## poly(lstat, deg.poly, raw = TRUE)4 5.703e-04 3.650
## poly(lstat, deg.poly, raw = TRUE)5 6.397e-06 -3.150
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1 4.26e-13 ***
## poly(lstat, deg.poly, raw = TRUE)2 1.54e-07 ***
## poly(lstat, deg.poly, raw = TRUE)3 1.98e-05 ***
## poly(lstat, deg.poly, raw = TRUE)4 0.000295 ***
## poly(lstat, deg.poly, raw = TRUE)5 0.001750 **
## ---
## Signif. codes:
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.905 on 414 degrees of freedom
## Multiple R-squared: 0.7186, Adjusted R-squared: 0.7152
## F-statistic: 211.5 on 5 and 414 DF, p-value: < 2.2e-16
#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)
#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)
c(MSE.train, MSE.test) # Final Fit Result
## [1] 23.72002 43.09228
## [1] 23.64619 24.40704
# Plot the fit
plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
ix = sort(Train.set$lstat, index.return=TRUE)$ix
lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
text(30,46, paste(" MSE.test=",round(MSE.test,2)), col="red")
###--------------------------
### 1. Load Boston data from package MASS
# As we did in Ch2-Lab.
library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
class(Boston) # it is a data frame
dim(Boston) # (num of row, num of col)
names(Boston) # column names
?Boston # see explanation for variables
# 1 crim per capita crime rate by town.
# 2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
# 3 indus proportion of non-retail business acres per town.
# 4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
# 5 nox nitrogen oxides concentration (parts per 10 million).
# 6 rm average number of rooms per dwelling.
# 7 age proportion of owner-occupied units built prior to 1940.
# 8 dis weighted mean of distances to five Boston employment centres.
# 9 rad index of accessibility to radial highways.
# 10 tax full-value property-tax rate per $10,000.
# 11 ptratio pupil-teacher ratio by town.
# 12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
# 13 lstat lower status of the population (percent).
# 14 medv (response Y) median value of owner-occupied homes in $1000s.
###--------------------------
### 2. Create Cross Validation Data Set
# I've written a function and posed on my website.
# You can load the function using single line with sourse() command.
#
# The function is called CreateCV().
# The dataframe you feed into the funciton must contain the "resp" column.
# Turn Boston into Tibble
library(tidyverse) # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston
# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston
# Check if there's NA in data
sum(is.na(Boston))
Boston = Boston %>% na.omit() # remove rows with NA (if applicable)
Boston
# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt") # load CreateCV()
# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)
# Output (all data.frame):
# Train.set Train.resp
# Test.set Test.resp
# CV.train[[[[k]]]] CV.train.resp[[[[k]]]]
# CV.valid[[[[k]]]] CV.valid.resp[[[[k]]]]
Train.set
Test.set
CV.train[[1]]
CV.valid[[1]]
###--------------------------
### 3. Polynomial Regression using 5-fold CV
#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Boston)
summary(Fit00)
plot(Boston$lstat, Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )
##--------------------------
## 3a. Polynomial Regression for d=3 and k=5
# k=1 2 3 4 5 Teat
# [84][84][84][84][84] x[86]
deg.poly = 3
k=5
Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]] )
summary(Fit01)
#- CV Training MSE
MSE.train <- mean(Fit01$residuals^2)
#- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
c(MSE.train, MSE.valid)
# Plot the fit
plot(CV.train[[k]]$lstat, CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k, ": MSE.va=",round(MSE.valid,2)), col="red")
# run this section for different d, and different k.
# layout( matrix(1:5, 1,5) )
# layout(1)
##--------------------------
## 3b. Polynomial Regression for d=1:10 and k=1:5
# k=1 2 3 4 5 Test
# [84][84][84][84][84] x[86]
MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
for (k in 1:5) {
Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]])
summary(Fit01)
#--- CV Training MSE
MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)
#--- CV Validation MSE
Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
}
}
MSE.train
MSE.valid
Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)
# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))
###--- Make decision about best deg.poly to use from the Av MSE plot
###--------------------------
### 4. Best Polynomial Regression Model for entire Training set
# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5
# Train Test
# [ 420 ] [ 86 ]
Fit05 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Train.set)
summary(Fit05)
#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)
#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)
c(MSE.train, MSE.test) # Final Fit Result
c(Av.MSE.train[5], Av.MSE.valid[5]) # Recall from CV section
# Plot the fit
plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
ix = sort(Train.set$lstat, index.return=TRUE)$ix
lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
text(30,46, paste(" MSE.test=",round(MSE.test,2)), col="red")