1. Load Boston data from package MASS

As we did in Ch2-Lab.

library(MASS)  # install.packages("MASS")

data(Boston)  # load data

head(Boston, 10) # see first 10 lines of data
##       crim   zn indus chas   nox    rm   age    dis rad
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5
##    tax ptratio  black lstat medv
## 1  296    15.3 396.90  4.98 24.0
## 2  242    17.8 396.90  9.14 21.6
## 3  242    17.8 392.83  4.03 34.7
## 4  222    18.7 394.63  2.94 33.4
## 5  222    18.7 396.90  5.33 36.2
## 6  222    18.7 394.12  5.21 28.7
## 7  311    15.2 395.60 12.43 22.9
## 8  311    15.2 396.90 19.15 27.1
## 9  311    15.2 386.63 29.93 16.5
## 10 311    15.2 386.71 17.10 18.9
class(Boston)   # it is a data frame
## [1] "data.frame"
dim(Boston)     # (num of row, num of col)
## [1] 506  14
names(Boston)   # column names
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"    
##  [6] "rm"      "age"     "dis"     "rad"     "tax"    
## [11] "ptratio" "black"   "lstat"   "medv"
?Boston         # see explanation for variables


#  1  crim     per capita crime rate by town.
#  2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#  3  indus    proportion of non-retail business acres per town.
#  4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#  5  nox      nitrogen oxides concentration (parts per 10 million).
#  6  rm       average number of rooms per dwelling.
#  7  age      proportion of owner-occupied units built prior to 1940.
#  8  dis      weighted mean of distances to five Boston employment centres.
#  9  rad      index of accessibility to radial highways.
#  10 tax      full-value property-tax rate per $10,000.
#  11 ptratio  pupil-teacher ratio by town.
#  12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#  13 lstat    lower status of the population (percent).

#  14 medv (response Y) median value of owner-occupied homes in $1000s.



2. Create Cross Validation Data Set

I’ve written a function and posed on my website. You can load the function using single line with sourse() command.

The function is called CreateCV(). The dataframe you feed into the funciton must contain the “resp” column.

# Turn Boston into Tibble
library(tidyverse)     # install.packages(tidyverse")
## Warning: package 'ggplot2' was built under R version
## 4.4.1
## ── Attaching core tidyverse packages ───────────────────
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Boston <- as_tibble(Boston)
Boston
## # A tibble: 506 × 14
##       crim    zn indus  chas   nox    rm   age   dis
##      <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
##  1 0.00632  18    2.31     0 0.538  6.58  65.2  4.09
##  2 0.0273    0    7.07     0 0.469  6.42  78.9  4.97
##  3 0.0273    0    7.07     0 0.469  7.18  61.1  4.97
##  4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06
##  5 0.0690    0    2.18     0 0.458  7.15  54.2  6.06
##  6 0.0298    0    2.18     0 0.458  6.43  58.7  6.06
##  7 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56
##  8 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95
##  9 0.211    12.5  7.87     0 0.524  5.63 100    6.08
## 10 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59
## # ℹ 496 more rows
## # ℹ 6 more variables: rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>, medv <dbl>
# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston
## # A tibble: 506 × 14
##     resp    crim    zn indus  chas   nox    rm   age
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8
##  5  36.2 0.0690    0    2.18     0 0.458  7.15  54.2
##  6  28.7 0.0298    0    2.18     0 0.458  6.43  58.7
##  7  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6
##  8  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1
##  9  16.5 0.211    12.5  7.87     0 0.524  5.63 100  
## 10  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9
## # ℹ 496 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>
# Check if there's NA in data
sum(is.na(Boston))
## [1] 0
Boston = Boston %>% na.omit()  # remove rows with NA (if applicable)
Boston
## # A tibble: 506 × 14
##     resp    crim    zn indus  chas   nox    rm   age
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8
##  5  36.2 0.0690    0    2.18     0 0.458  7.15  54.2
##  6  28.7 0.0298    0    2.18     0 0.458  6.43  58.7
##  7  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6
##  8  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1
##  9  16.5 0.211    12.5  7.87     0 0.524  5.63 100  
## 10  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9
## # ℹ 496 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>
# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt")   # load CreateCV()


# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)
## 
##     number of folds:  5 
##     number of obs:  506 
##     training data:  84  x  5  =  420 
##     size of test data:  86 
## 
##     Global variables created:  
##       Train.set       Train.resp           
##       Test.set        Test.resp            
##       CV.train[[k]]   CV.train.resp[[k]]   
##       CV.valid[[k]]   CV.valid.resp[[k]]
# Output (all data.frame):
#   Train.set      Train.resp
#   Test.set       Test.resp
#   CV.train[[[[k]]]]  CV.train.resp[[[[k]]]]
#   CV.valid[[[[k]]]]  CV.valid.resp[[[[k]]]]

Train.set
## # A tibble: 420 × 14
##     resp    crim    zn indus  chas   nox    rm   age
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8
##  5  28.7 0.0298    0    2.18     0 0.458  6.43  58.7
##  6  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6
##  7  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1
##  8  16.5 0.211    12.5  7.87     0 0.524  5.63 100  
##  9  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9
## 10  21.7 0.0938   12.5  7.87     0 0.524  5.89  39  
## # ℹ 410 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>
Test.set
## # A tibble: 86 × 14
##     resp   crim    zn indus  chas   nox    rm   age
##    <dbl>  <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  36.2 0.0690   0    2.18     0 0.458  7.15  54.2
##  2  15   0.225   12.5  7.87     0 0.524  6.38  94.3
##  3  18.9 0.117   12.5  7.87     0 0.524  6.01  82.9
##  4  20.4 0.630    0    8.14     0 0.538  5.95  61.8
##  5  19.6 0.852    0    8.14     0 0.538  5.96  89.2
##  6  18.9 0.0642   0    5.96     0 0.499  5.93  68.2
##  7  34.9 0.0336  75    2.95     0 0.428  7.02  15.8
##  8  26.6 0.127    0    6.91     0 0.448  6.77   2.9
##  9  25.3 0.142    0    6.91     0 0.448  6.17   6.6
## 10  19.3 0.171    0    6.91     0 0.448  5.68  33.8
## # ℹ 76 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>
CV.train[[1]]
## # A tibble: 336 × 14
##     resp    crim    zn indus  chas   nox    rm   age
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2
##  2  33.4 0.0324    0    2.18     0 0.458  7.00  45.8
##  3  28.7 0.0298    0    2.18     0 0.458  6.43  58.7
##  4  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6
##  5  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1
##  6  21.7 0.0938   12.5  7.87     0 0.524  5.89  39  
##  7  18.2 0.638     0    8.14     0 0.538  6.10  84.5
##  8  19.9 0.627     0    8.14     0 0.538  5.83  56.5
##  9  23.1 1.05      0    8.14     0 0.538  5.94  29.3
## 10  17.5 0.784     0    8.14     0 0.538  5.99  81.7
## # ℹ 326 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>
CV.valid[[1]]
## # A tibble: 84 × 14
##     resp   crim    zn indus  chas   nox    rm   age
##    <dbl>  <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
##  1  21.6 0.0273   0    7.07     0 0.469  6.42  78.9
##  2  34.7 0.0273   0    7.07     0 0.469  7.18  61.1
##  3  16.5 0.211   12.5  7.87     0 0.524  5.63 100  
##  4  18.9 0.170   12.5  7.87     0 0.524  6.00  85.9
##  5  13.9 0.841    0    8.14     0 0.538  5.60  85.7
##  6  12.7 1.13     0    8.14     0 0.538  5.71  94.1
##  7  13.1 1.15     0    8.14     0 0.538  5.70  95  
##  8  21   0.0801   0    5.96     0 0.499  5.85  41.5
##  9  16.6 0.229    0    6.91     0 0.448  6.03  85.5
## 10  23.3 0.154   25    5.13     0 0.453  6.14  29.2
## # ℹ 74 more rows
## # ℹ 6 more variables: dis <dbl>, rad <int>, tax <dbl>,
## #   ptratio <dbl>, black <dbl>, lstat <dbl>



3. Polynomial Regression using 5-fold CV

#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Boston)
summary(Fit00)
## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## Coefficients:
##                                      Estimate
## (Intercept)                        48.6496253
## poly(lstat, deg.poly, raw = TRUE)1 -3.8655928
## poly(lstat, deg.poly, raw = TRUE)2  0.1487385
## poly(lstat, deg.poly, raw = TRUE)3 -0.0020039
##                                    Std. Error t value
## (Intercept)                         1.4347240  33.909
## poly(lstat, deg.poly, raw = TRUE)1  0.3287861 -11.757
## poly(lstat, deg.poly, raw = TRUE)2  0.0212987   6.983
## poly(lstat, deg.poly, raw = TRUE)3  0.0003997  -5.013
##                                    Pr(>|t|)    
## (Intercept)                         < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1  < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)2 9.18e-12 ***
## poly(lstat, deg.poly, raw = TRUE)3 7.43e-07 ***
## ---
## Signif. codes:  
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6558 
## F-statistic: 321.7 on 3 and 502 DF,  p-value: < 2.2e-16
plot(Boston$lstat,  Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )


3a. Polynomial Regression for d=3 and k=5

# k=1   2   3   4   5     Teat
# [84][84][84][84][84]   x[86]

deg.poly = 3
k=5

Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]] )
  summary(Fit01)
## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = CV.train[[k]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.0501  -3.4933  -0.4797   2.6795  22.5574 
## 
## Coefficients:
##                                      Estimate
## (Intercept)                        51.7159242
## poly(lstat, deg.poly, raw = TRUE)1 -4.4918795
## poly(lstat, deg.poly, raw = TRUE)2  0.1847999
## poly(lstat, deg.poly, raw = TRUE)3 -0.0026298
##                                    Std. Error t value
## (Intercept)                         1.6942811  30.524
## poly(lstat, deg.poly, raw = TRUE)1  0.4040236 -11.118
## poly(lstat, deg.poly, raw = TRUE)2  0.0270514   6.831
## poly(lstat, deg.poly, raw = TRUE)3  0.0005222  -5.036
##                                    Pr(>|t|)    
## (Intercept)                         < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1  < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)2 4.03e-11 ***
## poly(lstat, deg.poly, raw = TRUE)3 7.81e-07 ***
## ---
## Signif. codes:  
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.08 on 332 degrees of freedom
## Multiple R-squared:  0.7156, Adjusted R-squared:  0.713 
## F-statistic: 278.5 on 3 and 332 DF,  p-value: < 2.2e-16
  #- CV Training MSE
  MSE.train <- mean(Fit01$residuals^2)

  #- CV Validation MSE
  Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
  MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)

  c(MSE.train, MSE.valid)
## [1] 25.49821 28.58537
# Plot the fit
plot(CV.train[[k]]$lstat,  CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k,       ": MSE.va=",round(MSE.valid,2)), col="red")

    # run this section for different d, and different k.

    # layout( matrix(1:5, 1,5) )
    # layout(1)


3b. Polynomial Regression for d=1:10 and k=1:5

# k=1   2   3   4   5     Test
# [84][84][84][84][84]   x[86]

MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
    for (k in 1:5) {

        Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]])
        summary(Fit01)

        #--- CV Training MSE
        MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)

        #--- CV Validation MSE
        Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
        MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
    }
}
MSE.train
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 36.24899 27.08953 25.84182 24.21842 23.55096
## [2,] 34.56513 26.53025 24.65327 23.08652 22.44040
## [3,] 38.52559 28.92240 26.71601 24.79355 24.29306
## [4,] 36.78962 28.76672 27.05489 25.50835 24.91792
## [5,] 37.76581 27.44607 25.49821 23.50494 23.02862
##          [,6]     [,7]     [,8]     [,9]    [,10]
## [1,] 23.43695 23.43695 23.43062 23.21270 23.11963
## [2,] 22.36687 22.23037 22.20459 21.91391 21.91374
## [3,] 24.06215 24.02984 24.02645 23.77189 23.70269
## [4,] 24.70608 24.64202 24.60266 24.35297 24.35027
## [5,] 22.86241 22.74223 22.69809 22.57818 22.54991
MSE.valid
##          [,1]     [,2]     [,3]     [,4]     [,5]
## [1,] 39.08241 30.64301 27.09155 24.81139 24.75677
## [2,] 46.13266 33.26572 32.01973 29.49029 29.35147
## [3,] 30.00595 23.35932 23.61148 22.78120 22.05074
## [4,] 36.99215 24.09156 22.12401 19.49744 18.99647
## [5,] 33.47810 29.78657 28.58537 27.88259 26.87975
##          [,6]     [,7]     [,8]     [,9]    [,10]
## [1,] 24.44796 24.45193 24.52577 24.36412 24.63450
## [2,] 28.87407 29.12594 29.55401 31.35284 31.46868
## [3,] 22.35141 22.16244 22.14949 22.14141 22.24910
## [4,] 19.12167 19.06789 19.33934 19.70460 19.62039
## [5,] 26.76655 26.98511 27.27634 26.78139 26.70438
Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)

# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))

###--- Make decision about best deg.poly to use from the Av MSE plot



4. Best Polynomial Regression Model for entire Training set

# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5

# Train    Test
# [ 420 ]  [ 86 ]


Fit05 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Train.set)
summary(Fit05)
## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly, raw = TRUE), data = Train.set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.0360  -2.9462  -0.7869   2.1442  24.4587 
## 
## Coefficients:
##                                      Estimate
## (Intercept)                         7.174e+01
## poly(lstat, deg.poly, raw = TRUE)1 -1.351e+01
## poly(lstat, deg.poly, raw = TRUE)2  1.475e+00
## poly(lstat, deg.poly, raw = TRUE)3 -8.068e-02
## poly(lstat, deg.poly, raw = TRUE)4  2.082e-03
## poly(lstat, deg.poly, raw = TRUE)5 -2.015e-05
##                                    Std. Error t value
## (Intercept)                         4.088e+00  17.550
## poly(lstat, deg.poly, raw = TRUE)1  1.805e+00  -7.488
## poly(lstat, deg.poly, raw = TRUE)2  2.763e-01   5.340
## poly(lstat, deg.poly, raw = TRUE)3  1.869e-02  -4.317
## poly(lstat, deg.poly, raw = TRUE)4  5.703e-04   3.650
## poly(lstat, deg.poly, raw = TRUE)5  6.397e-06  -3.150
##                                    Pr(>|t|)    
## (Intercept)                         < 2e-16 ***
## poly(lstat, deg.poly, raw = TRUE)1 4.26e-13 ***
## poly(lstat, deg.poly, raw = TRUE)2 1.54e-07 ***
## poly(lstat, deg.poly, raw = TRUE)3 1.98e-05 ***
## poly(lstat, deg.poly, raw = TRUE)4 0.000295 ***
## poly(lstat, deg.poly, raw = TRUE)5 0.001750 ** 
## ---
## Signif. codes:  
## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.905 on 414 degrees of freedom
## Multiple R-squared:  0.7186, Adjusted R-squared:  0.7152 
## F-statistic: 211.5 on 5 and 414 DF,  p-value: < 2.2e-16
#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)

#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)


c(MSE.train, MSE.test)  # Final Fit Result
## [1] 23.72002 43.09228
c(Av.MSE.train[5], Av.MSE.valid[5])  # Recall from CV section
## [1] 23.64619 24.40704
    # Plot the fit
    plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
    lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
    ix = sort(Train.set$lstat, index.return=TRUE)$ix
    lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
    text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
    text(30,46, paste("          MSE.test=",round(MSE.test,2)), col="red")



Code Only

###--------------------------
### 1. Load Boston data from package MASS
# As we did in Ch2-Lab.

library(MASS)  # install.packages("MASS")

data(Boston)   # load data

head(Boston, 10) # see first 10 lines of data

class(Boston)   # it is a data frame

dim(Boston)     # (num of row, num of col)
names(Boston)   # column names

?Boston         # see explanation for variables

#  1  crim     per capita crime rate by town.
#  2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#  3  indus    proportion of non-retail business acres per town.
#  4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#  5  nox      nitrogen oxides concentration (parts per 10 million).
#  6  rm       average number of rooms per dwelling.
#  7  age      proportion of owner-occupied units built prior to 1940.
#  8  dis      weighted mean of distances to five Boston employment centres.
#  9  rad      index of accessibility to radial highways.
#  10 tax      full-value property-tax rate per $10,000.
#  11 ptratio  pupil-teacher ratio by town.
#  12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#  13 lstat    lower status of the population (percent).

#  14 medv (response Y) median value of owner-occupied homes in $1000s.



###--------------------------
### 2. Create Cross Validation Data Set
# I've written a function and posed on my website.
# You can load the function using single line with sourse() command.
#
# The function is called CreateCV().
# The dataframe you feed into the funciton must contain the "resp" column.


# Turn Boston into Tibble
library(tidyverse)     # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston


# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston


# Check if there's NA in data
sum(is.na(Boston))
Boston = Boston %>% na.omit()  # remove rows with NA (if applicable)
Boston


# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt")   # load CreateCV()


# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)

# Output (all data.frame):
#   Train.set      Train.resp
#   Test.set       Test.resp
#   CV.train[[[[k]]]]  CV.train.resp[[[[k]]]]
#   CV.valid[[[[k]]]]  CV.valid.resp[[[[k]]]]

Train.set

Test.set

CV.train[[1]]

CV.valid[[1]]


###--------------------------
### 3. Polynomial Regression using 5-fold CV

#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Boston)
summary(Fit00)

plot(Boston$lstat,  Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )



##--------------------------
## 3a. Polynomial Regression for d=3 and k=5

# k=1   2   3   4   5     Teat
# [84][84][84][84][84]   x[86]

deg.poly = 3
k=5

Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]] )
  summary(Fit01)

  #- CV Training MSE
  MSE.train <- mean(Fit01$residuals^2)

  #- CV Validation MSE
  Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
  MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)

  c(MSE.train, MSE.valid)


# Plot the fit
plot(CV.train[[k]]$lstat,  CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k,       ": MSE.va=",round(MSE.valid,2)), col="red")

    # run this section for different d, and different k.

    # layout( matrix(1:5, 1,5) )
    # layout(1)


##--------------------------
## 3b. Polynomial Regression for d=1:10 and k=1:5

# k=1   2   3   4   5     Test
# [84][84][84][84][84]   x[86]

MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
    for (k in 1:5) {

        Fit01 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=CV.train[[k]])
        summary(Fit01)

        #--- CV Training MSE
        MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)

        #--- CV Validation MSE
        Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
        MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
    }
}
MSE.train
MSE.valid

Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)

# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))

   ###--- Make decision about best deg.poly to use from the Av MSE plot


###--------------------------
### 4. Best Polynomial Regression Model for entire Training set

# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5

# Train    Test
# [ 420 ]  [ 86 ]


Fit05 <- lm( resp ~ poly(lstat, deg.poly, raw=TRUE), data=Train.set)
summary(Fit05)

#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)

#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)


c(MSE.train, MSE.test)  # Final Fit Result

c(Av.MSE.train[5], Av.MSE.valid[5])  # Recall from CV section

    # Plot the fit
    plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
    lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
    ix = sort(Train.set$lstat, index.return=TRUE)$ix
    lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
    text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
    text(30,46, paste("          MSE.test=",round(MSE.test,2)), col="red")