Ch3-Lab Cross Validation

1. Load Boston data from package MASS

As we did in Ch2-Lab.

library(MASS)  # install.packages("MASS")

data(Boston)  # load data

head(Boston, 10) # see first 10 lines of data

##       crim   zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8 396.90
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7 394.63
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7 394.12
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
##    lstat medv
## 1   4.98 24.0
## 2   9.14 21.6
## 3   4.03 34.7
## 4   2.94 33.4
## 5   5.33 36.2
## 6   5.21 28.7
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9

class(Boston)   # it is a data frame

## [1] "data.frame"

dim(Boston)     # (num of row, num of col)

## [1] 506  14

names(Boston)   # column names

##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

?Boston         # see explanation for variables


#  1  crim     per capita crime rate by town.
#  2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#  3  indus    proportion of non-retail business acres per town.
#  4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#  5  nox      nitrogen oxides concentration (parts per 10 million).
#  6  rm       average number of rooms per dwelling.
#  7  age      proportion of owner-occupied units built prior to 1940.
#  8  dis      weighted mean of distances to five Boston employment centres.
#  9  rad      index of accessibility to radial highways.
#  10 tax      full-value property-tax rate per $10,000.
#  11 ptratio  pupil-teacher ratio by town.
#  12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#  13 lstat    lower status of the population (percent).

#  14 medv (response Y) median value of owner-occupied homes in $1000s.

2. Create Cross Validation Data Set

I’ve written a function and posed on my website. You can load the function using single line with sourse() command.

The function is called CreateCV(). The dataframe you feed into the funciton must contain the “resp” column.

# Turn Boston into Tibble
library(tidyverse)     # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston

## # A tibble: 506 × 14
##       crim    zn indus  chas   nox    rm   age   dis   rad   tax ptratio
##      <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>   <dbl>
##  1 0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296    15.3
##  2 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242    17.8
##  3 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242    17.8
##  4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222    18.7
##  5 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222    18.7
##  6 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222    18.7
##  7 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311    15.2
##  8 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311    15.2
##  9 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5   311    15.2
## 10 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311    15.2
## # ℹ 496 more rows
## # ℹ 3 more variables: black <dbl>, lstat <dbl>, medv <dbl>

# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston

## # A tibble: 506 × 14
##     resp    crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222
##  5  36.2 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222
##  6  28.7 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222
##  7  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311
##  8  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311
##  9  16.5 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5   311
## 10  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311
## # ℹ 496 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

# Check if there's NA in data
sum(is.na(Boston))

## [1] 0

Boston = Boston %>% na.omit()  # remove rows with NA (if applicable)
Boston

## # A tibble: 506 × 14
##     resp    crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222
##  5  36.2 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222
##  6  28.7 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222
##  7  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311
##  8  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311
##  9  16.5 0.211    12.5  7.87     0 0.524  5.63 100    6.08     5   311
## 10  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311
## # ℹ 496 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt")   # load CreateCV()


# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)

## 
##     number of folds:  5 
##     number of obs:  506 
##     size of training data:  420 
##     size of test data:  86 
##     size of each fold :  84 
## 
##     Global variables created:  
##       Train.set       Train.resp           
##       Test.set        Test.resp            
##       CV.train[[k]]   CV.train.resp[[k]]   
##       CV.valid[[k]]   CV.valid.resp[[k]]

# Output (all data.frame):
#   Train.set      Train.resp
#   Test.set       Test.resp
#   CV.train[[[[k]]]]  CV.train.resp[[[[k]]]]
#   CV.valid[[[[k]]]]  CV.valid.resp[[[[k]]]]

Train.set

## # A tibble: 422 × 14
##     resp    crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222
##  5  36.2 0.0690    0    2.18     0 0.458  7.15  54.2  6.06     3   222
##  6  28.7 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222
##  7  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311
##  8  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311
##  9  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311
## 10  18.9 0.117    12.5  7.87     0 0.524  6.01  82.9  6.23     5   311
## # ℹ 412 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

Test.set

## # A tibble: 84 × 14
##     resp   crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>  <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  16.5 0.211   12.5  7.87     0 0.524  5.63 100    6.08     5   311
##  2  15   0.225   12.5  7.87     0 0.524  6.38  94.3  6.35     5   311
##  3  18.2 0.638    0    8.14     0 0.538  6.10  84.5  4.46     4   307
##  4  13.6 1.25     0    8.14     0 0.538  5.57  98.1  3.80     4   307
##  5  19.6 0.852    0    8.14     0 0.538  5.96  89.2  4.01     4   307
##  6  14.5 0.988    0    8.14     0 0.538  5.81 100    4.10     4   307
##  7  13.5 1.61     0    8.14     0 0.538  6.10  96.9  3.76     4   307
##  8  35.4 0.0131  90    1.22     0 0.403  7.25  21.9  8.70     5   226
##  9  16   0.172   25    5.13     0 0.453  5.97  93.4  6.82     8   284
## 10  19.4 0.0438  80    3.37     0 0.398  5.79  31.1  6.61     4   337
## # ℹ 74 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

CV.train[[1]]

## # A tibble: 337 × 14
##     resp    crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>   <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  24   0.00632  18    2.31     0 0.538  6.58  65.2  4.09     1   296
##  2  21.6 0.0273    0    7.07     0 0.469  6.42  78.9  4.97     2   242
##  3  34.7 0.0273    0    7.07     0 0.469  7.18  61.1  4.97     2   242
##  4  33.4 0.0324    0    2.18     0 0.458  7.00  45.8  6.06     3   222
##  5  28.7 0.0298    0    2.18     0 0.458  6.43  58.7  6.06     3   222
##  6  22.9 0.0883   12.5  7.87     0 0.524  6.01  66.6  5.56     5   311
##  7  27.1 0.145    12.5  7.87     0 0.524  6.17  96.1  5.95     5   311
##  8  18.9 0.170    12.5  7.87     0 0.524  6.00  85.9  6.59     5   311
##  9  18.9 0.117    12.5  7.87     0 0.524  6.01  82.9  6.23     5   311
## 10  21.7 0.0938   12.5  7.87     0 0.524  5.89  39    5.45     5   311
## # ℹ 327 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

CV.valid[[1]]

## # A tibble: 85 × 14
##     resp   crim    zn indus  chas   nox    rm   age   dis   rad   tax
##    <dbl>  <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
##  1  36.2 0.0690     0  2.18     0 0.458  7.15  54.2  6.06     3   222
##  2  20.4 0.630      0  8.14     0 0.538  5.95  61.8  4.71     4   307
##  3  15.2 1.23       0  8.14     0 0.538  6.14  91.7  3.98     4   307
##  4  30.8 0.0276    75  2.95     0 0.428  6.60  21.8  5.40     3   252
##  5  19.3 0.171      0  6.91     0 0.448  5.68  33.8  5.10     3   233
##  6  19.7 0.0887    21  5.64     0 0.439  5.96  45.7  6.81     4   243
##  7  19.6 0.103     25  5.13     0 0.453  5.93  47.2  6.93     8   284
##  8  18.7 0.149     25  5.13     0 0.453  5.74  66.2  7.23     8   284
##  9  24.2 0.0883     0 10.8      0 0.413  6.42   6.6  5.29     4   305
## 10  22.8 0.0916     0 10.8      0 0.413  6.06   7.8  5.29     4   305
## # ℹ 75 more rows
## # ℹ 3 more variables: ptratio <dbl>, black <dbl>, lstat <dbl>

3. Polynomial Regression using 5-fold CV

#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly), data=Boston)
summary(Fit00)

## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.5441  -3.7122  -0.5145   2.4846  26.4153 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.5328     0.2399  93.937  < 2e-16 ***
## poly(lstat, deg.poly)1 -152.4595     5.3958 -28.255  < 2e-16 ***
## poly(lstat, deg.poly)2   64.2272     5.3958  11.903  < 2e-16 ***
## poly(lstat, deg.poly)3  -27.0511     5.3958  -5.013 7.43e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.396 on 502 degrees of freedom
## Multiple R-squared:  0.6578, Adjusted R-squared:  0.6558 
## F-statistic: 321.7 on 3 and 502 DF,  p-value: < 2.2e-16

plot(Boston$lstat,  Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )

3a. Polynomial Regression for d=3 and k=5

# k=1   2   3   4   5     Teat
# [84][84][84][84][84]   x[86]

deg.poly = 3
k=5

Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] )
  summary(Fit01)

## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = CV.train[[k]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3992  -3.7879  -0.8421   2.7662  26.5361 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.7068     0.3031  74.903  < 2e-16 ***
## poly(lstat, deg.poly)1 -121.4322     5.5733 -21.788  < 2e-16 ***
## poly(lstat, deg.poly)2   54.1723     5.5733   9.720  < 2e-16 ***
## poly(lstat, deg.poly)3  -23.6973     5.5733  -4.252 2.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.573 on 334 degrees of freedom
## Multiple R-squared:  0.6375, Adjusted R-squared:  0.6342 
## F-statistic: 195.8 on 3 and 334 DF,  p-value: < 2.2e-16

  #- CV Training MSE
  MSE.train <- mean(Fit01$residuals^2)

  #- CV Validation MSE
  Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
  MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)

  c(MSE.train, MSE.valid)

## [1] 30.69423 36.14407

# Plot the fit
plot(CV.train[[k]]$lstat,  CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k,       ": MSE.va=",round(MSE.valid,2)), col="red")

    # run this section for different d, and different k.

    # layout( matrix(1:5, 1,5) )
    # layout(1)

3b. Polynomial Regression for d=1:10 and k=1:5

# k=1   2   3   4   5     Test
# [84][84][84][84][84]   x[86]

MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
    for (k in 1:5) {

        Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]])
        summary(Fit01)

        #--- CV Training MSE
        MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)

        #--- CV Validation MSE
        Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
        MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
    }
}
MSE.train

##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 43.05375 34.74909 33.01309 32.02085 31.23260 31.22268 31.22062
## [2,] 40.05560 32.19746 30.92455 29.83802 28.92146 28.81751 28.78919
## [3,] 43.13570 32.58281 30.40547 28.45294 27.40293 27.21876 27.21876
## [4,] 43.14692 35.09344 33.33417 31.68624 30.64266 30.56833 30.48808
## [5,] 41.03800 32.35565 30.69423 28.99434 27.66631 27.51818 27.48698
##          [,8]     [,9]    [,10]
## [1,] 31.05750 30.90124 30.56510
## [2,] 28.76656 28.37541 28.18858
## [3,] 27.15422 27.06685 26.54419
## [4,] 30.39161 30.03558 29.80547
## [5,] 27.39787 27.03285 26.72318

MSE.valid

##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]
## [1,] 39.10088 28.60760 27.19558 23.77802 22.76167 22.31615 22.55017
## [2,] 50.55547 38.73248 35.51312 32.62050 31.12400 31.01640 31.02691
## [3,] 38.31102 37.62793 38.22876 39.29035 38.76847 39.37066 39.36591
## [4,] 38.14640 26.99644 25.55631 24.90907 23.96566 23.79805 24.09205
## [5,] 46.99887 38.01775 36.14407 35.70672 36.10602 36.25276 36.25668
##          [,8]     [,9]    [,10]
## [1,] 25.61753 20.99249 52.83200
## [2,] 30.83004 30.85357 30.60280
## [3,] 38.97614 37.36199 44.64711
## [4,] 24.09848 23.87373 23.79805
## [5,] 36.24200 36.11202 36.39063

Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)

# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))

###--- Make decision about best deg.poly to use from the Av MSE plot

4. Best Polynomial Regression Model for entire Training set

# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5

# Train    Test
# [ 420 ]  [ 86 ]


Fit05 <- lm( resp ~ poly(lstat, deg.poly), data=Train.set)
summary(Fit05)

## 
## Call:
## lm(formula = resp ~ poly(lstat, deg.poly), data = Train.set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3808  -3.1680  -0.8885   2.0396  27.1750 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              22.7723     0.2654  85.809  < 2e-16 ***
## poly(lstat, deg.poly)1 -137.9509     5.4517 -25.304  < 2e-16 ***
## poly(lstat, deg.poly)2   60.5427     5.4517  11.105  < 2e-16 ***
## poly(lstat, deg.poly)3  -26.7740     5.4517  -4.911 1.30e-06 ***
## poly(lstat, deg.poly)4   24.7932     5.4517   4.548 7.12e-06 ***
## poly(lstat, deg.poly)5  -20.5613     5.4517  -3.772 0.000186 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.452 on 416 degrees of freedom
## Multiple R-squared:  0.6642, Adjusted R-squared:  0.6601 
## F-statistic: 164.5 on 5 and 416 DF,  p-value: < 2.2e-16

#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)

#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)


c(MSE.train, MSE.test)  # Final Fit Result

## [1] 29.29834 14.89711

c(Av.MSE.train[5], Av.MSE.valid[5])  # Recall from CV section

## [1] 29.17319 30.54516

    # Plot the fit
    plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
    lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
    ix = sort(Train.set$lstat, index.return=TRUE)$ix
    lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
    text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
    text(30,46, paste("          MSE.test=",round(MSE.test,2)), col="red")

Code Only

###--------------------------
### 1. Load Boston data from package MASS
# As we did in Ch2-Lab.

library(MASS)  # install.packages("MASS")

data(Boston)   # load data

head(Boston, 10) # see first 10 lines of data

class(Boston)   # it is a data frame

dim(Boston)     # (num of row, num of col)
names(Boston)   # column names

?Boston         # see explanation for variables

#  1  crim     per capita crime rate by town.
#  2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#  3  indus    proportion of non-retail business acres per town.
#  4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#  5  nox      nitrogen oxides concentration (parts per 10 million).
#  6  rm       average number of rooms per dwelling.
#  7  age      proportion of owner-occupied units built prior to 1940.
#  8  dis      weighted mean of distances to five Boston employment centres.
#  9  rad      index of accessibility to radial highways.
#  10 tax      full-value property-tax rate per $10,000.
#  11 ptratio  pupil-teacher ratio by town.
#  12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#  13 lstat    lower status of the population (percent).

#  14 medv (response Y) median value of owner-occupied homes in $1000s.



###--------------------------
### 2. Create Cross Validation Data Set
# I've written a function and posed on my website.
# You can load the function using single line with sourse() command.
#
# The function is called CreateCV().
# The dataframe you feed into the funciton must contain the "resp" column.


# Turn Boston into Tibble
library(tidyverse)     # install.packages(tidyverse")
Boston <- as_tibble(Boston)
Boston


# Rename "medv" column as "resp" and move "resp" columnm to 1st
Boston = Boston %>% rename(resp=medv) %>% relocate(resp)
Boston


# Check if there's NA in data
sum(is.na(Boston))
Boston = Boston %>% na.omit()  # remove rows with NA (if applicable)
Boston


# Load the file from my website
source("https://nmimoto.github.io/R/ML-00.txt")   # load CreateCV()


# Use this function to create CV dataset
CreateCV(Boston, numFolds=5, seed=5480)

# Output (all data.frame):
#   Train.set      Train.resp
#   Test.set       Test.resp
#   CV.train[[[[k]]]]  CV.train.resp[[[[k]]]]
#   CV.valid[[[[k]]]]  CV.valid.resp[[[[k]]]]

Train.set

Test.set

CV.train[[1]]

CV.valid[[1]]


###--------------------------
### 3. Polynomial Regression using 5-fold CV

#- If all data is used
deg.poly = 3
Fit00 <- lm( resp ~ poly(lstat, deg.poly), data=Boston)
summary(Fit00)

plot(Boston$lstat,  Boston$resp, xlab="X", ylab="Y")
ix = sort(Boston$lstat, index.return=TRUE)$ix
lines(Boston$lstat[ix], Fit00$fitted[ix], lwd=2, col="black" )



##--------------------------
## 3a. Polynomial Regression for d=3 and k=5

# k=1   2   3   4   5     Teat
# [84][84][84][84][84]   x[86]

deg.poly = 3
k=5

Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]] )
  summary(Fit01)

  #- CV Training MSE
  MSE.train <- mean(Fit01$residuals^2)

  #- CV Validation MSE
  Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
  MSE.valid <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)

  c(MSE.train, MSE.valid)


# Plot the fit
plot(CV.train[[k]]$lstat,  CV.train[[k]]$resp, xlab="X", ylab="Y")
lines(CV.valid[[k]]$lstat, CV.valid[[k]]$resp, col="red", type="p", pch=19)
ix = sort(CV.train[[k]]$lstat, index.return=TRUE)$ix
lines(CV.train[[k]]$lstat[ix], Fit01$fitted[ix], lwd=2, col="black" )
text(30,49, paste("d=",deg.poly,": MSE.tr=",round(MSE.train,2)))
text(30,46, paste("k=",k,       ": MSE.va=",round(MSE.valid,2)), col="red")

    # run this section for different d, and different k.

    # layout( matrix(1:5, 1,5) )
    # layout(1)


##--------------------------
## 3b. Polynomial Regression for d=1:10 and k=1:5

# k=1   2   3   4   5     Test
# [84][84][84][84][84]   x[86]

MSE.train <- MSE.valid <- matrix(0, 5, 10)
for (deg.poly in 1:10) {
    for (k in 1:5) {

        Fit01 <- lm( resp ~ poly(lstat, deg.poly), data=CV.train[[k]])
        summary(Fit01)

        #--- CV Training MSE
        MSE.train[k, deg.poly] <- mean(Fit01$residuals^2)

        #--- CV Validation MSE
        Fit01.pred<- predict(Fit01, newdata=CV.valid[[k]])
        MSE.valid[k, deg.poly] <- mean((CV.valid[[k]]$resp - Fit01.pred)^2)
    }
}
MSE.train
MSE.valid

Av.MSE.train = apply(MSE.train, 2, mean)
Av.MSE.valid = apply(MSE.valid, 2, mean)

# Plot Average MSE
plot(Av.MSE.train, type="o", ylab="MSE")
lines(Av.MSE.valid, type="o", col="red")
legend(2, 40, lty=1, c("Av. Train MSE","Av. Valid MSE"), col=c("black", "red"))

   ###--- Make decision about best deg.poly to use from the Av MSE plot


###--------------------------
### 4. Best Polynomial Regression Model for entire Training set

# Using Av MSE plot, I choose d=5 is the best
deg.poly = 5

# Train    Test
# [ 420 ]  [ 86 ]


Fit05 <- lm( resp ~ poly(lstat, deg.poly), data=Train.set)
summary(Fit05)

#- CV Training MSE
MSE.train <- mean(Fit05$residuals^2)

#- CV Validation MSE
pred<- predict(Fit05, newdata=Test.set)
MSE.test <- mean((Test.set$resp - pred)^2)


c(MSE.train, MSE.test)  # Final Fit Result

c(Av.MSE.train[5], Av.MSE.valid[5])  # Recall from CV section

    # Plot the fit
    plot(Train.set$lstat, Train.set$resp, xlab="lstat", ylab="resp", main="Final Model")
    lines(Test.set$lstat, Test.set$resp, col="red", type="p", pch=19)
    ix = sort(Train.set$lstat, index.return=TRUE)$ix
    lines(Train.set$lstat[ix], Fit05$fitted[ix], lwd=2, col="blue" )
    text(30,49, paste("d=",deg.poly,": MSE.train=",round(MSE.train,2)))
    text(30,46, paste("          MSE.test=",round(MSE.test,2)), col="red")