1 Load Boston data from package MASS

library(MASS)  # install.packages("MASS")

data(Boston)  # load data

head(Boston, 10) # see first 10 lines of data
##       crim   zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 2  0.02731  0.0  7.07    0 0.469 6.421  78.9 4.9671   2 242    17.8 396.90
## 3  0.02729  0.0  7.07    0 0.469 7.185  61.1 4.9671   2 242    17.8 392.83
## 4  0.03237  0.0  2.18    0 0.458 6.998  45.8 6.0622   3 222    18.7 394.63
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 6  0.02985  0.0  2.18    0 0.458 6.430  58.7 6.0622   3 222    18.7 394.12
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 10 0.17004 12.5  7.87    0 0.524 6.004  85.9 6.5921   5 311    15.2 386.71
##    lstat medv
## 1   4.98 24.0
## 2   9.14 21.6
## 3   4.03 34.7
## 4   2.94 33.4
## 5   5.33 36.2
## 6   5.21 28.7
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 10 17.10 18.9
class(Boston)   # it is a data frame
## [1] "data.frame"
dim(Boston)     # (num of row, num of col)
## [1] 506  14
names(Boston)   # column names
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"
?Boston         # see explanation for variables


#1  crim     per capita crime rate by town.
#2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#3  indus    proportion of non-retail business acres per town.
#4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#5  nox      nitrogen oxides concentration (parts per 10 million).
#6  rm       average number of rooms per dwelling.
#7  age      proportion of owner-occupied units built prior to 1940.
#8  dis      weighted mean of distances to five Boston employment centres.
#9  rad      index of accessibility to radial highways.
#10 tax      full-value property-tax rate per $10,000.
#11 ptratio  pupil-teacher ratio by town.
#12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#13 lstat    lower status of the population (percent).

#14 medv (response Y) median value of owner-occupied homes in $1000s.

# 13 lstat = lower status of the population (percent)
#      Proportion of population that is lower status
#      = 1/2 (proportion of adults without,
#      some high school education and proportion of male
#      workers classified as laborers). The logarithmic
#      specification implies that socioeconomic status
#      distinctions mean more in the upper brackets of
#      society than in the lower classes. Source: 1970 U. S. Census



2. Attach() columns

plot(Boston$age, Boston$medv)   # plot age vs medv

attach(Boston)  # now you can use their colname as a variable
## The following objects are masked from Boston (pos = 3):
## 
##     age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
##     rad, rm, tax, zn
## The following objects are masked from Boston (pos = 5):
## 
##     age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
##     rad, rm, tax, zn
plot(age, medv)



3. Correlaiton Plots

# Correlaiton Plots
pairs(Boston)   # shows scatterplot matrix (large)

cor(Boston)     # raw correlation matrix
##                crim          zn       indus         chas         nox
## crim     1.00000000 -0.20046922  0.40658341 -0.055891582  0.42097171
## zn      -0.20046922  1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus    0.40658341 -0.53382819  1.00000000  0.062938027  0.76365145
## chas    -0.05589158 -0.04269672  0.06293803  1.000000000  0.09120281
## nox      0.42097171 -0.51660371  0.76365145  0.091202807  1.00000000
## rm      -0.21924670  0.31199059 -0.39167585  0.091251225 -0.30218819
## age      0.35273425 -0.56953734  0.64477851  0.086517774  0.73147010
## dis     -0.37967009  0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad      0.62550515 -0.31194783  0.59512927 -0.007368241  0.61144056
## tax      0.58276431 -0.31456332  0.72076018 -0.035586518  0.66802320
## ptratio  0.28994558 -0.39167855  0.38324756 -0.121515174  0.18893268
## black   -0.38506394  0.17552032 -0.35697654  0.048788485 -0.38005064
## lstat    0.45562148 -0.41299457  0.60379972 -0.053929298  0.59087892
## medv    -0.38830461  0.36044534 -0.48372516  0.175260177 -0.42732077
##                  rm         age         dis          rad         tax
## crim    -0.21924670  0.35273425 -0.37967009  0.625505145  0.58276431
## zn       0.31199059 -0.56953734  0.66440822 -0.311947826 -0.31456332
## indus   -0.39167585  0.64477851 -0.70802699  0.595129275  0.72076018
## chas     0.09125123  0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox     -0.30218819  0.73147010 -0.76923011  0.611440563  0.66802320
## rm       1.00000000 -0.24026493  0.20524621 -0.209846668 -0.29204783
## age     -0.24026493  1.00000000 -0.74788054  0.456022452  0.50645559
## dis      0.20524621 -0.74788054  1.00000000 -0.494587930 -0.53443158
## rad     -0.20984667  0.45602245 -0.49458793  1.000000000  0.91022819
## tax     -0.29204783  0.50645559 -0.53443158  0.910228189  1.00000000
## ptratio -0.35550149  0.26151501 -0.23247054  0.464741179  0.46085304
## black    0.12806864 -0.27353398  0.29151167 -0.444412816 -0.44180801
## lstat   -0.61380827  0.60233853 -0.49699583  0.488676335  0.54399341
## medv     0.69535995 -0.37695457  0.24992873 -0.381626231 -0.46853593
##            ptratio       black      lstat       medv
## crim     0.2899456 -0.38506394  0.4556215 -0.3883046
## zn      -0.3916785  0.17552032 -0.4129946  0.3604453
## indus    0.3832476 -0.35697654  0.6037997 -0.4837252
## chas    -0.1215152  0.04878848 -0.0539293  0.1752602
## nox      0.1889327 -0.38005064  0.5908789 -0.4273208
## rm      -0.3555015  0.12806864 -0.6138083  0.6953599
## age      0.2615150 -0.27353398  0.6023385 -0.3769546
## dis     -0.2324705  0.29151167 -0.4969958  0.2499287
## rad      0.4647412 -0.44441282  0.4886763 -0.3816262
## tax      0.4608530 -0.44180801  0.5439934 -0.4685359
## ptratio  1.0000000 -0.17738330  0.3740443 -0.5077867
## black   -0.1773833  1.00000000 -0.3660869  0.3334608
## lstat    0.3740443 -0.36608690  1.0000000 -0.7376627
## medv    -0.5077867  0.33346082 -0.7376627  1.0000000
# better graphics
library(corrplot)   # install.packages("corrplot")
corrplot(cor(Boston), method="number")  # cor matrix with number and color scheme

corrplot(cor(Boston))  # cor matrix with color scheme



4. Scatter Plots and Histogram

#- Scatter plots
layout( matrix(1:6, 2, 3) )
plot(crim, medv)
plot(zn, medv)
plot(indus, medv)
plot(nox, medv)
plot(rm, medv)
plot(age, medv)

layout( matrix(1:6, 2, 3) )
plot(dis, medv)
plot(rad, medv)
plot(tax, medv)
plot(ptratio, medv)
plot(black, medv)
plot(lstat, medv)

layout(1)


#- Histogram of the respoonse variable
hist(medv)



Model 1: Regression with all variables

Reg01 <- lm(medv ~ . , data=Boston)
summary(Reg01)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
plot(Reg01)



Model 2: Backward stepwise regression

From Model 1, remove age

#- Backward stepwise regression
Reg02 <- lm(medv ~ . -age, data=Boston)
summary(Reg02)
## 
## Call:
## lm(formula = medv ~ . - age, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6054  -2.7313  -0.5188   1.7601  26.2243 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.436927   5.080119   7.172 2.72e-12 ***
## crim         -0.108006   0.032832  -3.290 0.001075 ** 
## zn            0.046334   0.013613   3.404 0.000719 ***
## indus         0.020562   0.061433   0.335 0.737989    
## chas          2.689026   0.859598   3.128 0.001863 ** 
## nox         -17.713540   3.679308  -4.814 1.97e-06 ***
## rm            3.814394   0.408480   9.338  < 2e-16 ***
## dis          -1.478612   0.190611  -7.757 5.03e-14 ***
## rad           0.305786   0.066089   4.627 4.75e-06 ***
## tax          -0.012329   0.003755  -3.283 0.001099 ** 
## ptratio      -0.952211   0.130294  -7.308 1.10e-12 ***
## black         0.009321   0.002678   3.481 0.000544 ***
## lstat        -0.523852   0.047625 -10.999  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.74 on 493 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7343 
## F-statistic: 117.3 on 12 and 493 DF,  p-value: < 2.2e-16

Remove indus

Reg03 <- lm(medv ~ . -age-indus, data=Boston)
summary(Reg03)
## 
## Call:
## lm(formula = medv ~ . - age - indus, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## zn            0.045845   0.013523   3.390 0.000754 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## black         0.009291   0.002674   3.475 0.000557 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16
plot(Reg03)

# Looking at the result
str(Reg03)  # click on environment tab in Rstudio
## List of 12
##  $ coefficients : Named num [1:12] 36.3411 -0.1084 0.0458 2.7187 -17.376 ...
##   ..- attr(*, "names")= chr [1:12] "(Intercept)" "crim" "zn" "chas" ...
##  $ residuals    : Named num [1:506] -6.12 -3.4 4.17 4.75 8.22 ...
##   ..- attr(*, "names")= chr [1:506] "1" "2" "3" "4" ...
##  $ effects      : Named num [1:506] -506.9 -80.3 59.6 35.1 -39.9 ...
##   ..- attr(*, "names")= chr [1:506] "(Intercept)" "crim" "zn" "chas" ...
##  $ rank         : int 12
##  $ fitted.values: Named num [1:506] 30.1 25 30.5 28.6 28 ...
##   ..- attr(*, "names")= chr [1:506] "1" "2" "3" "4" ...
##  $ assign       : int [1:12] 0 1 2 3 4 5 6 7 8 9 ...
##  $ qr           :List of 5
##   ..$ qr   : num [1:506, 1:12] -22.4944 0.0445 0.0445 0.0445 0.0445 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:506] "1" "2" "3" "4" ...
##   .. .. ..$ : chr [1:12] "(Intercept)" "crim" "zn" "chas" ...
##   .. ..- attr(*, "assign")= int [1:12] 0 1 2 3 4 5 6 7 8 9 ...
##   ..$ qraux: num [1:12] 1.04 1.02 1.03 1.01 1.05 ...
##   ..$ pivot: int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ tol  : num 1e-07
##   ..$ rank : int 12
##   ..- attr(*, "class")= chr "qr"
##  $ df.residual  : int 494
##  $ xlevels      : Named list()
##  $ call         : language lm(formula = medv ~ . - age - indus, data = Boston)
##  $ terms        :Classes 'terms', 'formula'  language medv ~ (crim + zn + indus + chas + nox + rm + age + dis + rad + tax +      ptratio + black + lstat) - age - indus
##   .. ..- attr(*, "variables")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio,      black, lstat)
##   .. ..- attr(*, "factors")= int [1:14, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:14] "medv" "crim" "zn" "indus" ...
##   .. .. .. ..$ : chr [1:11] "crim" "zn" "chas" "nox" ...
##   .. ..- attr(*, "term.labels")= chr [1:11] "crim" "zn" "chas" "nox" ...
##   .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio,      black, lstat)
##   .. ..- attr(*, "dataClasses")= Named chr [1:14] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:14] "medv" "crim" "zn" "indus" ...
##  $ model        :'data.frame':   506 obs. of  14 variables:
##   ..$ medv   : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
##   ..$ crim   : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
##   ..$ zn     : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##   ..$ indus  : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##   ..$ chas   : int [1:506] 0 0 0 0 0 0 0 0 0 0 ...
##   ..$ nox    : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##   ..$ rm     : num [1:506] 6.58 6.42 7.18 7 7.15 ...
##   ..$ age    : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##   ..$ dis    : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
##   ..$ rad    : int [1:506] 1 2 2 3 3 3 5 5 5 5 ...
##   ..$ tax    : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
##   ..$ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##   ..$ black  : num [1:506] 397 397 393 395 397 ...
##   ..$ lstat  : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
##   ..- attr(*, "terms")=Classes 'terms', 'formula'  language medv ~ (crim + zn + indus + chas + nox + rm + age + dis + rad + tax +      ptratio + black + lstat) - age - indus
##   .. .. ..- attr(*, "variables")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio,      black, lstat)
##   .. .. ..- attr(*, "factors")= int [1:14, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. .. ..$ : chr [1:14] "medv" "crim" "zn" "indus" ...
##   .. .. .. .. ..$ : chr [1:11] "crim" "zn" "chas" "nox" ...
##   .. .. ..- attr(*, "term.labels")= chr [1:11] "crim" "zn" "chas" "nox" ...
##   .. .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
##   .. .. ..- attr(*, "intercept")= int 1
##   .. .. ..- attr(*, "response")= int 1
##   .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. .. ..- attr(*, "predvars")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio,      black, lstat)
##   .. .. ..- attr(*, "dataClasses")= Named chr [1:14] "numeric" "numeric" "numeric" "numeric" ...
##   .. .. .. ..- attr(*, "names")= chr [1:14] "medv" "crim" "zn" "indus" ...
##  - attr(*, "class")= chr "lm"
plot(Reg03$fitted, Reg03$residuals)



Model 3: Forward Selection

Reg04 <- lm(medv ~ crim + zn + indus, data=Boston)
summary(Reg04)
## 
## Call:
## lm(formula = medv ~ crim + zn + indus, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.070  -4.733  -1.585   2.648  32.423 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 27.39465    0.86484  31.676  < 2e-16 ***
## crim        -0.24863    0.04391  -5.662 2.52e-08 ***
## zn           0.05850    0.01750   3.344 0.000889 ***
## indus       -0.41558    0.06378  -6.515 1.77e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.752 on 502 degrees of freedom
## Multiple R-squared:  0.2937, Adjusted R-squared:  0.2895 
## F-statistic: 69.59 on 3 and 502 DF,  p-value: < 2.2e-16



Code Only

### 1 Load Boston data from package MASS
library(MASS)  # install.packages("MASS")

data(Boston)  # load data

head(Boston, 10) # see first 10 lines of data

class(Boston)   # it is a data frame

dim(Boston)     # (num of row, num of col)
names(Boston)   # column names

?Boston         # see explanation for variables


#1  crim     per capita crime rate by town.
#2  zn       proportion of residential land zoned for lots over 25,000 sq.ft.
#3  indus    proportion of non-retail business acres per town.
#4  chas     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#5  nox      nitrogen oxides concentration (parts per 10 million).
#6  rm       average number of rooms per dwelling.
#7  age      proportion of owner-occupied units built prior to 1940.
#8  dis      weighted mean of distances to five Boston employment centres.
#9  rad      index of accessibility to radial highways.
#10 tax      full-value property-tax rate per $10,000.
#11 ptratio  pupil-teacher ratio by town.
#12 black    1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#13 lstat    lower status of the population (percent).

#14 medv (response Y) median value of owner-occupied homes in $1000s.

# 13 lstat = lower status of the population (percent)
#      Proportion of population that is lower status
#      = 1/2 (proportion of adults without,
#      some high school education and proportion of male
#      workers classified as laborers). The logarithmic
#      specification implies that socioeconomic status
#      distinctions mean more in the upper brackets of
#      society than in the lower classes. Source: 1970 U. S. Census


### 2. Attach() columns
plot(Boston$age, Boston$medv)   # plot age vs medv

attach(Boston)  # now you can use their colname as a variable

plot(age, medv)


### 3. Correlaiton Plots
pairs(Boston)   # shows scatterplot matrix (large)

cor(Boston)     # raw correlation matrix

# better graphics
library(corrplot)   # install.packages("corrplot")
corrplot(cor(Boston), method="number")  # cor matrix with number and color scheme
corrplot(cor(Boston))  # cor matrix with color scheme



### 4. Scatter Plots and Histogram
# Scatter plots
layout( matrix(1:6, 2, 3) )
plot(crim, medv)
plot(zn, medv)
plot(indus, medv)
plot(nox, medv)
plot(rm, medv)
plot(age, medv)

layout( matrix(1:6, 2, 3) )
plot(dis, medv)
plot(rad, medv)
plot(tax, medv)
plot(ptratio, medv)
plot(black, medv)
plot(lstat, medv)
layout(1)

# Histogram of the respoonse variable
hist(medv)


### Model 1: Regression with all variables
Reg01 <- lm(medv ~ . , data=Boston)
summary(Reg01)

plot(Reg01)

### Model 2: Backward stepwise regression
# From Model 1, remove age
Reg02 <- lm(medv ~ . -age, data=Boston)
summary(Reg02)

# Remove indus
Reg03 <- lm(medv ~ . -age-indus, data=Boston)
summary(Reg03)

plot(Reg03)

# Looking at the result
str(Reg03)  # click on environment tab in Rstudio

plot(Reg03$fitted, Reg03$residuals)

### Model 3: Forward Selection
Reg04 <- lm(medv ~ crim + zn + indus, data=Boston)
summary(Reg04)