library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 2 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
## 3 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
## 4 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 6 0.02985 0.0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63
## 10 0.17004 12.5 7.87 0 0.524 6.004 85.9 6.5921 5 311 15.2 386.71
## lstat medv
## 1 4.98 24.0
## 2 9.14 21.6
## 3 4.03 34.7
## 4 2.94 33.4
## 5 5.33 36.2
## 6 5.21 28.7
## 7 12.43 22.9
## 8 19.15 27.1
## 9 29.93 16.5
## 10 17.10 18.9
## [1] "data.frame"
## [1] 506 14
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
?Boston # see explanation for variables
#1 crim per capita crime rate by town.
#2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
#3 indus proportion of non-retail business acres per town.
#4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#5 nox nitrogen oxides concentration (parts per 10 million).
#6 rm average number of rooms per dwelling.
#7 age proportion of owner-occupied units built prior to 1940.
#8 dis weighted mean of distances to five Boston employment centres.
#9 rad index of accessibility to radial highways.
#10 tax full-value property-tax rate per $10,000.
#11 ptratio pupil-teacher ratio by town.
#12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#13 lstat lower status of the population (percent).
#14 medv (response Y) median value of owner-occupied homes in $1000s.
# 13 lstat = lower status of the population (percent)
# Proportion of population that is lower status
# = 1/2 (proportion of adults without,
# some high school education and proportion of male
# workers classified as laborers). The logarithmic
# specification implies that socioeconomic status
# distinctions mean more in the upper brackets of
# society than in the lower classes. Source: 1970 U. S. Census
## The following objects are masked from Boston (pos = 3):
##
## age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
## rad, rm, tax, zn
## The following objects are masked from Boston (pos = 5):
##
## age, black, chas, crim, dis, indus, lstat, medv, nox, ptratio,
## rad, rm, tax, zn
## crim zn indus chas nox
## crim 1.00000000 -0.20046922 0.40658341 -0.055891582 0.42097171
## zn -0.20046922 1.00000000 -0.53382819 -0.042696719 -0.51660371
## indus 0.40658341 -0.53382819 1.00000000 0.062938027 0.76365145
## chas -0.05589158 -0.04269672 0.06293803 1.000000000 0.09120281
## nox 0.42097171 -0.51660371 0.76365145 0.091202807 1.00000000
## rm -0.21924670 0.31199059 -0.39167585 0.091251225 -0.30218819
## age 0.35273425 -0.56953734 0.64477851 0.086517774 0.73147010
## dis -0.37967009 0.66440822 -0.70802699 -0.099175780 -0.76923011
## rad 0.62550515 -0.31194783 0.59512927 -0.007368241 0.61144056
## tax 0.58276431 -0.31456332 0.72076018 -0.035586518 0.66802320
## ptratio 0.28994558 -0.39167855 0.38324756 -0.121515174 0.18893268
## black -0.38506394 0.17552032 -0.35697654 0.048788485 -0.38005064
## lstat 0.45562148 -0.41299457 0.60379972 -0.053929298 0.59087892
## medv -0.38830461 0.36044534 -0.48372516 0.175260177 -0.42732077
## rm age dis rad tax
## crim -0.21924670 0.35273425 -0.37967009 0.625505145 0.58276431
## zn 0.31199059 -0.56953734 0.66440822 -0.311947826 -0.31456332
## indus -0.39167585 0.64477851 -0.70802699 0.595129275 0.72076018
## chas 0.09125123 0.08651777 -0.09917578 -0.007368241 -0.03558652
## nox -0.30218819 0.73147010 -0.76923011 0.611440563 0.66802320
## rm 1.00000000 -0.24026493 0.20524621 -0.209846668 -0.29204783
## age -0.24026493 1.00000000 -0.74788054 0.456022452 0.50645559
## dis 0.20524621 -0.74788054 1.00000000 -0.494587930 -0.53443158
## rad -0.20984667 0.45602245 -0.49458793 1.000000000 0.91022819
## tax -0.29204783 0.50645559 -0.53443158 0.910228189 1.00000000
## ptratio -0.35550149 0.26151501 -0.23247054 0.464741179 0.46085304
## black 0.12806864 -0.27353398 0.29151167 -0.444412816 -0.44180801
## lstat -0.61380827 0.60233853 -0.49699583 0.488676335 0.54399341
## medv 0.69535995 -0.37695457 0.24992873 -0.381626231 -0.46853593
## ptratio black lstat medv
## crim 0.2899456 -0.38506394 0.4556215 -0.3883046
## zn -0.3916785 0.17552032 -0.4129946 0.3604453
## indus 0.3832476 -0.35697654 0.6037997 -0.4837252
## chas -0.1215152 0.04878848 -0.0539293 0.1752602
## nox 0.1889327 -0.38005064 0.5908789 -0.4273208
## rm -0.3555015 0.12806864 -0.6138083 0.6953599
## age 0.2615150 -0.27353398 0.6023385 -0.3769546
## dis -0.2324705 0.29151167 -0.4969958 0.2499287
## rad 0.4647412 -0.44441282 0.4886763 -0.3816262
## tax 0.4608530 -0.44180801 0.5439934 -0.4685359
## ptratio 1.0000000 -0.17738330 0.3740443 -0.5077867
## black -0.1773833 1.00000000 -0.3660869 0.3334608
## lstat 0.3740443 -0.36608690 1.0000000 -0.7376627
## medv -0.5077867 0.33346082 -0.7376627 1.0000000
# better graphics
library(corrplot) # install.packages("corrplot")
corrplot(cor(Boston), method="number") # cor matrix with number and color scheme
#- Scatter plots
layout( matrix(1:6, 2, 3) )
plot(crim, medv)
plot(zn, medv)
plot(indus, medv)
plot(nox, medv)
plot(rm, medv)
plot(age, medv)
layout( matrix(1:6, 2, 3) )
plot(dis, medv)
plot(rad, medv)
plot(tax, medv)
plot(ptratio, medv)
plot(black, medv)
plot(lstat, medv)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
From Model 1, remove age
##
## Call:
## lm(formula = medv ~ . - age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6054 -2.7313 -0.5188 1.7601 26.2243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.436927 5.080119 7.172 2.72e-12 ***
## crim -0.108006 0.032832 -3.290 0.001075 **
## zn 0.046334 0.013613 3.404 0.000719 ***
## indus 0.020562 0.061433 0.335 0.737989
## chas 2.689026 0.859598 3.128 0.001863 **
## nox -17.713540 3.679308 -4.814 1.97e-06 ***
## rm 3.814394 0.408480 9.338 < 2e-16 ***
## dis -1.478612 0.190611 -7.757 5.03e-14 ***
## rad 0.305786 0.066089 4.627 4.75e-06 ***
## tax -0.012329 0.003755 -3.283 0.001099 **
## ptratio -0.952211 0.130294 -7.308 1.10e-12 ***
## black 0.009321 0.002678 3.481 0.000544 ***
## lstat -0.523852 0.047625 -10.999 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.74 on 493 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7343
## F-statistic: 117.3 on 12 and 493 DF, p-value: < 2.2e-16
Remove indus
##
## Call:
## lm(formula = medv ~ . - age - indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5984 -2.7386 -0.5046 1.7273 26.2373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
## crim -0.108413 0.032779 -3.307 0.001010 **
## zn 0.045845 0.013523 3.390 0.000754 ***
## chas 2.718716 0.854240 3.183 0.001551 **
## nox -17.376023 3.535243 -4.915 1.21e-06 ***
## rm 3.801579 0.406316 9.356 < 2e-16 ***
## dis -1.492711 0.185731 -8.037 6.84e-15 ***
## rad 0.299608 0.063402 4.726 3.00e-06 ***
## tax -0.011778 0.003372 -3.493 0.000521 ***
## ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
## black 0.009291 0.002674 3.475 0.000557 ***
## lstat -0.522553 0.047424 -11.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
## F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
## List of 12
## $ coefficients : Named num [1:12] 36.3411 -0.1084 0.0458 2.7187 -17.376 ...
## ..- attr(*, "names")= chr [1:12] "(Intercept)" "crim" "zn" "chas" ...
## $ residuals : Named num [1:506] -6.12 -3.4 4.17 4.75 8.22 ...
## ..- attr(*, "names")= chr [1:506] "1" "2" "3" "4" ...
## $ effects : Named num [1:506] -506.9 -80.3 59.6 35.1 -39.9 ...
## ..- attr(*, "names")= chr [1:506] "(Intercept)" "crim" "zn" "chas" ...
## $ rank : int 12
## $ fitted.values: Named num [1:506] 30.1 25 30.5 28.6 28 ...
## ..- attr(*, "names")= chr [1:506] "1" "2" "3" "4" ...
## $ assign : int [1:12] 0 1 2 3 4 5 6 7 8 9 ...
## $ qr :List of 5
## ..$ qr : num [1:506, 1:12] -22.4944 0.0445 0.0445 0.0445 0.0445 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:506] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:12] "(Intercept)" "crim" "zn" "chas" ...
## .. ..- attr(*, "assign")= int [1:12] 0 1 2 3 4 5 6 7 8 9 ...
## ..$ qraux: num [1:12] 1.04 1.02 1.03 1.01 1.05 ...
## ..$ pivot: int [1:12] 1 2 3 4 5 6 7 8 9 10 ...
## ..$ tol : num 1e-07
## ..$ rank : int 12
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 494
## $ xlevels : Named list()
## $ call : language lm(formula = medv ~ . - age - indus, data = Boston)
## $ terms :Classes 'terms', 'formula' language medv ~ (crim + zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat) - age - indus
## .. ..- attr(*, "variables")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat)
## .. ..- attr(*, "factors")= int [1:14, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:14] "medv" "crim" "zn" "indus" ...
## .. .. .. ..$ : chr [1:11] "crim" "zn" "chas" "nox" ...
## .. ..- attr(*, "term.labels")= chr [1:11] "crim" "zn" "chas" "nox" ...
## .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat)
## .. ..- attr(*, "dataClasses")= Named chr [1:14] "numeric" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:14] "medv" "crim" "zn" "indus" ...
## $ model :'data.frame': 506 obs. of 14 variables:
## ..$ medv : num [1:506] 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## ..$ crim : num [1:506] 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## ..$ zn : num [1:506] 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## ..$ indus : num [1:506] 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## ..$ chas : int [1:506] 0 0 0 0 0 0 0 0 0 0 ...
## ..$ nox : num [1:506] 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## ..$ rm : num [1:506] 6.58 6.42 7.18 7 7.15 ...
## ..$ age : num [1:506] 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## ..$ dis : num [1:506] 4.09 4.97 4.97 6.06 6.06 ...
## ..$ rad : int [1:506] 1 2 2 3 3 3 5 5 5 5 ...
## ..$ tax : num [1:506] 296 242 242 222 222 222 311 311 311 311 ...
## ..$ ptratio: num [1:506] 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## ..$ black : num [1:506] 397 397 393 395 397 ...
## ..$ lstat : num [1:506] 4.98 9.14 4.03 2.94 5.33 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language medv ~ (crim + zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat) - age - indus
## .. .. ..- attr(*, "variables")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat)
## .. .. ..- attr(*, "factors")= int [1:14, 1:11] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:14] "medv" "crim" "zn" "indus" ...
## .. .. .. .. ..$ : chr [1:11] "crim" "zn" "chas" "nox" ...
## .. .. ..- attr(*, "term.labels")= chr [1:11] "crim" "zn" "chas" "nox" ...
## .. .. ..- attr(*, "order")= int [1:11] 1 1 1 1 1 1 1 1 1 1 ...
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(medv, crim, zn, indus, chas, nox, rm, age, dis, rad, tax, ptratio, black, lstat)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:14] "numeric" "numeric" "numeric" "numeric" ...
## .. .. .. ..- attr(*, "names")= chr [1:14] "medv" "crim" "zn" "indus" ...
## - attr(*, "class")= chr "lm"
##
## Call:
## lm(formula = medv ~ crim + zn + indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.070 -4.733 -1.585 2.648 32.423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.39465 0.86484 31.676 < 2e-16 ***
## crim -0.24863 0.04391 -5.662 2.52e-08 ***
## zn 0.05850 0.01750 3.344 0.000889 ***
## indus -0.41558 0.06378 -6.515 1.77e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.752 on 502 degrees of freedom
## Multiple R-squared: 0.2937, Adjusted R-squared: 0.2895
## F-statistic: 69.59 on 3 and 502 DF, p-value: < 2.2e-16
### 1 Load Boston data from package MASS
library(MASS) # install.packages("MASS")
data(Boston) # load data
head(Boston, 10) # see first 10 lines of data
class(Boston) # it is a data frame
dim(Boston) # (num of row, num of col)
names(Boston) # column names
?Boston # see explanation for variables
#1 crim per capita crime rate by town.
#2 zn proportion of residential land zoned for lots over 25,000 sq.ft.
#3 indus proportion of non-retail business acres per town.
#4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).
#5 nox nitrogen oxides concentration (parts per 10 million).
#6 rm average number of rooms per dwelling.
#7 age proportion of owner-occupied units built prior to 1940.
#8 dis weighted mean of distances to five Boston employment centres.
#9 rad index of accessibility to radial highways.
#10 tax full-value property-tax rate per $10,000.
#11 ptratio pupil-teacher ratio by town.
#12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town.
#13 lstat lower status of the population (percent).
#14 medv (response Y) median value of owner-occupied homes in $1000s.
# 13 lstat = lower status of the population (percent)
# Proportion of population that is lower status
# = 1/2 (proportion of adults without,
# some high school education and proportion of male
# workers classified as laborers). The logarithmic
# specification implies that socioeconomic status
# distinctions mean more in the upper brackets of
# society than in the lower classes. Source: 1970 U. S. Census
### 2. Attach() columns
plot(Boston$age, Boston$medv) # plot age vs medv
attach(Boston) # now you can use their colname as a variable
plot(age, medv)
### 3. Correlaiton Plots
pairs(Boston) # shows scatterplot matrix (large)
cor(Boston) # raw correlation matrix
# better graphics
library(corrplot) # install.packages("corrplot")
corrplot(cor(Boston), method="number") # cor matrix with number and color scheme
corrplot(cor(Boston)) # cor matrix with color scheme
### 4. Scatter Plots and Histogram
# Scatter plots
layout( matrix(1:6, 2, 3) )
plot(crim, medv)
plot(zn, medv)
plot(indus, medv)
plot(nox, medv)
plot(rm, medv)
plot(age, medv)
layout( matrix(1:6, 2, 3) )
plot(dis, medv)
plot(rad, medv)
plot(tax, medv)
plot(ptratio, medv)
plot(black, medv)
plot(lstat, medv)
layout(1)
# Histogram of the respoonse variable
hist(medv)
### Model 1: Regression with all variables
Reg01 <- lm(medv ~ . , data=Boston)
summary(Reg01)
plot(Reg01)
### Model 2: Backward stepwise regression
# From Model 1, remove age
Reg02 <- lm(medv ~ . -age, data=Boston)
summary(Reg02)
# Remove indus
Reg03 <- lm(medv ~ . -age-indus, data=Boston)
summary(Reg03)
plot(Reg03)
# Looking at the result
str(Reg03) # click on environment tab in Rstudio
plot(Reg03$fitted, Reg03$residuals)
### Model 3: Forward Selection
Reg04 <- lm(medv ~ crim + zn + indus, data=Boston)
summary(Reg04)