### ### ### Regression - Boston Data ### ### ################################# library(MASS) # install.packages("MASS") data(Boston) Boston class(Boston) dim(Boston) head(Boston) head(Boston, 100) names(Boston) ?Boston # see explanation for variables #1 crim per capita crime rate by town. #2 zn proportion of residential land zoned for lots over 25,000 sq.ft. #3 indus proportion of non-retail business acres per town. #4 chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). #5 nox nitrogen oxides concentration (parts per 10 million). #6 rm average number of rooms per dwelling. #7 age proportion of owner-occupied units built prior to 1940. #8 dis weighted mean of distances to five Boston employment centres. #9 rad index of accessibility to radial highways. #10 tax full-value property-tax rate per $10,000. #11 ptratio pupil-teacher ratio by town. #12 black 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town. #13 lstat lower status of the population (percent). #14 medv median value of owner-occupied homes in \$1000s. # 13 lstat = lower status of the population (percent) # Proportion of population that is lower status # = 1/2 (proportion of adults without, # some high school education and proportion of male # workers classified as laborers). The logarithmic # specification implies that socioeconomic status # distinctions mean more in the upper brackets of # society than in the lower classes. Source: 1970 U. S. Census library(tidyverse) # install.packages(tidyverse") Boston <- as_tibble(Boston) class(Boston) is_tibble(Boston) is.data.frame(Boston) Boston print(Boston, n=100) fix(Boston) Boston[ ,1] Boston[,"zn"] Boston$age attach(Boston) # now you can use their rownames age indus #- Correlaiton Plots pairs(Boston) # shows scatterplot matrix (large) cor(Boston) library(corrplot) # install.packages("corrplot") corrplot(cor(Boston), method="number") corrplot(cor(Boston)) #- Scatter plots layout( matrix(1:12, 3, 4) ) plot(crim, medv) plot(zn, medv) plot(indus, medv) plot(nox, medv) plot(rm, medv) plot(age, medv) plot(dis, medv) plot(rad, medv) plot(tax, medv) plot(ptratio, medv) plot(black, medv) plot(lstat, medv) plot(chas, medv) layout(1) #- Histogram of the respoonse variable hist(medv) #1 crim #2 zn #3 indus #4 chas #5 nox #6 rm #7 age #8 dis #9 rad #10 tax #11 ptratio #12 black #13 lstat #14 medv <- response (Y) #- Regression with all variables Reg01 <- lm(medv ~ . , data=Boston) summary(Reg01) plot(Reg01) #- Backward stepwise regression Reg02 <- lm(medv ~ . -age, data=Boston) summary(Reg02) Reg03 <- lm(medv ~ . -age-indus, data=Boston) summary(Reg03) plot(Reg03) Reg04 <- lm(medv ~ crim + zn + indus, data=Boston) summary(Reg04) #--- Looking at the result str(Reg03) # click on environment tab in Rstudio Reg03$residuals plot(Reg03$residuals)