# Setting seed for reproducibility
set.seed(4765)
Split the data set into a training set and a test set,
# Working Directory
# setwd("/")
# Clear Environment
# rm(list=ls())
# Load Environment from the given .RData file
load("Residen.RData")
# NA visualization - No NAs found
::vis_dat(Residen) visdat
# Train/Test split in random at 0.8 ratio
<- sample(1:nrow(Residen), 0.8*nrow(Residen))
row.number = Residen[row.number,]
train = Residen[-row.number,]
test
# Dimensions for Train/Test Data
dim(train)
## [1] 297 109
dim(test)
## [1] 75 109
Linear regression model on the training set to explain the ”actual sales price” (V104) in terms of the of the other variables excluding the variable ”actual construction costs” (V105),
# Create Linear Regression Model with training data excluding variable V105
<- lm(V104 ~. -V105, data=train, trace=FALSE) lm_model
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'trace' will be disregarded
summary(lm_model)
##
## Call:
## lm(formula = V104 ~ . - V105, data = train, trace = FALSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -771.57 -44.11 -1.96 43.07 609.42
##
## Coefficients: (34 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.650e+04 9.427e+04 0.281 0.778849
## `START YEAR` -6.724e+02 1.490e+03 -0.451 0.652265
## `START QUARTER` 2.396e+03 3.809e+03 0.629 0.529977
## `COMPLETION YEAR` 1.841e+02 1.853e+01 9.937 < 2e-16 ***
## `COMPLETION QUARTER` 6.177e+01 9.978e+00 6.191 2.84e-09 ***
## V1 -2.958e+00 2.601e+00 -1.137 0.256656
## V2 9.140e-02 2.698e-02 3.387 0.000835 ***
## V3 -2.871e-01 7.693e-02 -3.732 0.000241 ***
## V4 -5.338e-02 4.008e-02 -1.332 0.184258
## V5 3.127e-01 3.786e-01 0.826 0.409643
## V6 -1.672e-02 6.615e-02 -0.253 0.800650
## V7 NA NA NA NA
## V8 1.186e+00 1.897e-02 62.491 < 2e-16 ***
## V9 -4.007e-01 3.321e-01 -1.207 0.228895
## V10 5.817e+02 8.406e+02 0.692 0.489701
## V11 1.414e+02 2.891e+02 0.489 0.625218
## V12 -1.056e+02 1.398e+02 -0.756 0.450589
## V13 -2.322e-02 1.682e-02 -1.381 0.168711
## V14 -3.036e-01 7.375e-01 -0.412 0.680976
## V15 -1.634e+01 5.228e+01 -0.313 0.754893
## V16 4.597e+00 8.029e+00 0.573 0.567494
## V17 1.035e-01 8.125e-02 1.274 0.203890
## V18 -3.833e+02 3.840e+02 -0.998 0.319214
## V19 4.210e+00 3.701e+00 1.137 0.256569
## V20 -1.195e+00 1.964e+00 -0.609 0.543254
## V21 2.670e-01 2.933e-01 0.910 0.363693
## V22 -1.537e+00 1.458e+00 -1.054 0.293188
## V23 2.620e+02 2.174e+02 1.205 0.229391
## V24 -8.067e+02 8.678e+02 -0.930 0.353591
## V25 -3.900e-01 6.348e-01 -0.614 0.539567
## V26 2.158e-01 3.823e-01 0.564 0.572990
## V27 4.468e-03 3.771e-03 1.185 0.237317
## V28 2.393e-01 1.495e-01 1.600 0.110927
## V29 9.651e+01 3.583e+02 0.269 0.787922
## V30 7.407e+01 6.010e+01 1.232 0.219081
## V31 -4.847e+02 3.788e+02 -1.279 0.202066
## V32 8.735e-02 8.382e-02 1.042 0.298513
## V33 5.575e-01 8.843e-01 0.630 0.529088
## V34 -1.359e+02 1.477e+02 -0.920 0.358492
## V35 -2.145e-02 4.029e+00 -0.005 0.995758
## V36 -2.374e-01 2.906e-01 -0.817 0.414756
## V37 6.568e+02 4.550e+02 1.443 0.150314
## V38 5.177e+00 3.935e+00 1.316 0.189590
## V39 -3.132e+00 3.527e+00 -0.888 0.375499
## V40 -8.322e-01 9.094e-01 -0.915 0.361121
## V41 -7.209e-01 9.953e-01 -0.724 0.469632
## V42 6.733e+01 2.279e+02 0.295 0.767902
## V43 7.739e+02 7.874e+02 0.983 0.326775
## V44 -2.605e-01 4.986e-01 -0.522 0.601868
## V45 3.361e-01 5.078e-01 0.662 0.508758
## V46 5.905e-03 6.011e-03 0.982 0.327002
## V47 -3.114e-02 3.361e-01 -0.093 0.926269
## V48 -1.075e+03 9.981e+02 -1.077 0.282495
## V49 -8.348e+01 7.521e+01 -1.110 0.268195
## V50 1.192e+03 1.760e+03 0.677 0.498812
## V51 -1.970e-02 6.288e-02 -0.313 0.754315
## V52 5.028e-01 4.148e-01 1.212 0.226695
## V53 8.447e+01 5.892e+01 1.433 0.153123
## V54 1.291e+01 1.176e+01 1.098 0.273582
## V55 -1.663e-01 2.653e-01 -0.627 0.531355
## V56 -1.107e+03 1.273e+03 -0.870 0.385289
## V57 4.766e+00 4.227e+00 1.128 0.260677
## V58 -4.529e+00 3.881e+00 -1.167 0.244486
## V59 -4.639e-01 3.026e-01 -1.533 0.126600
## V60 -1.156e-04 2.034e-01 -0.001 0.999547
## V61 2.342e+01 1.960e+02 0.120 0.904974
## V62 1.897e+01 3.830e+02 0.050 0.960532
## V63 1.965e-02 1.709e-01 0.115 0.908573
## V64 1.572e-01 3.227e-01 0.487 0.626623
## V65 -5.320e-03 3.429e-03 -1.552 0.122192
## V66 -7.414e-01 7.363e-01 -1.007 0.315061
## V67 2.182e+02 2.282e+02 0.956 0.339999
## V68 1.643e+02 1.454e+02 1.130 0.259564
## V69 -6.309e+02 9.486e+02 -0.665 0.506653
## V70 -8.058e-02 7.074e-02 -1.139 0.255848
## V71 NA NA NA NA
## V72 NA NA NA NA
## V73 NA NA NA NA
## V74 NA NA NA NA
## V75 NA NA NA NA
## V76 NA NA NA NA
## V77 NA NA NA NA
## V78 NA NA NA NA
## V79 NA NA NA NA
## V80 NA NA NA NA
## V81 NA NA NA NA
## V82 NA NA NA NA
## V83 NA NA NA NA
## V84 NA NA NA NA
## V85 NA NA NA NA
## V86 NA NA NA NA
## V87 NA NA NA NA
## V88 NA NA NA NA
## V89 NA NA NA NA
## V90 NA NA NA NA
## V91 NA NA NA NA
## V92 NA NA NA NA
## V93 NA NA NA NA
## V94 NA NA NA NA
## V95 NA NA NA NA
## V96 NA NA NA NA
## V97 NA NA NA NA
## V98 NA NA NA NA
## V99 NA NA NA NA
## V100 NA NA NA NA
## V101 NA NA NA NA
## V102 NA NA NA NA
## V103 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 142.6 on 223 degrees of freedom
## Multiple R-squared: 0.9884, Adjusted R-squared: 0.9846
## F-statistic: 261 on 73 and 223 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_model)
## Warning: not plotting observations with leverage one:
## 20, 23, 35, 46, 53, 84, 156, 267, 272
# Predict the actual sales using linear regression model created.
<- predict(lm_model, test) lm_Predicted
## Warning in predict.lm(lm_model, test): prediction from a rank-deficient fit may
## be misleading
# Calcuating the test RMSE of Linear regression model
<- sqrt(mean((test$V104 - lm_Predicted)^2))
lm_rmse c("Linear Regression Model RMSE" = lm_rmse)
## Linear Regression Model RMSE
## 988.064
Linear regression model using stepwise selection on the training set,
library(MASS)
# NULL Model
= lm(V104 ~1, data=train)
null_model #summary(null_model)
# Upper Model
= lm(V104 ~., data=train)
upper_model #summary(upper_model)
# Linear Regression with Stepwise Selection
<- stepAIC(null_model, direction="both",trace=FALSE ,scope=list(upper=upper_model,lower=null_model))
lm_stepwise summary(lm_stepwise)
##
## Call:
## lm(formula = V104 ~ V8 + V7 + V55 + V105 + V5 + V72 + V99 + V19 +
## V84 + V65 + V86 + `COMPLETION QUARTER` + V1 + V23 + V35 +
## V41 + V68, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -822.99 -47.88 -0.47 41.95 644.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.719e+01 4.945e+01 -1.561 0.119674
## V8 1.185e+00 1.522e-02 77.838 < 2e-16 ***
## V7 1.801e+01 5.687e+00 3.166 0.001717 **
## V55 2.372e-03 9.168e-04 2.587 0.010191 *
## V105 1.742e+00 2.859e-01 6.093 3.67e-09 ***
## V5 -2.107e+00 4.284e-01 -4.919 1.49e-06 ***
## V72 -1.307e+01 1.151e+00 -11.355 < 2e-16 ***
## V99 -1.177e+01 4.735e+00 -2.485 0.013557 *
## V19 -4.041e-01 9.710e-02 -4.161 4.22e-05 ***
## V84 5.106e-04 1.371e-04 3.725 0.000236 ***
## V65 -5.665e-04 1.811e-04 -3.128 0.001944 **
## V86 9.761e+00 2.997e+00 3.257 0.001265 **
## `COMPLETION QUARTER` 1.249e+01 6.910e+00 1.807 0.071767 .
## V1 -3.531e+00 1.983e+00 -1.781 0.076074 .
## V23 2.815e+01 4.293e+00 6.558 2.64e-10 ***
## V35 -1.822e-01 9.929e-02 -1.835 0.067594 .
## V41 -3.688e-02 1.035e-02 -3.564 0.000430 ***
## V68 -4.913e+00 2.847e+00 -1.726 0.085498 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129.6 on 279 degrees of freedom
## Multiple R-squared: 0.988, Adjusted R-squared: 0.9873
## F-statistic: 1355 on 17 and 279 DF, p-value: < 2.2e-16
# Predication
<- predict(lm_stepwise, test)
lm_stepwise_pred
# Calculating the test RMSE of Linear regression model
<- sqrt(mean((test$V104 - lm_stepwise_pred)^2))
lm_step_rmse c(Step_RMSE = lm_step_rmse)
## Step_RMSE
## 179.9235
Linear regression model using ridge regression on the training set, with λ chosen by cross validation,
# Ridge Regression
library(dplyr)
library(glmnet)
# create matricies for the regression equation
= model.matrix(V104~. -V105,Residen)[,-1]
x = Residen %>%
y ::select(V104) %>%
dplyrunlist() %>%
as.numeric()
= model.matrix(V104~. -V105,train)[,-1]
x_train = train %>%
y_train ::select(V104) %>%
dplyrunlist() %>%
as.numeric()
= model.matrix(V104~. -V105,test)[,-1]
x_test = test %>%
y_test ::select(V104) %>%
dplyrunlist() %>%
as.numeric()
=10^seq(10,-2,length=100)
gridplot(grid)
= glmnet(x_train, y_train, alpha = 0, lambda = grid, thresh = 1e-12,trace=FALSE) ridge_mod
## Warning: from glmnet Fortran code (error code -78); Convergence for 78th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
dim(coef(ridge_mod))
## [1] 108 77
plot(ridge_mod)
= cv.glmnet(x_train, y_train, alpha = 0,trace=FALSE) # Fit ridge regression model on training data
cv.out = cv.out$lambda.min # Select lamda that minimizes training MSE
bestlam c("Best Lambda for Ridge Regression" = bestlam)
## Best Lambda for Ridge Regression
## 111.7434
plot(cv.out) # Draw plot of training MSE as a function of lambda
log(bestlam)
## [1] 4.716205
<- predict(ridge_mod, s = bestlam, newx = x_test) # Use best lambda to predict test data
ridge_pred <- sqrt(mean((y_test - ridge_pred)^2))
rr_rmse c(ridge_RMSE = rr_rmse)
## ridge_RMSE
## 275.9933
Linear regression model using lasso on the training set, with λ chosen by cross validation. Test RMSE obtained along with the number of non-zero coefficient estimates,
# Lasso
=glmnet(x_train,y_train,alpha=1,lambda=grid,trace=FALSE) #fit lasso model on training data
lasso_modplot(lasso_mod) #Draw plot of coefficients
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
=cv.glmnet(x_train,y_train,alpha=1,trace=FALSE) #Fit lasso model on training data
cv.outplot(cv.out)
=cv.out$lambda.min #Select lambda that minimises training data
bestlamc("Best Lambda for lasso" = bestlam)
## Best Lambda for lasso
## 2.64216
=predict(lasso_mod,s=bestlam,newx=x_test) #Use best lambda to predict test data
lasso_pred<- sqrt(mean((y_test - lasso_pred)^2))
lo_rmse c(lasso_RMSE = lo_rmse)
## lasso_RMSE
## 170.499
=glmnet(x,y,alpha=1,lambda=grid,trace=FALSE) #Fit lasso model on the full dataset
out=predict(out,type="coefficients",s=bestlam)[1:108,] #Display coefficients using lambda chosen by CV
lasso_coeff# lasso_coeff
!=0] #Display only non-zero coefficients lasso_coeff[lasso_coeff
## (Intercept) `START QUARTER` `COMPLETION QUARTER`
## -3.740964e+02 -9.003399e-02 1.519723e+01
## V1 V2 V3
## -4.888822e+00 1.807806e-02 -5.880665e-02
## V4 V5 V6
## 2.527186e-02 -1.058666e-01 5.577437e-05
## V7 V8 V9
## 3.603639e+01 1.174640e+00 7.407918e-03
## V16 V17 V21
## 9.596553e-02 3.315491e-03 3.354576e-03
## V27 V31 V35
## 2.651942e-06 1.127268e+00 1.375261e-01
## V36 V37 V44
## 1.757573e-03 4.965824e-01 2.241692e-02
## V47 V52 V54
## 2.621094e-03 -4.431157e-03 1.912921e-02
## V55 V66 V69
## 4.909184e-03 4.885161e-03 -1.412788e-02
## V71 V72 V74
## -7.024252e-03 -5.882857e+00 5.038468e-03
## V78 V83 V85
## 2.162572e-05 -2.047193e-05 5.780373e-03
## V92 V94 V97
## 1.047882e-03 1.515982e+01 4.927192e-03
Test Root Mean Square Error by different models,
=matrix(c(lm_rmse,lm_step_rmse,rr_rmse,lo_rmse))
rmse_t rownames(rmse_t) = (c("Linear Regression", "Linear Regression (Stepwise)", "Ridge Regression", "LASSO"))
colnames(rmse_t) = c("Test RMSE")
as.table(rmse_t)
## Test RMSE
## Linear Regression 988.0640
## Linear Regression (Stepwise) 179.9235
## Ridge Regression 275.9933
## LASSO 170.4990
From the test and train metrics model LASSO got the best performance with RMSE of 170.4990 on test data . Stepwise linear regression also performed well with RMSE of 179.9235.
# Rsquared value of LASSO
<- sum((lasso_pred - y_test) ^ 2) ## residual sum of squares
rss <- sum((y_test - mean(y_test)) ^ 2) ## total sum of squares
tss <- 1 - rss/tss
rsq c("R-squared for test set in LASSO model" = rsq)
## R-squared for test set in LASSO model
## 0.9852549
This means that 98.53 % variation in V104 can be explained from other predictor variables. The simple linear regression performed poorly while other three showed better performance. The main reason must be the multi colinearity present among predictor variables.