# Setting seed for reproducibility
set.seed(4765)

Split the data set into a training set and a test set,

# Working Directory
# setwd("/")

# Clear Environment
# rm(list=ls()) 

# Load Environment from the given .RData file
load("Residen.RData")

# NA visualization - No NAs found
visdat::vis_dat(Residen)

# Train/Test split in random at 0.8 ratio
row.number <- sample(1:nrow(Residen), 0.8*nrow(Residen))
train = Residen[row.number,]
test = Residen[-row.number,]

# Dimensions for Train/Test Data
dim(train)
## [1] 297 109
dim(test)
## [1]  75 109

Linear regression model on the training set to explain the ”actual sales price” (V104) in terms of the of the other variables excluding the variable ”actual construction costs” (V105),

# Create Linear Regression Model with training data excluding variable V105
lm_model <- lm(V104 ~. -V105, data=train, trace=FALSE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trace' will be disregarded
summary(lm_model)
## 
## Call:
## lm(formula = V104 ~ . - V105, data = train, trace = FALSE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -771.57  -44.11   -1.96   43.07  609.42 
## 
## Coefficients: (34 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.650e+04  9.427e+04   0.281 0.778849    
## `START YEAR`         -6.724e+02  1.490e+03  -0.451 0.652265    
## `START QUARTER`       2.396e+03  3.809e+03   0.629 0.529977    
## `COMPLETION YEAR`     1.841e+02  1.853e+01   9.937  < 2e-16 ***
## `COMPLETION QUARTER`  6.177e+01  9.978e+00   6.191 2.84e-09 ***
## V1                   -2.958e+00  2.601e+00  -1.137 0.256656    
## V2                    9.140e-02  2.698e-02   3.387 0.000835 ***
## V3                   -2.871e-01  7.693e-02  -3.732 0.000241 ***
## V4                   -5.338e-02  4.008e-02  -1.332 0.184258    
## V5                    3.127e-01  3.786e-01   0.826 0.409643    
## V6                   -1.672e-02  6.615e-02  -0.253 0.800650    
## V7                           NA         NA      NA       NA    
## V8                    1.186e+00  1.897e-02  62.491  < 2e-16 ***
## V9                   -4.007e-01  3.321e-01  -1.207 0.228895    
## V10                   5.817e+02  8.406e+02   0.692 0.489701    
## V11                   1.414e+02  2.891e+02   0.489 0.625218    
## V12                  -1.056e+02  1.398e+02  -0.756 0.450589    
## V13                  -2.322e-02  1.682e-02  -1.381 0.168711    
## V14                  -3.036e-01  7.375e-01  -0.412 0.680976    
## V15                  -1.634e+01  5.228e+01  -0.313 0.754893    
## V16                   4.597e+00  8.029e+00   0.573 0.567494    
## V17                   1.035e-01  8.125e-02   1.274 0.203890    
## V18                  -3.833e+02  3.840e+02  -0.998 0.319214    
## V19                   4.210e+00  3.701e+00   1.137 0.256569    
## V20                  -1.195e+00  1.964e+00  -0.609 0.543254    
## V21                   2.670e-01  2.933e-01   0.910 0.363693    
## V22                  -1.537e+00  1.458e+00  -1.054 0.293188    
## V23                   2.620e+02  2.174e+02   1.205 0.229391    
## V24                  -8.067e+02  8.678e+02  -0.930 0.353591    
## V25                  -3.900e-01  6.348e-01  -0.614 0.539567    
## V26                   2.158e-01  3.823e-01   0.564 0.572990    
## V27                   4.468e-03  3.771e-03   1.185 0.237317    
## V28                   2.393e-01  1.495e-01   1.600 0.110927    
## V29                   9.651e+01  3.583e+02   0.269 0.787922    
## V30                   7.407e+01  6.010e+01   1.232 0.219081    
## V31                  -4.847e+02  3.788e+02  -1.279 0.202066    
## V32                   8.735e-02  8.382e-02   1.042 0.298513    
## V33                   5.575e-01  8.843e-01   0.630 0.529088    
## V34                  -1.359e+02  1.477e+02  -0.920 0.358492    
## V35                  -2.145e-02  4.029e+00  -0.005 0.995758    
## V36                  -2.374e-01  2.906e-01  -0.817 0.414756    
## V37                   6.568e+02  4.550e+02   1.443 0.150314    
## V38                   5.177e+00  3.935e+00   1.316 0.189590    
## V39                  -3.132e+00  3.527e+00  -0.888 0.375499    
## V40                  -8.322e-01  9.094e-01  -0.915 0.361121    
## V41                  -7.209e-01  9.953e-01  -0.724 0.469632    
## V42                   6.733e+01  2.279e+02   0.295 0.767902    
## V43                   7.739e+02  7.874e+02   0.983 0.326775    
## V44                  -2.605e-01  4.986e-01  -0.522 0.601868    
## V45                   3.361e-01  5.078e-01   0.662 0.508758    
## V46                   5.905e-03  6.011e-03   0.982 0.327002    
## V47                  -3.114e-02  3.361e-01  -0.093 0.926269    
## V48                  -1.075e+03  9.981e+02  -1.077 0.282495    
## V49                  -8.348e+01  7.521e+01  -1.110 0.268195    
## V50                   1.192e+03  1.760e+03   0.677 0.498812    
## V51                  -1.970e-02  6.288e-02  -0.313 0.754315    
## V52                   5.028e-01  4.148e-01   1.212 0.226695    
## V53                   8.447e+01  5.892e+01   1.433 0.153123    
## V54                   1.291e+01  1.176e+01   1.098 0.273582    
## V55                  -1.663e-01  2.653e-01  -0.627 0.531355    
## V56                  -1.107e+03  1.273e+03  -0.870 0.385289    
## V57                   4.766e+00  4.227e+00   1.128 0.260677    
## V58                  -4.529e+00  3.881e+00  -1.167 0.244486    
## V59                  -4.639e-01  3.026e-01  -1.533 0.126600    
## V60                  -1.156e-04  2.034e-01  -0.001 0.999547    
## V61                   2.342e+01  1.960e+02   0.120 0.904974    
## V62                   1.897e+01  3.830e+02   0.050 0.960532    
## V63                   1.965e-02  1.709e-01   0.115 0.908573    
## V64                   1.572e-01  3.227e-01   0.487 0.626623    
## V65                  -5.320e-03  3.429e-03  -1.552 0.122192    
## V66                  -7.414e-01  7.363e-01  -1.007 0.315061    
## V67                   2.182e+02  2.282e+02   0.956 0.339999    
## V68                   1.643e+02  1.454e+02   1.130 0.259564    
## V69                  -6.309e+02  9.486e+02  -0.665 0.506653    
## V70                  -8.058e-02  7.074e-02  -1.139 0.255848    
## V71                          NA         NA      NA       NA    
## V72                          NA         NA      NA       NA    
## V73                          NA         NA      NA       NA    
## V74                          NA         NA      NA       NA    
## V75                          NA         NA      NA       NA    
## V76                          NA         NA      NA       NA    
## V77                          NA         NA      NA       NA    
## V78                          NA         NA      NA       NA    
## V79                          NA         NA      NA       NA    
## V80                          NA         NA      NA       NA    
## V81                          NA         NA      NA       NA    
## V82                          NA         NA      NA       NA    
## V83                          NA         NA      NA       NA    
## V84                          NA         NA      NA       NA    
## V85                          NA         NA      NA       NA    
## V86                          NA         NA      NA       NA    
## V87                          NA         NA      NA       NA    
## V88                          NA         NA      NA       NA    
## V89                          NA         NA      NA       NA    
## V90                          NA         NA      NA       NA    
## V91                          NA         NA      NA       NA    
## V92                          NA         NA      NA       NA    
## V93                          NA         NA      NA       NA    
## V94                          NA         NA      NA       NA    
## V95                          NA         NA      NA       NA    
## V96                          NA         NA      NA       NA    
## V97                          NA         NA      NA       NA    
## V98                          NA         NA      NA       NA    
## V99                          NA         NA      NA       NA    
## V100                         NA         NA      NA       NA    
## V101                         NA         NA      NA       NA    
## V102                         NA         NA      NA       NA    
## V103                         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142.6 on 223 degrees of freedom
## Multiple R-squared:  0.9884, Adjusted R-squared:  0.9846 
## F-statistic:   261 on 73 and 223 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_model)
## Warning: not plotting observations with leverage one:
##   20, 23, 35, 46, 53, 84, 156, 267, 272

# Predict the actual sales using linear regression model created.
lm_Predicted <- predict(lm_model, test)
## Warning in predict.lm(lm_model, test): prediction from a rank-deficient fit may
## be misleading
# Calcuating the test RMSE of Linear regression model
lm_rmse <- sqrt(mean((test$V104 - lm_Predicted)^2))
c("Linear Regression Model RMSE" = lm_rmse)
## Linear Regression Model RMSE 
##                      988.064

Linear regression model using stepwise selection on the training set,

library(MASS)

# NULL Model
null_model = lm(V104 ~1, data=train)
#summary(null_model)
    
# Upper Model
upper_model = lm(V104 ~., data=train)
#summary(upper_model)

# Linear Regression with Stepwise Selection
lm_stepwise <- stepAIC(null_model, direction="both",trace=FALSE ,scope=list(upper=upper_model,lower=null_model))
summary(lm_stepwise)
## 
## Call:
## lm(formula = V104 ~ V8 + V7 + V55 + V105 + V5 + V72 + V99 + V19 + 
##     V84 + V65 + V86 + `COMPLETION QUARTER` + V1 + V23 + V35 + 
##     V41 + V68, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -822.99  -47.88   -0.47   41.95  644.01 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.719e+01  4.945e+01  -1.561 0.119674    
## V8                    1.185e+00  1.522e-02  77.838  < 2e-16 ***
## V7                    1.801e+01  5.687e+00   3.166 0.001717 ** 
## V55                   2.372e-03  9.168e-04   2.587 0.010191 *  
## V105                  1.742e+00  2.859e-01   6.093 3.67e-09 ***
## V5                   -2.107e+00  4.284e-01  -4.919 1.49e-06 ***
## V72                  -1.307e+01  1.151e+00 -11.355  < 2e-16 ***
## V99                  -1.177e+01  4.735e+00  -2.485 0.013557 *  
## V19                  -4.041e-01  9.710e-02  -4.161 4.22e-05 ***
## V84                   5.106e-04  1.371e-04   3.725 0.000236 ***
## V65                  -5.665e-04  1.811e-04  -3.128 0.001944 ** 
## V86                   9.761e+00  2.997e+00   3.257 0.001265 ** 
## `COMPLETION QUARTER`  1.249e+01  6.910e+00   1.807 0.071767 .  
## V1                   -3.531e+00  1.983e+00  -1.781 0.076074 .  
## V23                   2.815e+01  4.293e+00   6.558 2.64e-10 ***
## V35                  -1.822e-01  9.929e-02  -1.835 0.067594 .  
## V41                  -3.688e-02  1.035e-02  -3.564 0.000430 ***
## V68                  -4.913e+00  2.847e+00  -1.726 0.085498 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.6 on 279 degrees of freedom
## Multiple R-squared:  0.988,  Adjusted R-squared:  0.9873 
## F-statistic:  1355 on 17 and 279 DF,  p-value: < 2.2e-16
# Predication
lm_stepwise_pred <- predict(lm_stepwise, test)

# Calculating the test RMSE of Linear regression model
lm_step_rmse <- sqrt(mean((test$V104 - lm_stepwise_pred)^2))
c(Step_RMSE = lm_step_rmse)
## Step_RMSE 
##  179.9235

Linear regression model using ridge regression on the training set, with λ chosen by cross validation,

# Ridge Regression
library(dplyr)
library(glmnet)
# create matricies for the regression equation
x = model.matrix(V104~. -V105,Residen)[,-1]
y = Residen %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

x_train = model.matrix(V104~. -V105,train)[,-1]
y_train = train %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

x_test = model.matrix(V104~. -V105,test)[,-1]
y_test = test %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

grid=10^seq(10,-2,length=100)
plot(grid)

ridge_mod = glmnet(x_train, y_train, alpha = 0, lambda = grid, thresh = 1e-12,trace=FALSE)
## Warning: from glmnet Fortran code (error code -78); Convergence for 78th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
dim(coef(ridge_mod))
## [1] 108  77
plot(ridge_mod) 

cv.out = cv.glmnet(x_train, y_train, alpha = 0,trace=FALSE) # Fit ridge regression model on training data
bestlam = cv.out$lambda.min  # Select lamda that minimizes training MSE
c("Best Lambda for Ridge Regression" = bestlam)
## Best Lambda for Ridge Regression 
##                         111.7434
plot(cv.out) # Draw plot of training MSE as a function of lambda

log(bestlam)
## [1] 4.716205
ridge_pred <- predict(ridge_mod, s = bestlam, newx = x_test) # Use best lambda to predict test data
rr_rmse <- sqrt(mean((y_test - ridge_pred)^2))
c(ridge_RMSE = rr_rmse)
## ridge_RMSE 
##   275.9933

Linear regression model using lasso on the training set, with λ chosen by cross validation. Test RMSE obtained along with the number of non-zero coefficient estimates,

# Lasso

lasso_mod=glmnet(x_train,y_train,alpha=1,lambda=grid,trace=FALSE) #fit lasso model on training data
plot(lasso_mod)                                       #Draw plot of coefficients 
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

cv.out=cv.glmnet(x_train,y_train,alpha=1,trace=FALSE)           #Fit lasso model on training data   
plot(cv.out)

bestlam=cv.out$lambda.min     #Select lambda that minimises training data
c("Best Lambda for lasso" = bestlam)
## Best Lambda for lasso 
##               2.64216
lasso_pred=predict(lasso_mod,s=bestlam,newx=x_test) #Use best lambda to predict test data
lo_rmse <- sqrt(mean((y_test - lasso_pred)^2))
c(lasso_RMSE = lo_rmse)
## lasso_RMSE 
##    170.499
out=glmnet(x,y,alpha=1,lambda=grid,trace=FALSE) #Fit lasso model on the full dataset
lasso_coeff=predict(out,type="coefficients",s=bestlam)[1:108,] #Display coefficients using lambda chosen by CV
# lasso_coeff
lasso_coeff[lasso_coeff !=0] #Display only non-zero coefficients
##          (Intercept)      `START QUARTER` `COMPLETION QUARTER` 
##        -3.740964e+02        -9.003399e-02         1.519723e+01 
##                   V1                   V2                   V3 
##        -4.888822e+00         1.807806e-02        -5.880665e-02 
##                   V4                   V5                   V6 
##         2.527186e-02        -1.058666e-01         5.577437e-05 
##                   V7                   V8                   V9 
##         3.603639e+01         1.174640e+00         7.407918e-03 
##                  V16                  V17                  V21 
##         9.596553e-02         3.315491e-03         3.354576e-03 
##                  V27                  V31                  V35 
##         2.651942e-06         1.127268e+00         1.375261e-01 
##                  V36                  V37                  V44 
##         1.757573e-03         4.965824e-01         2.241692e-02 
##                  V47                  V52                  V54 
##         2.621094e-03        -4.431157e-03         1.912921e-02 
##                  V55                  V66                  V69 
##         4.909184e-03         4.885161e-03        -1.412788e-02 
##                  V71                  V72                  V74 
##        -7.024252e-03        -5.882857e+00         5.038468e-03 
##                  V78                  V83                  V85 
##         2.162572e-05        -2.047193e-05         5.780373e-03 
##                  V92                  V94                  V97 
##         1.047882e-03         1.515982e+01         4.927192e-03

Test Root Mean Square Error by different models,

rmse_t =matrix(c(lm_rmse,lm_step_rmse,rr_rmse,lo_rmse))
rownames(rmse_t) = (c("Linear Regression", "Linear Regression (Stepwise)",  "Ridge Regression", "LASSO"))
colnames(rmse_t) = c("Test RMSE")
as.table(rmse_t)
##                              Test RMSE
## Linear Regression             988.0640
## Linear Regression (Stepwise)  179.9235
## Ridge Regression              275.9933
## LASSO                         170.4990

From the test and train metrics model LASSO got the best performance with RMSE of 170.4990 on test data . Stepwise linear regression also performed well with RMSE of 179.9235.

# Rsquared value of LASSO
rss <- sum((lasso_pred - y_test) ^ 2)  ## residual sum of squares
tss <- sum((y_test - mean(y_test)) ^ 2)  ## total sum of squares
rsq <- 1 - rss/tss
c("R-squared for test set in LASSO model" = rsq)
## R-squared for test set in LASSO model 
##                             0.9852549

This means that 98.53 % variation in V104 can be explained from other predictor variables. The simple linear regression performed poorly while other three showed better performance. The main reason must be the multi colinearity present among predictor variables.