# Setting seed for reproducibility
set.seed(4765)

Split the data set into a training set and a test set,

# Working Directory
# setwd("/")

# Clear Environment
# rm(list=ls()) 

# Load Environment from the given .RData file
load("Residen.RData")

# NA visualization - No NAs found
visdat::vis_dat(Residen)

# Train/Test split in random at 0.8 ratio
row.number <- sample(1:nrow(Residen), 0.8*nrow(Residen))
train = Residen[row.number,]
test = Residen[-row.number,]

# Dimensions for Train/Test Data
dim(train)
## [1] 297 109
dim(test)
## [1]  75 109

Linear regression model on the training set to explain the ”actual sales price” (V104) in terms of the of the other variables excluding the variable ”actual construction costs” (V105),

# Create Linear Regression Model with training data excluding variable V105
lm_model <- lm(V104 ~. -V105, data=train, trace=FALSE)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'trace' will be disregarded
summary(lm_model)
## 
## Call:
## lm(formula = V104 ~ . - V105, data = train, trace = FALSE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -771.57  -44.11   -1.96   43.07  609.42 
## 
## Coefficients: (34 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2.650e+04  9.427e+04   0.281 0.778849    
## `START YEAR`         -6.724e+02  1.490e+03  -0.451 0.652265    
## `START QUARTER`       2.396e+03  3.809e+03   0.629 0.529977    
## `COMPLETION YEAR`     1.841e+02  1.853e+01   9.937  < 2e-16 ***
## `COMPLETION QUARTER`  6.177e+01  9.978e+00   6.191 2.84e-09 ***
## V1                   -2.958e+00  2.601e+00  -1.137 0.256656    
## V2                    9.140e-02  2.698e-02   3.387 0.000835 ***
## V3                   -2.871e-01  7.693e-02  -3.732 0.000241 ***
## V4                   -5.338e-02  4.008e-02  -1.332 0.184258    
## V5                    3.127e-01  3.786e-01   0.826 0.409643    
## V6                   -1.672e-02  6.615e-02  -0.253 0.800650    
## V7                           NA         NA      NA       NA    
## V8                    1.186e+00  1.897e-02  62.491  < 2e-16 ***
## V9                   -4.007e-01  3.321e-01  -1.207 0.228895    
## V10                   5.817e+02  8.406e+02   0.692 0.489701    
## V11                   1.414e+02  2.891e+02   0.489 0.625218    
## V12                  -1.056e+02  1.398e+02  -0.756 0.450589    
## V13                  -2.322e-02  1.682e-02  -1.381 0.168711    
## V14                  -3.036e-01  7.375e-01  -0.412 0.680976    
## V15                  -1.634e+01  5.228e+01  -0.313 0.754893    
## V16                   4.597e+00  8.029e+00   0.573 0.567494    
## V17                   1.035e-01  8.125e-02   1.274 0.203890    
## V18                  -3.833e+02  3.840e+02  -0.998 0.319214    
## V19                   4.210e+00  3.701e+00   1.137 0.256569    
## V20                  -1.195e+00  1.964e+00  -0.609 0.543254    
## V21                   2.670e-01  2.933e-01   0.910 0.363693    
## V22                  -1.537e+00  1.458e+00  -1.054 0.293188    
## V23                   2.620e+02  2.174e+02   1.205 0.229391    
## V24                  -8.067e+02  8.678e+02  -0.930 0.353591    
## V25                  -3.900e-01  6.348e-01  -0.614 0.539567    
## V26                   2.158e-01  3.823e-01   0.564 0.572990    
## V27                   4.468e-03  3.771e-03   1.185 0.237317    
## V28                   2.393e-01  1.495e-01   1.600 0.110927    
## V29                   9.651e+01  3.583e+02   0.269 0.787922    
## V30                   7.407e+01  6.010e+01   1.232 0.219081    
## V31                  -4.847e+02  3.788e+02  -1.279 0.202066    
## V32                   8.735e-02  8.382e-02   1.042 0.298513    
## V33                   5.575e-01  8.843e-01   0.630 0.529088    
## V34                  -1.359e+02  1.477e+02  -0.920 0.358492    
## V35                  -2.145e-02  4.029e+00  -0.005 0.995758    
## V36                  -2.374e-01  2.906e-01  -0.817 0.414756    
## V37                   6.568e+02  4.550e+02   1.443 0.150314    
## V38                   5.177e+00  3.935e+00   1.316 0.189590    
## V39                  -3.132e+00  3.527e+00  -0.888 0.375499    
## V40                  -8.322e-01  9.094e-01  -0.915 0.361121    
## V41                  -7.209e-01  9.953e-01  -0.724 0.469632    
## V42                   6.733e+01  2.279e+02   0.295 0.767902    
## V43                   7.739e+02  7.874e+02   0.983 0.326775    
## V44                  -2.605e-01  4.986e-01  -0.522 0.601868    
## V45                   3.361e-01  5.078e-01   0.662 0.508758    
## V46                   5.905e-03  6.011e-03   0.982 0.327002    
## V47                  -3.114e-02  3.361e-01  -0.093 0.926269    
## V48                  -1.075e+03  9.981e+02  -1.077 0.282495    
## V49                  -8.348e+01  7.521e+01  -1.110 0.268195    
## V50                   1.192e+03  1.760e+03   0.677 0.498812    
## V51                  -1.970e-02  6.288e-02  -0.313 0.754315    
## V52                   5.028e-01  4.148e-01   1.212 0.226695    
## V53                   8.447e+01  5.892e+01   1.433 0.153123    
## V54                   1.291e+01  1.176e+01   1.098 0.273582    
## V55                  -1.663e-01  2.653e-01  -0.627 0.531355    
## V56                  -1.107e+03  1.273e+03  -0.870 0.385289    
## V57                   4.766e+00  4.227e+00   1.128 0.260677    
## V58                  -4.529e+00  3.881e+00  -1.167 0.244486    
## V59                  -4.639e-01  3.026e-01  -1.533 0.126600    
## V60                  -1.156e-04  2.034e-01  -0.001 0.999547    
## V61                   2.342e+01  1.960e+02   0.120 0.904974    
## V62                   1.897e+01  3.830e+02   0.050 0.960532    
## V63                   1.965e-02  1.709e-01   0.115 0.908573    
## V64                   1.572e-01  3.227e-01   0.487 0.626623    
## V65                  -5.320e-03  3.429e-03  -1.552 0.122192    
## V66                  -7.414e-01  7.363e-01  -1.007 0.315061    
## V67                   2.182e+02  2.282e+02   0.956 0.339999    
## V68                   1.643e+02  1.454e+02   1.130 0.259564    
## V69                  -6.309e+02  9.486e+02  -0.665 0.506653    
## V70                  -8.058e-02  7.074e-02  -1.139 0.255848    
## V71                          NA         NA      NA       NA    
## V72                          NA         NA      NA       NA    
## V73                          NA         NA      NA       NA    
## V74                          NA         NA      NA       NA    
## V75                          NA         NA      NA       NA    
## V76                          NA         NA      NA       NA    
## V77                          NA         NA      NA       NA    
## V78                          NA         NA      NA       NA    
## V79                          NA         NA      NA       NA    
## V80                          NA         NA      NA       NA    
## V81                          NA         NA      NA       NA    
## V82                          NA         NA      NA       NA    
## V83                          NA         NA      NA       NA    
## V84                          NA         NA      NA       NA    
## V85                          NA         NA      NA       NA    
## V86                          NA         NA      NA       NA    
## V87                          NA         NA      NA       NA    
## V88                          NA         NA      NA       NA    
## V89                          NA         NA      NA       NA    
## V90                          NA         NA      NA       NA    
## V91                          NA         NA      NA       NA    
## V92                          NA         NA      NA       NA    
## V93                          NA         NA      NA       NA    
## V94                          NA         NA      NA       NA    
## V95                          NA         NA      NA       NA    
## V96                          NA         NA      NA       NA    
## V97                          NA         NA      NA       NA    
## V98                          NA         NA      NA       NA    
## V99                          NA         NA      NA       NA    
## V100                         NA         NA      NA       NA    
## V101                         NA         NA      NA       NA    
## V102                         NA         NA      NA       NA    
## V103                         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 142.6 on 223 degrees of freedom
## Multiple R-squared:  0.9884, Adjusted R-squared:  0.9846 
## F-statistic:   261 on 73 and 223 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(lm_model)
## Warning: not plotting observations with leverage one:
##   20, 23, 35, 46, 53, 84, 156, 267, 272

# Predict the actual sales using linear regression model created.
lm_Predicted <- predict(lm_model, test)
## Warning in predict.lm(lm_model, test): prediction from a rank-deficient fit may
## be misleading
# Calcuating the test RMSE of Linear regression model
lm_rmse <- sqrt(mean((test$V104 - lm_Predicted)^2))
c("Linear Regression Model RMSE" = lm_rmse)
## Linear Regression Model RMSE 
##                      988.064

Linear regression model using stepwise selection on the training set,

library(MASS)

# NULL Model
null_model = lm(V104 ~1, data=train)
#summary(null_model)
    
# Upper Model
upper_model = lm(V104 ~., data=train)
#summary(upper_model)

# Linear Regression with Stepwise Selection
lm_stepwise <- stepAIC(null_model, direction="both",trace=FALSE ,scope=list(upper=upper_model,lower=null_model))
summary(lm_stepwise)
## 
## Call:
## lm(formula = V104 ~ V8 + V7 + V55 + V105 + V5 + V72 + V99 + V19 + 
##     V84 + V65 + V86 + `COMPLETION QUARTER` + V1 + V23 + V35 + 
##     V41 + V68, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -822.99  -47.88   -0.47   41.95  644.01 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.719e+01  4.945e+01  -1.561 0.119674    
## V8                    1.185e+00  1.522e-02  77.838  < 2e-16 ***
## V7                    1.801e+01  5.687e+00   3.166 0.001717 ** 
## V55                   2.372e-03  9.168e-04   2.587 0.010191 *  
## V105                  1.742e+00  2.859e-01   6.093 3.67e-09 ***
## V5                   -2.107e+00  4.284e-01  -4.919 1.49e-06 ***
## V72                  -1.307e+01  1.151e+00 -11.355  < 2e-16 ***
## V99                  -1.177e+01  4.735e+00  -2.485 0.013557 *  
## V19                  -4.041e-01  9.710e-02  -4.161 4.22e-05 ***
## V84                   5.106e-04  1.371e-04   3.725 0.000236 ***
## V65                  -5.665e-04  1.811e-04  -3.128 0.001944 ** 
## V86                   9.761e+00  2.997e+00   3.257 0.001265 ** 
## `COMPLETION QUARTER`  1.249e+01  6.910e+00   1.807 0.071767 .  
## V1                   -3.531e+00  1.983e+00  -1.781 0.076074 .  
## V23                   2.815e+01  4.293e+00   6.558 2.64e-10 ***
## V35                  -1.822e-01  9.929e-02  -1.835 0.067594 .  
## V41                  -3.688e-02  1.035e-02  -3.564 0.000430 ***
## V68                  -4.913e+00  2.847e+00  -1.726 0.085498 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.6 on 279 degrees of freedom
## Multiple R-squared:  0.988,  Adjusted R-squared:  0.9873 
## F-statistic:  1355 on 17 and 279 DF,  p-value: < 2.2e-16
# Predication
lm_stepwise_pred <- predict(lm_stepwise, test)

# Calculating the test RMSE of Linear regression model
lm_step_rmse <- sqrt(mean((test$V104 - lm_stepwise_pred)^2))
c(Step_RMSE = lm_step_rmse)
## Step_RMSE 
##  179.9235

Linear regression model using ridge regression on the training set, with λ chosen by cross validation,

# Ridge Regression
library(dplyr)
library(glmnet)
# create matricies for the regression equation
x = model.matrix(V104~. -V105,Residen)[,-1]
y = Residen %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

x_train = model.matrix(V104~. -V105,train)[,-1]
y_train = train %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

x_test = model.matrix(V104~. -V105,test)[,-1]
y_test = test %>%
  dplyr::select(V104) %>%
  unlist() %>%
  as.numeric()

grid=10^seq(10,-2,length=100)
plot(grid)

ridge_mod = glmnet(x_train, y_train, alpha = 0, lambda = grid, thresh = 1e-12,trace=FALSE)
## Warning: from glmnet Fortran code (error code -78); Convergence for 78th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
dim(coef(ridge_mod))
## [1] 108  77
plot(ridge_mod) 

cv.out = cv.glmnet(x_train, y_train, alpha = 0,trace=FALSE) # Fit ridge regression model on training data
bestlam = cv.out$lambda.min  # Select lamda that minimizes training MSE
c("Best Lambda for Ridge Regression" = bestlam)
## Best Lambda for Ridge Regression 
##                         111.7434
plot(cv.out) # Draw plot of training MSE as a function of lambda

log(bestlam)
## [1] 4.716205
ridge_pred <- predict(ridge_mod, s = bestlam, newx = x_test) # Use best lambda to predict test data
rr_rmse <- sqrt(mean((y_test - ridge_pred)^2))
c(ridge_RMSE = rr_rmse)
## ridge_RMSE 
##   275.9933

Linear regression model using lasso on the training set, with λ chosen by cross validation. Test RMSE obtained along with the number of non-zero coefficient estimates,

# Lasso

lasso_mod=glmnet(x_train,y_train,alpha=1,lambda=grid,trace=FALSE) #fit lasso model on training data
plot(lasso_mod)                                       #Draw plot of coefficients 
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values

cv.out=cv.glmnet(x_train,y_train,alpha=1,trace=FALSE)           #Fit lasso model on training data   
plot(cv.out)

bestlam=cv.out$lambda.min     #Select lambda that minimises training data
c("Best Lambda for lasso" = bestlam)
## Best Lambda for lasso 
##               2.64216
lasso_pred=predict(lasso_mod,s=bestlam,newx=x_test) #Use best lambda to predict test data
lo_rmse <- sqrt(mean((y_test - lasso_pred)^2))
c(lasso_RMSE = lo_rmse)
## lasso_RMSE 
##    170.499
out=glmnet(x,y,alpha=1,lambda=grid,trace=FALSE) #Fit lasso model on the full dataset
lasso_coeff=predict(out,type="coefficients",s=bestlam)[1:108,] #Display coefficients using lambda chosen by CV
# lasso_coeff
lasso_coeff[lasso_coeff !=0] #Display only non-zero coefficients
##          (Intercept)      `START QUARTER` `COMPLETION QUARTER` 
##        -3.740964e+02        -9.003399e-02         1.519723e+01 
##                   V1                   V2                   V3 
##        -4.888822e+00         1.807806e-02        -5.880665e-02 
##                   V4                   V5                   V6 
##         2.527186e-02        -1.058666e-01         5.577437e-05 
##                   V7                   V8                   V9 
##         3.603639e+01         1.174640e+00         7.407918e-03 
##                  V16                  V17                  V21 
##         9.596553e-02         3.315491e-03         3.354576e-03 
##                  V27                  V31                  V35 
##         2.651942e-06         1.127268e+00         1.375261e-01 
##                  V36                  V37                  V44 
##         1.757573e-03         4.965824e-01         2.241692e-02 
##                  V47                  V52                  V54 
##         2.621094e-03        -4.431157e-03         1.912921e-02 
##                  V55                  V66                  V69 
##         4.909184e-03         4.885161e-03        -1.412788e-02 
##                  V71                  V72                  V74 
##        -7.024252e-03        -5.882857e+00         5.038468e-03 
##                  V78                  V83                  V85 
##         2.162572e-05        -2.047193e-05         5.780373e-03 
##                  V92                  V94                  V97 
##         1.047882e-03         1.515982e+01         4.927192e-03

Test Root Mean Square Error by different models,

rmse_t =matrix(c(lm_rmse,lm_step_rmse,rr_rmse,lo_rmse))
rownames(rmse_t) = (c("Linear Regression", "Linear Regression (Stepwise)",  "Ridge Regression", "LASSO"))
colnames(rmse_t) = c("Test RMSE")
as.table(rmse_t)
##                              Test RMSE
## Linear Regression             988.0640
## Linear Regression (Stepwise)  179.9235
## Ridge Regression              275.9933
## LASSO                         170.4990

From the test and train metrics model LASSO got the best performance with RMSE of 170.4990 on test data . Stepwise linear regression also performed well with RMSE of 179.9235.

# Rsquared value of LASSO
rss <- sum((lasso_pred - y_test) ^ 2)  ## residual sum of squares
tss <- sum((y_test - mean(y_test)) ^ 2)  ## total sum of squares
rsq <- 1 - rss/tss
c("R-squared for test set in LASSO model" = rsq)
## R-squared for test set in LASSO model 
##                             0.9852549