Sei sulla pagina 1di 10

Advance Statictics Project – Problem 2 -

Leslie Salt Data Set


Q1. What is the nature of each of the variables? Which variable is dependent variable and what are
the independent variables in the model? - Price is the dependent variable and all other variables
are independent.

Q2. Check whether the variables require any transformation individually - The independent
variables Flood and County should be factor variables and not integer. Converted them as factor
variables while doing the project.

Q3. Set up a regression equation, run the model and discuss your results –

Price of the property if sold in next 3 months:

1st month - $17688.22/acre

2nd month - $17819.05/acre

3rd month - $17949.89/acre

Source Code:

##Leslie Salt Data Set

##Load and analyse the structure of the dataset

library(readxl)

LSdata = read_excel("Dataset_LeslieSalt.xlsx")

str(LSdata)

summary(LSdata)

#Nature of Variables - Price is the dependent variable and all other variables are
independent.

#Transformation - The independent variables Flood and County should be factor variables
and not integer.

#Converting Flood & County variables to factor

LSdata$County = factor(LSdata$County,

levels=c("0","1"),

labels=c("San Mateo", "Santa Clara"))


LSdata$Flood = factor(LSdata$Flood,

levels=c("0","1"),

labels=c("No", "Yes" ))

str(LSdata)

summary(LSdata)

# Verify the data for Null Values

sapply(LSdata,function(x){sum(is.na(x))})

#Analyse Price using plots for identifying outliers and correlations

boxplot(LSdata$Price)

#Removing the outlier

LSdata = LSdata[-26,]

boxplot(LSdata$Price)

#Checking the corelation

LSmatrix <- as.matrix(dplyr::select_if(LSdata, is.numeric))

corrplot(cor(LSmatrix), method = "circle",

type="full",

order = "hclust",

tl.col = "black")

#Loading the corrplot library as it was not loaded previously

library(corrplot)

#Corelation observations:

##Price has a positive correlation with Elevation and Date.

##Price has a negative correlation with Sewer.

##Price has negligible correlation with Size and Distance.


#First Model with all independent variables

LSdatamodel1 = lm(Price ~., data = LSdata)

summary(LSdatamodel1)

##As p-value is very less, this model is a valid one.

##When analyzing the p-values, it is observed that the variables County, Size, Sewer and
Distance

##have a high p-value. Therefore, we will ignore these variables in next regression model.

#second Model without County, Size, Sewer and Distance variables

LSdatamodel2 = lm(Price ~.-County-Size-Sewer-Distance, data = LSdata)

summary(LSdatamodel2)

##As p-value is very less, this model is a valid one.

##However, R-squared value has decreased compared to the previous model, therefore this
model is rejected.

##Distance and Size variables will be removed from the model as they are correlated as per
the corrplot

##and this creates a problem of Multicollinearity.

#Third Model without Size and Distance variables.

LSdatamodel3 = lm(Price ~.-Distance -Size, data = LSdata)

summary(LSdatamodel3)

##As p-value is very less, this model is a valid one. Also the R square value is higher than
Secound Model.

##Hence we will go with this model.

#Predicting the price of the Leslie Salt property

##County = Santa Clara, Size = 246.8, Elevation = 0 (property at sea level),Sewer = 0(no data
provided),
##Date = 6 (assuming property will be sold in next 6 months), Flood = 0 (property diked),
Distance = 0

##(as distance is relative to Leslie Salt property)

LeslieProperty = data.frame(Price = 0, County = "Santa Clara", Size = 246.8, Elevation = 0,


Sewer = 0, Date = c(1,2,3), Flood = "No" , Distance = 0)

LeslieProperty$PredictedPrice <- predict(LSdatamodel3,LeslieProperty) * 1000

LeslieProperty$PredictedPrice

=================================Output:

> getwd()
[1] "E:/Analytics/R/Advance Statistics Project"
> ##Leslie Salt Data Set
> ##Load and analyse the structure of the dataset
> LSdata <- read_excel("Dataset_LeslieSalt.xlsx")
Error in read_excel("Dataset_LeslieSalt.xlsx") :
could not find function "read_excel"
> ##Leslie Salt Data Set
> ##Load and analyse the structure of the dataset
> library(readxl)
> LSdata <- read_excel("Dataset_LeslieSalt.xlsx")
> str(LSdata)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31 obs. of 8 variables:
$ Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
$ County : num 1 1 0 0 0 1 1 1 1 1 ...
$ Size : num 138.4 52 16.1 1695.2 845 ...
$ Elevation: num 10 4 0 1 1 2 4 4 20 0 ...
$ Sewer : num 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
$ Date : num -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
$ Flood : num 0 0 1 0 1 0 0 0 0 0 ...
$ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
> summary(LSdata)
Price County Size Elevation Sewer Date Flood
Min. : 1.70 Min. :0.0000 Min. : 6.90 Min. : 0.000 Min. : 0 Min. :-103.00 Min.
:0.0000
1st Qu.: 5.35 1st Qu.:0.0000 1st Qu.: 20.35 1st Qu.: 2.000 1st Qu.: 0 1st Qu.: -63.50 1st
Qu.:0.0000
Median :11.70 Median :1.0000 Median : 51.40 Median : 4.000 Median : 900 Median : -
59.00 Median :0.0000
Mean :11.95 Mean :0.6129 Mean : 139.97 Mean : 4.645 Mean : 1981 Mean : -
58.65 Mean :0.1613
3rd Qu.:16.05 3rd Qu.:1.0000 3rd Qu.: 104.10 3rd Qu.: 7.000 3rd Qu.: 3450 3rd Qu.: -
51.00 3rd Qu.:0.0000
Max. :37.20 Max. :1.0000 Max. :1695.20 Max. :20.000 Max. :10000 Max. : -4.00
Max. :1.0000
Distance
Min. : 0.000
1st Qu.: 0.850
Median : 4.900
Mean : 5.132
3rd Qu.: 5.500
Max. :16.500
> # Verify the data for Null Values
> sapply(LSdata,function(x){sum(is.na(x))})
Price County Size Elevation Sewer Date Flood Distance
0 0 0 0 0 0 0 0
> #Nature of Variables - Price is the dependent variable and all other variables are independent.
> #Transformation - The independent variables Flood and County should be factor variables and
not integer.
> LSdata$County <- factor(LSdata$County,
+ levels=c("0","1"),
+ labels=c("San Mateo", "Santa Clara"))
> LSdata$Flood <- factor(LSdata$Flood,
+ levels=c("0","1"),
+ labels=c("No", "Yes" ))
> str(LSdata)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 31 obs. of 8 variables:
$ Price : num 4.5 10.6 1.7 5 5 3.3 5.7 6.2 19.4 3.2 ...
$ County : Factor w/ 2 levels "San Mateo","Santa Clara": 2 2 1 1 1 2 2 2 2 2 ...
$ Size : num 138.4 52 16.1 1695.2 845 ...
$ Elevation: num 10 4 0 1 1 2 4 4 20 0 ...
$ Sewer : num 3000 0 2640 3500 1000 10000 0 0 1300 6000 ...
$ Date : num -103 -103 -98 -93 -92 -86 -68 -64 -63 -62 ...
$ Flood : Factor w/ 2 levels "No","Yes": 1 1 2 1 2 1 1 1 1 1 ...
$ Distance : num 0.3 2.5 10.3 14 14 0 0 0 1.2 0 ...
> summary(LSdata)
Price County Size Elevation Sewer Date Flood
Min. : 1.70 San Mateo :12 Min. : 6.90 Min. : 0.000 Min. : 0 Min. :-103.00 No :26
1st Qu.: 5.35 Santa Clara:19 1st Qu.: 20.35 1st Qu.: 2.000 1st Qu.: 0 1st Qu.: -63.50
Yes: 5
Median :11.70 Median : 51.40 Median : 4.000 Median : 900 Median : -59.00
Mean :11.95 Mean : 139.97 Mean : 4.645 Mean : 1981 Mean : -58.65
3rd Qu.:16.05 3rd Qu.: 104.10 3rd Qu.: 7.000 3rd Qu.: 3450 3rd Qu.: -51.00
Max. :37.20 Max. :1695.20 Max. :20.000 Max. :10000 Max. : -4.00
Distance
Min. : 0.000
1st Qu.: 0.850
Median : 4.900
Mean : 5.132
3rd Qu.: 5.500
Max. :16.500
> #Analyse Price using plots for identifying outliers and correlations
> boxplot(LeslieSaltData$Price)
Error in boxplot(LeslieSaltData$Price) :
object 'LeslieSaltData' not found
> #Analyse Price using plots for identifying outliers and correlations
> boxplot(LSdata$Price)
> LSData <- LSData[-26,]
Error: object 'LSData' not found
> LSdata = LSdata[-26,]
> boxplot(LSdata$Price)
> #Checking the corelation
> LSmatrix <- as.matrix(dplyr::select_if(LSdata, is.numeric))
> corrplot(cor(LSmatrix), method = "circle",
+ type="full",
+ order = "hclust",
+ tl.col = "black")
Error in corrplot(cor(LSmatrix), method = "circle", type = "full", order = "hclust", :
could not find function "corrplot"
> library(corrplot)
corrplot 0.84 loaded
> corrplot(cor(LSmatrix), method = "circle",
+ type="full",
+ order = "hclust",
+ tl.col = "black")
> #Corelation observations:
> ##Price has a positive correlation with Elevation and Date.
> ##Price has a negative correlation with Sewer.
> ##Price has negligible correlation with Size and Distance.
> #First Model with all independent variables
> LSdatamodel1 = lm(Price ~., data = LSdata)
> summary(LSdatamodel1)

Call:
lm(formula = Price ~ ., data = LSdata)

Residuals:
Min 1Q Median 3Q Max
-3.7059 -2.6043 -0.3876 2.2315 4.7774

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 18.6267827 2.9067195 6.408 1.9e-06 ***
CountySanta Clara -2.6365930 2.8842949 -0.914 0.37056
Size -0.0034320 0.0025420 -1.350 0.19070
Elevation 0.5407713 0.1693998 3.192 0.00421 **
Sewer -0.0005078 0.0003100 -1.638 0.11563
Date 0.1279277 0.0356334 3.590 0.00163 **
FloodYes -7.8400025 2.2885764 -3.426 0.00242 **
Distance 0.4097406 0.2453188 1.670 0.10904
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.145 on 22 degrees of freedom


Multiple R-squared: 0.8069, Adjusted R-squared: 0.7454
F-statistic: 13.13 on 7 and 22 DF, p-value: 1.493e-06

> ##As p-value is very less, this model is a valid one.


> ##When analyzing the p-values, it is observed that the variables County, Size, Sewer and
Distance
> ##have a high p-value. Therefore, we will ignore these variables in next regression model.
> #second Model without County, Size, Sewer and Distance variables
> LSdatamodel2 = lm(Price ~.-County-Size-Sewer-Distance, data = LSdata)
> summary(LSdatamodel2)

Call:
lm(formula = Price ~ . - County - Size - Sewer - Distance, data = LSdata)

Residuals:
Min 1Q Median 3Q Max
-5.5172 -2.8233 -0.2048 2.6765 6.6460

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 19.2331 2.0181 9.530 5.72e-10 ***
Elevation 0.5477 0.1698 3.226 0.00338 **
Date 0.1696 0.0283 5.994 2.50e-06 ***
FloodYes -3.6172 1.9813 -1.826 0.07941 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.752 on 26 degrees of freedom
Multiple R-squared: 0.6751, Adjusted R-squared: 0.6376
F-statistic: 18.01 on 3 and 26 DF, p-value: 1.57e-06

> ##As p-value is very less, this model is a valid one.


> ##However, R-squared value has decreased compared to the previous model, therefore this
model is rejected.
> ##Distance and Size variables will be removed from the model as they are correlated as per
the corrplot
> ##and this creates a problem of Multicollinearity.
> #Third Model without Size and Distance variables.
> LSdatamodel3 = lm(Price ~.-Distance -Size, data = LSdata)
> summary(LSdatamodel3)

Call:
lm(formula = Price ~ . - Distance - Size, data = LSdata)

Residuals:
Min 1Q Median 3Q Max
-5.0186 -2.2651 -0.3114 2.1549 5.1596

Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 22.0187525 1.9634490 11.214 5.01e-11 ***
CountySanta Clara -4.4613706 1.8189990 -2.453 0.02183 *
Elevation 0.5086667 0.1726287 2.947 0.00704 **
Sewer -0.0006846 0.0002789 -2.455 0.02173 *
Date 0.1308357 0.0276699 4.728 8.28e-05 ***
FloodYes -7.6795702 2.1524916 -3.568 0.00156 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.252 on 24 degrees of freedom


Multiple R-squared: 0.7747, Adjusted R-squared: 0.7278
F-statistic: 16.51 on 5 and 24 DF, p-value: 4.372e-07

> ##As p-value is very less, this model is a valid one. Also the R square value is higher than
Secound Model.
> ##Hence we will go with this model.
> #Predicting the price of the Leslie Salt property
> ##County = Santa Clara, Size = 246.8, Elevation = 0 (property at sea level),Sewer = 0(no data
provided),
> ##Date = 6 (assuming property will be sold in next 6 months), Flood = 0 (property diked),
Distance = 0
> ##(as distance is relative to Leslie Salt property)
> leslie_salt = data.frame(0,"Santa Clara",246.8,0,0,6,"No",0)
> colnames(leslie_salt) = c("Price", "County", "Size", "Elevation", "Sewer", "Date", "Flood",
"Distance")
> data=rbind(data,leslie_salt)
Error in rep(xi, length.out = nvar) :
attempt to replicate an object of type 'closure'
> leslie_salt_price = predict(LSdatamodel3, newdata = data[32,])
Error in data[32, ] : object of type 'closure' is not subsettable
> leslie_salt_price
Error: object 'leslie_salt_price' not found
> data = rbind(data,leslie_salt)
Error in rep(xi, length.out = nvar) :
attempt to replicate an object of type 'closure'
> LSdata = rbind(LSdata,leslie_salt)
> leslie_salt_price = predict(LSdatamodel3, newdata = data[32,])
Error in data[32, ] : object of type 'closure' is not subsettable
> leslie_salt_price = predict(LSdatamodel3, newdata = LSdata[32,])
> leslie_salt_price
1
NA
> leslie_salt_price
1
NA
> ##As p-value is very less, this model is a valid one. Also the R square value is higher than
Secound Model.
> ##Hence we will go with this model.
> #Predicting the price of the Leslie Salt property
> ##County = Santa Clara, Size = 246.8, Elevation = 0 (property at sea level),Sewer = 0(no data
provided),
> ##Date = 6 (assuming property will be sold in next 6 months), Flood = 0 (property diked),
Distance = 0
> ##(as distance is relative to Leslie Salt property)
> LeslieProperty = data.frame(Price = 0, County = "Santa Clara", Size = 246.8, Elevation = 0,
Sewer = 0, Date = c(1,2,3), Flood = "No" , Distance = 0)
> LeslieProperty$PredictedPrice <- predict(LSdatamodel3,LeslieProperty) * 1000
> LeslieProperty$PredictedPrice
[1] 17688.22 17819.05 17949.89

Potrebbero piacerti anche