-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHousingPrice Linear Regression.R
216 lines (158 loc) · 5.91 KB
/
HousingPrice Linear Regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#Boston Pricing case study
setwd("D:\\")
##Loading Data
prices<-read.csv("boston_prices.csv",header=TRUE,stringsAsFactors=FALSE)
##Checking Data Characteristics
dim(prices)
str(prices)
head(prices)
names(prices)
#summary statistics
summary(prices)
#Missing values treatment
colSums(is.na(prices)) #MEDV has a lot of missing values
summary((prices$MEDV))
prices$MEDV[is.na(prices$MEDV)]<-mean(prices$MEDV,na.rm=TRUE)
#Outlier plots
par(mfrow=c(2,7)) #This allows you to plot 14 charts on a single page; It is optional.
list<-names(prices) #Store the names of the dataset in a list format
list<-list[-4]
for(i in 1:length(list)) #Plot the boxplots of all variables and shortlist which ones need outlier treatment.
{
boxplot(prices[,list[i]],main=list[i])
}
#Restore the par parameters to normal
dev.off()
#In this solution, We have replaced the outlier values by the median values
#You can decide to replace by max or mean values based on business objectives
#Outlier treatment
for(i in 1:length(list)) ##For loop to replace all the outlier values with the mean value ; if you want you can replace with median value as well.
{
x<-boxplot(prices[,list[i]])
out<-x$out
index<-which(prices[,list[i]] %in% x$out)
prices[index,list[i]]<-mean(prices[,list[i]])
rm(x)
rm(out)
}
#Exploratory analysis
library(ggplot2)
#Study the histogram of the DV and the transformed histogram
hist(prices$MEDV)
#hist(prices$log_MEDV) #Once you create the transformations;look down
#You can look at the correlation between each IDV and the DV
#An eg :
ggplot(prices,aes(x=MEDV,y=LSTAT)) +geom_point()
ggplot(prices,aes(x=MEDV,y=DIS)) +geom_point()
ggplot(prices,aes(x=MEDV,y=AGE)) +geom_point()
#Inorder to quicken the process, lets write a function :
#Below is a function that gives you the correlation values between all IDV's and the DV
#Simply taking a look at the output of this function, you can quickly shortlist
#Which all IDV's are correlated to the DV
#Function to get the list of correlations between : DV and the IDV's
list1<-list[-13]
for(i in 1:length(list1))
{
x<-cor(prices$MEDV,prices[list[i]])
print(x)
}
#Significant variables are : B LSTAT AGE X.rooms.dwelling nitric.oxides.concentration INDUS
#You can also try to use data transformations
#Log transformations
#Create the log transformation for all variables
prices$log_CRIM<-log(prices$CRIM)
prices$log_ZN<-log(prices$ZN)
prices$log_NOX<-log(prices$nitric.oxides.concentration)
prices$log_RM<-log(prices$X.rooms.dwelling)
prices$log_AGE<-log(prices$AGE)
prices$log_DIS<-log(prices$DIS)
prices$log_RAD<-log(prices$RAD)
prices$log_TAX<-log(prices$TAX)
prices$log_PTRATIO<-log(prices$PTRATIO)
prices$log_B<-log(prices$B)
prices$log_LSTAT<-log(prices$LSTAT)
prices$log_MEDV<-log(prices$MEDV) #DV
prices$log_INDUS<-log(prices$INDUS)
#Refer to the profiling excel sheet to see all the correlations documented
#Function to get the list of correlations between : log_DV and log of IDV's
list_log<-names(prices)[c(15:25,27)]
for(i in 1:length(list_log))
{
xlog<-cor(prices$log_MEDV,prices[list_log[i]])
print(xlog)
}
#Function to get the list of correlations between : log_DV and IDV's
list_log_DV<-names(prices)[1:13]
list_log_DV<-list_log_DV[-4]
for(i in 1:length(list_log_DV))
{
xlogdv<-cor(prices$log_MEDV,prices[list_log_DV[i]])
print(xlogdv)
}
sampling<-sort(sample(nrow(prices), nrow(prices)*.7))
#Select training sample
train<-prices[sampling,]
test<-prices[-sampling,]
##Building SimpLe Linear Regression Model
#Metrics :
#Rsquare
#Coefficients
#P values : Significance levels of the IDV's
#Residuals distribution
#Factor variables as IDV's
#All good modelssummm
Reg<-lm(log_MEDV~CRIM+INDUS+RAD+TAX+B+
Charles.River.dummy.variable+
DIS+ZN+PTRATIO+LSTAT+AGE+X.rooms.dwelling+nitric.oxides.concentration,data=train)
summary(Reg)
#Getting the formula
formula(Reg)
#Getting the formula
formula(Reg)
#Remove insignificant variables :
Reg1<-lm(log_MEDV~
Charles.River.dummy.variable+
DIS+PTRATIO+LSTAT+AGE+X.rooms.dwelling+nitric.oxides.concentration,data=train)
summary(Reg1)
#Reg2 : remove insignificant values
Reg2 <- lm(log_MEDV ~CRIM+INDUS+RAD+TAX+B+
Charles.River.dummy.variable+
DIS+ZN+PTRATIO+LSTAT+X.rooms.dwelling+nitric.oxides.concentration, data=train)
summary(Reg2)
#Reg3 _ remove insignificant values
Reg3 <- lm(log_MEDV ~CRIM+RAD+
Charles.River.dummy.variable+
DIS+ZN+PTRATIO+LSTAT+nitric.oxides.concentration, data=train)
summary(Reg3)
#Some other combination
Reg4<-lm(log_MEDV~INDUS +ZN + X.rooms.dwelling + LSTAT+CRIM + Charles.River.dummy.variable,data=train)
summary(Reg4)
#The best model happens to be : Reg3
##Getting predicted values
predicted<-predict(Reg3)
plot(predicted)
length(predicted)
##Finding Residuals
residuals<-resid(Reg3)
plot(residuals)
length(residuals)
##Plotting Residuals vs Predicted Values
##Checking Heteroskedastcity
##There should be no trend between predicted values and residual values
plot(predicted,residuals,abline(0,0))
#You can notice that there seems to be an inverse pattern for some points
#So this model may not be the preferred model.
#atttching predicted values to test data
predicted<-predict(Reg3,newdata=test)
length(predicted)
test$p<-predicted
#Calculating error in the test dataset - (Actual- predicted)/predicted values
test$error<-(test$log_MEDV-test$p)/test$log_MEDV
mean(test$error)*100 #you get to know the average error in the given dataset
##Plotting actual vs predicted values
plot(test$p,col="blue",type="l")
lines(test$log_MEDV,col="red",type="l")
#checking for Correlation between variables
library(car)
vif(Reg3)
#You can drop variables if they have a vif>10 ; means high correlation between variables