-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMLFinal.R
393 lines (372 loc) · 16.6 KB
/
MLFinal.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
# read data in from an Excel spreadsheet
library(readxl)
imdb.data <- read_excel("C:/Users/maura/Downloads/IMDb_Data.xlsx")
####################################################################
# look at summaries of eleven variables
# 1. IMDbRating
# look at numerical summary
summary(imdb.data$IMDbRating)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 7.500 7.700 7.900 7.957 8.100 9.300
# make a histogram to look at the spread (Figure 1)
hist(imdb.data$IMDbRating, freq=F, xlab="IMDb Rating",
main="Histogram of Top 1000 Films' Ratings")
# 2. Director
# sort the directors by number of films they have in the dataset
sort(table(imdb.data$Director))
# 3. TopFourStars
# get all actors' names into a list
four.thousand.actors <- NULL
actor1s <- rep(NA,1000); actor2s <- rep(NA,1000)
actor3s <- rep(NA,1000); actor4s <- rep(NA,1000)
for(i in 1:1000){
four.actors <- strsplit(imdb.data$TopFourStars[i], split=", ")
actor1s[i] <- four.actors[[1]][1] # store first actors
actor2s[i] <- four.actors[[1]][2] # store second actors
actor3s[i] <- four.actors[[1]][3] # store third actors
actor4s[i] <- four.actors[[1]][4] # store fourth actors
}
imdb.data$Actor1 <- actor1s # add 1st actors to dataframe
imdb.data$Actor2 <- actor2s # add 2nd actors to dataframe
imdb.data$Actor3 <- actor3s # add 3rd actors to dataframe
imdb.data$Actor4 <- actor4s # add 4th actors to dataframe
four.thousand.actors <- c(actor1s, actor2s, actor3s, actor4s)
# determine which actors are in more than 10 movies
big.actors <- table(four.thousand.actors)>10 # T if in more than 10
big.actors.names <- NULL # store names of those in more than 10 here
for(i in 1:2689){ # for each of the 2689 unique actors
if(big.actors[i]==T){ # if the actor has been in more than 10
big.actors.names <- c(big.actors.names, big.actors[i]) # store
}
}
big.actors.names # look at the actors that were in more than 10
sum(four.thousand.actors=="AlPacino") # Al Pacino in 13
sum(four.thousand.actors=="BradPitt") # Brad Pitt in 11
sum(four.thousand.actors=="ChristianBale") # Christian Bale in 11
sum(four.thousand.actors=="JamesStewart") # James Stewart in 11
sum(four.thousand.actors=="LeonardoDiCaprio") # Leo DiCaprio in 11
sum(four.thousand.actors=="MattDamon") # Matt Damon in 11
sum(four.thousand.actors=="RobertDeNiro") # Rob De Niro in 16
sum(four.thousand.actors=="TomHanks") # Tom Hanks in 14
# 4. Genre
# put all genres into one list
all.genres <- NULL
genre1s <- rep(NA, 1000); genre2s <- rep(NA, 1000)
genre3s <- rep(NA, 1000) # store up to 3 genres for each film
for(i in 1:1000){
genres <- strsplit(imdb.data$Genre[i],split=",")
genre1s[i] <- genres[[1]][1] # first listed genre
genre2s[i] <- genres[[1]][2] # second listed genre
genre3s[i] <- genres[[1]][3] # third listed genre
}
all.genres <- c(genre1s, genre2s, genre3s) # make list of all genres
all.genres <- na.omit(all.genres) # get rid of NAs
length(all.genres)/1000 # 2539/1000 = average of 2.5 genres/film
# look at how many films were in each genre (Table 4)
table(all.genres)
# change genre to factor of Action, Comedy, Crime, Drama, or Misc.
genre1 <- rep("Misc.", 1000)
for(i in 1:1000){
if (genre1s[i]=="Action"){genre1[i] <- "Action"}
else if(genre1s[i]=="Comedy"){genre1[i] <- "Comedy"}
else if(genre1s[i]=="Crime"){genre1[i] <- "Crime"}
else if(genre1s[i]=="Drama"){genre1[i] <- "Drama"}
}
imdb.data$Genre1 <- as.factor(genre1) # add 1st genre to dataframe
# look at table of first-listed genres (Table 5)
table(imdb.data$Genre1)
# make boxplot to look at IMDbRating spread for each genre (Figure 2)
plot(imdb.data$IMDbRating ~ imdb.data$Genre1,
xlab = "First Genre", ylab = "IMDb Rating",
main="Top 1000 Films' IMDb Ratings vs. Most Popular 1st Genres")
# 5. Duration
# look at numerical summary
summary(imdb.data$Duration)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 45.0 103.0 120.0 123.7 138.0 321.0
# make a histogram to look at the spread (Figure 3)
hist(imdb.data$Duration, freq=F, xlab="Minutes",
main="Histogram of Top 1000 Films' Durations")
# scatterplot to look at relationship between rating and runtime
# with slightly jittered points (Figure 4)
set.seed(12)
dur.jit <- runif(1000, -1, 1) # for Duration
set.seed(12)
rat.jit <- runif(1000, -.05, .05) # for IMDbRating
plot((imdb.data$Duration+dur.jit), (imdb.data$IMDbRating+rat.jit),
xlab="Duration (mins)", ylab = "IMDb Rating", pch=20,
main = "Top 1000 Films' Ratings vs. Durations", cex=.8)
# add a lowess line to visualize the trend
lines(lowess(imdb.data$Duration, imdb.data$IMDbRating), col="blue")
# look at correlation between rating and runtime
cor(imdb.data$Duration, imdb.data$IMDbRating) # 0.26
# scale Duration to standardize it
imdb.data$Duration <- scale(imdb.data$Duration)
# 6. AgeRating
# see how many films were in each rating category (Table 6)
table(imdb.data$AgeRating)
# G NC-17 NR Passed PG PG-13 R
# 39 5 189 78 151 171 367
# look at IMDb rating spread for each category (Figure 5)
boxplot(imdb.data$IMDbRating ~ imdb.data$AgeRating,
xlab="Age Rating", ylab="IMDb Rating",
main = "Top 1000 Films' IMDb Ratings vs. Age Ratings")
# convert the variable to a factor in the dataframe
imdb.data$AgeRating <- as.factor(imdb.data$AgeRating)
# 7. ReleaseYear
# look at numerical summary
summary(imdb.data$ReleaseYear)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1920 1975 1999 1991 2010 2023
# make histogram to look at the spread (Figure 6)
hist(imdb.data$ReleaseYear, freq=F, xlab="Year",
main = "Histogram of Top 1000 Films' Release Years")
# make scatterplot to look at rating and year, w/ jitter (Figure 7)
set.seed(12)
year.jit <- runif(1000,-1,1)
plot((imdb.data$ReleaseYear+year.jit),(imdb.data$IMDbRating+rat.jit),
xlab="Year", ylab = "IMDb Rating", pch=20,
main = "Top 1000 Films' Ratings vs. Release Years")
# add a lowess line to the plot to visualize the trend
lines(lowess(imdb.data$ReleaseYear,imdb.data$IMDbRating), col="blue")
# look at the correlation between rating and release year
cor(imdb.data$ReleaseYear, imdb.data$IMDbRating) # -0.10
# use ReleaseYear to make ReleaseDecade and add to dataframe
decade <- rep(NA, 1000)
for(i in 1:1000){
if (imdb.data$ReleaseYear[i]<1930){decade[i]<-"1920s"}
else if(imdb.data$ReleaseYear[i]<1940){decade[i]<-"1930s"}
else if(imdb.data$ReleaseYear[i]<1950){decade[i]<-"1940s"}
else if(imdb.data$ReleaseYear[i]<1960){decade[i]<-"1950s"}
else if(imdb.data$ReleaseYear[i]<1970){decade[i]<-"1960s"}
else if(imdb.data$ReleaseYear[i]<1980){decade[i]<-"1970s"}
else if(imdb.data$ReleaseYear[i]<1990){decade[i]<-"1980s"}
else if(imdb.data$ReleaseYear[i]<2000){decade[i]<-"1990s"}
else if(imdb.data$ReleaseYear[i]<2010){decade[i]<-"2000s"}
else if(imdb.data$ReleaseYear[i]<2020){decade[i]<-"2010s"}
else if(imdb.data$ReleaseYear[i]<2024){decade[i]<-"2020s"}
}
imdb.data$ReleaseDecade <- as.factor(decade)
table(decade) # look at summary of decades (Table 7)
# make boxplot of IMDbRating vs. ReleaseDecade (Figure 8)
boxplot(imdb.data$IMDbRating~imdb.data$ReleaseDecade,
xlab = "Decade of Release", ylab = "IMDb Rating",
main = "Top 1000 Films' IMDb Ratings vs. Release Decade")
# 8. Language
# see how many films were in each language (Table 8)
table(imdb.data$Language)
# see spread of rating for languages with > 15 films (Figure 9)
pop.langs <- ifelse( imdb.data$Language=="English" |
imdb.data$Language=="French" |
imdb.data$Language=="German" |
imdb.data$Language=="Hindi" |
imdb.data$Language=="Japanese" |
imdb.data$Language=="Spanish", 1, 0)
boxplot(imdb.data$IMDbRating[pop.langs==1] ~
imdb.data$Language[pop.langs==1],
xlab="Language", ylab="IMDb Rating",
main="Boxplot of IMDb Rating vs. Language")
# modify Language to have English, Hindi, Japanese, French, or Misc.
language <- rep("Misc.", 1000)
for(i in 1:1000){
if (imdb.data$Language[i]=="English"){language[i]<-"English"}
else if(imdb.data$Language[i]=="Hindi"){language[i]<-"Hindi"}
else if(imdb.data$Language[i]=="Japanese"){language[i]<-"Japanese"}
else if(imdb.data$Language[i]=="French"){language[i]<-"French"}
}
# store languages as a factor in the dataframe
imdb.data$Language <- as.factor(language)
# 9. Metacritic Score
# store Metacritic Score as an int, not a char, in dataframe
imdb.data$MetacriticScore <- as.integer(imdb.data$MetacriticScore)
# look at numerical summary
summary(imdb.data$MetacriticScore)
# Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 28.00 71.00 80.00 78.79 88.00 100.00 149
# make separate dataset with films that have MC scores
MC.score.films <- na.omit(imdb.data)
# make histogram to look at spread (Figure 10)
hist(MC.score.films$MetacriticScore,
xlab = "Metacritic Score", freq=F,
main = "Histogram of Top 1000 Films' Scores on Metacritic")
# make scatterplot for IMDb score and Metacritic score (Figure 11)
set.seed(12) # jitter Metacritic Scores
mc.jit <- runif(851, -1, 1)
set.seed(12) # jitter IMDb Ratings
rat.jit <- runif(851, -.05, .05)
plot((MC.score.films$MetacriticScore+mc.jit),
(MC.score.films$IMDbRating+rat.jit), pch=20,
xlab = "Metacritic Score", ylab = "IMDb Rating",
main = "Top 1000 Films' Ratings vs. Metacritic Scores")
# add lowess line to plot to visualize trend
lines(lowess(MC.score.films$MetacriticScore,
MC.score.films$IMDbRating), col="blue")
# correlation between critics’ ratings and audiences’ ratings
cor(MC.score.films$MetacriticScore, MC.score.films$IMDbRating)#0.25
# scale MetacriticScore to standardize it
imdb.data$MetacriticScore <- scale(imdb.data$MetacriticScore)
# 10. Lead Gender
# see how many films had a lead actor of different genders
table(imdb.data$LeadGender)
# F M
# 169 831
# look at spread of ratings for different genders (Figure 12)
boxplot(imdb.data$IMDbRating ~ imdb.data$LeadGender,
xlab="Lead Actor's Gender", names=c("Female","Male"),
ylab = "IMDb Rating",
main="Boxplot of IMDb Rating vs. Lead Gender")
# store variable as a factor in dataframe
imdb.data$LeadGender <- as.factor(imdb.data$LeadGender)
# 11. Lead Race
# see how many films had a lead actor of different races
table(imdb.data$LeadRace)
# Asian Black Latino Pacific Islander White
# 170 31 27 2 770
# look at spread of ratings for different races (Figure 13)
boxplot(imdb.data$IMDbRating ~ imdb.data$LeadRace,
xlab="Lead Actor's Race",
names=c("Asian","Black","Latino","Pac. Isl.","White"),
main="Boxplot of IMDb Rating vs. Lead Race")
# change LeadRace to White, Asian, Black, or Latino/Pac.Isl.
race <- rep(NA, 1000)
for(i in 1:1000){
if (imdb.data$LeadRace[i]=="White"){race[i]<-"White"}
else if(imdb.data$LeadRace[i]=="Asian"){race[i]<-"Asian"}
else if(imdb.data$LeadRace[i]=="Black"){race[i]<-"Black"}
else if(imdb.data$LeadRace[i]=="Latino" |
imdb.data$LeadRace[i]=="Pacific Islander")
{race[i]<-"Latino/Pac.Isl."}
}
# store variable as a factor in dataframe
imdb.data$LeadRace <- as.factor(race)
#####################################################################
# fit models to predict IMDbRating on training data
# start at seed 12, then use seed 24, then seed 42, then seed 22
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
train.rows <- sample(1:1000, 750)
train.data <- imdb.data[train.rows,]
test.data <- imdb.data[-train.rows,]
# 1. Linear Regression
# fit linear regression model
lm.mod <- lm(IMDbRating ~ Genre1 + Duration + AgeRating + Language +
ReleaseDecade + MetacriticScore + LeadGender + LeadRace,
data=train.data)
summary(lm.mod) # look at parameter estimates and p-values
lm.mod.preds <- predict(lm.mod,na.omit(test.data)) # predict for test
mean((lm.mod.preds - na.omit(test.data)$IMDbRating)^2) # test MSE
# 2. Ridge Regression
library(glmnet)
# set up x and y matrices for training data
train.x <- model.matrix(IMDbRating ~ Genre1 + Duration + AgeRating +
ReleaseDecade + Language + MetacriticScore +
LeadGender + LeadRace, data=train.data)
train.y <- na.omit(train.data)$IMDbRating
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
grid <- 10^seq(1,-5,length=500) # grid of possible lambda values
# determine best value of lambda for this dataset
ridge.cv.out <- cv.glmnet(train.x, train.y, alpha=0, lambda=grid)
ridge.bestlam <- ridge.cv.out$lambda.min
# run ridge model on training set with best lambda
ridge.mod <- glmnet(train.x, train.y, alpha=0, lambda=ridge.bestlam,
thresh = 1e-15)
# set up x and y matrices for test data
test.x <- model.matrix(IMDbRating ~ Genre1 + Duration + AgeRating +
ReleaseDecade + Language + MetacriticScore +
LeadGender + LeadRace, data=test.data)
test.y <- na.omit(test.data)$IMDbRating
# predict IMDbRating and calculate MSE for test data
ridge.pred <- predict(ridge.mod, s=ridge.bestlam, newx=test.x)
mean((ridge.pred - test.y)^2)
ridge.mod$beta[order(ridge.mod$beta),] # parameter estimates
# 3. Lasso Regression
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
# determine best value of lambda for this dataset
lasso.cv.out <- cv.glmnet(train.x, train.y, alpha=1, lambda=grid)
lasso.bestlam <- lasso.cv.out$lambda.min
# run lasso model on training data
lasso.mod <- glmnet(train.x, train.y, alpha=1, lambda=lasso.bestlam,
thresh = 1e-15)
# predict IMDbRating and calculate MSE for test data
lasso.pred <- predict(lasso.mod, s=lasso.bestlam, newx=test.x)
mean((lasso.pred - test.y)^2)
lasso.mod$beta[order(lasso.mod$beta),] # parameter estimates
# 4. Best Subset Selection
library(leaps)
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
# see which 10 variables are determined to be most important
bss.mod <- regsubsets(IMDbRating ~ Genre1 + Duration + AgeRating +
ReleaseDecade + Language + MetacriticScore +
LeadGender + LeadRace, data=train.data,
nvmax = 10)
summary(bss.mod)
# fit the best 3 variable model and make predictions for test data
bss.lm.mod <- lm(IMDbRating ~ MetacriticScore + Duration +
ReleaseDecade, data=train.data)
summary(bss.lm.mod)
bss.lm.mod.preds <- predict(bss.lm.mod, na.omit(test.data))
mean((bss.lm.mod.preds - na.omit(test.data)$IMDbRating)^2) #test MSE
# 5. Regression Trees and Random Forest
library(rpart)
library(rpart.plot)
library(randomForest)
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
# make a tree from training data, prune it, and plot it (Figure 14)
tree <- rpart(IMDbRating ~ Genre1 + Duration + AgeRating +
ReleaseDecade + Language + MetacriticScore + LeadGender
+ LeadRace, data=train.data, method='anova',
control=rpart.control(minsplit=50,cp=.01,xval=10))
tree.pruned <- prune(tree, cp=.01)
prp(tree.pruned, faclen=0, extra=1, roundint=F, digits=5)
# predict IMDbRating for test data and find test MSE
tree.preds <- predict(tree.pruned, test.data)
mean((tree.preds - test.data$IMDbRating)^2)
set.seed(12)
set.seed(24)
set.seed(42)
set.seed(22)
# run randomForest, trying 5 variables at each split
tree.RF <- randomForest(IMDbRating ~ Genre1 + Duration + AgeRating +
ReleaseDecade + Language + MetacriticScore +
LeadGender + LeadRace, mtry = 5,
data=na.omit(train.data), importance=TRUE)
# see which variables were deemed most important (Figure 15)
importance(tree.RF); varImpPlot(tree.RF)
# predict IMDbRating for test data and find test MSE
tree.RF.preds <- predict(tree.RF, na.omit(test.data))
mean((tree.RF.preds - na.omit(test.data)$IMDbRating)^2)
##################################################################
# try to fit models to new test data
train.data <- imdb.data
test.data<-read_excel("C:/Users/maura/Documents/IMDb_Data_New.xlsx")
test.data$AgeRating <- as.factor(test.data$AgeRating)
test.data$ReleaseDecade <- as.factor(test.data$ReleaseDecade)
test.data$Genre1 <- as.factor(test.data$Genre1)
test.data$Language <- as.factor(test.data$Language)
test.data$LeadGender <- as.factor(test.data$LeadGender)
test.data$LeadRace <- as.factor(test.data$LeadRace)
test.data$Duration <- scale(test.data$Duration)
test.data$MetacriticScore <- scale(test.data$MetacriticScore)
# re-run linear regression, best subset selection, and regression
# trees using the above code
# get errors when trying to re-run ridge, lasso, and random forest
# using the above code
# visualize standard linear reg. model’s predictions (Figure 16)
plot(test.data$IMDbRating, lm.mod.preds, xlab="Actual Test Ratings",
ylab = "Predicted Test Ratings", pch=20,
main = "Predicted vs. Actual IMDbRatings for Test Data")
abline(0,1) # add the line y=x to the plot