-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb.py
74 lines (63 loc) · 2.97 KB
/
imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib
import csv
from halo import Halo
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import pandas as pd
from pandas import ExcelWriter
movie_title_list = ['Yomeddine','Shoplifters','Capernaum','Cold War','Dogman','Aga','El Angel','A Twelve-Year Night','In the Aisles','Tel Aviv on Fire',
'Birds of passage','The Third Wife','Pity','Volcano','Rona, Azim’s Mother', 'Tale of the Sea','A Family Tour','Donbass','Dovlatov','Foxtrot','Jumpman',
'One Step Behind the Seraphim','Rojo','Suleiman Mountain','The House that Jack Built','The Interpreter','The Man who bought the moon',
'The Sisters Brothers','The Reports on Sarah and Saleem','Woman at War','Yuli','The Heiresses','Our Time','Namme','Everybody Knows',
'MUHAMMAD: THE MESSENGER OF GOD','Border','The Ballad of Buster Scruggs']
# movie_title_list = ['Rona, Azim’s Mother']
pd_data = []
failed_movie_requests = []
multiple_search_result = []
base_url = 'https://www.imdb.com'
for movie_title in movie_title_list:
# url = 'https://www.imdb.com/find?q={0}&s=tt&exact=true&ref_=fn_al_tt_ex'
encoded = urlencode(dict(q=movie_title,s='tt',exact='true',ref_='fn_al_tt_ex'))
full_url = f"https://www.imdb.com/find?{encoded}"
# print(full_url)
r = requests.get(full_url)
data = r.text
search_list_soup = BeautifulSoup(data,"html.parser")
tables = search_list_soup.findChildren('table')
search_list_table = tables[0]
rows = search_list_table.findChildren('tr')
if len(rows) > 0:
row_tr_a = search_list_soup.tr.a
if row_tr_a:
movie_url = row_tr_a.get('href')
movie_fullurl = base_url + movie_url
movie_text = requests.get(movie_fullurl).text
movie_soup = BeautifulSoup(movie_text,"html.parser")
rating_div = movie_soup.find('div', class_='ratingValue')
if rating_div:
rating = rating_div.span.text
else:
rating = '0'
# print(movie_title + ' - ', rating)
if len(rows) > 1:
mul_search_req = (full_url,movie_title)
multiple_search_result.append(mul_search_req)
movie_data = [movie_title, rating,movie_fullurl]
pd_data.append(movie_data)
else:
# print('search result count for ' + movie_title + ' = ' + str(len(rows)))
failed_req =(full_url,movie_title)
failed_movie_requests.append(failed_req)
# print('Movies failed to get rating ---------------------')
# print(failed_movie_requests)
# print('Multiple search result ---------------------')
# print(multiple_search_result)
df = pd.DataFrame(pd_data,columns=['Title','Rating', 'Url'])
sorted_df = df.sort_values(by='Rating',ascending=[False])
print(sorted_df)
writer = ExcelWriter('Imdb.xlsx')
sorted_df.to_excel(writer,'Sheet5')
writer.save()