-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanime_scraper.py
142 lines (110 loc) · 4.26 KB
/
anime_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#This file was used to scrape animes from MyAnimeList from their webpages
#CONFIG
size = 290 #number of websites to scrape at a time before uploading it to mongodb database
wait_time = 65 #number of seconds to wait between scraping (to avoid Anti-DoS)
##END CONFIG
from pymongo import MongoClient
import pymongo
from requests_html import HTMLSession
from requests_html import AsyncHTMLSession
import json
import pandas as pd
import time
import sys
#connect to database in mongodb
client = MongoClient("INSERT MONGDB ADDRESS HERE",tlsAllowInvalidCertificates=True)
database = client['MyAnimeList']
#colors r,y,g,b,p,v,B,U,E
class c:
r = '\033[91m'
y = '\033[93m'
g = '\033[92m'
b = '\033[96m'
p = '\033[94m'
v = '\033[95m'
B = '\033[1m'
U = '\033[4m'
E = '\033[0m'
#get the anime_id we've last scraped
last_entry = database['Misc'].find({},{'_id':0})[0]['last_entry']
print(f'{c.v}Last Anime ID scraped: {c.B}{c.p}{last_entry}{c.E}')
#this allows us to scrape multiple websites at a time
asession = AsyncHTMLSession()
#scraped data
batch = []
#this will be True when Anti-DoS is triggered
wait = False
async def scrape(id):
#visit anime website
site = await asession.get(f'https://myanimelist.net/anime/{id}')
#Get the title, and check if page is 404
try:
title = site.html.find('#contentWrapper')[0].find('h1')[0].text
if title == '404 Not Found':
return
except Exception as e:
#Anti-DoS is triggered
global wait
if not wait:
print(f'{c.r}Anti-DoS triggered! \U0001f612 Anime #{id} Error: {e}{c.E}')
wait = True
return
data = {}
data['_id'] = id
data['title'] = title
data['rating'] = site.html.find('.score-label')[1].text
#get anime tags
tags = site.html.find('#content .leftside .spaceit_pad')
for div in tags:
#div is <div class='spaceit_pad'> element
tag = div.find('span')[0].text[:-1]
if tag in ['Type','Status','Rating']:
data[tag] = div.text[len(tag)+2:]
#Some tags are plural
elif tag in ['Genres','Themes','Producers','Studios','Demographics']:
data[tag] = [x.text for x in div.find('a')]
elif tag in ['Genre','Theme','Producer','Studio','Demographic']:
data[tag+'s'] = div.find('a')[0].text
#Get the plot summary
data['plot'] = site.html.find('p[itemprop="description"]')[0].text.replace('\n','').replace('[Written by MAL Rewrite]','')
#add the data to batch
batch.append(data)
#time how long we scraped
start_time = time.time()
while last_entry < 52000:
print(f'{c.b}Scraping from: {c.B}{c.p}{last_entry}{c.E} \U0001f916')
#time how long we scraped a batch
batch_time = time.time()
#scrape the websites
asession.run(*[lambda x=y: scrape(x) for y in range(last_entry,last_entry+size)])
#Anti-DoS is triggered, wait 5 mins
if wait:
batch.clear()
wait = False
print(f'{c.y}Anti-DoS triggered, restarting in 5 mins...{c.E}')
time.sleep(60)
print(f'{c.y}Anti-DoS triggered, restarting in 4 mins...{c.E}')
time.sleep(60)
print(f'{c.y}Anti-DoS triggered, restarting in 3 mins...{c.E}')
time.sleep(60)
print(f'{c.y}Anti-DoS triggered, restarting in 2 mins...{c.E}')
time.sleep(60)
print(f'{c.y}Anti-DoS triggered, restarting in 1 mins...{c.E}')
time.sleep(60)
continue
#check if we've scraped any animes
if len(batch):
print(f'{c.g}Batch finished \U00002705 {c.b}Average Speed: {c.p}{c.B}{round(size/(time.time()-batch_time))}{c.E}{c.b} per second. {c.g}{c.B}+{len(batch)}{c.E}{c.b} animes{c.E}')
#insert into database
database['Animes'].insert_many(batch)
batch.clear()
else:
#if no animes were scraped, continue on
batch.clear()
print(f'{c.y}No Animes were scraped for this batch.{c.E}')
#update last_entry to keep track of the last anime we've scraped
last_entry += size
database['Misc'].update_one({},{ "$set": { "last_entry": last_entry} })
#waiting wait_time to not trigger anti-DoS.
print(f'{c.b}Last Entry is now: {c.B}{c.p}{last_entry}{c.E}{c.b} Waiting {c.B}{c.p}{wait_time}{c.E}{c.b} seconds...{c.E}')
time.sleep(wait_time)