-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathweb_crawlers.py
99 lines (79 loc) · 2.45 KB
/
web_crawlers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Web crawlers for Maimai
"""
import random
import time
import requests
from pymongo import MongoClient
COOKIES_STRING = ''
def conn_to_mongo(collection_name):
"""
connect to MongoDB
:return:
"""
client = MongoClient()
if collection_name == 'zhiyan':
db = client.maimai.zhiyan
elif collection_name == 'zhiyancomments':
db = client.maimai.zhiyancomments
return db
def insert_into_mongodb(db, collection_name, obj):
"""
insert into MongoDB
:param db:
:param collection_name:
:param obj:
:return:
"""
if db is None:
db = conn_to_mongo(collection_name)
db.insert_one(obj)
def crawl_maimai_zhiyan(company_name):
"""
crawl maimai zhiyan
:param company_name:
:return:
"""
url = 'https://maimai.cn/search/gossips'
offset = 0
continue_crawl = True
cookies = dict(cookies_are=COOKIES_STRING)
print('Start crawling Zhiyan for {}.'.format(company_name))
while continue_crawl:
payload = {
'query': company_name,
'limit': '20',
'offset': offset,
'searchTokens': [],
'highlight': 'true',
'sortby': 'time',
'jsononly': 1,
}
response = requests.get(url=url, params=payload, headers={
# ':authority': 'maimai.cn',
# ':method': 'GET',
# ':scheme': 'https',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'
'/73.0.3683.75 Safari/537.36'
}, cookies=cookies)
if response.status_code == 200:
data = response.json()
if len(data['data']['gossips']) == 0:
continue_crawl = False
else:
print(data)
collection_name = 'zhiyan'
db = conn_to_mongo(collection_name)
for gossip in data['data']['gossips']:
gossip['company_name'] = company_name
insert_into_mongodb(db, collection_name, gossip)
offset += 20
time.sleep(random.randint(2, 5)) # a range between 2s and 5s
if __name__ == '__main__':
# crawl_maimai_zhiyan('京东')
db = conn_to_mongo('zhiyan')
gossips = db.find({"company_name": "京东"})
gossip_texts = []
for gossip in gossips:
gossip_texts.append(gossip['gossip']['text'])
print(''.join(gossip_texts))