forked from abinnz/xiaofendui
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathzk_monitor_wx.py
255 lines (243 loc) · 8.97 KB
/
zk_monitor_wx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# -*- coding: utf-8 -*-
import json
import os
import re
import time
import logging
import itchat
import random
import requests
from urllib.parse import unquote
from pyquery import PyQuery as py
from datetime import datetime
# 日志格式设定
logging.basicConfig(level=logging.INFO, format='\n%(asctime)s - %(levelname)s: %(message)s')
# 存储目录
BASE_DIR = '/tmp'
# 临时文件
ZK_TMP_FILE = BASE_DIR + '/zk_monitor.json'
# 请求头
REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
# 爬虫链接
ZK_BASE_URL = 'http://www.zuanke8.com/forum.php?mod=forumdisplay&fid=15&filter=author&orderby=dateline'
ZK_POST_URL = 'http://www.zuanke8.com/thread-%s-1-1.html'
TUAN_BASE_URL = 'http://www.0818tuan.com/list-1-0.html'
# 任务间隔时间
ZK_TASK_INTERVAL = 30
# 请求超时设置,默认5秒
REQUEST_TIMEOUT = 5
# 编码
UTF8_ENCODING = 'utf-8'
GBK_ENCODING = 'gbk'
GB2312_ENCODING = 'gb2312'
# 监控关键字
KEYWORD = {
'include': r'密令|红包|洪水|大水|有水|速度|神券|京豆|好价|bug|\d+元|\d+券|\d+减\d+|\d+-\d+',
'exclude': r'权限|水.?贴|什么|怎么|怎样|不是|不能|不行|没有|反撸|求|啥|问|哪|吗|么|?'
}
# 匹配群聊名称
MATCH_ROOMS = ['冲鸭,羊毛小分队']
# 全局存储数据
result = dict()
last_result = dict()
# 推送名单
user_names = list()
def main_handler():
global result, last_result
try:
logging.info('临时文件:' + ZK_TMP_FILE)
if not os.path.exists(ZK_TMP_FILE):
logging.info('临时文件不存在')
else:
with open(ZK_TMP_FILE, 'r', encoding=UTF8_ENCODING) as f:
result = json.load(f)
logging.info('当前存储数据量:' + str(len(result.keys())))
# zk首页热门内容
d = py(ZK_BASE_URL, headers=REQUEST_HEADERS, encoding=GBK_ENCODING, timeout=REQUEST_TIMEOUT)
d('#threadlisttableid tbody').each(deal_post)
# 0818tuan
d = py(TUAN_BASE_URL, headers=REQUEST_HEADERS, encoding=GB2312_ENCODING, timeout=REQUEST_TIMEOUT)
d('.list-group > .list-group-item').each(deal_post_tuan)
except Exception as ex:
logging.exception('主任务运行异常:' + str(ex))
raise ex
finally:
# 创建存储目录
if not os.path.exists(BASE_DIR):
os.makedirs(BASE_DIR)
# 存储结果
with open(ZK_TMP_FILE, 'w', encoding=UTF8_ENCODING) as f:
# 超过数量清空
if len(result.keys()) > 500:
last_result.clear()
last_result = result
result = dict()
logging.info('保存数据结果')
json.dump(result, f)
# 每个帖子
def deal_post(i, e):
global result
match = re.match(r'normalthread_(\d+)', str(py(e).attr('id')))
if match is None:
return
# 帖子主键
post_id = match.group(1)
# 已存在
if is_result_include(post_id):
return
# 帖子标题
title = py(e).find('th').text()
if not is_keyword_valid(title):
return
# 帖子时间
time = py(e).find('td:eq(1)').find('em').text()
info = get_post_info(post_id, title=title, time=time)
result[post_id] = info
# 内容关键字判断
if not is_keyword_valid(info['content'], 'content'):
logging.info('内容关键字过滤:' + str(info))
return
logging.info('准备推送信息:' + str(info))
send_msg(info)
def deal_post_tuan(i, e):
global result
info = dict()
info['title'] = py(e).attr('title')
url = py(e).attr('href')
info['time'] = py(e).find('.badge-success').text()
info['images'] = list()
# 排除置顶
if info['title'] is None or info['time'] == '':
return
# 排除非权限贴以及标题非关键字
if len(re.findall(r'\[权\d*\]', info['title'])) == 0 \
or not is_keyword_valid(info['title']) \
or is_result_include(url):
return
info['url'] = os.path.dirname(TUAN_BASE_URL) + url
logging.info('爬取链接:' + info['url'])
d = py(info['url'], headers=REQUEST_HEADERS, encoding=GB2312_ENCODING, timeout=REQUEST_TIMEOUT)
# 区块元素
ele = d('.post-content>p:first').clone()
for img in ele.find('img'):
src = py(img).attr('src')
if len(re.findall('.jpg|.jpeg|.png', src, re.I)) == 0:
continue
info['images'].append(src)
info['content'] = py(re.sub(r'(<br/?>\s*\n?)+', '@#@', ele.remove('img').html())).text().replace('@#@', '\n')
mapping = get_url_mapping(ele)
for href in mapping:
info['content'] = info['content'].replace(href, mapping[href])
result[url] = info
# 内容非关键字
if not is_keyword_valid(info['content'], 'content'):
logging.info('内容关键字过滤:' + str(info))
return
logging.info('准备推送信息:' + str(info))
send_msg(info)
# 判读结果是否包含
def is_result_include(key):
global result, last_result
# 已存在
if result.get(key) is not None or last_result.get(key) is not None:
return True
else:
return False
# 判断是否符合关键字
def is_keyword_valid(text, check_type='title'):
if check_type == 'title':
if len(re.findall(KEYWORD['include'], text, re.I)) == 0 \
or len(re.findall(KEYWORD['exclude'], text, re.I)) > 0:
return False
else:
return True
else:
return False if len(re.findall(KEYWORD['exclude'], text, re.I)) > 0 else True
# 发送消息
def send_msg(info):
# 推送用户名单
global user_names
files = list()
content = '%s\n\n%s\n\n电脑版:%s' % (info['title'], info['content'], info['url'])
for name in user_names:
itchat.send_msg(content, toUserName=name)
# 发送图片
for url in info['images']:
path = os.path.join(BASE_DIR, os.path.basename(url))
files.append(path)
f = open(path, 'wb')
f.write(requests.get(url).content)
f.close()
itchat.send_image(path, toUserName=name)
time.sleep(random.randint(2, 5))
# 删除临时文件
for path in files:
if os.path.exists(path):
os.remove(path)
# 获取帖子链接内容
def get_post_info(post_id, title=None, time=None):
info = dict()
info['url'] = ZK_POST_URL % post_id
logging.info('爬取链接:' + info['url'])
d = py(info['url'], headers=REQUEST_HEADERS, encoding=GBK_ENCODING, timeout=REQUEST_TIMEOUT)
# 帖子标题
post_title = d('#thread_subject').attr('title')
# 帖子时间
post_time = d('.pti:first>.authi:first').find('em:first').text().replace('发表于 ', '')
# 帖子图片
info['images'] = list()
if post_title is None:
info['title'] = None if title is None else title
info['time'] = None if time is None else time
info['content'] = d('#messagetext>p:first').text()
else:
info['title'] = post_title
info['time'] = post_time
ele = d('#postlist>div:first').find('tr:first').find('.t_f').clone()
info['content'] = ele.remove('ignore_js_op').text()
mapping = get_url_mapping(ele)
for href in mapping:
info['content'] = info['content'].replace(href, mapping[href])
for e in d('.t_fsz:first').find('ignore_js_op').find('img'):
e = py(e)
if e.attr('aid') is None:
continue
src = e.attr('file')
if len(re.findall('.jpg|.jpeg|.png', src, re.I)) == 0:
continue
info['images'].append(src)
return info
# 获取要替换的链接
def get_url_mapping(ele):
url_mapping = dict()
for i in ele.find('a'):
href = py(i).attr['href']
url = py(i).text()
if re.match(r'https?.+\.\..+|.*链接.*', url) is None \
or href is None:
continue
if '0818tuan' in href and '?u=' in href:
url_mapping[url] = unquote(href.split('?u=')[1])
else:
url_mapping[url] = unquote(href)
return url_mapping
if __name__ == '__main__':
logging.info('请扫描二维码登录')
itchat.auto_login(hotReload=True, enableCmdQR=True)
logging.info('登录成功')
chatrooms = itchat.get_chatrooms(contactOnly=True)
for room in chatrooms:
logging.info('获取群聊通讯:' + room['NickName'] + '|' + room['UserName'])
if room['NickName'] in MATCH_ROOMS:
user_names.append(room['UserName'])
# 定时循环
while True:
try:
main_handler()
except Exception as ex:
logging.exception('主线程异常:' + str(ex))
time.sleep(ZK_TASK_INTERVAL)
# 退出登录
itchat.logout()