-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathwechat_history.py
178 lines (133 loc) · 5.28 KB
/
wechat_history.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
actions in this script:
1. read the json file to get parameter info
2. use webdrive to open the account history home page, the session and
cookies are then saved.
3. open the account posts by making requests to getmsg
4. loop over the getmsg page until no more message to load
5. save message to csv file in the same directory of the json file
"""
import json
from selenium import webdriver
import os
from datetime import datetime
_CHROME = webdriver.Chrome(r'c:\temp\chromedriver.exe')
_ROOT = 'https://mp.weixin.qq.com/mp/profile_ext?'
def get_account_info(info_json):
with open(info_json, 'r', encoding='utf-8') as infile:
d = json.load(infile)
# make sure all the keys are there and not empty
assert d['__biz'] not in (None, '')
assert d['uin'] not in (None, '')
assert d['key'] not in (None, '')
assert d['pass_ticket'] not in (None, '')
return d
def read_page_content(url):
""" read url body content """
_CHROME.get(url)
return _CHROME.find_element_by_xpath('.//body').text
def json_loads_recursive(json_str):
# if input str is actual a numeric, just return
if not isinstance(json_str, str):
return json_str
# try to load the string
try:
d = json.loads(json_str, encoding='utf-8')
# if the load failed, the the input is not a dict, return it.
except json.decoder.JSONDecodeError:
return json_str
# if successfully loaded, try to load all the values recursively
else:
for k, v in d.items():
d[k] = json_loads_recursive(v)
return d
def construct_home_url(account_info):
__biz = '__biz={}'.format(account_info['__biz'])
uin = 'uin={}'.format(account_info['uin'])
key = 'key={}'.format(account_info['key'])
pass_ticket = 'pass_ticket={}'.format(account_info['pass_ticket'])
params = ['action=home', __biz, uin, key, pass_ticket, 'scene=124',
'devicetype=Windows+10', 'version=6204014f', 'lang=en',
'a8scene=7', 'winzoom=1']
return _ROOT + '&'.join(params)
def construct_message_url(account_info, new=False, next_offset=0):
""" once home is loaded, only __biz is needed for getmsg """
__biz = '__biz={}'.format(account_info['__biz'])
offset = 0 if new else next_offset
params = ['action=getmsg', __biz, 'offset={}'.format(offset), 'f=json']
return _ROOT + '&'.join(params)
class Chunk(object):
"""
Class for each message chunk when making requests to getmsg.
Attributes:
- return_code: return code
- error_message: error message
- message_count: seems it will always be 10, even when requesting more than 10
- can_msg_continue: 1 or 0, if 0, no more message to load
- message_list: list of messages
- next_offset: offset number of next chunk
"""
def __init__(self, url):
self.contents = json_loads_recursive(read_page_content(url))
@property
def return_code(self):
return self.contents.get('ret')
@property
def errmsg(self):
return self.contents.get('errmsg')
@property
def message_count(self):
return self.contents.get('msg_count')
@property
def can_msg_continue(self):
return self.contents.get('can_msg_continue')
@property
def message_list(self):
return self.contents.get('general_msg_list').get('list')
@property
def next_offset(self):
return self.contents.get('next_offset')
def get_all_messages(account_info):
starting_msg_url = construct_message_url(account_info, new=True)
chunk = Chunk(starting_msg_url)
messages = []
while chunk.errmsg == 'ok':
# result is good, save the message list
messages.extend(chunk.message_list)
# loading more
if chunk.can_msg_continue:
offset = chunk.next_offset
next_url = construct_message_url(account_info, next_offset=offset)
chunk = Chunk(next_url)
# nothing to load anymore, return the message
else:
return messages
def save_messages_to_csv(messages, account_json_path):
""" save messages into csv """
path, name = os.path.split(os.path.abspath(account_json_path))
output_csv = os.path.join(path, '{}.csv'.format(name.split('.')[0]))
with open(output_csv, 'w', encoding='utf-8') as outfile:
outfile.write('publish time, title, url\n')
for msg in messages:
# get message posting time
try:
pub_timestamp = msg.get('comm_msg_info').get('datetime')
pub_dt = str(datetime.utcfromtimestamp(pub_timestamp))
title = msg.get('app_msg_ext_info').get('title')
url = msg.get('app_msg_ext_info').get('content_url')
outfile.write('{}, {}, {}\n'.format(pub_dt, title, url))
except (KeyError, AttributeError) as e:
pass
def main(account_json):
account_info = get_account_info(account_json)
home_url = construct_home_url(account_info)
_CHROME.get(home_url)
messages = get_all_messages(account_info)
print('{} messages collected'.format(len(messages)))
print('This is the most recent one:')
print(messages[0])
save_messages_to_csv(messages, account_json)
if __name__ == '__main__':
import sys
account_info = sys.argv[1]
main(account_info)