-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
354 lines (305 loc) · 14.6 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
# -*-coding:UTF-8-*-
__author__ = 'Moore'
import nltk
import jieba
import string
import scipy as sp
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
f_train = open(".//data//weibo_train_data//weibo_train_data.txt") # 返回一个文件对象
f_test = open(".//data//weibo_predict_data//weibo_predict_data.txt")
# i = 0
# line = f_train.readline().decode("utf-8") # 调用文件的 readline()方法
# # lines = f.readlines()
# train_lines_num = len(f_train.readlines())
# print "total lines = ", train_lines_num
# f_train.seek(0, 0)
stop_words_file_c = open(".//dic//stopwords_c.txt")
stop_words_file_e = open(".//dic//stopwords_e.txt")
stopword_c = stop_words_file_c.readline().decode("utf-8")
stop_words_c = []
while stopword_c:
stop_words_c.append(stopword_c.strip('\n'))
stopword_c = stop_words_file_c.readline().decode("utf-8")
# for word in stop_words_c:
# print word
# stopword_e = stop_words_file_e.readline().decode("utf-8")
# stop_words_e = []
# while stopword_e:
# stop_words_e.append(stopword_e)
# stopword_e = stop_words_file_e.readline()
# for word in stop_words_e:
# print word
def preprocess(weibo_str):
if weibo_str.find("http"):
start = weibo_str.find("http")
remove_str = weibo_str[start:start + 19]
return weibo_str.replace(remove_str, "")
else:
return weibo_str
def get_words(weibo_str):
item_words_dic = {}
seg_list = jieba.cut(weibo_str)
for orginal_word in seg_list:
if orginal_word not in stop_words_c:
if filter(lambda c: c not in string.letters, orginal_word):
if orginal_word in item_words_dic.keys():
countnum = item_words_dic.keys().count(orginal_word)
item_words_dic[orginal_word] = countnum + 1
else:
item_words_dic[orginal_word] = 1
return item_words_dic
def file_to_arr(lines, text_index, train_or_test):
train_like = []
train_comment = []
train_forward = []
corpus = []
for index in range(len(lines)):
single_line = lines[index].decode("utf-8").split('\t')
str_without_http = preprocess(single_line[text_index].replace("\n", "").replace(" ", ""))
if str_without_http is not None:
seg_list = jieba.cut(str_without_http)
seg_result = []
for orginal_word in seg_list:
if orginal_word not in stop_words_c and filter(lambda c: c not in string.letters, orginal_word):
seg_result.append(orginal_word)
if len(seg_result) != 0:
corpus.append(' '.join(seg_result))
if train_or_test == 'train':
train_forward.append(single_line[3])
train_comment.append(single_line[4])
train_like.append(single_line[5])
else:
corpus.append('no_features')
if train_or_test == 'train':
train_forward.append(single_line[3])
train_comment.append(single_line[4])
train_like.append(single_line[5])
else:
corpus.append('no_features')
if train_or_test == 'train':
train_forward.append(single_line[3])
train_comment.append(single_line[4])
train_like.append(single_line[5])
if train_or_test == 'train':
return corpus, train_forward, train_comment, train_like
else:
return corpus
def feature_tfidf(train_file, test_file, train_text_index, test_text_index, train_read_size, test_read_size):
total_train_lines = train_file.readlines(1000000)
total_test_lines = test_file.readlines(1000000)
if train_read_size == -1:
train_lines = total_train_lines
else:
train_lines = total_train_lines[:train_read_size]
train_size = len(train_lines)
if test_read_size == -1:
test_lines = total_test_lines
else:
test_lines = total_test_lines[:test_read_size]
test_size = len(test_lines)
train_text_arr, forward_train, comment_train, like_train = file_to_arr(train_lines, train_text_index, 'train')
test_text_arr = file_to_arr(test_lines, test_text_index, 'test')
# debug start
train_text_arr_nozero = []
comment_train_nozero = []
for i in range(len(comment_train)):
if int(comment_train[i]) != 0:
train_text_arr_nozero.append(train_text_arr[i])
comment_train_nozero.append(comment_train[i])
train_text_arr = train_text_arr_nozero
comment_train = comment_train_nozero
# debug end
tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
tfidf_train = tv.fit_transform(train_text_arr)
tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
tfidf_test = tv2.fit_transform(test_text_arr)
return tfidf_train, tfidf_test, forward_train, comment_train, like_train
def mnbclf_compute(train_file, test_file, train_text_index, test_text_index, train_read_size, test_read_size):
tfidf_train, tfidf_test, forward_train, comment_train, like_train = feature_tfidf(train_file, test_file,
train_text_index, test_text_index,
train_read_size, test_read_size)
x_train, x_test, y_train, y_test = train_test_split(tfidf_train, comment_train, test_size=0.2)
clf = MultinomialNB().fit(x_train, y_train)
# clf_forward = MultinomialNB().fit(tfidf_train, forward_train)
# clf_comment = MultinomialNB().fit(tfidf_train, comment_train)
# clf_like = MultinomialNB().fit(tfidf_train, like_train)
doc_class_predicted = clf.predict(tfidf_test)
# print(doc_class_predicted)
print(np.mean(doc_class_predicted == y_test))
return doc_class_predicted
def lassoclf_compute(train_file, test_file, train_text_index, test_text_index, train_read_size, test_read_size):
tfidf_train, tfidf_test, forward_train, comment_train, like_train = feature_tfidf(train_file, test_file,
train_text_index, test_text_index,
train_read_size, test_read_size)
x_train, x_test, y_train, y_test = train_test_split(tfidf_train, comment_train, test_size=0.2)
y_train_int = [int(element) for element in y_train]
y_test_int = [int(element) for element in y_test]
regr = linear_model.Lasso(alpha=0.3)
regr.fit(x_train, y_train_int)
coef_file = open('./coef.txt', 'w')
for e in regr.coef_:
if e!=0:
coef_file.write(str(e) + '\n')
coef_file.close()
doc_class_predicted = [int(element) for element in regr.predict(x_test)]
print regr.score(x_test, y_test_int)
return doc_class_predicted
def tfidf_compute(corpus_file, test_file, weiboindex, readsize, testreadsize):
total_train_lines = corpus_file.readlines()
total_test_lines = test_file.readlines()
if readsize == -1:
lines = total_train_lines
else:
lines = total_train_lines[:readsize]
train_size = len(lines)
if testreadsize == -1:
test_lines = total_test_lines
else:
test_lines = total_test_lines[:testreadsize]
test_size = len(test_lines)
lines.extend(test_lines)
corpus = []
for index in range(len(lines)):
single_line = lines[index].decode("utf-8").split('\t')
if index < train_size:
str_without_http = preprocess(single_line[weiboindex].replace("\n", "").replace(" ", ""))
else:
str_without_http = preprocess(single_line[3].replace("\n", "").replace(" ", ""))
if str_without_http is not None:
seg_list = jieba.cut(str_without_http)
seg_result = []
for orginal_word in seg_list:
if orginal_word not in stop_words_c and filter(lambda c: c not in string.letters, orginal_word):
seg_result.append(orginal_word)
corpus.append(' '.join(seg_result))
vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
print tfidf
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
trainset_like_para = []
trainset_forward_para = []
trainset_comment_para = []
testset_para = []
test_textid_userid_para = []
for i in range(len(weight)): # 打印每个文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
single_line = lines[i].decode("utf-8").split('\t')
if i < train_size:
like = single_line[5]
forward = single_line[3]
comment = single_line[4]
features = {}
print u"第", i, u"个训练文本的词语tf-idf权重计算完毕"
for j in range(len(word)):
features[word[j]] = weight[i][j]
# print word[j], weight[i][j]
trainset_like_para.append((features, like))
trainset_forward_para.append((features, forward))
trainset_comment_para.append((features, comment))
else:
features = {}
textid = single_line[0]
userid = single_line[1]
print u"第", (i - train_size), u"个测试文本的词语tf-idf权重计算完毕"
for j in range(len(word)):
features[word[j]] = weight[i][j]
testset_para.append(features)
test_textid_userid_para.append(textid + '\t' + userid + '\t')
return trainset_like_para, trainset_forward_para, trainset_comment_para, testset_para, test_textid_userid_para
def tfidf_compute_test(test_file, category, weiboindex, readsize):
lines = test_file.readlines(readsize)
corpus = []
featuresets = []
for single_line in lines:
single_line = single_line.decode("utf-8").split('\t')
str_without_http = preprocess(single_line[weiboindex].replace("\n", "").replace(" ", ""))
if str_without_http is not None:
seg_list = jieba.cut(str_without_http)
seg_result = []
for orginal_word in seg_list:
if orginal_word not in stop_words_c and filter(lambda c: c not in string.letters, orginal_word):
seg_result.append(orginal_word)
corpus.append(' '.join(seg_result))
vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
tfidf = transformer.fit_transform(
vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
for i in range(len(weight)): # 打印每个文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
single_line = lines[i].decode("utf-8").split('\t')
like = single_line[5]
forward = single_line[3]
comment = single_line[4]
features = {}
print u"-------这里输出第", i, u"个文本的词语tf-idf权重------"
for j in range(len(word)):
features[word[j]] = weight[i][j]
# print word[j], weight[i][j]
if category == "like":
featuresets.append((features, like))
elif category == "forward":
featuresets.append((features, forward))
elif category == "comment":
featuresets.append((features, comment))
return featuresets
corpussize = 10000
testsize = 1000
result = lassoclf_compute(f_train, f_test, 6, 3, corpussize, testsize)
print len(result)
outfile = open('./result.txt', 'w')
for idx in range(len(result)):
outfile.write(str(idx) + ' ' + str(result[idx]) + '\n')
outfile.close()
# trainset_like, trainset_forward, trainset_comment, testset, test_textid_userid = tfidf_compute(f_train, f_test, 6,
# corpussize, testsize)
# classifier_like = nltk.NaiveBayesClassifier.train(trainset_like)
# classifier_forward = nltk.NaiveBayesClassifier.train(trainset_forward)
# classifier_comment = nltk.NaiveBayesClassifier.train(trainset_comment)
#
# test_like_result = []
# test_forward_result = []
# test_comment_result = []
#
# for text in testset:
# like_predict = classifier_like.classify(text)
# forward_predict = classifier_forward.classify(text)
# comment_predict = classifier_comment.classify(text)
# test_like_result.append(like_predict)
# test_forward_result.append(forward_predict)
# test_comment_result.append(comment_predict)
#
# for i in range(len(test_like_result)):
# print i, test_textid_userid[i] + str(test_like_result[i]) + ',' + str(test_forward_result[i]) + ',' + str(
# test_comment_result[i])
# print type(classifier.classify(testset))
# print 'accuracy =', nltk.classify.accuracy(classifier, testset)
# single_line = lines[77].decode("utf-8").split('\t')
# str_without_http = preprocess(single_line[6].replace("\n", "").replace(" ", ""))
# print single_line
# print str_without_http
# word_elements = get_words(str_without_http)
# for word in word_elements.keys():
# print i, word, word_elements[word]
# while i <= 100:
# # print line
# line = f.readline().decode("utf-8")
# t = line.split('\t')
# str_without_http = preprocess(t[6].replace("\n", "").replace(" ", ""))
# if str_without_http is not None:
# word_elements = get_words(str_without_http)
# for word in word_elements.keys():
# print i, word, word_elements[word]
# i += 1
f_train.close()