-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTP_retain_releaser_divid.py
114 lines (95 loc) · 3.08 KB
/
TP_retain_releaser_divid.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# -*- coding: utf-8 -*-
# author: Scandium
# work_location: CSM Peking
# project : TC
# time: 2020/01/02 21:05
import pkuseg
import csv
# import jieba.posseg as jp
import os
def file_to_list(file_name):
if file_name.endswith('csv'):
try:
with open(file_name, "r", encoding='utf-8') as csv_file:
list_out = []
csv_r = csv.reader((line.replace('\0', '') for line in csv_file))
for row in csv_r:
list_out.append(row)
return list_out
except:
# print('gb_csv')
with open(file_name, "r", encoding='gb18030') as csv_file:
list_out = []
csv_r = csv.reader((line.replace('\0', '') for line in csv_file))
for row in csv_r:
list_out.append(row)
return list_out
else:
try:
list_out = []
with open(file_name, "r", encoding='utf-8') as file1:
for row in file1.readlines():
list_out.append(row)
return list_out
except:
try:
list_out = []
with open(file_name, "r", encoding='utf-8') as file1:
for row in file1.readlines():
list_out.append(row)
return list_out
except:
print('Can not open', file_name)
return []
def ld_to_csv(input_dic, csv_directory, csv_name):
with open(r'{dic_rectory}\{name}.csv'.format(dic_rectory=csv_directory, name=csv_name), 'w', newline='',
encoding='gb18030') as csv_w:
file = csv.writer(csv_w)
if type(input_dic).__name__ == 'dict':
for key in input_dic.keys():
list_write = []
if type(input_dic[key]).__name__ == 'list':
for write_value in input_dic[key]:
list_write.append(write_value)
else:
list_write = [key, input_dic[key]]
file.writerow(list_write)
elif type(input_dic).__name__ == 'list':
for key in input_dic:
list_write = []
for write_value in key:
list_write.append(write_value)
file.writerow(list_write)
def del_lastN(input_list):
out_list = []
for i in range(len(input_list)):
out_list.append(input_list[i].strip('\n'))
return out_list
def stop_word_build():
stop_words = del_lastN(file_to_list('stop_words'))
stop_words_add = del_lastN(file_to_list('stop_word_add'))
stop_words_recover = del_lastN(file_to_list('stop_word_recover'))
stop_word = set(stop_words + stop_words_add)
out_stop_word = [word for word in stop_word if word not in stop_words_recover]
return out_stop_word
def dic_order_by_value(input_dic):
list_tuple = sorted(input_dic.items(), key=lambda input_dic: input_dic[1], reverse=True)
return dict(list_tuple)
# 此行之下皆为测试
stop_words = stop_word_build()
seg = pkuseg.pkuseg(postag=False)
es_dir = r'F:\TC\retrain_2_topic'
for topic_file_name in os.listdir(es_dir):
file_open = file_to_list(os.path.join(es_dir, topic_file_name))
print(topic_file_name)
for line in file_open:
releaser = line[0]
releaser_div = seg.cut(releaser)
new_line = releaser_div + line[1:]
write_line = [word for word in new_line if word not in stop_words]
with open("F:\TC\div_releaser\{topic_num}".format(topic_num=topic_file_name),
'a+', newline='', encoding='gb18030') as csv_file:
csv_w = csv.writer(csv_file)
csv_w.writerow(write_line)
csv_file.close()
# 此行之下皆为测试