-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsample1.py
126 lines (95 loc) · 3.91 KB
/
sample1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
# -*-coding=utf-8-*-
import os
from gensim import corpora, models, similarities
def getFileList(dir):
return [dir + x for x in os.listdir(dir)]
dictLists = getFileList('./dict/')
class LoadDictionary(object):
def __init__(self, dictionary):
self.dictionary = dictionary
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ('./dict/' + sFileName + '.dict', './corpus/' + sFileName + '.mm')
yield self.dictionary.load_from_text(dictFile)
class LoadCorpus(object):
def __iter__(self):
for dictFile in dictLists:
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
sFileDir, sFileName = os.path.split(sFileRaw)
(dictFile, corpusFile) = ('./dict/' + sFileName + '.dict', './corpus/' + sFileName + '.mm')
yield corpora.MmCorpus(corpusFile)
"""
预处理(easy_install nltk)
"""
# 简化的 中文+英文 预处理
def pre_process_cn(inputs, low_freq_filter=True):
"""
1.去掉停用词
2.去掉标点符号
3.处理为词干
4.去掉低频词
"""
import nltk
import jieba.analyse
from nltk.tokenize import word_tokenize
texts_tokenized = []
for document in inputs:
texts_tokenized_tmp = []
for word in word_tokenize(document):
texts_tokenized_tmp += jieba.analyse.extract_tags(word, 10)
texts_tokenized.append(texts_tokenized_tmp)
texts_filtered_stopwords = texts_tokenized
# 去除标点符号
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
texts_filtered = [[word for word in document if not word in english_punctuations] for document in
texts_filtered_stopwords]
# 词干化
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
# 去除过低频词
if low_freq_filter:
all_stems = sum(texts_stemmed, [])
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
else:
texts = texts_stemmed
return texts
dictionary = corpora.dictionary.Dictionary()
dictionary_memory_friendly = LoadDictionary(dictionary)
for vector in dictionary_memory_friendly:
dictionary = vector
corpus = []
corpus_memory_friendly = LoadCorpus()
for vector in corpus_memory_friendly:
corpus.append(vector[0])
if 0 < len(corpus):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20,
chunksize=2000000) # 不指定 id2word=dictionary 时,LsiModel内部会根据 corpus 重建 dictionary
index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus))
# 要处理的对象登场,这里随便从小说中截取了一段话
target_courses = ['男人们的脸上沉重而冷凝,蒙着面纱的女人们则是发出断断续续的哭泣声,他们无比专注地看着前方,见证一场生与死的拉锯战。']
target_text = pre_process_cn(target_courses, low_freq_filter=False)
"""
对具体对象相似度匹配
"""
# 选择一个基准数据
ml_course = target_text[0]
# 词袋处理
ml_bow = dictionary.doc2bow(ml_course)
# 在上面选择的模型数据 lsi model 中,计算其他数据与其的相似度
ml_lsi = model[ml_bow] # ml_lsi 形式如 (topic_id, topic_value)
sims = index[ml_lsi] # sims 是最终结果了, index[xxx] 调用内置方法 __getitem__() 来计算ml_lsi
# 排序,为输出方便
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
# 查看结果
print sort_sims[0:10]
print len(dictLists)
print dictLists[sort_sims[1][0]]
print dictLists[sort_sims[2][0]]
print dictLists[sort_sims[3][0]]