-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtopicModel.py
115 lines (86 loc) · 3.49 KB
/
topicModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pickle
import numpy as np
from sklearn.decomposition import NMF
from preprocess import preprocess_text
def get_topics(model, feature_names, no_top_words):
topic_dict = {}
for topic_idx, topic in enumerate(model.components_):
topic_dict[topic_idx] = " ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]])
return topic_dict
def possibleTopics(data_dir, tfidf_model, vectSum, no_components=10, num_top_words=10):
"""
Takes the vectorized summary data, which is generated by the tfidf model
and finds the possible topics of the papers
using Topic Modeling and Non-negative Matrix Factorization,
the input query is classified into one of the topics.
Parameters
-----------------
query : list
User's query to the recommendation engine.
tfidf_model : sklearn.feature_extraction.text.TfidfVectorizer
The model which is used to extract the tfidf vectors from a
given processed corpus.
vectSum: scipy.sparse.csr.csr_matrix
The matrix with tfidf vectors of all the papers.
Returns
-------
nmf_model : sklearn.decomposition.NMF
An NMF model fitted to the given TF-IDF vectors.
"""
tf_feature_names = tfidf_model.get_feature_names()
nmf_model = NMF(n_components=no_components, random_state=1,
alpha=.1, l1_ratio=.5, init='nndsvd').fit(vectSum)
with open(data_dir + "nmf_model.pk", 'wb') as fp:
pickle.dump(nmf_model, fp)
print("NMF model saved!")
topic_dict = get_topics(
nmf_model, tf_feature_names, num_top_words)
with open(data_dir + "topic_dict.pk", 'wb') as fp:
pickle.dump(topic_dict, fp)
print("Saved topic dictionary!")
return nmf_model
def get_topic_class(vect):
"""
Calculates the most probable topic for a given vector.
Parameters
----------
vect : numpy.array
A TF-IDF vector.
Returns
-------
prob_topic : int
The most probably topic the TF-IDF vector belongs to.
"""
topic_probability_scores = nmf_model.transform(vect)
prob_topic = np.argmax(np.sum(topic_probability_scores, axis=0))
return prob_topic
def labelTopics(data_dir, nmf_model, vectSum):
"""
Takes all the summary vectors and assigns a topic labels to it.
Saves the labels to data_dir with name "topic_labels.pk".
Parameters
----------
data_dir : string
Path to directory to store data.
nmf_model : sklearn.decomposition.NMF
NMF model used to get topics from a TF-IDF vector.
vectSum : np.ndarray
TF-IDF vectors for the summary of all the research papers.
"""
topic_labels = np.array(list(map(get_topic_class, vectSum)))
with open(data_dir + "topic_labels.pk", 'wb') as fp:
pickle.dump(topic_labels, fp)
print("Saved topic labels!")
if __name__ == "__main__":
data_dir = "./data/"
with open(data_dir + 'vectorizer.pk', 'rb') as pickle_in:
vectorizer = pickle.load(pickle_in)
with open(data_dir + "tfidf-vectors-200.pk", "rb") as fp:
vectSum = pickle.load(fp)
try:
with open(data_dir + "nmf_model.pk", 'rb') as fp:
nmf_model = pickle.load(fp)
except FileNotFoundError:
nmf_model = possibleTopics(data_dir, vectorizer, vectSum)
labelTopics(data_dir, nmf_model, vectSum)