-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindexer.py
131 lines (120 loc) · 5.04 KB
/
indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!usr/bin/python
from tokenizedocuments import TokenizeDocuments
from shelve import DbfilenameShelf as shelve
import mathutils
from numpy import NaN
#TODO move the mathematical functions to another module
class Indexer(object):
"""
Class to handle the indexing of the documents stored in the
database. Indexer will use the tokenizedocuments module to retrieve
indexes or tokens or terms from the documents.
The point to be noted here is that we do not sort the tokens
alphabetically but with their integer ids. What we do here is that we
have our basis vector which is derived from the unique terms in all
our documents. The dimension of this vector may be very large i.e of
the orders of 1,000s or 10,000s. So, we may choose to filter terms with
lower frequencies to reduce the dimension.
Also, we precompute all the document vectors and store them in the database.
This will help in faster similarity computations.
"""
def __init__(self, database):
self.db = database
self.tokenizer = TokenizeDocuments(database)
self.vec_length = 0
self.collection_length = self.set_collection_length(database)
def get_document_keywords(self):
"""
Return the keywords and their integer ids from the documents stored
in the database.
"""
t = self.tokenizer
#get all tokens from database
all_terms = t.get_all_terms()
#determine the frequencies of the documents
term_freq = t.terms_counter(all_terms)
#filter tokens with frequency of 1
t.filter_terms(term_freq, [1])
#get list of unique tokens
list_of_terms = term_freq.keys()
# ids is a dictionary of tokens with their integer ids
ids = t.assign_id_to_terms(list_of_terms)
#determine the dimension of the vector in the document space which is
#basicaly the length of the list_of_terms or ids
self.vec_length = len(ids)
#ids is a dictionary with keys being the integer ids and the values
#as tokens or terms. This dictionary is also stored in another database
#for a possible future use or reference
return 0
def set_collection_length(self, database):
"""
Determines the length of document collection for
self.collection_length attribute of the class.
"""
db = shelve(database, 'r')
temp_file = shelve('temp/terms_integerid','r')
self.vec_length = len(temp_file['id2term'])
return len(db.keys())
def yield_documents(self):
"""
Generator to return a document from the database
"""
db = shelve(self.db, 'w')
for document in db.itervalues():
yield document
def yield_keyword(self):
temp_file = shelve('temp/terms_integerid','w')
for value in temp_file['id2term'].itervalues():
yield value
def calculate_document_frequency(self):
"""
Calculate the document frequencies of the tokens. Document frequency is
the number of documents in which a keyword appears. This will be used
to calculate the inverse document frequencies.
"""
self.get_document_keywords()
shelve('doc_frequencies','c')
doc_freq = shelve('doc_frequencies','w')
keywords = self.yield_keyword()
for i in xrange(0,self.vec_length):
count = 0
kw = keywords.next()
doc_generator = self.yield_documents()
flag = True
while flag:
try:
if kw in doc_generator.next().all_terms:
count+=1
except StopIteration:
flag = False
doc_freq[kw] = count
print kw,count
doc_freq.close()
def set_doc_vector(self):
"""
Iterates through the documents and sets their document vectors
"""
doc_generator = self.yield_documents()
documents = True
doc_freq = shelve('doc_frequencies', 'w')
shelve('documentVectors', 'c')
keyword_database = shelve('temp/terms_to_integer', 'r')
keywords = keyword_database['term2id']
dv = shelve('documentVectors', 'w')
db = shelve(self.db, 'w')
for document in db.itervalues():
key = document.key
doc_terms = document.unique_terms_freq
tf = 0
doc_vector = zeros(self.vec_length)
for kw in keywords:
if kw in doc_terms.keys():
tf = doc_terms[kw]
term_weight = mathutils.calculate_term_weight(tf, doc_freq[kw], self.vec_length)
doc_vector[keywords[kw]] = term_weight
doc_vector = mathutils.normalise_vector(doc_vector)
print doc_vector
dv[key] = doc_vector
i = Indexer('database1')
i.calculate_document_frequency()
i.set_doc_vector()