-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsearch.py
71 lines (64 loc) · 2.18 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!usr/bin/python
"""
This module implements the class for handling the user queries.
temp/terms_to_integer stores the basis vector in a dictionary with the
terms mapped to integers.
"""
from shelve import DbfilenameShelf as shelve
from Models.document import Document
from porter import PorterStemmer
import mathutils
from counter import Counter
from numpy import zeros
from Models.document import Document
from numpy import dot
keyword_database = shelve('temp/terms_to_integer', 'r')
keywords = keyword_database['term2id'].keys()
vec_length = len(keywords)
porter_stemmer = PorterStemmer()
dv = shelve('documentVectors', 'r')
doc_database = shelve('database1', 'r')
def query_parser(query):
"""
The query string is split into words or terms. The terms are then
checked if they are present in our basis vector. The terms which are
found in the basis vector are then mapped to their integer ids and
returned as a vector.
"""
query_terms = query.split()
query_terms = [porter_stemmer.stem(word) for word in query_terms]
query_terms = [term for term in query_terms if term in keywords]
query_vec = zeros(vec_length)
tfs = dict(Counter(query_terms))
for term in query_terms:
if term in keywords:
index = keyword_database['term2id'][term]
weight = tfs[term]
query_vec[index] = weight
query_vec = mathutils.normalise_vector(query_vec)
return (query_vec)
def search_database(query_vec, no_of_res = 10):
"""
Searches the database and returns atmost 10 relevant documents.
"""
results = []
for doc_id in dv.iterkeys():
value = dot(dv[doc_id], query_vec)
results.append((doc_id, value))
results = sorted(results, key=lambda k: k[1], reverse = True)
count = 0
for n in xrange(no_of_res):
if results[n][1] != 0:
count+=1
print doc_database[results[n][0]].url
if count == 0:
print "No results found!!!"
def search():
"""
Search Function which runs infinitely to accept user queries.
"""
find = True
while find:
query = raw_input("Search For : ")
search_database(query_parser(query))
search()