-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdamerau_levenshtein.py
82 lines (62 loc) · 2.3 KB
/
damerau_levenshtein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import sys
import time
NodeCount = 0
class TrieNode:
def __init__(self):
self.word = None
self.children = {}
global NodeCount
NodeCount += 1
def insert(self, word):
node = self
for letter in word:
if letter not in node.children:
node.children[letter] = TrieNode()
node = node.children[letter]
node.word = word
trie = TrieNode()
def search(word, maxcost, ratio_calc = False):
currentrow = range(len(word)+1)
results = []
for letter in trie.children:
search_again(trie.children[letter], letter, None, word, currentrow, None, results, maxcost, ratio_calc)
return results
def search_again(node, letter, prev_letter, word, prevRow, pre_previous_row, results, maxcost, ratio_calc):
columns = len(word) + 1
currentRow = [prevRow[0] + 1]
for column in range(1,columns):
insertcost = currentRow[column - 1] + 1
deletecost = prevRow[column] + 1
if word[column - 1] != letter:
if ratio_calc:
replacecost = prevRow[column - 1] + 2
else:
replacecost = prevRow[column - 1] + 1
else:
replacecost = prevRow[column - 1]
currentRow.append(min( insertcost, deletecost, replacecost))
#print(pre_previous_row,"----",currentRow[column], pre_previous_row[column - 2])
if prev_letter and column -1 > 0 and letter == word[column - 2] and prev_letter == word[column - 1] and word[column - 1] != letter:
currentRow[column] = min(currentRow[column], pre_previous_row[column - 2] + 1)
if(currentRow[-1] <= maxcost and node.word != None):
results.append((node.word, currentRow[-1]))
if min(currentRow) <= maxcost:
prev_letter = letter
for letter in node.children:
search_again(node.children[letter], letter, prev_letter, word, currentRow, prevRow, results, maxcost, ratio_calc)
#node = node.children[letter]
#prevRow = currentRow
if __name__ == '__main__':
target = sys.argv[1]
max_cost = int(sys.argv[2])
suppress = int(sys.argv[3])
data = open("./data/google-10000-english.txt", "r")
for line in data.readlines():
words = line.split(" ")
for i in range(len(words)):
trie.insert(words[i])
start = time.time()
results = search(target, max_cost, True)
if not suppress:
for res in results:
print(res)