-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanagram.py
214 lines (188 loc) · 6.93 KB
/
anagram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# -*- coding: utf-8 -*-
"""Module for finding anagrams"""
import itertools
import sys
class AnagramFinder(object):
def __init__(self, filename="words2.txt"):
self.read_dict(filename)
def get_anagrams(self, word):
'''Main method.'''
two_word_anagram = self.get_two_word_anagram(word)
most_word_anagram = self.get_most_word_anagram(word)
print two_word_anagram
print most_word_anagram
def get_two_word_anagram(self, word):
'''Gets the first key from the generator and looks up a corresponding
word.
'''
try:
keys = self.gen_two_word_anagram_keys(word).next()
except StopIteration:
return ''
return (self.valid_words[len(keys[0])][keys[0]][0],
self.valid_words[len(keys[1])][keys[1]][0])
def gen_two_word_anagram_keys(self, word):
'''Build a generator of tuples of sorted words that represent
valid anagrams.
'''
sorted_word = sort_chars(word)
for p in bifurcate_int(len(word)):
small_words = self.gen_subwords(sorted_word, p[0])
big_words = self.gen_subwords(sorted_word, p[1])
if p[0] == p[1]:
# small_words and big_words are the same, so we can use
# combinations instead of a direct product for a smaller
# search space. This way we can avoid searching both
# ('ab', 'ad') and ('ad', 'ab').
possible_pairs = itertools.combinations_with_replacement(
small_words, 2)
else:
possible_pairs = itertools.product(small_words, big_words)
for pair in possible_pairs:
if merge_words(pair) == sorted_word:
yield pair
def get_most_word_anagram(self, word):
'''Gets the first key from the generator and looks up a corresponding
word.
'''
try:
keys = self.gen_longest_anagram_keys(word).next()
except StopIteration:
return ''
return tuple([self.valid_words[len(keys[i])][keys[i]][0]
for i in range(len(keys))])
def gen_longest_anagram_keys(self, word):
'''Build a generator of tuples of sorted words that represent
valid anagrams.
'''
max_words = len(word)/2
sorted_word = sort_chars(word)
for i in xrange(max_words, 1, -1):
for partition in partition_int(len(word), i):
words = []
for length in partition:
words.append(self.gen_subwords(sorted_word, length))
possible_pairs = itertools.product(*words)
for pair in possible_pairs:
if merge_words(pair) == sorted_word:
yield pair
def gen_subwords(self, sorted_word, length):
for subword in self.valid_words[length].keys():
if is_subword(subword, sorted_word):
yield subword
def read_dict(self, filename):
'''Reads a text file of words into memory.
Splits each length of word into its own dict.
Each dict has sorted strings as keys,
and sets of unsorted strings as values.
Returns valid_words, a list of dicts.
valid_words[n] is the dict of all words of length n.
valid_words[0] and valid_words[1] are empty dicts.
'''
with open(filename) as wordfile:
max_length = 10
# add 1 so that valid_words[i] is words of length i
valid_words = [{} for _ in range(max_length + 1)]
old_word = ''
for word in wordfile:
# remove trailing newline and fix case
word = word.strip().lower()
# remove some duplicates that appear next to each
# other in the alphabetical word list.
if old_word == word:
continue
old_word = word
l = len(word)
if l < 2:
continue
if l > max_length:
for _ in range(max_length, l):
valid_words.append({})
max_length = l
key = sort_chars(word)
valid_words[l].setdefault(key, []).append(word)
self.valid_words = valid_words
def sort_chars(word):
word_array = [c for c in word]
word_array.sort()
return ''.join(word_array)
def merge_two_words(word1, word2):
'''Merge two sorted strings into one sorted string.
This is just the "merge" part of mergesort,
converting to a string at the end.
'''
i, j = 0, 0
out = []
l1, l2 = len(word1), len(word2)
while i < l1 and j < l2:
if word1[i] < word2[j]:
out.append(word1[i])
i = i + 1
else:
out.append(word2[j])
j = j + 1
while i < l1:
out.append(word1[i])
i = i + 1
while j < l2:
out.append(word2[j])
j = j + 1
return ''.join(out)
def merge_words(words):
'''Split an iterable into singletons and merge them.
This could be used in place of sort_chars.
'''
if len(words) == 1:
return words[0]
if len(words) == 2:
return merge_two_words(words[0], words[1])
words1 = words[:len(words)/2]
words2 = words[len(words)/2:]
return merge_two_words(merge_words(words1), merge_words(words2))
def is_subword(subword, word):
'''Checks if a sorted subword is a subset of a sorted word.'''
i, j = 0, 0
while j < len(word) and i < len(subword):
if subword[i] == word[j]:
i = i + 1
j = j + 1
else:
j = j + 1
if i == len(subword):
return True
else:
return False
def bifurcate_int(i):
'''Returns a generator of all partitions of i into 2 smaller integers
that sum to i. Returns evenly divided partitions first, lopsided ones last
'''
for j in xrange(i/2, 1, -1):
yield (j, i - j)
def partition_int(i, num_parts=3):
'''Returns a generator of all partitions of i into several smaller
integers that sum to i.
'''
partition = [2] * num_parts
partition[0] = i - 2 * (num_parts - 1)
if partition[0] < 2:
return
yield partition[:]
while next_partition(partition):
yield partition[:]
def next_partition(partition):
'''Modifies partition to the next valid sorted partition.
If successful, return True, otherwise False.
'''
for i in range(len(partition) - 1, 0, -1):
if partition[i-1] - partition[i] > 1:
partition[i] = partition[i] + 1
partition[i-1] = partition[i-1] - 1
return True
return False
if __name__ == '__main__':
if len(sys.argv) < 2:
print "Please specify a word to anagram"
else:
af = AnagramFinder()
word = ''.join(sys.argv[1:])
af.get_anagrams(word)