Skip to content

Commit

Permalink
faster hamming and input validation
Browse files Browse the repository at this point in the history
  • Loading branch information
dawnandrew100 committed Jul 11, 2024
1 parent eadebcd commit 0ee6713
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 38 deletions.
41 changes: 19 additions & 22 deletions limestone/editdistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,33 +128,28 @@ def align(self, querySequence: str, subjectSequence: str)->str:
return f"{queryAlign}\n{subjectAlign}"

class Hamming(_GLOBALBASE):
def __int_or_str(self, querySequence: str|int, subjectSequence: str|int) -> tuple[str,str]:
"""
Hamming distance between integers is measured between the binary representaiton of the integers.
If a string of letters is passed into this hamming function, then it remains unchanged.
If a number string or number literal is passed into this function then it is converted to binary.
"""
querySequence, subjectSequence = str(querySequence), str(subjectSequence)
if str(querySequence).isdigit() and str(subjectSequence).isdigit():
qs, ss = int(querySequence), int(subjectSequence)
big = f'{qs:08b}' if qs > ss else f'{ss:08b}'
small = f'{ss:0{len(list(big))}b}' if ss < qs else f'{qs:0{len(list(big))}b}'
querySequence = big if f'{qs:08b}' == big else small
subjectSequence = big if f'{ss:08b}' == big else small
elif not str(querySequence).isalpha() or not str(subjectSequence).isalpha():
raise ValueError("Both sequences must be either all letters or all numbers")
return querySequence, subjectSequence

def __int_pair(self, querySequence: str|int, subjectSequence: str|int) -> bool:
querySequence, subjectSequence = str(querySequence), str(subjectSequence)
if querySequence.isalpha() and subjectSequence.isalpha():
return False
if querySequence.isdigit() and subjectSequence.isdigit():
return True
raise ValueError("Both sequences must be either all letters or all numbers")

def align(self, querySequence: str|int, subjectSequence: str|int)->str:
querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence)
if self.__int_pair(querySequence, subjectSequence):
qs, ss = int(querySequence), int(subjectSequence)
return f"{bin(qs)}\n{bin(ss)}"
return f"{querySequence}\n{subjectSequence}"

def matrix(self, qs: str, ss: str) -> None:
return None

def __call__(self, querySequence: str|int, subjectSequence: str|int)->tuple[int,list[int]]:
querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence)
qs,ss = map(lambda x: x.upper(), [querySequence,subjectSequence])
if self.__int_pair(querySequence, subjectSequence):
qs, ss = bin(querySequence), bin(subjectSequence)
else:
qs,ss = map(lambda x: x.upper(), [querySequence,subjectSequence])

if len(qs) == 1 and len(ss) == 1:
dist = 1 if qs != ss else 0
Expand All @@ -181,12 +176,14 @@ def __call__(self, querySequence: str|int, subjectSequence: str|int)->tuple[int,
return dist, dist_array

def distance(self, querySequence: str|int, subjectSequence: str|int)->int:
querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence)
if self.__int_pair(querySequence, subjectSequence):
qs, ss = int(querySequence), int(subjectSequence)
return bin(qs ^ ss).count("1")
query = set([(x, y) for (x, y) in enumerate(querySequence)])
subject = set([(x, y) for (x, y) in enumerate(subjectSequence)])
qs,sq = query-subject, subject-query
dist = max(map(len,[qs,sq]))
return dist
return dist

def binary_distance_array(self, querySequence: str, subjectSequence: str)->list[int]:
_, distarray = self(querySequence, subjectSequence)
Expand Down
32 changes: 16 additions & 16 deletions tests/test_Hamming.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,16 @@ def test_norm_distance3(self):
self.assertEqual(dist, 0.75)

def test_norm_similarity1(self):
dist = hamming.normalized_similarity("ACTG", "AATG")
self.assertEqual(dist, 0.75)
sim = hamming.normalized_similarity("ACTG", "AATG")
self.assertEqual(sim, 0.75)

def test_norm_similarity2(self):
dist = hamming.normalized_similarity("ACTG", "AAAG")
self.assertEqual(dist, 0.5)
sim = hamming.normalized_similarity("ACTG", "AAAG")
self.assertEqual(sim, 0.5)

def test_norm_similarity3(self):
dist = hamming.normalized_similarity("ACTG", "AAAA")
self.assertEqual(dist, 0.25)
sim = hamming.normalized_similarity("ACTG", "AAAA")
self.assertEqual(sim, 0.25)

def test_diff_len(self):
dist = hamming.distance("ACTG", "AATGA")
Expand All @@ -68,14 +68,14 @@ def test_diff_len2(self):
self.assertEqual(dist, 2.0)

def test_binary_diff(self):
dist = hamming.binary_distance_array("ACTG", "AATG")
distarray = hamming.binary_distance_array("ACTG", "AATG")
ans = [1,0,1,1]
self.assertEqual(dist, ans)
self.assertEqual(distarray, ans)

def test_binary_sim(self):
dist = hamming.binary_similarity_array("ACTG", "AATG")
simarray = hamming.binary_similarity_array("ACTG", "AATG")
ans = [0,1,0,0]
self.assertEqual(dist, ans)
self.assertEqual(simarray, ans)

def test_align1(self):
align = hamming.align("ACTG", "ATGA")
Expand All @@ -89,21 +89,21 @@ def test_align2(self):

def test_align_num1(self):
align = hamming.align(12, 13)
ans ="00001100\n00001101"
ans ="0b1100\n0b1101"
self.assertEqual(align, ans)

def test_align_num2(self):
align = hamming.distance(12, 13)
self.assertEqual(align, 1)
numdist = hamming.distance(12, 13)
self.assertEqual(numdist, 1)

def test_align_num1_string(self):
align = hamming.align("12", "13")
ans ="00001100\n00001101"
ans ="0b1100\n0b1101"
self.assertEqual(align, ans)

def test_align_num2_string(self):
align = hamming.distance("12", "13")
self.assertEqual(align, 1)
numdist = hamming.distance("12", "13")
self.assertEqual(numdist, 1)

if __name__ == '__main__':
unittest.main()

0 comments on commit 0ee6713

Please sign in to comment.