From 0ee6713a0e027715561f08662faef28498a2eb4b Mon Sep 17 00:00:00 2001 From: Andrew Hennis Date: Wed, 10 Jul 2024 20:52:35 -0400 Subject: [PATCH] faster hamming and input validation --- limestone/editdistance.py | 41 ++++++++++++++++++--------------------- tests/test_Hamming.py | 32 +++++++++++++++--------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/limestone/editdistance.py b/limestone/editdistance.py index 9f8d055..47fab97 100644 --- a/limestone/editdistance.py +++ b/limestone/editdistance.py @@ -128,33 +128,28 @@ def align(self, querySequence: str, subjectSequence: str)->str: return f"{queryAlign}\n{subjectAlign}" class Hamming(_GLOBALBASE): - def __int_or_str(self, querySequence: str|int, subjectSequence: str|int) -> tuple[str,str]: - """ - Hamming distance between integers is measured between the binary representaiton of the integers. - If a string of letters is passed into this hamming function, then it remains unchanged. - If a number string or number literal is passed into this function then it is converted to binary. - """ - querySequence, subjectSequence = str(querySequence), str(subjectSequence) - if str(querySequence).isdigit() and str(subjectSequence).isdigit(): - qs, ss = int(querySequence), int(subjectSequence) - big = f'{qs:08b}' if qs > ss else f'{ss:08b}' - small = f'{ss:0{len(list(big))}b}' if ss < qs else f'{qs:0{len(list(big))}b}' - querySequence = big if f'{qs:08b}' == big else small - subjectSequence = big if f'{ss:08b}' == big else small - elif not str(querySequence).isalpha() or not str(subjectSequence).isalpha(): - raise ValueError("Both sequences must be either all letters or all numbers") - return querySequence, subjectSequence - + def __int_pair(self, querySequence: str|int, subjectSequence: str|int) -> bool: + querySequence, subjectSequence = str(querySequence), str(subjectSequence) + if querySequence.isalpha() and subjectSequence.isalpha(): + return False + if querySequence.isdigit() and subjectSequence.isdigit(): + return True + raise ValueError("Both sequences must be either all letters or all numbers") + def align(self, querySequence: str|int, subjectSequence: str|int)->str: - querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence) + if self.__int_pair(querySequence, subjectSequence): + qs, ss = int(querySequence), int(subjectSequence) + return f"{bin(qs)}\n{bin(ss)}" return f"{querySequence}\n{subjectSequence}" def matrix(self, qs: str, ss: str) -> None: return None def __call__(self, querySequence: str|int, subjectSequence: str|int)->tuple[int,list[int]]: - querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence) - qs,ss = map(lambda x: x.upper(), [querySequence,subjectSequence]) + if self.__int_pair(querySequence, subjectSequence): + qs, ss = bin(querySequence), bin(subjectSequence) + else: + qs,ss = map(lambda x: x.upper(), [querySequence,subjectSequence]) if len(qs) == 1 and len(ss) == 1: dist = 1 if qs != ss else 0 @@ -181,12 +176,14 @@ def __call__(self, querySequence: str|int, subjectSequence: str|int)->tuple[int, return dist, dist_array def distance(self, querySequence: str|int, subjectSequence: str|int)->int: - querySequence, subjectSequence = self.__int_or_str(querySequence, subjectSequence) + if self.__int_pair(querySequence, subjectSequence): + qs, ss = int(querySequence), int(subjectSequence) + return bin(qs ^ ss).count("1") query = set([(x, y) for (x, y) in enumerate(querySequence)]) subject = set([(x, y) for (x, y) in enumerate(subjectSequence)]) qs,sq = query-subject, subject-query dist = max(map(len,[qs,sq])) - return dist + return dist def binary_distance_array(self, querySequence: str, subjectSequence: str)->list[int]: _, distarray = self(querySequence, subjectSequence) diff --git a/tests/test_Hamming.py b/tests/test_Hamming.py index 8775e1c..0ab9e47 100644 --- a/tests/test_Hamming.py +++ b/tests/test_Hamming.py @@ -48,16 +48,16 @@ def test_norm_distance3(self): self.assertEqual(dist, 0.75) def test_norm_similarity1(self): - dist = hamming.normalized_similarity("ACTG", "AATG") - self.assertEqual(dist, 0.75) + sim = hamming.normalized_similarity("ACTG", "AATG") + self.assertEqual(sim, 0.75) def test_norm_similarity2(self): - dist = hamming.normalized_similarity("ACTG", "AAAG") - self.assertEqual(dist, 0.5) + sim = hamming.normalized_similarity("ACTG", "AAAG") + self.assertEqual(sim, 0.5) def test_norm_similarity3(self): - dist = hamming.normalized_similarity("ACTG", "AAAA") - self.assertEqual(dist, 0.25) + sim = hamming.normalized_similarity("ACTG", "AAAA") + self.assertEqual(sim, 0.25) def test_diff_len(self): dist = hamming.distance("ACTG", "AATGA") @@ -68,14 +68,14 @@ def test_diff_len2(self): self.assertEqual(dist, 2.0) def test_binary_diff(self): - dist = hamming.binary_distance_array("ACTG", "AATG") + distarray = hamming.binary_distance_array("ACTG", "AATG") ans = [1,0,1,1] - self.assertEqual(dist, ans) + self.assertEqual(distarray, ans) def test_binary_sim(self): - dist = hamming.binary_similarity_array("ACTG", "AATG") + simarray = hamming.binary_similarity_array("ACTG", "AATG") ans = [0,1,0,0] - self.assertEqual(dist, ans) + self.assertEqual(simarray, ans) def test_align1(self): align = hamming.align("ACTG", "ATGA") @@ -89,21 +89,21 @@ def test_align2(self): def test_align_num1(self): align = hamming.align(12, 13) - ans ="00001100\n00001101" + ans ="0b1100\n0b1101" self.assertEqual(align, ans) def test_align_num2(self): - align = hamming.distance(12, 13) - self.assertEqual(align, 1) + numdist = hamming.distance(12, 13) + self.assertEqual(numdist, 1) def test_align_num1_string(self): align = hamming.align("12", "13") - ans ="00001100\n00001101" + ans ="0b1100\n0b1101" self.assertEqual(align, ans) def test_align_num2_string(self): - align = hamming.distance("12", "13") - self.assertEqual(align, 1) + numdist = hamming.distance("12", "13") + self.assertEqual(numdist, 1) if __name__ == '__main__': unittest.main()