diff --git a/limestone/__init__.py b/limestone/__init__.py index 332a167..8450df0 100644 --- a/limestone/__init__.py +++ b/limestone/__init__.py @@ -1,6 +1,8 @@ from limestone.algorithms.base import GLOBALBASE from limestone.algorithms.base import LOCALBASE from limestone.algorithms.editdistance import hamming +from limestone.algorithms.editdistance import jaro +from limestone.algorithms.editdistance import jaro_winkler from limestone.algorithms.editdistance import hirschberg from limestone.algorithms.editdistance import lowrance_wagner from limestone.algorithms.editdistance import needleman_wunsch diff --git a/limestone/algorithms/editdistance.py b/limestone/algorithms/editdistance.py index 6ba3934..f65f1a9 100644 --- a/limestone/algorithms/editdistance.py +++ b/limestone/algorithms/editdistance.py @@ -390,6 +390,98 @@ def align(self, qs: str, ss: str) -> str: queryAlign, subjectAlign = self(qs, ss) return f"{queryAlign}\n{subjectAlign}" +class Jaro(): + def __init__(self): + self.match_score = 1 + self.winkler = False + + def __call__(self, querySequence: str, subjectSequence: str) -> tuple[int, int]: + qs, ss = (x.upper() for x in [querySequence, subjectSequence]) + if qs == ss: + return -1, 0 + len1, len2 = len(querySequence), len(subjectSequence) + max_dist = max(len1, len2)//2 - 1 + + matches = 0 + array_qs = [False] * len1 + array_ss = [False] * len2 + for i in range(len1): + start = max(0, i - max_dist) + end = min(len2, i + max_dist + 1) + for j in range(start, end): + if qs[i] == ss[j] and array_ss[j] == 0: + array_qs[i] = array_ss[j] = True + matches += 1 + break + if matches == 0: + return 0, 0 + + transpositions = 0 + comparison = 0 + for i in range(len1): + if array_qs[i]: + while not array_ss[comparison]: + comparison += 1 + if qs[i] != ss[comparison]: + transpositions += 1 + comparison += 1 + return matches, transpositions//2 + + def similarity(self, querySequence: str, subjectSequence: str) -> float: + matches, t = self(querySequence, subjectSequence) + if matches == 0: + return 0.0 + if matches == -1: + return 1.0 + jaro_sim = (1/3)*((matches/len(querySequence))+(matches/len(subjectSequence))+((matches-t)/matches)) + if not self.winkler: + return jaro_sim + prefix_matches = 0 + for i in range(4): + if querySequence[i] != subjectSequence[i] or i > len(subjectSequence) - 1: + break + prefix_matches += 1 + return jaro_sim + prefix_matches*self.scaling_factor*(1-jaro_sim) + + def normalized_similarity(self, querySequence: str, subjectSequence: str) -> float: + return round(self.similarity(querySequence, subjectSequence), 2) + + def distance(self, querySequence: str, subjectSequence: str) -> float: + return 1 - self.similarity(querySequence, subjectSequence) + + def normalized_distance(self, querySequence: str, subjectSequence: str) -> float: + return round(self.distance(querySequence, subjectSequence), 2) + + def matrix(self, querySequence: str, subjectSequence: str) -> NDArray[float64]: + #dynamic programming variant to show all matches + qs,ss = [""], [""] + qs.extend([x.upper() for x in querySequence]) + ss.extend([x.upper() for x in subjectSequence]) + max_match_dist = max(0, (max(len(ss)-1, len(qs)-1)//2)-1) + + #matrix initialisation + self.alignment_score = numpy.zeros((len(qs),len(ss))) + for i, query_char in enumerate(qs): + for j, subject_char in enumerate(ss): + if i == 0 or j == 0: + #keeps first row and column consistent throughout all calculations + continue + dmatch = self.alignment_score[i-1][j-1] + start = max(1, i-max_match_dist) + trans_match = ss[start:start+(2*max_match_dist)] + if query_char == subject_char or query_char in trans_match: + dmatch += 1 + + self.alignment_score[i][j] = dmatch + return self.alignment_score + +class JaroWinkler(Jaro): + def __init__(self, scaling_factor = 0.1): + self.match_score = 1 + self.winkler = True + #p should not exceed 0.25 else similarity could be larger than 1 + self.scaling_factor = scaling_factor + class Smith_Waterman(__LOCALBASE): def __init__(self, match_score:int = 1, mismatch_penalty:int = 1, gap_penalty:int = 2)->None: self.match_score = match_score @@ -560,6 +652,8 @@ def distance(self, querySequence: str, subjectSequence: str)->float: waterman_smith_beyer = Waterman_Smith_Beyer() smith_waterman = Smith_Waterman() hirschberg = Hirschberg() +jaro = Jaro() +jaro_winkler = JaroWinkler() lowrance_wagner = Lowrance_Wagner() longest_common_subsequence = Longest_Common_Subsequence() shortest_common_supersequence = Shortest_Common_Supersequence() diff --git a/tests/test_Jaro.py b/tests/test_Jaro.py new file mode 100644 index 0000000..2fb9624 --- /dev/null +++ b/tests/test_Jaro.py @@ -0,0 +1,63 @@ +from __future__ import annotations +import unittest +from limestone import jaro + +class TestJaro(unittest.TestCase): + def test_distance_diff(self): + dist = jaro.distance("ACTG", "FHYU") + self.assertEqual(dist, 1.0) + + def test_similarity_diff(self): + sim = jaro.similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_norm_distance_diff(self): + dist = jaro.normalized_distance("ACTG", "FHYU") + self.assertEqual(dist, 1.0) + + def test_norm_similarity_diff(self): + sim = jaro.normalized_similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_distance_sim(self): + dist = jaro.distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_similarity_sim(self): + sim = jaro.similarity("ACTG", "ACTG") + self.assertEqual(sim, 1.0) + + def test_norm_distance_sim(self): + dist = jaro.normalized_distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_norm_similarity_sim(self): + sim = jaro.normalized_similarity("ACTG", "ACTG") + self.assertEqual(sim, 1.0) + + def test_norm_distance1(self): + dist = jaro.normalized_distance("ACTG", "AATG") + self.assertEqual(dist, 0.17) + + def test_norm_distance2(self): + dist = jaro.normalized_distance("ACTG", "AAAG") + self.assertEqual(dist, 0.33) + + def test_norm_distance3(self): + dist = jaro.normalized_distance("ACTG", "AAAA") + self.assertEqual(dist, 0.5) + + def test_norm_similarity1(self): + dist = jaro.normalized_similarity("ACTG", "AATG") + self.assertEqual(dist, 0.83) + + def test_norm_similarity2(self): + dist = jaro.normalized_similarity("ACTG", "AAAG") + self.assertEqual(dist, 0.67) + + def test_norm_similarity3(self): + dist = jaro.normalized_similarity("ACTG", "AAAA") + self.assertEqual(dist, 0.5) + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_JaroWinkler.py b/tests/test_JaroWinkler.py new file mode 100644 index 0000000..14e0613 --- /dev/null +++ b/tests/test_JaroWinkler.py @@ -0,0 +1,63 @@ +from __future__ import annotations +import unittest +from limestone import jaro_winkler + +class TestJaroWinkler(unittest.TestCase): + def test_distance_diff(self): + dist = jaro_winkler.distance("ACTG", "FHYU") + self.assertEqual(dist, 1.0) + + def test_similarity_diff(self): + sim = jaro_winkler.similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_norm_distance_diff(self): + dist = jaro_winkler.normalized_distance("ACTG", "FHYU") + self.assertEqual(dist, 1.0) + + def test_norm_similarity_diff(self): + sim = jaro_winkler.normalized_similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_distance_sim(self): + dist = jaro_winkler.distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_similarity_sim(self): + sim = jaro_winkler.similarity("ACTG", "ACTG") + self.assertEqual(sim, 1.0) + + def test_norm_distance_sim(self): + dist = jaro_winkler.normalized_distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_norm_similarity_sim(self): + sim = jaro_winkler.normalized_similarity("ACTG", "ACTG") + self.assertEqual(sim, 1.0) + + def test_norm_distance1(self): + dist = jaro_winkler.normalized_distance("ACTG", "AATG") + self.assertEqual(dist, 0.15) + + def test_norm_distance2(self): + dist = jaro_winkler.normalized_distance("ACTG", "AAAG") + self.assertEqual(dist, 0.3) + + def test_norm_distance3(self): + dist = jaro_winkler.normalized_distance("ACTG", "AAAA") + self.assertEqual(dist, 0.45) + + def test_norm_similarity1(self): + dist = jaro_winkler.normalized_similarity("ACTG", "AATG") + self.assertEqual(dist, 0.85) + + def test_norm_similarity2(self): + dist = jaro_winkler.normalized_similarity("ACTG", "AAAG") + self.assertEqual(dist, 0.7) + + def test_norm_similarity3(self): + dist = jaro_winkler.normalized_similarity("ACTG", "AAAA") + self.assertEqual(dist, 0.55) + +if __name__ == '__main__': + unittest.main()