diff --git a/README.md b/README.md index 25aa0ba..f9f3bbb 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ This project contains several sequence alignment algorithms that can also produc | ------------------ | --------- | -------- | ------------------------------ | |Needleman-Wunsch | [x] | [x] | [x] | |Gotoh (Global) | [x] | [x] | [x] | +|Gotoh (Local) | [x] | [x] | [x] | |Smith-Waterman | [x] | [x] | [x] | |Waterman-Smith-Beyer | [x] | [x] | [x] | |Wagner-Fischer | [x] | [x] | [x] | @@ -25,7 +26,9 @@ This project contains several sequence alignment algorithms that can also produc ## Algorithms Explained [Needleman-Wunsch](https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm) -[Gotoh -Global](https://helios2.mi.parisdescartes.fr/~lomn/Cours/BI/Material/gap-penalty-gotoh.pdf) +[Gotoh (Global)](https://helios2.mi.parisdescartes.fr/~lomn/Cours/BI/Material/gap-penalty-gotoh.pdf) + +[Gotoh (Local)](http://rna.informatik.uni-freiburg.de/Teaching/index.jsp?toolName=Gotoh%20(Local)) [Smith-Waterman ](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) @@ -97,7 +100,6 @@ print(needleman_wunsch.matrix("AFTG","ACTG")) # Work In Progress -Jaro and Jaro-Winkler algorithms. Importing and parsing FASTA, FASTQ, and PDB files. # Caveats diff --git a/limestone/__init__.py b/limestone/__init__.py index 2aa1319..ebea40e 100644 --- a/limestone/__init__.py +++ b/limestone/__init__.py @@ -12,3 +12,4 @@ from limestone.algorithms.editdistance import longest_common_subsequence from limestone.algorithms.editdistance import shortest_common_supersequence from limestone.algorithms.editdistance import gotoh +from limestone.algorithms.editdistance import gotoh_local diff --git a/limestone/algorithms/editdistance.py b/limestone/algorithms/editdistance.py index 48cf17c..3c231a6 100644 --- a/limestone/algorithms/editdistance.py +++ b/limestone/algorithms/editdistance.py @@ -13,14 +13,11 @@ raise ImportError("Please pip install all dependencies from requirements.txt!") def main(): - qqs = "GCATGCCAT" - sss = "CATGCATCGAC" - - print(gotoh.align(qqs, sss)) - print(gotoh.distance(qqs, sss)) - print(gotoh.normalized_distance(qqs, sss)) - print(gotoh.similarity(qqs, sss)) + qqs = "WHORUNSTHEWORLDGIRLS" + sss = "DOGSSTHEWORLDEXUBERENT" + print(gotoh_local(qqs, sss)) + print(gotoh_local.align(qqs, sss)) class Wagner_Fischer(__GLOBALBASE): #Levenshtein Distance def __init__(self)->None: @@ -312,7 +309,6 @@ def __call__(self, querySequence: str, subjectSequence: str)->tuple[NDArray[floa qs.extend([x.upper() for x in querySequence]) ss.extend([x.upper() for x in subjectSequence]) - #LETS GIVE POSITIVES A TRY FOR THE TESTS #matrix initialisation self.D = numpy.full((len(qs),len(ss)), numpy.inf) self.P = numpy.full((len(qs), len(ss)), numpy.inf) @@ -379,6 +375,70 @@ def align(self, querySequence: str, subjectSequence: str)->str: return f"{queryAlign}\n{subjectAlign}" +class Gotoh_Local(__LOCALBASE): + def __init__(self, match_score:int = 1, mismatch_penalty:int = 1, new_gap_penalty:int = 2, continue_gap_penalty: int = 1)->None: + self.match_score = match_score + self.mismatch_penalty = mismatch_penalty + self.new_gap_penalty = new_gap_penalty + self.continue_gap_penalty = continue_gap_penalty + + def __call__(self, querySequence: str, subjectSequence: str)->tuple[NDArray[float64],NDArray[float64],NDArray[float64]]: + qs,ss = [""], [""] + qs.extend([x.upper() for x in querySequence]) + ss.extend([x.upper() for x in subjectSequence]) + + #matrix initialisation + self.D = numpy.full((len(qs),len(ss)), -numpy.inf) + self.P = numpy.full((len(qs), len(ss)), -numpy.inf) + self.P[:,0] = 0 + self.Q = numpy.full((len(qs), len(ss)), -numpy.inf) + self.Q[0,:] = 0 + #initialisation of starter values for first column and first row + self.D[:,0] = 0 + self.D[0,:] = 0 + + for i in range(1, len(qs)): + for j in range(1, len(ss)): + match = self.D[i - 1, j - 1] + (self.match_score if qs[i] == ss[j] else -self.mismatch_penalty) + self.P[i, j] = max(self.D[i - 1, j] - self.new_gap_penalty - self.continue_gap_penalty, self.P[i - 1, j] - self.continue_gap_penalty) + self.Q[i, j] = max(self.D[i, j - 1] - self.new_gap_penalty - self.continue_gap_penalty, self.Q[i, j - 1] - self.continue_gap_penalty) + self.D[i, j] = max(match, self.P[i, j], self.Q[i, j], 0) + return self.D, self.P, self.Q + + def matrix(self, querySequence: str, subjectSequence: str)->tuple[NDArray[float64], NDArray[float64], NDArray[float64]]: + D, P, Q = self(querySequence, subjectSequence) + return D, P, Q + + def align(self, querySequence: str, subjectSequence: str)->str: + matrix, _, _ = self(querySequence, subjectSequence) + + qs, ss = [x.upper() for x in querySequence], [x.upper() for x in subjectSequence] + if matrix.max() == 0: + return "There is no local alignment!" + + #finds the largest value closest to bottom right of matrix + i, j = list(numpy.where(matrix == matrix.max())) + i, j = i[-1], j[-1] + + subjectAlign = [] + queryAlign= [] + score = matrix.max() + while score > 0: + score = matrix[i][j] + if score == 0: + break + queryAlign.append(qs[i-1]) + subjectAlign.append(ss[j-1]) + i -= 1 + j -= 1 + queryAlign = "".join(queryAlign[::-1]) + subjectAlign = "".join(subjectAlign[::-1]) + return f"{queryAlign}\n{subjectAlign}" + + def similarity(self, querySequence: str, subjectSequence: str)->float: + matrix, _, _ = self(querySequence, subjectSequence) + return matrix.max() + class Hirschberg(): def __init__(self, match_score: int = 1, mismatch_penalty: int = -1, gap_penalty: int = -2)->None: self.match_score = match_score @@ -737,6 +797,7 @@ def distance(self, querySequence: str, subjectSequence: str)->float: longest_common_subsequence = Longest_Common_Subsequence() shortest_common_supersequence = Shortest_Common_Supersequence() gotoh = Gotoh() +gotoh_local = Gotoh_Local() if __name__ == "__main__": main() diff --git a/limestone/benchmarks.py b/limestone/benchmarks.py index ad03081..25bf371 100644 --- a/limestone/benchmarks.py +++ b/limestone/benchmarks.py @@ -42,6 +42,12 @@ def test_LCS(): def test_SCS(): [ls.shortest_common_supersequence.distance(a, b) for a, b in zip(test_strings1, test_strings2)] +def test_G(): + [ls.gotoh.distance(a, b) for a, b in zip(test_strings1, test_strings2)] + +def test_GL(): + [ls.gotoh_local.distance(a, b) for a, b in zip(test_strings1, test_strings2)] + def main(): print("Each of the following tests creates a list comprehension of 100 sequences that are 50 to 100 charachters each") print(f"Needleman Wunsch test: Time = {timeit('test_NW()', globals = globals(), number=1):0.4f}") @@ -54,6 +60,8 @@ def main(): print(f"Jaro Winkler test: Time = {timeit('test_JW()', globals = globals(), number=1):0.4f}") print(f"Longest Common Subsequence test: Time = {timeit('test_LCS()', globals = globals(), number=1):0.4f}") print(f"Shortest Common Supersequence test: Time = {timeit('test_SCS()', globals = globals(), number=1):0.4f}") + print(f"Gotoh (Global) test: Time = {timeit('test_G()', globals = globals(), number=1):0.4f}") + print(f"Gotoh (Local) test: Time = {timeit('test_GL()', globals = globals(), number=1):0.4f}") if __name__ == "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index ffa8940..b3aa1c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "limestone" -version = "0.12.0" +version = "0.13.0" authors = [ { name ="Andrew Hennis", email="andrew.mr.hennis@gmail.com" }, ] diff --git a/requirements.txt b/requirements.txt index f0d1480..88a3f96 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -numpy>=1.26.4 +numpy>=2.0.0 diff --git a/setup.py b/setup.py index 572b1c5..e06b6ad 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='limestone', - version='0.12.0', + version='0.13.0', packages=['limestone'], package_data={'': ['*.py']}, diff --git a/tests/test_GotohLocal.py b/tests/test_GotohLocal.py new file mode 100644 index 0000000..494364a --- /dev/null +++ b/tests/test_GotohLocal.py @@ -0,0 +1,72 @@ + +from __future__ import annotations +import unittest +from limestone import gotoh_local + +class TestGotohLocal(unittest.TestCase): + def test_distance_diff(self): + dist = gotoh_local.distance("ACTG", "FHYU") + self.assertEqual(dist, 4.0) + + def test_similarity_diff(self): + sim = gotoh_local.similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_norm_distance_diff(self): + dist = gotoh_local.normalized_distance("ACTG", "FHYU") + self.assertEqual(dist, 1.0) + + def test_norm_similarity_diff(self): + sim = gotoh_local.normalized_similarity("ACTG", "FHYU") + self.assertEqual(sim, 0.0) + + def test_distance_sim(self): + dist = gotoh_local.distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_similarity_sim(self): + sim = gotoh_local.similarity("ACTG", "ACTG") + self.assertEqual(sim, 4.0) + + def test_norm_distance_sim(self): + dist = gotoh_local.normalized_distance("ACTG", "ACTG") + self.assertEqual(dist, 0.0) + + def test_norm_distance1(self): + dist = gotoh_local.normalized_distance("ACTG", "ACTF") + self.assertEqual(dist, 0.25) + + def test_norm_distance2(self): + dist = gotoh_local.normalized_distance("ACTG", "ACFF") + self.assertEqual(dist, 0.5) + + def test_norm_distance3(self): + dist = gotoh_local.normalized_distance("ACTG", "AFFF") + self.assertEqual(dist, 0.75) + + def test_norm_similarity_sim(self): + sim = gotoh_local.normalized_similarity("ACTG", "ACTG") + self.assertEqual(sim, 1.0) + + def test_norm_similarity1(self): + dist = gotoh_local.normalized_similarity("ACTG", "ACTF") + self.assertEqual(dist, 0.75) + + def test_norm_similarity2(self): + dist = gotoh_local.normalized_similarity("ACTG", "ACFF") + self.assertEqual(dist, 0.5) + + def test_norm_similarity3(self): + dist = gotoh_local.normalized_similarity("ACTG", "AFFF") + self.assertEqual(dist, 0.25) + + def test_align(self): + alignment = gotoh_local.align("BA", "ABA") + self.assertEqual(alignment, "BA\nBA") + + def test_align2(self): + alignment = gotoh_local.align("AGTACGCA", "TATGC") + self.assertEqual(alignment, "TACGC\nTATGC") + +if __name__ == '__main__': + unittest.main()