Skip to content

Commit

Permalink
added Jaro and Jaro Winkler algs
Browse files Browse the repository at this point in the history
  • Loading branch information
dawnandrew100 committed Aug 22, 2024
1 parent ee5fb0c commit 13d466a
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 0 deletions.
2 changes: 2 additions & 0 deletions limestone/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from limestone.algorithms.base import GLOBALBASE
from limestone.algorithms.base import LOCALBASE
from limestone.algorithms.editdistance import hamming
from limestone.algorithms.editdistance import jaro
from limestone.algorithms.editdistance import jaro_winkler
from limestone.algorithms.editdistance import hirschberg
from limestone.algorithms.editdistance import lowrance_wagner
from limestone.algorithms.editdistance import needleman_wunsch
Expand Down
94 changes: 94 additions & 0 deletions limestone/algorithms/editdistance.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,98 @@ def align(self, qs: str, ss: str) -> str:
queryAlign, subjectAlign = self(qs, ss)
return f"{queryAlign}\n{subjectAlign}"

class Jaro():
def __init__(self):
self.match_score = 1
self.winkler = False

def __call__(self, querySequence: str, subjectSequence: str) -> tuple[int, int]:
qs, ss = (x.upper() for x in [querySequence, subjectSequence])
if qs == ss:
return -1, 0
len1, len2 = len(querySequence), len(subjectSequence)
max_dist = max(len1, len2)//2 - 1

matches = 0
array_qs = [False] * len1
array_ss = [False] * len2
for i in range(len1):
start = max(0, i - max_dist)
end = min(len2, i + max_dist + 1)
for j in range(start, end):
if qs[i] == ss[j] and array_ss[j] == 0:
array_qs[i] = array_ss[j] = True
matches += 1
break
if matches == 0:
return 0, 0

transpositions = 0
comparison = 0
for i in range(len1):
if array_qs[i]:
while not array_ss[comparison]:
comparison += 1
if qs[i] != ss[comparison]:
transpositions += 1
comparison += 1
return matches, transpositions//2

def similarity(self, querySequence: str, subjectSequence: str) -> float:
matches, t = self(querySequence, subjectSequence)
if matches == 0:
return 0.0
if matches == -1:
return 1.0
jaro_sim = (1/3)*((matches/len(querySequence))+(matches/len(subjectSequence))+((matches-t)/matches))
if not self.winkler:
return jaro_sim
prefix_matches = 0
for i in range(4):
if querySequence[i] != subjectSequence[i] or i > len(subjectSequence) - 1:
break
prefix_matches += 1
return jaro_sim + prefix_matches*self.scaling_factor*(1-jaro_sim)

def normalized_similarity(self, querySequence: str, subjectSequence: str) -> float:
return round(self.similarity(querySequence, subjectSequence), 2)

def distance(self, querySequence: str, subjectSequence: str) -> float:
return 1 - self.similarity(querySequence, subjectSequence)

def normalized_distance(self, querySequence: str, subjectSequence: str) -> float:
return round(self.distance(querySequence, subjectSequence), 2)

def matrix(self, querySequence: str, subjectSequence: str) -> NDArray[float64]:
#dynamic programming variant to show all matches
qs,ss = [""], [""]
qs.extend([x.upper() for x in querySequence])
ss.extend([x.upper() for x in subjectSequence])
max_match_dist = max(0, (max(len(ss)-1, len(qs)-1)//2)-1)

#matrix initialisation
self.alignment_score = numpy.zeros((len(qs),len(ss)))
for i, query_char in enumerate(qs):
for j, subject_char in enumerate(ss):
if i == 0 or j == 0:
#keeps first row and column consistent throughout all calculations
continue
dmatch = self.alignment_score[i-1][j-1]
start = max(1, i-max_match_dist)
trans_match = ss[start:start+(2*max_match_dist)]
if query_char == subject_char or query_char in trans_match:
dmatch += 1

self.alignment_score[i][j] = dmatch
return self.alignment_score

class JaroWinkler(Jaro):
def __init__(self, scaling_factor = 0.1):
self.match_score = 1
self.winkler = True
#p should not exceed 0.25 else similarity could be larger than 1
self.scaling_factor = scaling_factor

class Smith_Waterman(__LOCALBASE):
def __init__(self, match_score:int = 1, mismatch_penalty:int = 1, gap_penalty:int = 2)->None:
self.match_score = match_score
Expand Down Expand Up @@ -560,6 +652,8 @@ def distance(self, querySequence: str, subjectSequence: str)->float:
waterman_smith_beyer = Waterman_Smith_Beyer()
smith_waterman = Smith_Waterman()
hirschberg = Hirschberg()
jaro = Jaro()
jaro_winkler = JaroWinkler()
lowrance_wagner = Lowrance_Wagner()
longest_common_subsequence = Longest_Common_Subsequence()
shortest_common_supersequence = Shortest_Common_Supersequence()
Expand Down
63 changes: 63 additions & 0 deletions tests/test_Jaro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations
import unittest
from limestone import jaro

class TestJaro(unittest.TestCase):
def test_distance_diff(self):
dist = jaro.distance("ACTG", "FHYU")
self.assertEqual(dist, 1.0)

def test_similarity_diff(self):
sim = jaro.similarity("ACTG", "FHYU")
self.assertEqual(sim, 0.0)

def test_norm_distance_diff(self):
dist = jaro.normalized_distance("ACTG", "FHYU")
self.assertEqual(dist, 1.0)

def test_norm_similarity_diff(self):
sim = jaro.normalized_similarity("ACTG", "FHYU")
self.assertEqual(sim, 0.0)

def test_distance_sim(self):
dist = jaro.distance("ACTG", "ACTG")
self.assertEqual(dist, 0.0)

def test_similarity_sim(self):
sim = jaro.similarity("ACTG", "ACTG")
self.assertEqual(sim, 1.0)

def test_norm_distance_sim(self):
dist = jaro.normalized_distance("ACTG", "ACTG")
self.assertEqual(dist, 0.0)

def test_norm_similarity_sim(self):
sim = jaro.normalized_similarity("ACTG", "ACTG")
self.assertEqual(sim, 1.0)

def test_norm_distance1(self):
dist = jaro.normalized_distance("ACTG", "AATG")
self.assertEqual(dist, 0.17)

def test_norm_distance2(self):
dist = jaro.normalized_distance("ACTG", "AAAG")
self.assertEqual(dist, 0.33)

def test_norm_distance3(self):
dist = jaro.normalized_distance("ACTG", "AAAA")
self.assertEqual(dist, 0.5)

def test_norm_similarity1(self):
dist = jaro.normalized_similarity("ACTG", "AATG")
self.assertEqual(dist, 0.83)

def test_norm_similarity2(self):
dist = jaro.normalized_similarity("ACTG", "AAAG")
self.assertEqual(dist, 0.67)

def test_norm_similarity3(self):
dist = jaro.normalized_similarity("ACTG", "AAAA")
self.assertEqual(dist, 0.5)

if __name__ == '__main__':
unittest.main()
63 changes: 63 additions & 0 deletions tests/test_JaroWinkler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from __future__ import annotations
import unittest
from limestone import jaro_winkler

class TestJaroWinkler(unittest.TestCase):
def test_distance_diff(self):
dist = jaro_winkler.distance("ACTG", "FHYU")
self.assertEqual(dist, 1.0)

def test_similarity_diff(self):
sim = jaro_winkler.similarity("ACTG", "FHYU")
self.assertEqual(sim, 0.0)

def test_norm_distance_diff(self):
dist = jaro_winkler.normalized_distance("ACTG", "FHYU")
self.assertEqual(dist, 1.0)

def test_norm_similarity_diff(self):
sim = jaro_winkler.normalized_similarity("ACTG", "FHYU")
self.assertEqual(sim, 0.0)

def test_distance_sim(self):
dist = jaro_winkler.distance("ACTG", "ACTG")
self.assertEqual(dist, 0.0)

def test_similarity_sim(self):
sim = jaro_winkler.similarity("ACTG", "ACTG")
self.assertEqual(sim, 1.0)

def test_norm_distance_sim(self):
dist = jaro_winkler.normalized_distance("ACTG", "ACTG")
self.assertEqual(dist, 0.0)

def test_norm_similarity_sim(self):
sim = jaro_winkler.normalized_similarity("ACTG", "ACTG")
self.assertEqual(sim, 1.0)

def test_norm_distance1(self):
dist = jaro_winkler.normalized_distance("ACTG", "AATG")
self.assertEqual(dist, 0.15)

def test_norm_distance2(self):
dist = jaro_winkler.normalized_distance("ACTG", "AAAG")
self.assertEqual(dist, 0.3)

def test_norm_distance3(self):
dist = jaro_winkler.normalized_distance("ACTG", "AAAA")
self.assertEqual(dist, 0.45)

def test_norm_similarity1(self):
dist = jaro_winkler.normalized_similarity("ACTG", "AATG")
self.assertEqual(dist, 0.85)

def test_norm_similarity2(self):
dist = jaro_winkler.normalized_similarity("ACTG", "AAAG")
self.assertEqual(dist, 0.7)

def test_norm_similarity3(self):
dist = jaro_winkler.normalized_similarity("ACTG", "AAAA")
self.assertEqual(dist, 0.55)

if __name__ == '__main__':
unittest.main()

0 comments on commit 13d466a

Please sign in to comment.