Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue with deletion in long words #8

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions src/StateSetIndex.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,18 @@ public function findMatchingStates(string $string, int $editDistance, int $trans
$lastSubstitutions = [];
$lastMappedChar = null;

$this->loopOverEveryCharacter($string, function (int $mappedChar) use (&$states, &$lastSubstitutions, &$lastMappedChar, $editDistance, $transpositionCost) {
$this->loopOverEveryCharacter($string, function (int $mappedChar, int $index) use (&$states, &$lastSubstitutions, &$lastMappedChar, $editDistance, $transpositionCost) {
$statesStar = new CostAnnotatedStateSet(); // This is S∗ in the paper
$substitutionStates = [];

foreach ($states->all() as $state => $cost) {
$statesStarC = new CostAnnotatedStateSet(); // This is S∗c in the paper

// Match for non-indexed cut-off characters
if ($index >= $this->config->getIndexLength() - $editDistance) {
$statesStarC->add($state, $cost);
}

// Deletion
if ($cost + 1 <= $editDistance) {
$statesStarC->add($state, $cost + 1);
Expand Down Expand Up @@ -258,16 +263,16 @@ private function hasNextState(int $state): bool
}

/**
* @param \Closure(int) $closure
* @param \Closure(int, int) $closure
*/
private function loopOverEveryCharacter(string $string, \Closure $closure): void
{
$indexedSubstringLength = min($this->config->getIndexLength(), mb_strlen($string));
$indexedSubstring = mb_substr($string, 0, $indexedSubstringLength);

foreach (mb_str_split($indexedSubstring) as $char) {
foreach (mb_str_split($indexedSubstring) as $index => $char) {
$mappedChar = $this->alphabet->map($char, $this->config->getAlphabetSize());
$closure($mappedChar);
$closure($mappedChar, $index);
}
}
}
14 changes: 14 additions & 0 deletions tests/StateSetIndexTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ public function testResultsMatchResearchPaper(): void
$this->assertSame([104, 419, 467, 1677, 1811], $stateSetIndex->findMatchingStates('Mustre', 2, 2));
$this->assertSame([1811 => ['Mueller'], 1677 => ['Muster', 'Mustermann']], $stateSetIndex->findAcceptedStrings('Mustre', 2, 2));
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2, 2));

// Match word cut off by index length
$this->assertSame(['Mueller'], $stateSetIndex->find('Mueler', 1));
}

public function testWithUtf8Alphabet(): void
Expand All @@ -50,6 +53,17 @@ public function testWithUtf8Alphabet(): void
$this->assertSame(['Muster'], $stateSetIndex->find('Mustre', 2));
}

public function testWithLongWordSingleDeletion(): void
{
$stateSetIndex = new StateSetIndex(new Config(6, 4), new Utf8Alphabet(), new InMemoryStateSet(), new InMemoryDataStore());
$stateSetIndex->index(['Mustermann']);

$this->assertSame([2, 10, 44, 177, 710, 2843], $stateSetIndex->getStateSet()->all());
$this->assertSame([2843], $stateSetIndex->findMatchingStates('Mutermann', 1, 1));
$this->assertSame([2843 => ['Mustermann']], $stateSetIndex->findAcceptedStrings('Mutermann', 1, 1));
$this->assertSame(['Mustermann'], $stateSetIndex->find('Mutermann', 1));
}

/**
* This use case occurred while testing 2.0.0, which is why this is added as additional test case.
*/
Expand Down
Loading