-
Notifications
You must be signed in to change notification settings - Fork 612
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32385 from vespa-engine/bratseth/symmetric-normal…
…izing Bratseth/symmetric normalizing
- Loading branch information
Showing
19 changed files
with
134 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,4 +129,5 @@ public int hashCode() { | |
result = 31 * result + id.hashCode(); | ||
return result; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
21 changes: 21 additions & 0 deletions
21
opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpNormalizerTestCase.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
package com.yahoo.language.opennlp; | ||
|
||
import org.junit.Test; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
/** | ||
* @author bratseth | ||
*/ | ||
public class OpenNlpNormalizerTestCase { | ||
|
||
@Test | ||
public void testNormalizing() { | ||
var normalizer = new OpenNlpLinguisticsTester().normalizer(); | ||
assertEquals("cafe", normalizer.normalize("cafe")); | ||
// TODO: Accent normalize | ||
// assertEquals("cafe", normalizer.normalize("café")); | ||
// assertEquals("cafe", normalizer.normalize("cafè")); | ||
} | ||
|
||
} |
47 changes: 47 additions & 0 deletions
47
...nguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpProcessingSymmetryTestCase.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package com.yahoo.language.opennlp; | ||
|
||
import ai.vespa.opennlp.OpenNlpConfig; | ||
import com.yahoo.language.Language; | ||
import org.junit.Test; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
public class OpenNlpProcessingSymmetryTestCase { | ||
|
||
@Test | ||
public void testSymmetricTransformation() { | ||
var tester = new OpenNlpLinguisticsTester(); | ||
var input = "conges"; | ||
String indexed = tester.tokenizeToString(input, Language.ENGLISH); | ||
String queried = tester.stemAndNormalize(input, Language.ENGLISH); | ||
assertEquals("Expected that the actual query token equals the indexed", indexed, queried); | ||
} | ||
|
||
@Test | ||
public void testSymmetricTransformationWithAccentsEnglishKStem() { | ||
var tester = new OpenNlpLinguisticsTester(); | ||
var input = "congés"; | ||
String indexed = tester.tokenizeToString(input, Language.ENGLISH); | ||
String queried = tester.stemAndNormalize(input, Language.ENGLISH); | ||
assertEquals("Expected that the actual query token equals the indexed", indexed, queried); | ||
} | ||
|
||
@Test | ||
public void testSymmetricTransformationWithAccentsEnglishSnowball() { | ||
var tester = new OpenNlpLinguisticsTester(new OpenNlpConfig.Builder().snowballStemmingForEnglish(true).build()); | ||
var input = "congés"; | ||
String indexed = tester.tokenizeToString(input, Language.ENGLISH); | ||
String queried = tester.stemAndNormalize(input, Language.ENGLISH); | ||
assertEquals("Expected that the actual query token equals the indexed", indexed, queried); | ||
} | ||
|
||
@Test | ||
public void testSymmetricTransformationWithAccentsSpanish() { | ||
var tester = new OpenNlpLinguisticsTester(); | ||
var input = "congés"; | ||
String indexed = tester.tokenizeToString(input, Language.SPANISH); | ||
String queried = tester.stemAndNormalize(input, Language.SPANISH); | ||
assertEquals("Expected that the actual query token equals the indexed", indexed, queried); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters