diff --git a/container-core/src/main/java/com/yahoo/component/chain/Chain.java b/container-core/src/main/java/com/yahoo/component/chain/Chain.java index de9aa0b67cf5..6f69bdbdfe89 100644 --- a/container-core/src/main/java/com/yahoo/component/chain/Chain.java +++ b/container-core/src/main/java/com/yahoo/component/chain/Chain.java @@ -129,4 +129,5 @@ public int hashCode() { result = 31 * result + id.hashCode(); return result; } + } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index 22fa667f7551..e6e0adeacd81 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -198,7 +198,7 @@ private Substring getOffsets(BlockItem b) { private Item stem(BlockItem current, StemContext context, Index index) { Item blockAsItem = (Item)current; CompositeItem composite; - List segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language); + List segments = linguistics.getStemmer().stem(current.stringValue(), context.language, index.getStemMode(), index.getNormalize()); if (segments.isEmpty()) return blockAsItem; String indexName = current.getIndexName(); diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java index 9483ae6ef13e..c60f1a7af574 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java @@ -45,7 +45,7 @@ public RuleBaseLinguistics withLanguage(Language language) { /** Processes this term according to the linguistics of this rule base */ public String process(String term) { if (stemMode == StemMode.NONE) return term; - List stems = linguistics.getStemmer().stem(term, StemMode.BEST, language); + List stems = linguistics.getStemmer().stem(term, language, StemMode.SHORTEST, true); if (stems.isEmpty()) return term; if (stems.get(0).isEmpty()) return term; return stems.get(0).get(0); diff --git a/container-search/src/test/java/com/yahoo/search/searchers/OpportunisticWeakAndSearcherTestCase.java b/container-search/src/test/java/com/yahoo/search/searchers/OpportunisticWeakAndSearcherTestCase.java index cdfbdd627656..49b36f43f6e5 100644 --- a/container-search/src/test/java/com/yahoo/search/searchers/OpportunisticWeakAndSearcherTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/searchers/OpportunisticWeakAndSearcherTestCase.java @@ -18,8 +18,11 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; - +/** + * @author baldersheim + */ public class OpportunisticWeakAndSearcherTestCase { + private static Item buildQueryItem(CompositeItem root, CompositeItem injectAtLevel2) { root.addItem(new WordItem("text")); injectAtLevel2.addItem(new WordItem("a")); @@ -47,9 +50,9 @@ public void requireThatWeakAndIsDetected() { @Test public void requireThatWeakAndIsReplacedWithAnd() { assertEquals(buildQueryItem(new OrItem(), new AndItem()), - weakAnd2AndRecurse(buildQueryItem(new OrItem(), new WeakAndItem()))); + weakAnd2AndRecurse(buildQueryItem(new OrItem(), new WeakAndItem()))); assertEquals(buildQueryItem(new AndItem(), new AndItem()), - weakAnd2AndRecurse(buildQueryItem(new AndItem(), new WeakAndItem()))); + weakAnd2AndRecurse(buildQueryItem(new AndItem(), new WeakAndItem()))); } private static WeakAndItem try2Adjust(WeakAndItem item, int hits) { @@ -71,6 +74,7 @@ public void requireThatDefaultWeakAndHeapIsAdjustedUpToHits() { assertEquals(1000, try2Adjust(new WeakAndItem(), new OrItem(), 1000).getN()); assertFalse(try2Adjust(new WeakAndItem(), new OrItem(), 10).nIsExplicit()); } + @Test public void requireThatNonDefaultWeakAndHeapIsNotAdjustedUpToHits() { assertEquals(33, try2Adjust(new WeakAndItem(33), 1000).getN()); diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index d743c9284a6a..ba5bc3c79702 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -605,6 +605,7 @@ "abstract" ], "methods" : [ + "public java.util.List stem(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)", "public abstract java.util.List stem(java.lang.String, com.yahoo.language.process.StemMode, com.yahoo.language.Language)" ], "fields" : [ ] @@ -619,6 +620,7 @@ ], "methods" : [ "public void (com.yahoo.language.process.Tokenizer)", + "public java.util.List stem(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)", "public java.util.List stem(java.lang.String, com.yahoo.language.process.StemMode, com.yahoo.language.Language)" ], "fields" : [ ] diff --git a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java index d53f4178b39b..6565a01cd416 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Stemmer.java @@ -12,6 +12,21 @@ */ public interface Stemmer { + /** + * Stem input according to specified stemming mode. + * This default implementation invokes stem(input, mode, language) and so ignores the removeAccents argument. + * + * @param input the string to stem. + * @param language the language to use for stemming + * @param mode the stemming mode + * @param removeAccents whether to normalize accents and similar + * @return a list of possible stems. Empty if none. + * @throws ProcessingException thrown if there is an exception stemming this input + */ + default List stem(String input, Language language, StemMode mode, boolean removeAccents) { + return stem(input, mode, language); + } + /** * Stem input according to specified stemming mode. * diff --git a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java index 0ca479e1893b..0d40cbc83ae0 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java +++ b/linguistics/src/main/java/com/yahoo/language/process/StemmerImpl.java @@ -18,14 +18,19 @@ public StemmerImpl(Tokenizer tokenizer) { } @Override - public List stem(String input, StemMode stemMode, Language language) { + public List stem(String input, Language language, StemMode stemMode, boolean removeAccents) { List stems = new ArrayList<>(); - for (Token token : tokenizer.tokenize(input, language, stemMode, false)) { + for (Token token : tokenizer.tokenize(input, language, stemMode, removeAccents)) { findStems(token, stems); } return stems; } + @Override + public List stem(String input, StemMode stemMode, Language language) { + return stem(input, language, stemMode, false); + } + private void findStems(Token token, List out) { int len; if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) { diff --git a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java index f544110eeca8..92593d76d391 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/process/Tokenizer.java @@ -16,8 +16,8 @@ public interface Tokenizer { * @param input the string to tokenize. May be arbitrarily large. * @param language the language of the input string. * @param stemMode the stem mode applied on the returned tokens - * @param removeAccents if true accents and similar are removed from the returned tokens - * @return the tokens of the input String. + * @param removeAccents whether to normalize accents and similar + * @return the tokens of the input String * @throws ProcessingException If the underlying library throws an Exception. */ Iterable tokenize(String input, Language language, StemMode stemMode, boolean removeAccents); diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index c7b8798a6072..3561d681a581 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -149,7 +149,7 @@ public int hashCode() { @Override public String toString() { - return "token '" + original + "'"; + return "token '" + tokenString + "'" + ( ! tokenString.equals(original) ? " (original: " + original + ")" : ""); } public String toDetailString() { diff --git a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java index 785225a50961..7016b7256b22 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/StemmerImplTestCase.java @@ -20,13 +20,13 @@ public class StemmerImplTestCase { @Test public void requireThatStemIsNormalizedAndLowerCased() { - assertStem("FOO", List.of("foo")); - assertStem("a\u030A", List.of("\u00E5")); + assertStem("FOO", List.of("foo"), true); + assertStem("a\u030A", List.of("\u00E5"), false); } @Test public void requireThatOnlyIndexableTokensAreReturned() { - assertStem("foo. (bar)!", List.of("foo", "bar")); + assertStem("foo. (bar)!", List.of("foo", "bar"), true); } @Test @@ -49,17 +49,17 @@ public void requireThatSpecialTokensAreNotDecompounded() { assertEquals(List.of(new StemList("c"), new StemList("p"), new StemList("p")), - stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + stemmer.stem("c++", Language.ENGLISH,StemMode.SHORTEST, true)); token.setSpecialToken(true); assertEquals(List.of(new StemList("c++")), - stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH)); + stemmer.stem("c++", Language.ENGLISH, StemMode.SHORTEST, true)); } - private static void assertStem(String input, List expectedStems) { + private static void assertStem(String input, List expectedStems, boolean removeAccents) { Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer())); List got = new ArrayList<>(); - for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) { + for (StemList word : stemmer.stem(input, Language.ENGLISH, StemMode.ALL, removeAccents)) { got.add(word.get(0)); } assertEquals(expectedStems, got); diff --git a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java index 44bed2d4a756..1d6ff94a38d2 100644 --- a/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java +++ b/lucene-linguistics/src/test/java/com/yahoo/language/lucene/LuceneTokenizerTest.java @@ -43,7 +43,7 @@ public void testLithuanianTokenizer() { @Test public void testStemming() { String text = "mūšio"; - List tokens = luceneLinguistics().getStemmer().stem(text, StemMode.ALL, Language.LITHUANIAN); + List tokens = luceneLinguistics().getStemmer().stem(text, Language.LITHUANIAN, StemMode.ALL, true); assertEquals(1, tokens.size()); assertEquals("mūš", tokens.get(0).get(0)); } diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java index 2c5c0659ad03..5c356b48c5be 100644 --- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -50,7 +50,9 @@ public Stemmer getStemmer() { } @Override - public Segmenter getSegmenter() { return new SegmenterImpl(forQuerying(getTokenizer())); } + public Segmenter getSegmenter() { + return new SegmenterImpl(forQuerying(getTokenizer())); + } @Override public Detector getDetector() { return detector; } diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java index 5af0765e3e03..49f5d5ee1cc5 100644 --- a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -122,8 +122,7 @@ private Iterable segmentChinese(String input) { return tokens; } - private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, - Stemmer stemmer) { + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/ChineseSegmentationTest.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/ChineseSegmentationTest.java index d5ebf606622e..1a68f10dcd5e 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/ChineseSegmentationTest.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/ChineseSegmentationTest.java @@ -35,7 +35,7 @@ public void testChineseSegmentation() { assertEquals(7, tokens.size()); assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString()); - var stems = tester.stemmer().stem(text, StemMode.ALL, Language.CHINESE_SIMPLIFIED); + var stems = tester.stemmer().stem(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true); assertEquals(7, tokens.size()); assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString()); } @@ -57,7 +57,7 @@ public void testChineseSegmentationWithoutGrams() { assertEquals(7, tokens.size()); assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString()); - var stems = tester.stemmer().stem(text, StemMode.ALL, Language.CHINESE_SIMPLIFIED); + var stems = tester.stemmer().stem(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true); assertEquals(7, tokens.size()); assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString()); } diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsSubclassingTest.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsSubclassingTest.java index d95b676cdfbc..a7144dd63a2f 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsSubclassingTest.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsSubclassingTest.java @@ -23,7 +23,7 @@ public void testOpenNlpLinguisticsSubclassing() { var subclass = new OpenNlpLinguisticsSubclass(); assertEquals("the only token", subclass.getTokenizer().tokenize("whatever", Language.ENGLISH, StemMode.ALL, false).iterator().next().getTokenString()); assertEquals("the only token", subclass.getSegmenter().segment("whatever", Language.ENGLISH).iterator().next()); - assertEquals("the only token", subclass.getStemmer().stem("whatever", StemMode.ALL, Language.ENGLISH).get(0).get(0)); + assertEquals("the only token", subclass.getStemmer().stem("whatever", Language.ENGLISH, StemMode.ALL, true).get(0).get(0)); } static class OpenNlpLinguisticsSubclass extends OpenNlpLinguistics { diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsTester.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsTester.java index 74339c49b587..b57adcbf0015 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsTester.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpLinguisticsTester.java @@ -3,6 +3,7 @@ import ai.vespa.opennlp.OpenNlpConfig; import com.yahoo.language.Language; +import com.yahoo.language.process.Normalizer; import com.yahoo.language.process.Segmenter; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Stemmer; @@ -23,6 +24,7 @@ public class OpenNlpLinguisticsTester { private final Tokenizer tokenizer; private final Segmenter segmenter; private final Stemmer stemmer; + private final Normalizer normalizer; public OpenNlpLinguisticsTester() { this(new OpenNlpLinguistics()); @@ -36,16 +38,27 @@ public OpenNlpLinguisticsTester(OpenNlpLinguistics linguistics) { this.tokenizer = linguistics.getTokenizer(); this.segmenter = linguistics.getSegmenter(); this.stemmer = linguistics.getStemmer(); + this.normalizer = linguistics.getNormalizer(); } Tokenizer tokenizer() { return tokenizer; } Segmenter segmenter() { return segmenter; } Stemmer stemmer() { return stemmer; } + Normalizer normalizer() { return normalizer; } Iterable tokenize(String input, Language language) { return tokenizer.tokenize(input, language, StemMode.SHORTEST, true); } + String tokenizeToString(String input, Language language) { + return tokenize(input, language).iterator().next().getTokenString(); + } + + String stemAndNormalize(String input, Language language) { + var stemListList = stemmer.stem(input, language, StemMode.SHORTEST, true); + return normalizer.normalize(stemListList.get(0).get(0)); + } + void recurseDecompose(Token t) { assertTrue(t.getOffset() >= 0); assertTrue(t.getOrig().length() >= 0); diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpNormalizerTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpNormalizerTestCase.java new file mode 100644 index 000000000000..609590c4faaf --- /dev/null +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpNormalizerTestCase.java @@ -0,0 +1,21 @@ +package com.yahoo.language.opennlp; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author bratseth + */ +public class OpenNlpNormalizerTestCase { + + @Test + public void testNormalizing() { + var normalizer = new OpenNlpLinguisticsTester().normalizer(); + assertEquals("cafe", normalizer.normalize("cafe")); + // TODO: Accent normalize + // assertEquals("cafe", normalizer.normalize("café")); + // assertEquals("cafe", normalizer.normalize("cafè")); + } + +} diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpProcessingSymmetryTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpProcessingSymmetryTestCase.java new file mode 100644 index 000000000000..db08a2aa93c0 --- /dev/null +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpProcessingSymmetryTestCase.java @@ -0,0 +1,47 @@ +package com.yahoo.language.opennlp; + +import ai.vespa.opennlp.OpenNlpConfig; +import com.yahoo.language.Language; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class OpenNlpProcessingSymmetryTestCase { + + @Test + public void testSymmetricTransformation() { + var tester = new OpenNlpLinguisticsTester(); + var input = "conges"; + String indexed = tester.tokenizeToString(input, Language.ENGLISH); + String queried = tester.stemAndNormalize(input, Language.ENGLISH); + assertEquals("Expected that the actual query token equals the indexed", indexed, queried); + } + + @Test + public void testSymmetricTransformationWithAccentsEnglishKStem() { + var tester = new OpenNlpLinguisticsTester(); + var input = "congés"; + String indexed = tester.tokenizeToString(input, Language.ENGLISH); + String queried = tester.stemAndNormalize(input, Language.ENGLISH); + assertEquals("Expected that the actual query token equals the indexed", indexed, queried); + } + + @Test + public void testSymmetricTransformationWithAccentsEnglishSnowball() { + var tester = new OpenNlpLinguisticsTester(new OpenNlpConfig.Builder().snowballStemmingForEnglish(true).build()); + var input = "congés"; + String indexed = tester.tokenizeToString(input, Language.ENGLISH); + String queried = tester.stemAndNormalize(input, Language.ENGLISH); + assertEquals("Expected that the actual query token equals the indexed", indexed, queried); + } + + @Test + public void testSymmetricTransformationWithAccentsSpanish() { + var tester = new OpenNlpLinguisticsTester(); + var input = "congés"; + String indexed = tester.tokenizeToString(input, Language.SPANISH); + String queried = tester.stemAndNormalize(input, Language.SPANISH); + assertEquals("Expected that the actual query token equals the indexed", indexed, queried); + } + +} diff --git a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index 9ba4beeaffcc..7715eb4172b4 100644 --- a/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/opennlp-linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -204,7 +204,7 @@ public void testTokenizeEmojis() { public void testStemEmojis() { var stemmer = new OpenNlpLinguistics().getStemmer(); String emoji = "\uD83D\uDD2A"; // 🔪 - List stems = stemmer.stem(emoji, StemMode.ALL, Language.ENGLISH); + List stems = stemmer.stem(emoji, Language.ENGLISH, StemMode.ALL, true); assertEquals(1, stems.size()); var stemList = stems.get(0); assertEquals(1, stemList.size());