Skip to content

Commit

Permalink
Merge pull request #32385 from vespa-engine/bratseth/symmetric-normal…
Browse files Browse the repository at this point in the history
…izing

Bratseth/symmetric normalizing
  • Loading branch information
bratseth authored Sep 11, 2024
2 parents b55c3be + b4a1072 commit ed23942
Show file tree
Hide file tree
Showing 19 changed files with 134 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,5 @@ public int hashCode() {
result = 31 * result + id.hashCode();
return result;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ private Substring getOffsets(BlockItem b) {
private Item stem(BlockItem current, StemContext context, Index index) {
Item blockAsItem = (Item)current;
CompositeItem composite;
List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language);
List<StemList> segments = linguistics.getStemmer().stem(current.stringValue(), context.language, index.getStemMode(), index.getNormalize());
if (segments.isEmpty()) return blockAsItem;

String indexName = current.getIndexName();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public RuleBaseLinguistics withLanguage(Language language) {
/** Processes this term according to the linguistics of this rule base */
public String process(String term) {
if (stemMode == StemMode.NONE) return term;
List<StemList> stems = linguistics.getStemmer().stem(term, StemMode.BEST, language);
List<StemList> stems = linguistics.getStemmer().stem(term, language, StemMode.SHORTEST, true);
if (stems.isEmpty()) return term;
if (stems.get(0).isEmpty()) return term;
return stems.get(0).get(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;


/**
* @author baldersheim
*/
public class OpportunisticWeakAndSearcherTestCase {

private static Item buildQueryItem(CompositeItem root, CompositeItem injectAtLevel2) {
root.addItem(new WordItem("text"));
injectAtLevel2.addItem(new WordItem("a"));
Expand Down Expand Up @@ -47,9 +50,9 @@ public void requireThatWeakAndIsDetected() {
@Test
public void requireThatWeakAndIsReplacedWithAnd() {
assertEquals(buildQueryItem(new OrItem(), new AndItem()),
weakAnd2AndRecurse(buildQueryItem(new OrItem(), new WeakAndItem())));
weakAnd2AndRecurse(buildQueryItem(new OrItem(), new WeakAndItem())));
assertEquals(buildQueryItem(new AndItem(), new AndItem()),
weakAnd2AndRecurse(buildQueryItem(new AndItem(), new WeakAndItem())));
weakAnd2AndRecurse(buildQueryItem(new AndItem(), new WeakAndItem())));
}

private static WeakAndItem try2Adjust(WeakAndItem item, int hits) {
Expand All @@ -71,6 +74,7 @@ public void requireThatDefaultWeakAndHeapIsAdjustedUpToHits() {
assertEquals(1000, try2Adjust(new WeakAndItem(), new OrItem(), 1000).getN());
assertFalse(try2Adjust(new WeakAndItem(), new OrItem(), 10).nIsExplicit());
}

@Test
public void requireThatNonDefaultWeakAndHeapIsNotAdjustedUpToHits() {
assertEquals(33, try2Adjust(new WeakAndItem(33), 1000).getN());
Expand Down
2 changes: 2 additions & 0 deletions linguistics/abi-spec.json
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@
"abstract"
],
"methods" : [
"public java.util.List stem(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)",
"public abstract java.util.List stem(java.lang.String, com.yahoo.language.process.StemMode, com.yahoo.language.Language)"
],
"fields" : [ ]
Expand All @@ -619,6 +620,7 @@
],
"methods" : [
"public void <init>(com.yahoo.language.process.Tokenizer)",
"public java.util.List stem(java.lang.String, com.yahoo.language.Language, com.yahoo.language.process.StemMode, boolean)",
"public java.util.List stem(java.lang.String, com.yahoo.language.process.StemMode, com.yahoo.language.Language)"
],
"fields" : [ ]
Expand Down
15 changes: 15 additions & 0 deletions linguistics/src/main/java/com/yahoo/language/process/Stemmer.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@
*/
public interface Stemmer {

/**
* Stem input according to specified stemming mode.
* This default implementation invokes stem(input, mode, language) and so ignores the removeAccents argument.
*
* @param input the string to stem.
* @param language the language to use for stemming
* @param mode the stemming mode
* @param removeAccents whether to normalize accents and similar
* @return a list of possible stems. Empty if none.
* @throws ProcessingException thrown if there is an exception stemming this input
*/
default List<StemList> stem(String input, Language language, StemMode mode, boolean removeAccents) {
return stem(input, mode, language);
}

/**
* Stem input according to specified stemming mode.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,19 @@ public StemmerImpl(Tokenizer tokenizer) {
}

@Override
public List<StemList> stem(String input, StemMode stemMode, Language language) {
public List<StemList> stem(String input, Language language, StemMode stemMode, boolean removeAccents) {
List<StemList> stems = new ArrayList<>();
for (Token token : tokenizer.tokenize(input, language, stemMode, false)) {
for (Token token : tokenizer.tokenize(input, language, stemMode, removeAccents)) {
findStems(token, stems);
}
return stems;
}

@Override
public List<StemList> stem(String input, StemMode stemMode, Language language) {
return stem(input, language, stemMode, false);
}

private void findStems(Token token, List<StemList> out) {
int len;
if (token.isSpecialToken() || (len = token.getNumComponents()) == 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ public interface Tokenizer {
* @param input the string to tokenize. May be arbitrarily large.
* @param language the language of the input string.
* @param stemMode the stem mode applied on the returned tokens
* @param removeAccents if true accents and similar are removed from the returned tokens
* @return the tokens of the input String.
* @param removeAccents whether to normalize accents and similar
* @return the tokens of the input String
* @throws ProcessingException If the underlying library throws an Exception.
*/
Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ public int hashCode() {

@Override
public String toString() {
return "token '" + original + "'";
return "token '" + tokenString + "'" + ( ! tokenString.equals(original) ? " (original: " + original + ")" : "");
}

public String toDetailString() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ public class StemmerImplTestCase {

@Test
public void requireThatStemIsNormalizedAndLowerCased() {
assertStem("FOO", List.of("foo"));
assertStem("a\u030A", List.of("\u00E5"));
assertStem("FOO", List.of("foo"), true);
assertStem("a\u030A", List.of("\u00E5"), false);
}

@Test
public void requireThatOnlyIndexableTokensAreReturned() {
assertStem("foo. (bar)!", List.of("foo", "bar"));
assertStem("foo. (bar)!", List.of("foo", "bar"), true);
}

@Test
Expand All @@ -49,17 +49,17 @@ public void requireThatSpecialTokensAreNotDecompounded() {
assertEquals(List.of(new StemList("c"),
new StemList("p"),
new StemList("p")),
stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
stemmer.stem("c++", Language.ENGLISH,StemMode.SHORTEST, true));

token.setSpecialToken(true);
assertEquals(List.of(new StemList("c++")),
stemmer.stem("c++", StemMode.SHORTEST, Language.ENGLISH));
stemmer.stem("c++", Language.ENGLISH, StemMode.SHORTEST, true));
}

private static void assertStem(String input, List<String> expectedStems) {
private static void assertStem(String input, List<String> expectedStems, boolean removeAccents) {
Stemmer stemmer = new StemmerImpl(new SimpleTokenizer(new SimpleNormalizer()));
List<String> got = new ArrayList<>();
for (StemList word : stemmer.stem(input, StemMode.ALL, Language.ENGLISH)) {
for (StemList word : stemmer.stem(input, Language.ENGLISH, StemMode.ALL, removeAccents)) {
got.add(word.get(0));
}
assertEquals(expectedStems, got);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public void testLithuanianTokenizer() {
@Test
public void testStemming() {
String text = "mūšio";
List<StemList> tokens = luceneLinguistics().getStemmer().stem(text, StemMode.ALL, Language.LITHUANIAN);
List<StemList> tokens = luceneLinguistics().getStemmer().stem(text, Language.LITHUANIAN, StemMode.ALL, true);
assertEquals(1, tokens.size());
assertEquals("mūš", tokens.get(0).get(0));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,9 @@ public Stemmer getStemmer() {
}

@Override
public Segmenter getSegmenter() { return new SegmenterImpl(forQuerying(getTokenizer())); }
public Segmenter getSegmenter() {
return new SegmenterImpl(forQuerying(getTokenizer()));
}

@Override
public Detector getDetector() { return detector; }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ private Iterable<Token> segmentChinese(String input) {
return tokens;
}

private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents,
Stemmer stemmer) {
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) {
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void testChineseSegmentation() {
assertEquals(7, tokens.size());
assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString());

var stems = tester.stemmer().stem(text, StemMode.ALL, Language.CHINESE_SIMPLIFIED);
var stems = tester.stemmer().stem(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true);
assertEquals(7, tokens.size());
assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString());
}
Expand All @@ -57,7 +57,7 @@ public void testChineseSegmentationWithoutGrams() {
assertEquals(7, tokens.size());
assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString());

var stems = tester.stemmer().stem(text, StemMode.ALL, Language.CHINESE_SIMPLIFIED);
var stems = tester.stemmer().stem(text, Language.CHINESE_SIMPLIFIED, StemMode.ALL, true);
assertEquals(7, tokens.size());
assertEquals("[是, 一个, 展示, 雅, 目前, 在, 测试阶段]", tokens.toString());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public void testOpenNlpLinguisticsSubclassing() {
var subclass = new OpenNlpLinguisticsSubclass();
assertEquals("the only token", subclass.getTokenizer().tokenize("whatever", Language.ENGLISH, StemMode.ALL, false).iterator().next().getTokenString());
assertEquals("the only token", subclass.getSegmenter().segment("whatever", Language.ENGLISH).iterator().next());
assertEquals("the only token", subclass.getStemmer().stem("whatever", StemMode.ALL, Language.ENGLISH).get(0).get(0));
assertEquals("the only token", subclass.getStemmer().stem("whatever", Language.ENGLISH, StemMode.ALL, true).get(0).get(0));
}

static class OpenNlpLinguisticsSubclass extends OpenNlpLinguistics {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ai.vespa.opennlp.OpenNlpConfig;
import com.yahoo.language.Language;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Stemmer;
Expand All @@ -23,6 +24,7 @@ public class OpenNlpLinguisticsTester {
private final Tokenizer tokenizer;
private final Segmenter segmenter;
private final Stemmer stemmer;
private final Normalizer normalizer;

public OpenNlpLinguisticsTester() {
this(new OpenNlpLinguistics());
Expand All @@ -36,16 +38,27 @@ public OpenNlpLinguisticsTester(OpenNlpLinguistics linguistics) {
this.tokenizer = linguistics.getTokenizer();
this.segmenter = linguistics.getSegmenter();
this.stemmer = linguistics.getStemmer();
this.normalizer = linguistics.getNormalizer();
}

Tokenizer tokenizer() { return tokenizer; }
Segmenter segmenter() { return segmenter; }
Stemmer stemmer() { return stemmer; }
Normalizer normalizer() { return normalizer; }

Iterable<Token> tokenize(String input, Language language) {
return tokenizer.tokenize(input, language, StemMode.SHORTEST, true);
}

String tokenizeToString(String input, Language language) {
return tokenize(input, language).iterator().next().getTokenString();
}

String stemAndNormalize(String input, Language language) {
var stemListList = stemmer.stem(input, language, StemMode.SHORTEST, true);
return normalizer.normalize(stemListList.get(0).get(0));
}

void recurseDecompose(Token t) {
assertTrue(t.getOffset() >= 0);
assertTrue(t.getOrig().length() >= 0);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
package com.yahoo.language.opennlp;

import org.junit.Test;

import static org.junit.Assert.assertEquals;

/**
* @author bratseth
*/
public class OpenNlpNormalizerTestCase {

@Test
public void testNormalizing() {
var normalizer = new OpenNlpLinguisticsTester().normalizer();
assertEquals("cafe", normalizer.normalize("cafe"));
// TODO: Accent normalize
// assertEquals("cafe", normalizer.normalize("café"));
// assertEquals("cafe", normalizer.normalize("cafè"));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package com.yahoo.language.opennlp;

import ai.vespa.opennlp.OpenNlpConfig;
import com.yahoo.language.Language;
import org.junit.Test;

import static org.junit.Assert.assertEquals;

public class OpenNlpProcessingSymmetryTestCase {

@Test
public void testSymmetricTransformation() {
var tester = new OpenNlpLinguisticsTester();
var input = "conges";
String indexed = tester.tokenizeToString(input, Language.ENGLISH);
String queried = tester.stemAndNormalize(input, Language.ENGLISH);
assertEquals("Expected that the actual query token equals the indexed", indexed, queried);
}

@Test
public void testSymmetricTransformationWithAccentsEnglishKStem() {
var tester = new OpenNlpLinguisticsTester();
var input = "congés";
String indexed = tester.tokenizeToString(input, Language.ENGLISH);
String queried = tester.stemAndNormalize(input, Language.ENGLISH);
assertEquals("Expected that the actual query token equals the indexed", indexed, queried);
}

@Test
public void testSymmetricTransformationWithAccentsEnglishSnowball() {
var tester = new OpenNlpLinguisticsTester(new OpenNlpConfig.Builder().snowballStemmingForEnglish(true).build());
var input = "congés";
String indexed = tester.tokenizeToString(input, Language.ENGLISH);
String queried = tester.stemAndNormalize(input, Language.ENGLISH);
assertEquals("Expected that the actual query token equals the indexed", indexed, queried);
}

@Test
public void testSymmetricTransformationWithAccentsSpanish() {
var tester = new OpenNlpLinguisticsTester();
var input = "congés";
String indexed = tester.tokenizeToString(input, Language.SPANISH);
String queried = tester.stemAndNormalize(input, Language.SPANISH);
assertEquals("Expected that the actual query token equals the indexed", indexed, queried);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ public void testTokenizeEmojis() {
public void testStemEmojis() {
var stemmer = new OpenNlpLinguistics().getStemmer();
String emoji = "\uD83D\uDD2A"; // 🔪
List<StemList> stems = stemmer.stem(emoji, StemMode.ALL, Language.ENGLISH);
List<StemList> stems = stemmer.stem(emoji, Language.ENGLISH, StemMode.ALL, true);
assertEquals(1, stems.size());
var stemList = stems.get(0);
assertEquals(1, stemList.size());
Expand Down

0 comments on commit ed23942

Please sign in to comment.