From c8e8a1f0526343c50cc9aafe2d866fe5e12b3a4a Mon Sep 17 00:00:00 2001 From: Drew Farris Date: Wed, 20 Nov 2024 14:09:32 -0500 Subject: [PATCH] Improves Lucene query tokenization to properly handle Analyzers that produce alternate tokens at the same position (#2629) * General code cleanup/prep * Introduced the ability to track variants in tokens, all unit tests still pass * Completed implementation based on position increment * Update unit test and apply formatting * Removed unused CachingTokenFilter in CustomAnalyzerQueryNodeProcessor * Fixed formatting in some unit tests --------- Co-authored-by: austin007008 <143425397+austin007008@users.noreply.github.com> Co-authored-by: hgklohr --- .../CustomAnalyzerQueryNodeProcessor.java | 296 ++++++++++++------ .../jexl/TestLuceneToJexlQueryParser.java | 20 +- .../TestLuceneToJexlQueryParserVariants.java | 136 ++++++++ 3 files changed, 344 insertions(+), 108 deletions(-) create mode 100644 warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java diff --git a/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java b/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java index 82a30c5be9..948107b960 100644 --- a/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java +++ b/warehouse/query-core/src/main/java/datawave/query/language/processor/lucene/CustomAnalyzerQueryNodeProcessor.java @@ -21,8 +21,10 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Set; +import java.util.TreeSet; import org.apache.log4j.Logger; import org.apache.lucene.analysis.Analyzer; @@ -53,16 +55,17 @@ *

* Applies tokenization to {@link TextableQueryNode} objects using a configured Lucene {@link Analyzer}. *

- * * Uses the {@link Analyzer} specified in the the {@link ConfigurationKeys#ANALYZER} attribute of the {@link QueryConfigHandler} to process non-wildcard * {@link FieldQueryNode}s for fields listed in tokenizedFields. - * + *

* (Nodes that are {@link WildcardQueryNode}, {@link FuzzyQueryNode} or {@link RegexpQueryNode} or are part of a {@link TermRangeQueryNode} are NOT processed by * this processor.) - * + *

+ *

* The text of each {@link TextableQueryNode} is processed using the {@link Analyzer} to generate tokens. If the analyzer returns one or more terms that are not * identical to the input, the processor generates an {@link OrQueryNode} containing the original query node and a new {@link QuotedFieldQueryNode} or * {@link SlopQueryNode} depending on the nature of the original query node and whether useSlopForTokenizedTerms is false. + *

*

* There are three primary cases where tokenization will be applied to input query terms - single terms (e.g: wi-fi), phrases (e.g: "portable wi-fi"), and * phrases with slop (e.g: "portable wi-fi"~3). In the case of single term input, tokenization will produce a phrase with slop equals to the number of positions @@ -250,9 +253,7 @@ protected List setChildrenOrder(List children) throws Quer return children; /* no-op */ } - private QueryNode tokenizeNode(QueryNode node, final String text, final String field) throws QueryNodeException { - CachingTokenFilter buffer = null; - + private QueryNode tokenizeNode(final QueryNode node, final String text, final String field) throws QueryNodeException { if (analyzer == null) { if (logger.isDebugEnabled()) { logger.debug("Skipping tokenization of node: '" + node + "'; no analyzer is set"); @@ -266,125 +267,154 @@ private QueryNode tokenizeNode(QueryNode node, final String text, final String f if (logger.isDebugEnabled()) { logger.debug("Skipping processed query node: " + node.toString()); } - return node; - } else { - // mark the original node processed. - node.setTag(NODE_PROCESSED, Boolean.TRUE); } - try { - // Take a pass over the tokens and buffer them in the caching token filter. - TokenStream source = this.analyzer.tokenStream(field, new StringReader(text)); - source.reset(); - - buffer = new CachingTokenFilter(source); + node.setTag(NODE_PROCESSED, Boolean.TRUE); // mark this node as processed, so we don't process it again. - PositionIncrementAttribute posIncrAtt = null; - int numTokens = 0; + try (TokenStream buffer = this.analyzer.tokenStream(field, new StringReader(text))) { - if (buffer.hasAttribute(PositionIncrementAttribute.class)) { - posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class); - } - - while (buffer.incrementToken()) { - numTokens++; - } - - // rewind the buffer stream + // prepare the source for reading. buffer.reset(); - // close original stream - all tokens buffered - source.close(); - if (!buffer.hasAttribute(CharTermAttribute.class) || numTokens == 0) { - // no terms found, return unmodified node. - return node; + if (!buffer.hasAttribute(CharTermAttribute.class)) { + return node; // tokenizer can't produce terms, return unmodified query node. } final CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class); + final PositionIncrementAttribute posIncrAtt = buffer.hasAttribute(PositionIncrementAttribute.class) + ? buffer.getAttribute(PositionIncrementAttribute.class) + : null; + + // the variant builder will maintain multiple versions of the tokenized query as we find tokens + // that have multiple variants in the same position - e.g., stems, roots or lemmas. + final VariantBuilder b = new VariantBuilder(); - StringBuilder b = new StringBuilder(); - int slopRange = 0; + // build the new query strings from the tokenizer output while tracking cases where we've dropped words + // and will need to adjust the phrase slop for the query as a result. + int positionCount = 0; - String term; while (buffer.incrementToken()) { - term = termAtt.toString(); - b.append(term).append(" "); - - // increment the slop range for the tokenized text based on the - // positionIncrement attribute if available, otherwise one position - // per token. - if (posIncrAtt != null && this.positionIncrementsEnabled) { - slopRange += posIncrAtt.getPositionIncrement(); - } else { - slopRange++; - } + String token = termAtt.toString(); + final int positionIncrement = posIncrAtt != null ? posIncrAtt.getPositionIncrement() : 1; + positionCount += positionIncrementsEnabled ? positionIncrement : 1; + b.append(token, positionIncrement == 0); } - b.setLength(b.length() - 1); // trim trailing whitespace - - if (b.length() > 0) { - final String tokenizedText = b.toString(); + if (b.hasNoVariants()) { + return node; // If we didn't produce anything from the tokenizer, return unmodified query node. + } - // Check to see that the tokenizer produced output that was different from the original query node. - // If so avoid creating an OR clause. We compare the 'escaped' string of the original query so that we - // do not mistreat things like spaces. - if (TextableQueryNode.class.isAssignableFrom(node.getClass())) { - final CharSequence c = ((TextableQueryNode) node).getText(); - final String cmp = UnescapedCharSequence.class.isAssignableFrom(c.getClass()) ? toStringEscaped((UnescapedCharSequence) c) : c.toString(); - if (tokenizedText.equalsIgnoreCase(cmp)) { - return node; - } + // calculate the amount of slop we need to add based on the original query and the number of positions observed + int slopNeeded = calculateSlopNeeded(node, text, positionCount); + + // process each of the 'variants' to ensure they are different from the base query, if so, potentially + // create a new query node and add it to the set of OR clauses. Variants are guaranteed unique, so no + // need to deduplicate there. + final String baseQueryText = getEscapedBaseQueryText(node); + final LinkedList clauses = new LinkedList<>(); + for (String tokenizedText : b.getVariants()) { + if (tokenizedText.equalsIgnoreCase(baseQueryText)) { + continue; // skip this variant - it adds nothing new over the base query. } + QueryNode newQueryNode = createNewQueryNode(field, tokenizedText, slopNeeded); + clauses.add(newQueryNode); + } - QueryNode n = new QuotedFieldQueryNode(field, new UnescapedCharSequence(tokenizedText), -1, -1); - // mark the derived node processed so we don't process it again later. - n.setTag(NODE_PROCESSED, Boolean.TRUE); - - // Adjust the slop based on the difference between the original - // slop minus the original token count (based on whitespace) - int originalSlop = 0; - if (node.getTag(ORIGINAL_SLOP) != null) { - originalSlop = (Integer) node.getTag(ORIGINAL_SLOP); - final int delta = originalSlop - text.split("\\s+").length; - slopRange += delta; - } + if (clauses.isEmpty()) { + return node; + } - // Only add slop if the original had slop, or the original was not a phrase and slop is enabled. - // Using slop for non-quoted terms is a workaround until the phrase function will accept multiple - // terms in the same position as a valid match. - boolean originalWasQuoted = QuotedFieldQueryNode.class.isAssignableFrom(node.getClass()); - if ((useSlopForTokenizedTerms && !originalWasQuoted) || originalSlop > 0) { - n = new SlopQueryNode(n, slopRange); - } + // If we made it here, the tokenizer produced output that was different from the original query node, and + // we want to build an 'OR' clause that will match either query string. + clauses.addFirst(possiblyWrapOriginalQueryNode(node)); + return new GroupQueryNode(new OrQueryNode(clauses)); + } catch (IOException e) { + throw new QueryNodeException(e); + } + } - // The tokenizer produced output that was different from the original query node, wrap the original - // node and the tokenizer produced node in a OR query. To do this properly, we need to wrap the - // original node in a slop query node if it was originally in a slop query node. - if (originalSlop > 0) { - // restore the original slop wrapper to the base node if it was present originally. - node = new SlopQueryNode(node, originalSlop); - } + /** + * Create a new query node for the specified field and tokenize text, optionally wrapping it in a SlopQueryNode if we've determined that slop is needed + * (either due to tokens being removed or there being slop on the original query we need to account for. + * + * @param field + * the field for the query node + * @param tokenizedText + * the text for the query node + * @param slopNeeded + * whether slop is needed. + * @return a new QuotedFieldQueryNode or possibly a SlopQueryNode containing the new clause. Both of these nodes will be marked as 'PROCESSED'. + */ + public QueryNode createNewQueryNode(String field, String tokenizedText, int slopNeeded) { + QueryNode newQueryNode = new QuotedFieldQueryNode(field, new UnescapedCharSequence(tokenizedText), -1, -1); + newQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); // don't process this node again. + if (slopNeeded > 0) { + newQueryNode = new SlopQueryNode(newQueryNode, slopNeeded); + newQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); // don't process this node again. + } + return newQueryNode; + } - final List clauses = new ArrayList<>(); - clauses.add(node); - clauses.add(n); + /** + * Calculate the amount of slop we need to add to a new query node for tokenized text. This is based on the based on the number of positions observed in the + * tokenized text and the difference between the slop in the original query minus the original token count. + * + * @param node + * the original query node from which the tokenized text originated. + * @param text + * the text of the original query. + * @param positionsObserved + * the number of positions observed in the tokenized text. + * @return the amount of slop we need to add to our new query clauses. + */ + private int calculateSlopNeeded(QueryNode node, String text, int positionsObserved) { + int slopNeeded = positionsObserved; - node = new GroupQueryNode(new OrQueryNode(clauses)); - } - } catch (IOException e) { - throw new QueryNodeException(e); - } finally { - if (buffer != null) { - try { - buffer.close(); - } catch (IOException ex) { - logger.warn("Exception closing caching token filter: ", ex); - } + final boolean originalWasQuoted = QuotedFieldQueryNode.class.isAssignableFrom(node.getClass()); + final int originalSlop = node.getTag(ORIGINAL_SLOP) != null ? (Integer) node.getTag(ORIGINAL_SLOP) : 0; + + if ((useSlopForTokenizedTerms && !originalWasQuoted) || originalSlop > 0) { + // Adjust the slop needed based on the slop in the original query. + final int delta = originalSlop - text.split("\\s+").length; + if (delta > 0) { + slopNeeded += delta; } + } else { + slopNeeded = 0; } + return slopNeeded; + } - return node; + /** + * If the original query node was nested in a SlopQueryNode, that fact has been stored in the ORIGINAL_SLOP tag, and we'll need to re-create that slop node. + * Otherwise, return the original node unchanged. + * + * @param node + * the node to process. + * @return the node wrapped in a SlopQueryNode, if the input node originally had slop. + */ + private static QueryNode possiblyWrapOriginalQueryNode(QueryNode node) { + final int originalSlop = node.getTag(ORIGINAL_SLOP) != null ? (Integer) node.getTag(ORIGINAL_SLOP) : 0; + final QueryNode originalQueryNode = originalSlop > 0 ? new SlopQueryNode(node, originalSlop) : node; + originalQueryNode.setTag(NODE_PROCESSED, Boolean.TRUE); + return originalQueryNode; + } + + /** + * If a query node was something that has text, get the text. If the query node was already unescaped, convert it to it's escaped version. This way it can + * be compared to other nodes with escapes in place. + * + * @param node + * the node to extract text from + * @return the escaped version of the text from the node, null if the node had no text. + */ + private static String getEscapedBaseQueryText(QueryNode node) { + if (TextableQueryNode.class.isAssignableFrom(node.getClass())) { + final CharSequence c = ((TextableQueryNode) node).getText(); + return UnescapedCharSequence.class.isAssignableFrom(c.getClass()) ? toStringEscaped((UnescapedCharSequence) c) : c.toString(); + } + return null; } /** @@ -394,7 +424,7 @@ private QueryNode tokenizeNode(QueryNode node, final String text, final String f * string value * @return unescaped string */ - private String toStringEscaped(UnescapedCharSequence unescaped) { + private static String toStringEscaped(UnescapedCharSequence unescaped) { // non efficient implementation final StringBuilder result = new StringBuilder(); final int len = unescaped.length(); @@ -408,4 +438,68 @@ private String toStringEscaped(UnescapedCharSequence unescaped) { } return result.toString(); } + + /** + * Maintains one or more buffers for tokenized queries. During standard operation, works like a StringBuilder. If the tokenizer encounters a variant (e.g., + * zero position offset, same start and end as the previous token) appendVariant will start building a second buffer containing that variant. + */ + public static class VariantBuilder { + List> variants = new ArrayList<>(); + + public VariantBuilder append(String input, boolean appendVariant) { + return appendVariant ? appendVariant(input) : append(input); + } + + public VariantBuilder append(String input) { + if (variants.isEmpty()) { + variants.add(new ArrayList<>()); + } + + for (List b : variants) { + b.add(input); + } + + return this; + } + + public VariantBuilder appendVariant(String input) { + if (variants.isEmpty()) { + append(input); + } else { + + List> newVariants = new ArrayList<>(); + + for (List b : variants) { + // create a new variant of all the existing strings, replacing the + List newVariant = new ArrayList<>(b); + newVariant.set(newVariant.size() - 1, input); + newVariants.add(newVariant); + } + + variants.addAll(newVariants); + } + + return this; + } + + public boolean hasNoVariants() { + boolean hasNoVariants = true; + for (List b : variants) { + if (!b.isEmpty()) { + // at least one of the variant buffers has something. + hasNoVariants = false; + break; + } + } + return hasNoVariants; + } + + public Set getVariants() { + Set result = new TreeSet<>(); + for (List b : variants) { + result.add(String.join(" ", b)); + } + return result; + } + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java index 49fe61074e..4779d42123 100644 --- a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParser.java @@ -642,12 +642,18 @@ public void testSynonymTokenization() throws ParseException { TokenSearch searchUtil = TokenSearch.Factory.newInstance(); Analyzer analyzer = new StandardAnalyzer(searchUtil); parser.setAnalyzer(analyzer); - // this isn't the most realistic test, but it does verify that we don't lose the rest of the token stream - // when the first token emitted is the same as the input token. - Assert.assertEquals( - "(TOKFIELD == '/home/datawave/README.md' || " - + "content:phrase(TOKFIELD, termOffsetMap, '/home/datawave/readme.md', 'home/datawave/readme.md', " - + "'home', 'datawave/readme.md', 'datawave', 'readme.md', 'readme', 'md'))", - parseQuery("TOKFIELD:\"/home/datawave/README.md\"")); + // @formatter:off + String expected = "(" + + "TOKFIELD == '/home/datawave/README.md' || " + + "TOKFIELD == 'datawave' || " + + "TOKFIELD == 'datawave/readme.md' || " + + "TOKFIELD == 'home' || " + + "TOKFIELD == 'home/datawave/readme.md' || " + + "TOKFIELD == 'md' || " + + "TOKFIELD == 'readme' || " + + "TOKFIELD == 'readme.md'" + + ")"; + // @formatter:on + Assert.assertEquals(expected, parseQuery("TOKFIELD:\"/home/datawave/README.md\"")); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java new file mode 100644 index 0000000000..53d416ec7b --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/language/parser/jexl/TestLuceneToJexlQueryParserVariants.java @@ -0,0 +1,136 @@ +package datawave.query.language.parser.jexl; + +import static org.junit.Assert.assertEquals; + +import org.apache.lucene.analysis.CharArraySet; +import org.junit.Before; +import org.junit.Test; + +import com.google.common.collect.Sets; + +import datawave.ingest.data.tokenize.DefaultTokenSearch; +import datawave.ingest.data.tokenize.StandardAnalyzer; +import datawave.ingest.data.tokenize.TokenSearch; +import datawave.query.language.parser.ParseException; +import datawave.query.language.tree.QueryNode; +import datawave.query.language.tree.ServerHeadNode; + +public class TestLuceneToJexlQueryParserVariants { + private LuceneToJexlQueryParser parser; + + @Before + public void setUp() { + CharArraySet stopwords = new CharArraySet(1, true); + stopwords.add("STOP"); + + // TokenSearch is used for ingesting variants, and generally should never be used at query time + // but is good for simulating the case where we want variants at query time. + TokenSearch tokenSearch = TokenSearch.Factory.newInstance(DefaultTokenSearch.class.getName(), stopwords); + StandardAnalyzer analyzer = new StandardAnalyzer(tokenSearch); + parser = new LuceneToJexlQueryParser(); + parser.setSkipTokenizeUnfieldedFields(Sets.newHashSet("noToken")); + parser.setTokenizedFields(Sets.newHashSet("tokField")); + parser.setAnalyzer(analyzer); + } + + @Test + public void testVariantSingleTerm() throws ParseException { + assertEquals("(TOKFIELD == 'foo@bar.com' || TOKFIELD == '@bar.com' || TOKFIELD == 'foo')", parseQuery("TOKFIELD:foo@bar.com")); + } + + @Test + public void testVariantStopword() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'STOP', 'foo@bar.com', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', '@bar.com', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'foo', 'baz') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'foo@bar.com', 'baz')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email STOP foo@bar.com baz\"")); + } + + @Test + public void testVariantSlopStopword() throws ParseException { + // the split file `wi-fi` increases the slop + // @formatter:off + String expected = "(" + + "content:within(TOKFIELD, 6, termOffsetMap, 'email', 'STOP', 'foo@bar.com', 'wi-fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', '@bar.com', 'wi', 'fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', 'foo', 'wi', 'fi') || " + + "content:within(TOKFIELD, 7, termOffsetMap, 'email', 'foo@bar.com', 'wi', 'fi')" + + ")"; + // @formatter:off + + assertEquals(expected, parseQuery("TOKFIELD:\"email STOP foo@bar.com wi-fi\"~6")); + } + + @Test + public void testVariantsEnd() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', 'foo@bar.com') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', '@bar.com') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'to', 'address', 'foo')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email to address foo@bar.com\"")); + } + + @Test + public void testVariantsBegin() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'foo@bar.com', 'email', 'from', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, '@bar.com', 'email', 'from', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'foo', 'email', 'from', 'address')" + + ")"; + // @formatter:on + + assertEquals(expected, parseQuery("TOKFIELD:\"foo@bar.com email from address\"")); + } + + @Test + public void testVariantsMiddle() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', 'foo@bar.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', '@bar.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'email', 'from', 'foo', 'address')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"email from foo@bar.com address\"")); + } + + @Test + public void testVariantsMultiple() throws ParseException { + // @formatter:off + String expected = "(" + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', 'bar', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', '@bar.com', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', 'bar', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo', 'to', 'bar@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', '@foo.com', 'address') || " + + "content:phrase(TOKFIELD, termOffsetMap, 'from', 'foo@bar.com', 'to', 'bar', 'address')" + + ")"; + // @formatter:on + assertEquals(expected, parseQuery("TOKFIELD:\"from foo@bar.com to bar@foo.com address\"")); + } + + private String parseQuery(String query) throws ParseException { + String parsedQuery = null; + try { + QueryNode node = parser.parse(query); + if (node instanceof ServerHeadNode) { + parsedQuery = node.getOriginalQuery(); + } + } catch (RuntimeException e) { + throw new ParseException(e); + } + return parsedQuery; + } +}