From 55e22cf7c422cf3c4c1ace44d98b2ea1ecda5f15 Mon Sep 17 00:00:00 2001 From: Moriarty <22225248+apmoriarty@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:10:12 +0000 Subject: [PATCH 1/3] Added visitor to that rewrites regex terms without affecting the overal executability of the query. Added visitor that detects if a term or subtree is executable, or an 'anchor'. --- .../jexl/visitors/RegexRewritePattern.java | 51 ++ .../jexl/visitors/RewriteRegexVisitor.java | 225 +++++++++ .../pushdown/AnchorDetectionVisitor.java | 208 +++++++++ .../query/planner/DefaultQueryPlanner.java | 63 ++- .../test/java/datawave/query/ShapesTest.java | 23 + .../visitors/RewriteRegexVisitorTest.java | 442 ++++++++++++++++++ .../pushdown/AnchorDetectionVisitorTest.java | 274 +++++++++++ .../datawave/query/QueryLogicFactory.xml | 19 + 8 files changed, 1303 insertions(+), 2 deletions(-) create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java create mode 100644 warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java create mode 100644 warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java new file mode 100644 index 00000000000..e106aed3886 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RegexRewritePattern.java @@ -0,0 +1,51 @@ +package datawave.query.jexl.visitors; + +import org.apache.commons.lang3.builder.EqualsBuilder; +import org.apache.commons.lang3.builder.HashCodeBuilder; + +/** + * There may exist certain field-pattern combinations that you always want to rewrite + */ +public class RegexRewritePattern { + private String field; + private String literal; + + public RegexRewritePattern(String field, String literal) { + this.field = field; + this.literal = literal; + } + + public boolean matches(String field, String literal) { + return this.field.equals(field) && this.literal.equals(literal); + } + + public String getField() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public String getLiteral() { + return literal; + } + + public void setLiteral(String literal) { + this.literal = literal; + } + + @Override + public boolean equals(Object o) { + if (o instanceof RegexRewritePattern) { + RegexRewritePattern other = (RegexRewritePattern) o; + return new EqualsBuilder().append(field, other.field).append(literal, other.literal).isEquals(); + } + return false; + } + + @Override + public int hashCode() { + return new HashCodeBuilder().append(field).append(literal).hashCode(); + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java new file mode 100644 index 00000000000..aaefde6d346 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java @@ -0,0 +1,225 @@ +package datawave.query.jexl.visitors; + +import java.util.Collections; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTERNode; +import org.apache.commons.jexl3.parser.JexlNode; +import org.apache.commons.jexl3.parser.JexlNodes; + +import datawave.query.Constants; +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.JexlNodeFactory; +import datawave.query.jexl.NodeTypeCount; +import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.visitors.pushdown.AnchorDetectionVisitor; + +/** + * Rewrites regex terms as filter functions provided an anchor exists. + *

+ * An anchor is an executable term or subtree. + *

+ * This visitor supports several configuration options + *

+ * IncludeFields + *

+ * Limit rewrite operations to the specified fields + *

+ *

+ * ExcludeFields + *

+ * Rewrite operations will not be applied to the specified fields. This option overrides any 'include fields' but can be superseded by + * {@link RegexRewritePattern} + *

+ *

+ * RegexRewritePattern + *

+ * In very specific cases one may want to always attempt a regex rewrite, regardless of any previously specified include or exclude fields + *

+ */ +public class RewriteRegexVisitor extends ShortCircuitBaseVisitor { + + private final Set indexedFields; + private final Set indexOnlyFields; + + private final Set includeFields; + private final Set excludeFields; + + private final Set patterns; + + private final AnchorDetectionVisitor anchorDetectionVisitor; + + /** + * Constructor with minimal args + * + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + */ + public RewriteRegexVisitor(Set indexedFields, Set indexOnlyFields) { + this(indexedFields, indexOnlyFields, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); + } + + /** + * Constructor with minimal args + * + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + */ + public RewriteRegexVisitor(Set indexedFields, Set indexOnlyFields, Set includeFields, Set excludeFields, + Set patterns) { + this.indexedFields = indexedFields; + this.indexOnlyFields = indexOnlyFields; + this.includeFields = includeFields; + this.excludeFields = excludeFields; + this.patterns = patterns; + + this.anchorDetectionVisitor = new AnchorDetectionVisitor(indexedFields, indexOnlyFields); + } + + /** + * Static entry point + * + * @param node + * the query or subtree + * @param indexedFields + * the set of indexed fields + * @param indexOnlyFields + * the set of index only fields + * @return the modified tree + */ + public static JexlNode rewrite(JexlNode node, Set indexedFields, Set indexOnlyFields) { + return rewrite(node, indexedFields, indexOnlyFields, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); + } + + public static JexlNode rewrite(JexlNode node, Set indexedFields, Set indexOnlyFields, Set includeFields, Set excludeFields, + Set patterns) { + RewriteRegexVisitor visitor = new RewriteRegexVisitor(indexedFields, indexOnlyFields, includeFields, excludeFields, patterns); + node.jjtAccept(visitor, null); + return node; + } + + // union is not overridden here + + @Override + public Object visit(ASTAndNode node, Object data) { + + if (data instanceof Boolean) { + return data; // short circuit repeated post-traversals + } + + if (QueryPropertyMarker.findInstance(node).isAnyType()) { + return data; // do not descend into markers + } + + // enforce a post-order traversal for maximum rewrite + node.childrenAccept(this, data); + + List anchorCandidates = new LinkedList<>(); + List anchorNonCandidates = new LinkedList<>(); + List otherCandidates = new LinkedList<>(); + + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + JexlNode child = node.jjtGetChild(i); + + // this seems expensive, a visitor that returned raw counts, depth, and complexity would nice to have + NodeTypeCount counts = NodeTypeCountVisitor.countNodes(child, ASTERNode.class); + + if (anchorDetectionVisitor.isAnchor(child)) { + if (counts.getTotal(ASTERNode.class) > 0) { + anchorCandidates.add(child); + } else { + anchorNonCandidates.add(child); + } + } else if (counts.getTotal(ASTERNode.class) > 0) { + otherCandidates.add(child); + } + } + + if (!anchorCandidates.isEmpty() || !anchorNonCandidates.isEmpty()) { + + if (!anchorNonCandidates.isEmpty()) { + // rewrite all anchor candidates + for (JexlNode candidate : anchorCandidates) { + candidate.jjtAccept(this, true); + } + } else { + // rewrite all anchor candidates except the last one, to preserve executability + for (int i = 0; i < anchorCandidates.size() - 1; i++) { + anchorCandidates.get(i).jjtAccept(this, true); + } + } + + // if any anchor exists, rewrite other candidates + for (JexlNode otherCandidate : otherCandidates) { + otherCandidate.jjtAccept(this, true); + } + } + + return data; + } + + @Override + public Object visit(ASTERNode node, Object data) { + String field = JexlASTHelper.getIdentifier(node); + + if (isLegalRewrite(field, data)) { + + // once legality of rewrite is established make sure it's not filtered + String literal = (String) JexlASTHelper.getLiteralValue(node); + + if (isNodeRewritableFromRules(field, literal)) { + JexlNode rewrite = JexlNodeFactory.buildFunctionNode("filter", "includeRegex", field, literal); + JexlNodes.replaceChild(node.jjtGetParent(), node, rewrite); + } + } + + return data; + } + + private boolean isLegalRewrite(String field, Object data) { + // never rewrite ANY_FIELD or index-only fields + if (field.equals(Constants.ANY_FIELD) || indexOnlyFields.contains(field)) { + return false; + } + + // 1. anchor exists elsewhere + // 2. field is not indexed + return data instanceof Boolean || !indexedFields.contains(field); + } + + /** + * Determine if the node can be rewritten given any configured rules (include fields, exclude fields, patterns) + * + * @param field + * the field + * @param literal + * the literal + * @return true if the node can be rewritten + */ + private boolean isNodeRewritableFromRules(String field, String literal) { + // check patterns first because they supersede include/exclude rules + for (RegexRewritePattern pattern : patterns) { + if (pattern.matches(field, literal)) { + return true; + } + } + + // exclude fields beat include fields + if (!excludeFields.isEmpty() && excludeFields.contains(field)) { + return false; + } + + if (!includeFields.isEmpty()) { + return includeFields.contains(field); + } + + return true; + } +} diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java new file mode 100644 index 00000000000..018658b1eb2 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java @@ -0,0 +1,208 @@ +package datawave.query.jexl.visitors.pushdown; + +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTAndNode; +import org.apache.commons.jexl3.parser.ASTAssignment; +import org.apache.commons.jexl3.parser.ASTEQNode; +import org.apache.commons.jexl3.parser.ASTERNode; +import org.apache.commons.jexl3.parser.ASTFunctionNode; +import org.apache.commons.jexl3.parser.ASTGENode; +import org.apache.commons.jexl3.parser.ASTGTNode; +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ASTLENode; +import org.apache.commons.jexl3.parser.ASTLTNode; +import org.apache.commons.jexl3.parser.ASTNENode; +import org.apache.commons.jexl3.parser.ASTNRNode; +import org.apache.commons.jexl3.parser.ASTNotNode; +import org.apache.commons.jexl3.parser.ASTOrNode; +import org.apache.commons.jexl3.parser.ASTReference; +import org.apache.commons.jexl3.parser.ASTReferenceExpression; +import org.apache.commons.jexl3.parser.JexlNode; + +import datawave.query.jexl.JexlASTHelper; +import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.visitors.ShortCircuitBaseVisitor; + +/** + * Determines if a subtree is an anchor for a given query + *

+ * An anchor is defined as an executable leaf or subtree. + */ +public class AnchorDetectionVisitor extends ShortCircuitBaseVisitor { + + private final Set indexedFields; + private final Set indexOnlyFields; + + /** + * Default constructor + * + * @param indexedFields + * the set of indexed query fields + * @param indexOnlyFields + * the set of index only query fields + */ + public AnchorDetectionVisitor(Set indexedFields, Set indexOnlyFields) { + this.indexedFields = indexedFields; + this.indexOnlyFields = indexOnlyFields; + } + + public boolean isAnchor(JexlNode node) { + return (boolean) node.jjtAccept(this, null); + } + + // pass through nodes + + @Override + public Object visit(ASTJexlScript node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTReference node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTReferenceExpression node, Object data) { + return node.jjtGetChild(0).jjtAccept(this, data); + } + + @Override + public Object visit(ASTAssignment node, Object data) { + return false; + } + + @Override + public Object visit(ASTNotNode node, Object data) { + return false; + } + + // junction nodes + + /** + * An OrNode is considered an anchor if and only if all children are anchor nodes + * + * @param node + * a JexlNode + * @param data + * an Object + * @return True if this node is an anchor + */ + @Override + public Object visit(ASTOrNode node, Object data) { + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + boolean childIsAnchor = (boolean) node.jjtGetChild(i).jjtAccept(this, data); + if (!childIsAnchor) { + return false; + } + } + return true; + } + + /** + * An AndNode is considered an anchor if at least one child node is an anchor + * + * @param node + * a JexlNode + * @param data + * an Object + * @return True if this node is an anchor + */ + @Override + public Object visit(ASTAndNode node, Object data) { + QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node); + if (instance.isAnyType()) { + return visitMarker(instance); + } + + for (int i = 0; i < node.jjtGetNumChildren(); i++) { + boolean isChildAnchor = (boolean) node.jjtGetChild(i).jjtAccept(this, data); + if (isChildAnchor) { + return true; + } + } + return false; + } + + // leaf nodes + + @Override + public Object visit(ASTEQNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTNENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTLTNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTGTNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTLENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTGENode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTERNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTNRNode node, Object data) { + return visitLeaf(node); + } + + @Override + public Object visit(ASTFunctionNode node, Object data) { + return false; + } + + private boolean visitLeaf(JexlNode node) { + String field = JexlASTHelper.getIdentifier(node, true); + if (indexedFields.contains(field) || indexOnlyFields.contains(field)) { + if (node instanceof ASTEQNode || node instanceof ASTNENode) { + Object value = JexlASTHelper.getLiteralValue(node); + return value != null; + } + return true; + } + return false; + } + + private Object visitMarker(QueryPropertyMarker.Instance instance) { + + if (instance == null || instance.getType() == null) { + return false; + } + + // might need to handle double markers, such as delayed bounded ranges + + switch (instance.getType()) { + case BOUNDED_RANGE: + case DELAYED: + case EVALUATION_ONLY: + case EXCEEDED_OR: + case EXCEEDED_TERM: + case EXCEEDED_VALUE: + return true; + default: + return false; + } + } + +} diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 6df09d7646c..aaf34561c9f 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -142,8 +142,10 @@ import datawave.query.jexl.visitors.RebuildingVisitor; import datawave.query.jexl.visitors.RegexFunctionVisitor; import datawave.query.jexl.visitors.RegexIndexExpansionVisitor; +import datawave.query.jexl.visitors.RegexRewritePattern; import datawave.query.jexl.visitors.RewriteNegationsVisitor; import datawave.query.jexl.visitors.RewriteNullFunctionsVisitor; +import datawave.query.jexl.visitors.RewriteRegexVisitor; import datawave.query.jexl.visitors.SetMembershipVisitor; import datawave.query.jexl.visitors.SortedUIDsRequiredVisitor; import datawave.query.jexl.visitors.TermCountingVisitor; @@ -304,6 +306,14 @@ public class DefaultQueryPlanner extends QueryPlanner implements Cloneable { */ protected boolean showReducedQueryPrune = true; + /** + * Controls optimistic rewriting of regex terms as filter functions, preserving overall query executability + */ + protected boolean rewriteRegexTerms = false; + protected Set regexIncludeFields; + protected Set regexExcludeFields; + protected Set regexRewritePatterns; + // handles boilerplate operations that surround a visitor's execution (e.g., timers, logging, validating) private TimedVisitorManager visitorManager = new TimedVisitorManager(); @@ -778,7 +788,10 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH // | Post Query Model Expansion Clean Up | // +-------------------------------------+ - Set indexOnlyFields = loadIndexedFields(config); + Set indexOnlyFields = loadIndexOnlyFields(config); + + Set indexedFields = loadIndexedFields(config); + config.setIndexedFields(indexedFields); if (!indexOnlyFields.isEmpty()) { // filter:includeRegex and filter:excludeRegex functions cannot be run against index-only fields, clean that up @@ -809,6 +822,11 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH config.setQueryTree(timedEnforceUniqueDisjunctionsWithinExpressions(timers, config.getQueryTree())); } + // rewrite regex nodes, optimistically + if (rewriteRegexTerms) { + RewriteRegexVisitor.rewrite(config.getQueryTree(), indexedFields, indexOnlyFields, regexIncludeFields, regexExcludeFields, regexRewritePatterns); + } + if (disableBoundedLookup) { // protection mechanism. If we disable bounded ranges and have a // LT,GT or ER node, we should expand it @@ -1219,7 +1237,7 @@ protected QueryModel loadQueryModel(ShardQueryConfiguration config) { */ - protected Set loadIndexedFields(ShardQueryConfiguration config) { + protected Set loadIndexOnlyFields(ShardQueryConfiguration config) { try { return metadataHelper.getIndexOnlyFields(config.getDatatypeFilter()); } catch (TableNotFoundException e) { @@ -1228,6 +1246,15 @@ protected Set loadIndexedFields(ShardQueryConfiguration config) { } } + protected Set loadIndexedFields(ShardQueryConfiguration config) { + try { + return metadataHelper.getIndexedFields(config.getDatatypeFilter()); + } catch (TableNotFoundException e) { + QueryException qe = new QueryException(DatawaveErrorCode.INDEX_ONLY_FIELDS_RETRIEVAL_ERROR, e); + throw new DatawaveFatalQueryException(qe); + } + } + /** * Loads expansion fields filtered by datatype. If an error occurs that error is rethrown as a {@link DatawaveFatalQueryException} * @@ -3215,6 +3242,38 @@ public static void setMaxTermsToPrint(int maxTermsToPrint) { DefaultQueryPlanner.maxTermsToPrint = maxTermsToPrint; } + public boolean isRewriteRegexTerms() { + return rewriteRegexTerms; + } + + public void setRewriteRegexTerms(boolean rewriteRegexTerms) { + this.rewriteRegexTerms = rewriteRegexTerms; + } + + public Set getRegexIncludeFields() { + return regexIncludeFields; + } + + public void setRegexIncludeFields(Set regexIncludeFields) { + this.regexIncludeFields = regexIncludeFields; + } + + public Set getRegexExcludeFields() { + return regexExcludeFields; + } + + public void setRegexExcludeFields(Set regexExcludeFields) { + this.regexExcludeFields = regexExcludeFields; + } + + public Set getRegexRewritePatterns() { + return regexRewritePatterns; + } + + public void setRegexRewritePatterns(Set regexRewritePatterns) { + this.regexRewritePatterns = regexRewritePatterns; + } + /** * Given a date, truncate it to year, month, date and increment the day by one to determine the following day. * diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index 8149117800d..c51fe8154a6 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -888,4 +888,27 @@ public void testSortQueryBeforeGlobalIndex() throws Exception { } } + @Test + public void testRewriteRegexFromIncludes() throws Exception { + withQuery("ONLY_HEX == 'hexa' && TYPE =~ 'reg.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && filter:includeRegex(TYPE, 'reg.*')"); + } + + @Test + public void testDoNotRewriteRegexWithExcludedField() throws Exception { + withQuery("ONLY_HEX == 'hexa' && SHAPE =~ 'hex.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Delayed_ = true) && (SHAPE =~ 'hex.*'))"); + } + + @Test + public void testRewriteRegexWithExcludedFieldBecauseOfPatternMatch() throws Exception { + withQuery("ONLY_HEX == 'hexa' && SHAPE =~ 'hexag.*'"); + withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); + planAndExecuteQuery(); + assertPlannedQuery("ONLY_HEX == 'hexa' && filter:includeRegex(SHAPE, 'hexag.*')"); + } } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java new file mode 100644 index 00000000000..fd6add4b908 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java @@ -0,0 +1,442 @@ +package datawave.query.jexl.visitors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.ParseException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; + +public class RewriteRegexVisitorTest { + + private final Set indexedFields = Set.of("F", "F2", "IO", "IO2"); + private final Set indexOnlyFields = Set.of("IO", "IO2"); + + private final Set includeFields = new HashSet<>(); + private final Set excludeFields = new HashSet<>(); + + private final Set patterns = new HashSet<>(); + + @BeforeEach + public void beforeEach() { + includeFields.clear(); + excludeFields.clear(); + patterns.clear(); + } + + // A and regex + @Test + public void testSingleTermAndRegex() { + // term and indexed regex + test("F == 'a' && F =~ 'ba.*'", "F == 'a' && filter:includeRegex(F, 'ba.*')"); + test("IO == 'a' && F =~ 'ba.*'", "IO == 'a' && filter:includeRegex(F, 'ba.*')"); + test("NA == 'a' && F =~ 'ba.*'"); + + // term and index only regex is never rewritten + test("F == 'a' && IO =~ 'ba.*'"); + test("IO == 'a' && IO =~ 'ba.*'"); + test("NA == 'a' && IO =~ 'ba.*'"); + + // term and non-indexed regex is always rewritten + test("F == 'a' && NA =~ 'ba.*'", "F == 'a' && filter:includeRegex(NA, 'ba.*')"); + test("IO == 'a' && NA =~ 'ba.*'", "IO == 'a' && filter:includeRegex(NA, 'ba.*')"); + test("NA == 'a' && NA =~ 'ba.*'", "NA == 'a' && filter:includeRegex(NA, 'ba.*')"); + } + + // A or regex + @Test + public void testSingleTermOrRegex() { + // term or indexed regex is never rewritten + test("F == 'a' || F =~ 'ba.*'"); + test("IO == 'a' || F =~ 'ba.*'"); + test("NA == 'a' || F =~ 'ba.*'"); + + // term or index only regex is never rewritten + test("F == 'a' || IO =~ 'ba.*'"); + test("IO == 'a' || IO =~ 'ba.*'"); + test("NA == 'a' || IO =~ 'ba.*'"); + + // top level union with non-indexed regex is a full table scan, do not rewrite + test("F == 'a' || NA =~ 'ba.*'", "F == 'a' || filter:includeRegex(NA, 'ba.*')"); + test("IO == 'a' || NA =~ 'ba.*'", "IO == 'a' || filter:includeRegex(NA, 'ba.*')"); + test("NA == 'a' || NA =~ 'ba.*'", "NA == 'a' || filter:includeRegex(NA, 'ba.*')"); + } + + // (A and B) or regex + @Test + public void testNestedIntersectionOrRegex() { + // all combinations of nested intersection and indexed regex + test("(F == 'a' && F == 'b') || F =~ 'ba.*'"); + test("(F == 'a' && IO == 'b') || F =~ 'ba.*'"); + test("(F == 'a' && NA == 'b') || F =~ 'ba.*'"); + test("(IO == 'a' && IO == 'b') || F =~ 'ba.*'"); + test("(IO == 'a' && NA == 'b') || F =~ 'ba.*'"); + test("(NA == 'a' && NA == 'b') || F =~ 'ba.*'"); + + // all combinations of nested intersection and index only regex + test("(F == 'a' && F == 'b') || IO =~ 'ba.*'"); + test("(F == 'a' && IO == 'b') || IO =~ 'ba.*'"); + test("(F == 'a' && NA == 'b') || IO =~ 'ba.*'"); + test("(IO == 'a' && IO == 'b') || IO =~ 'ba.*'"); + test("(IO == 'a' && NA == 'b') || IO =~ 'ba.*'"); + test("(NA == 'a' && NA == 'b') || IO =~ 'ba.*'"); + + // the input queries are non-executable, non-indexed field still gets rewritten + // all combinations of nested intersection and non-indexed regex + test("(F == 'a' && F == 'b') || NA =~ 'ba.*'", "(F == 'a' && F == 'b') || filter:includeRegex(NA, 'ba.*')"); + test("(F == 'a' && IO == 'b') || NA =~ 'ba.*'", "(F == 'a' && IO == 'b') || filter:includeRegex(NA, 'ba.*')"); + test("(F == 'a' && NA == 'b') || NA =~ 'ba.*'", "(F == 'a' && NA == 'b') || filter:includeRegex(NA, 'ba.*')"); + test("(IO == 'a' && IO == 'b') || NA =~ 'ba.*'", "(IO == 'a' && IO == 'b') || filter:includeRegex(NA, 'ba.*')"); + test("(IO == 'a' && NA == 'b') || NA =~ 'ba.*'", "(IO == 'a' && NA == 'b') || filter:includeRegex(NA, 'ba.*')"); + test("(NA == 'a' && NA == 'b') || Na =~ 'ba.*'", "(NA == 'a' && NA == 'b') || filter:includeRegex(Na, 'ba.*')"); + } + + // (A or B) and regex + @Test + public void testNestedUnionAndRegex() { + // all combinations of nested intersection and indexed regex + test("(F == 'a' || F == 'b') && F =~ 'ba.*'", "(F == 'a' || F == 'b') && filter:includeRegex(F, 'ba.*')"); + test("(F == 'a' || IO == 'b') && F =~ 'ba.*'", "(F == 'a' || IO == 'b') && filter:includeRegex(F, 'ba.*')"); + test("(F == 'a' || NA == 'b') && F =~ 'ba.*'"); + test("(IO == 'a' || IO == 'b') && F =~ 'ba.*'", "(IO == 'a' || IO == 'b') && filter:includeRegex(F, 'ba.*')"); + test("(IO == 'a' || NA == 'b') && F =~ 'ba.*'"); + test("(NA == 'a' || NA == 'b') && F =~ 'ba.*'"); + + // all combinations of nested intersection and index only regex + test("(F == 'a' || F == 'b') && IO =~ 'ba.*'"); + test("(F == 'a' || IO == 'b') && IO =~ 'ba.*'"); + test("(F == 'a' || NA == 'b') && IO =~ 'ba.*'"); + test("(IO == 'a' || IO == 'b') && IO =~ 'ba.*'"); + test("(IO == 'a' || NA == 'b') && IO =~ 'ba.*'"); + test("(NA == 'a' || NA == 'b') && IO =~ 'ba.*'"); + + // all combinations of nested intersection and non-indexed regex + test("(F == 'a' || F == 'b') && NA =~ 'ba.*'", "(F == 'a' || F == 'b') && filter:includeRegex(NA, 'ba.*')"); + test("(F == 'a' || IO == 'b') && NA =~ 'ba.*'", "(F == 'a' || IO == 'b') && filter:includeRegex(NA, 'ba.*')"); + test("(F == 'a' || NA == 'b') && NA =~ 'ba.*'", "(F == 'a' || NA == 'b') && filter:includeRegex(NA, 'ba.*')"); + test("(IO == 'a' || IO == 'b') && NA =~ 'ba.*'", "(IO == 'a' || IO == 'b') && filter:includeRegex(NA, 'ba.*')"); + test("(IO == 'a' || NA == 'b') && NA =~ 'ba.*'", "(IO == 'a' || NA == 'b') && filter:includeRegex(NA, 'ba.*')"); + test("(NA == 'a' || NA == 'b') && Na =~ 'ba.*'", "(NA == 'a' || NA == 'b') && filter:includeRegex(Na, 'ba.*')"); + } + + // A and (B or regex) + @Test + public void testIntersectionWithNestedUnionWithSingleRegex() { + // top level indexed term, variable indexed state for nested term, indexed regex + test("F == 'a' && (F == 'b' || F =~ 'ba.*')", "F == 'a' && (F == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("F == 'a' && (IO == 'b' || F =~ 'ba.*')", "F == 'a' && (IO == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("F == 'a' && (NA == 'b' || F =~ 'ba.*')", "F == 'a' && (NA == 'b' || filter:includeRegex(F, 'ba.*'))"); + + // top level indexed term, variable indexed state for nested term, index only regex + test("F == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("F == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("F == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level indexed term, variable indexed state for nested term, non-indexed regex + test("F == 'a' && (F == 'b' || NA =~ 'ba.*')", "F == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("F == 'a' && (IO == 'b' || NA =~ 'ba.*')", "F == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("F == 'a' && (NA == 'b' || NA =~ 'ba.*')", "F == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + + // top level index only term, variable indexed state for nested term, indexed regex + test("IO == 'a' && (F == 'b' || F =~ 'ba.*')", "IO == 'a' && (F == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("IO == 'a' && (IO == 'b' || F =~ 'ba.*')", "IO == 'a' && (IO == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("IO == 'a' && (NA == 'b' || F =~ 'ba.*')", "IO == 'a' && (NA == 'b' || filter:includeRegex(F, 'ba.*'))"); + + // top level index only term, variable indexed state for nested term, index only regex + test("IO == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("IO == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("IO == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level index only term, variable indexed state for nested term, non-indexed regex + test("IO == 'a' && (F == 'b' || NA =~ 'ba.*')", "IO == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("IO == 'a' && (IO == 'b' || NA =~ 'ba.*')", "IO == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("IO == 'a' && (NA == 'b' || NA =~ 'ba.*')", "IO == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + + // top level non-indexed term, variable indexed state for nested term, indexed regex + test("NA == 'a' && (F == 'b' || F =~ 'ba.*')"); + test("NA == 'a' && (IO == 'b' || F =~ 'ba.*')"); + test("NA == 'a' && (NA == 'b' || F =~ 'ba.*')"); + + // top level non-indexed term, variable indexed state for nested term, index only regex + test("NA == 'a' && (F == 'b' || IO =~ 'ba.*')"); + test("NA == 'a' && (IO == 'b' || IO =~ 'ba.*')"); + test("NA == 'a' && (NA == 'b' || IO =~ 'ba.*')"); + + // top level non-indexed term, variable indexed state for nested term, non-indexed regex + test("NA == 'a' && (F == 'b' || NA =~ 'ba.*')", "NA == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("NA == 'a' && (IO == 'b' || NA =~ 'ba.*')", "NA == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("NA == 'a' && (NA == 'b' || NA =~ 'ba.*')", "NA == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + } + + // A or (B and regex) + @Test + public void testUnionWithNestedIntersectionWithSingleRegex() { + // top level indexed, variable index state of nested term, indexed regex + test("F == 'a' || (F == 'b' && F == 'ab.*')"); + test("F == 'a' || (IO == 'b' && F == 'ab.*')"); + test("F == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level indexed, variable index state of nested term, index only regex + test("F == 'a' || (F == 'b' && IO == 'ab.*')"); + test("F == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("F == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level indexed, variable index state of nested term, non-indexed regex + test("F == 'a' || (F == 'b' && NA == 'ab.*')"); + test("F == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("F == 'a' || (NA == 'b' && NA == 'ab.*')"); + + // top level index only, variable index state of nested term, indexed regex + test("IO == 'a' || (F == 'b' && F == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && F == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level index only, variable index state of nested term, index only regex + test("IO == 'a' || (F == 'b' && IO == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level index only, variable index state of nested term, non-indexed regex + test("IO == 'a' || (F == 'b' && NA == 'ab.*')"); + test("IO == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("IO == 'a' || (NA == 'b' && NA == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, indexed regex + test("NA == 'a' || (F == 'b' && F == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && F == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && F == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, index only regex + test("NA == 'a' || (F == 'b' && IO == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && IO == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && IO == 'ab.*')"); + + // top level non-indexed, variable index state of nested term, non-indexed regex + test("NA == 'a' || (F == 'b' && NA == 'ab.*')"); + test("NA == 'a' || (IO == 'b' && NA == 'ab.*')"); + test("NA == 'a' || (NA == 'b' && NA == 'ab.*')"); + } + + // A and (regex or regex) + @Test + public void testIntersectionWithNestedUnionOfRegexes() { + // indexed term and union of regexes with all possible index states + test("F == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"); + test("F == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || IO =~ 'ac.*')"); + test("F == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("F == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + + // index only term and union of regexes with all possible index states + test("IO == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"); + test("IO == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || IO =~ 'ac.*')"); + test("IO == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("IO == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + + // non-indexed tem and union of regexes with all possible index states + test("NA == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')"); + test("NA == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')"); + test("NA == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (F =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); + test("NA == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + } + + // A or (regex and regex) + @Test + public void testUnionWithNestedIntersectionOfRegexes() { + // indexed term or intersection of regexes with all possible index states + test("F == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "F == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "F == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("F == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + + // index only term or intersection of regexes with all possible index states + test("IO == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("IO == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + + // non-indexed tem or intersection of regexes with all possible index states + test("NA == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); + test("NA == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + } + + // (A or regex) and (B or regex) + @Test + public void testNestedUnionsWithDistributedRegexes() { + String query = "(F == 'a' || F =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; + String expected = "(F == 'a' || filter:includeRegex(F, 'ab.*')) && (F == 'b' || F =~ 'ac.*')"; + test(query, expected); + + query = "(F == 'a' || NA =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; + expected = "(F == 'a' || filter:includeRegex(NA, 'ab.*')) && (F == 'b' || F =~ 'ac.*')"; + test(query, expected); + } + + // (A and regex) or (B and regex) + @Test + public void testNestedIntersectionsWithDistributedRegexes() { + String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; + String expected = "(F == 'a' && filter:includeRegex(F, 'ab.*')) || (F == 'b' && filter:includeRegex(F, 'ac.*'))"; + test(query, expected); + } + + // (A or B) and (regex or regex) + @Test + public void testPartialAnchorAndNestedUnionRegex() { + String query = "(F == 'a' || F == 'b') && (F =~ 'ab.*' || F =~ 'ac.*')"; + String expected = "(F == 'a' || F == 'b') && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"; + test(query, expected); + } + + // A and (B or (C and regex) + @Test + public void testLeftAnchorAndDeeplyNestedRegex() { + String query = "F == 'a' && (F == 'b' || (F == 'c' && F =~ 'ab.*'))"; + String expected = "F == 'a' && (F == 'b' || (F == 'c' && filter:includeRegex(F, 'ab.*')))"; + test(query, expected); + } + + // ((regex and C) or B) and A + @Test + public void testRightAnchorAndDeeplyNestedRegex() { + String query = "((F =~ 'ab.*' && F == 'c') || F == 'b') && F == 'a'"; + String expected = "((filter:includeRegex(F, 'ab.*') && F == 'c') || F == 'b') && F == 'a'"; + test(query, expected); + } + + @Test + public void testUnionOfTwoLegalRewrites() { + String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; + String expected = "(F == 'a' && filter:includeRegex(F, 'ab.*')) || (F == 'b' && filter:includeRegex(F, 'ac.*'))"; + test(query, expected); + } + + // (NA and regex) or (NA and regex) + @Test + public void testUnionOfTwoIllegalRewrites() { + String query = "(NA == 'a' && F =~ 'ab.*') || (NA == 'b' && F =~ 'ac.*')"; + test(query); + } + + @Test + public void testIncludeFieldsPreventNoRewrites() { + withIncludeFields(Set.of("F", "F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && filter:includeRegex(F, 'ab.*') && filter:includeRegex(F2, 'ac.*')"); + } + + @Test + public void testIncludeFieldsPreventSomeLegalRewrites() { + withIncludeFields(Set.of("F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && F =~ 'ab.*' && filter:includeRegex(F2, 'ac.*')"); + } + + @Test + public void testExcludeFieldsPreventAllLegalRewrites() { + withExcludeFields(Set.of("F", "F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'"); + } + + @Test + public void testExcludeFieldsPreventSomeLegalRewrites() { + withExcludeFields(Set.of("F2")); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && filter:includeRegex(F, 'ab.*') && F2 =~ 'ac.*'"); + } + + @Test + public void testFullyInclusiveIncludeAndExcludeFields() { + withIncludeFields(Set.of("F")); + withExcludeFields(Set.of("F")); + // exclude fields beats include fields + test("IO == 'a' && F =~ 'ab.*'"); + } + + @Test + public void testPatternBeatsExcludeFields() { + withPattern("F", "zz.*"); + withExcludeFields(Set.of("F")); + // pattern beats exclude fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + } + + @Test + public void testPatternBeatsIncludeFields() { + withPattern("F", "zz.*"); + withIncludeFields(Set.of("F2")); + // pattern beats include fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + } + + @Test + public void testPatternBeatsIncludeAndExcludeFields() { + withPattern("F", "zz.*"); + withIncludeFields(Set.of("F2")); + withExcludeFields(Set.of("F")); + // pattern beats include fields + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + } + + /** + * Assert that the provided query does not change + * + * @param query + * the query + */ + private void test(String query) { + test(query, query); + } + + /** + * Assert that the provided query matches the expected query after the {@link RewriteRegexVisitor} is applied + * + * @param query + * the query + * @param expected + * the expected result + */ + private void test(String query, String expected) { + ASTJexlScript script = parse(query); + RewriteRegexVisitor.rewrite(script, indexedFields, indexOnlyFields, includeFields, excludeFields, patterns); + String result = JexlStringBuildingVisitor.buildQuery(script); + assertEquals(expected, result); + } + + private ASTJexlScript parse(String query) { + try { + return JexlASTHelper.parseAndFlattenJexlQuery(query); + } catch (ParseException e) { + fail("Failed to parse query: " + query, e); + throw new RuntimeException(e); + } + } + + private void withIncludeFields(Set includeFields) { + this.includeFields.addAll(includeFields); + } + + private void withExcludeFields(Set excludeFields) { + this.excludeFields.addAll(excludeFields); + } + + private void withPattern(String field, String literal) { + patterns.add(new RegexRewritePattern(field, literal)); + } +} diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java new file mode 100644 index 00000000000..6b63061f7a3 --- /dev/null +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java @@ -0,0 +1,274 @@ +package datawave.query.jexl.visitors.pushdown; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.jexl3.parser.ASTJexlScript; +import org.apache.commons.jexl3.parser.JexlNode; +import org.junit.jupiter.api.Test; + +import datawave.query.jexl.JexlASTHelper; + +class AnchorDetectionVisitorTest { + + private final Set indexOnlyFields = Collections.singleton("IO"); + private final Set indexedFields = Collections.singleton("F"); + private AnchorDetectionVisitor visitor; + + @Test + void testIndexedLeaves() { + // @formatter:off + String[] queries = new String[]{ + "F == '1'", + "F != '1'", + "F < '2'", + "F > '2'", + "F <= '2'", + "F >= '2'", + "F =~ 'ba.*'", + "F !~ 'ba.*'", + }; + // @formatter:on + + test(queries, true); + } + + @Test + void testIndexOnlyLeaves() { + // @formatter:off + String[] queries = new String[]{ + "IO == '1'", + "IO != '1'", + "IO < '2'", + "IO > '2'", + "IO <= '2'", + "IO >= '2'", + "IO =~ 'ba.*'", + "IO !~ 'ba.*'", + }; + // @formatter:on + + test(queries, true); + } + + @Test + void testNonIndexedLeaves() { + // @formatter:off + String[] queries = new String[]{ + "FIELD == '1'", + "FIELD != '1'", + "FIELD < '2'", + "FIELD > '2'", + "FIELD <= '2'", + "FIELD >= '2'", + "FIELD =~ 'ba.*'", + "FIELD !~ 'ba.*'", + }; + // @formatter:on + + test(queries, false); + } + + @Test + void testNullLiterals() { + test("F == null", false); + test("F != null", false); + test("IO == null", false); + test("IO != null", false); + test("FIELD == null", false); + test("FIELD != null", false); + } + + @Test + void testFilterFunctions() { + // @formatter:off + String[] queries = new String[]{ + // index only include/exclude are rewritten to regex nodes + "filter:include(F, 'ba.*')", + "filter:exclude(F, 'ba.*')", + "filter:include(FIELD, 'ba.*')", + "filter:exclude(FIELD, 'ba.*')", + // isNull functions should be rewritten to 'F == null' + "filter:isNull(F)", + "filter:isNull(F)", + "filter:isNull(FIELD)", + "filter:isNull(FIELD)", + // isNotNull functions should be rewritten to !(F == null) + "filter:isNotNull(F)", + "filter:isNotNull(F)", + "filter:isNotNull(FIELD)", + "filter:isNotNull(FIELD)", + "filter:compare(F,'==','any',F)", + "filter:compare(IO,'==','any',IO)", + "filter:compare(FIELD,'==','any',FIELD)", + }; + // @formatter:on + + test(queries, false); + } + + @Test + void testMarkers() { + // @formatter:off + String[] anchorMarkers = new String[] { + "((_Bounded_ = true) && (F > '2' && F < '5'))", + "((_Delayed_ = true) && (F == '1'))", + "((_Eval_ = true) && (F == '1'))", + "((_List_ = true) && ((id = 'id') && (field = 'F') && (params = '{\"ranges\":[[\"[r1\",\"r2]\"],[\"[r3\",\"f4]\"]]}')))", + "((_Value_ = true) && (F =~ 'ba.*'))", + "((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))" + }; + // @formatter:on + + test(anchorMarkers, true); + + // @formatter:off + String[] nonAnchorMarkers = new String[]{ + "((_Hole_ = true) && (F == '1'))", + "((_Drop_ = true) && (F == '1'))", + "((_Lenient_ = true) && (F == '1'))", + "((_Strict_ = true) && (F == '1'))" + }; + // @formatter:on + + test(nonAnchorMarkers, false); + } + + @Test + void testUnions() { + // @formatter:off + String[] anchorUnions = new String[] { + "F == '1' || F == '2'", + "F == '1' || IO == '1'", + "IO == '1' || IO == '2'"}; + // @formatter:on + + test(anchorUnions, true); + + // @formatter:off + String[] nonAnchorUnions = new String[] { + "FIELD == '1' || F == '2'", + "F == '1' || IO == '1' || FIELD == '3'", + "FIELD == '1' || FIELD == '2'"}; + // @formatter:onn + + test(nonAnchorUnions, false); + } + + @Test + void testIntersections() { + // @formatter:off + String[] anchorIntersections = new String[] { + "F == '1' && F == '2'", + "F == '1' && IO == '1'", + "IO == '1' && IO == '2'", + "F == '1' && IO == null", + "IO == '1' && IO == null", + // intersection needs just one anchor to be executable + "X == '1' && F == '2'", "X == '1' && IO == '2'" + }; + // @formatter:on + + test(anchorIntersections, true); + + // @formatter:off + String[] nonAnchorQueries = new String[] { + "X == '1' && Y == '2' && Z == '3'", + "F == null && IO == null", + }; + // @formatter:on + + test(nonAnchorQueries, false); + } + + @Test + void testNestedUnions() { + // @formatter:off + String[] anchorNestedUnions = new String[]{ + "(F == '1' || F == '2') && (F == '3' || F == '4')", + "(F == '1' || F == '2') && (IO == '3' || IO == '4')", + "(IO == '1' || IO == '2') && (F == '3' || F == '4')", + "(F == '1' || IO == '2') && (F == '3' || IO == '4')", + "(IO == '1' || F == '2') && (IO == '3' || F == '4')", + }; + // @formatter:on + + test(anchorNestedUnions, true); + } + + @Test + void testNestedIntersections() { + // @formatter:off + String[] anchorNestedIntersections = new String[]{ + "(F == '1' && F == '2') || (F == '3' && F == '4')", + "(F == '1' && F == '2') || (IO == '3' && IO == '4')", + "(IO == '1' && IO == '2') || (F == '3' && F == '4')", + "(F == '1' && IO == '2') || (F == '3' && IO == '4')", + "(IO == '1' && F == '2') || (IO == '3' && F == '4')", + }; + // @formatter:on + + test(anchorNestedIntersections, true); + } + + @Test + void testFullContentPhraseFunction() { + String query = "content:phrase(F, termOffsetMap, 'foo', 'bar') && F == 'foo' && F == 'bar'"; + test(query, true); + } + + @Test + void testArithmeticAndSizeMethods() { + // @formatter:off + String[] queries = new String[]{ + // filter + "filter:getMinTime(F) == 1892160000000", + "filter:getMinTime(F) != 1892160000000", + "filter:getMinTime(F) > 1892160000000", + "filter:getMinTime(F) < 1892160000000", + "filter:getMinTime(F) >= 1892160000000", + "filter:getMinTime(F) <= 1892160000000", + // method + "F.size() == 1", + "F.size() != 1", + "F.size() > 1", + "F.size() < 1", + "F.size() >= 1", + "F.size() <= 1", + }; + // @formatter:on + + test(queries, false); + } + + private void test(String[] queries, boolean expected) { + for (String query : queries) { + test(query, expected); + } + } + + private void test(String query, boolean expected) { + JexlNode node = parseQuery(query); + assertEquals(expected, getVisitor().isAnchor(node)); + } + + private JexlNode parseQuery(String query) { + try { + ASTJexlScript script = JexlASTHelper.parseAndFlattenJexlQuery(query); + return script.jjtGetChild(0); + } catch (Exception e) { + fail("Could not parse query: " + query); + throw new IllegalStateException(e); + } + } + + private AnchorDetectionVisitor getVisitor() { + if (visitor == null) { + visitor = new AnchorDetectionVisitor(indexedFields, indexOnlyFields); + } + return visitor; + } +} diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 5bc61292091..42a76e6f14a 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -361,8 +361,27 @@ + + + + + + TYPE + + + + SHAPE + + + + + + + + + From c5bb43a61d5eb72cfac3025269f3ca2ddd0346c2 Mon Sep 17 00:00:00 2001 From: Moriarty <22225248+apmoriarty@users.noreply.github.com> Date: Thu, 17 Oct 2024 12:37:35 +0000 Subject: [PATCH 2/3] Wrap regex terms in eval only marker instead of rewriting into filter function --- .../jexl/visitors/RewriteRegexVisitor.java | 8 +- .../pushdown/AnchorDetectionVisitor.java | 4 +- .../test/java/datawave/query/ShapesTest.java | 4 +- .../visitors/RewriteRegexVisitorTest.java | 158 +++++++++--------- .../pushdown/AnchorDetectionVisitorTest.java | 4 +- 5 files changed, 89 insertions(+), 89 deletions(-) diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java index aaefde6d346..2a1195e71b8 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/RewriteRegexVisitor.java @@ -12,13 +12,13 @@ import datawave.query.Constants; import datawave.query.jexl.JexlASTHelper; -import datawave.query.jexl.JexlNodeFactory; import datawave.query.jexl.NodeTypeCount; import datawave.query.jexl.nodes.QueryPropertyMarker; +import datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType; import datawave.query.jexl.visitors.pushdown.AnchorDetectionVisitor; /** - * Rewrites regex terms as filter functions provided an anchor exists. + * Rewrites regex terms provided an anchor exists. Regex terms are wrapped in EvalOnly marker *

* An anchor is an executable term or subtree. *

@@ -175,8 +175,8 @@ public Object visit(ASTERNode node, Object data) { String literal = (String) JexlASTHelper.getLiteralValue(node); if (isNodeRewritableFromRules(field, literal)) { - JexlNode rewrite = JexlNodeFactory.buildFunctionNode("filter", "includeRegex", field, literal); - JexlNodes.replaceChild(node.jjtGetParent(), node, rewrite); + JexlNode marker = QueryPropertyMarker.create(node, MarkerType.EVALUATION_ONLY); + JexlNodes.replaceChild(node.jjtGetParent(), node, marker); } } diff --git a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java index 018658b1eb2..4740b298948 100644 --- a/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java +++ b/warehouse/query-core/src/main/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitor.java @@ -194,12 +194,12 @@ private Object visitMarker(QueryPropertyMarker.Instance instance) { switch (instance.getType()) { case BOUNDED_RANGE: - case DELAYED: - case EVALUATION_ONLY: case EXCEEDED_OR: case EXCEEDED_TERM: case EXCEEDED_VALUE: return true; + case DELAYED: + case EVALUATION_ONLY: default: return false; } diff --git a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java index c51fe8154a6..d8661daa4c6 100644 --- a/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/ShapesTest.java @@ -893,7 +893,7 @@ public void testRewriteRegexFromIncludes() throws Exception { withQuery("ONLY_HEX == 'hexa' && TYPE =~ 'reg.*'"); withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); planAndExecuteQuery(); - assertPlannedQuery("ONLY_HEX == 'hexa' && filter:includeRegex(TYPE, 'reg.*')"); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Eval_ = true) && (TYPE =~ 'reg.*'))"); } @Test @@ -909,6 +909,6 @@ public void testRewriteRegexWithExcludedFieldBecauseOfPatternMatch() throws Exce withQuery("ONLY_HEX == 'hexa' && SHAPE =~ 'hexag.*'"); withExpected(Sets.newHashSet(ShapesIngest.hexagonUid)); planAndExecuteQuery(); - assertPlannedQuery("ONLY_HEX == 'hexa' && filter:includeRegex(SHAPE, 'hexag.*')"); + assertPlannedQuery("ONLY_HEX == 'hexa' && ((_Eval_ = true) && (SHAPE =~ 'hexag.*'))"); } } diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java index fd6add4b908..e77c8d6fc53 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/RewriteRegexVisitorTest.java @@ -34,8 +34,8 @@ public void beforeEach() { @Test public void testSingleTermAndRegex() { // term and indexed regex - test("F == 'a' && F =~ 'ba.*'", "F == 'a' && filter:includeRegex(F, 'ba.*')"); - test("IO == 'a' && F =~ 'ba.*'", "IO == 'a' && filter:includeRegex(F, 'ba.*')"); + test("F == 'a' && F =~ 'ba.*'", "F == 'a' && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("IO == 'a' && F =~ 'ba.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ba.*'))"); test("NA == 'a' && F =~ 'ba.*'"); // term and index only regex is never rewritten @@ -44,9 +44,9 @@ public void testSingleTermAndRegex() { test("NA == 'a' && IO =~ 'ba.*'"); // term and non-indexed regex is always rewritten - test("F == 'a' && NA =~ 'ba.*'", "F == 'a' && filter:includeRegex(NA, 'ba.*')"); - test("IO == 'a' && NA =~ 'ba.*'", "IO == 'a' && filter:includeRegex(NA, 'ba.*')"); - test("NA == 'a' && NA =~ 'ba.*'", "NA == 'a' && filter:includeRegex(NA, 'ba.*')"); + test("F == 'a' && NA =~ 'ba.*'", "F == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("IO == 'a' && NA =~ 'ba.*'", "IO == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("NA == 'a' && NA =~ 'ba.*'", "NA == 'a' && ((_Eval_ = true) && (NA =~ 'ba.*'))"); } // A or regex @@ -63,9 +63,9 @@ public void testSingleTermOrRegex() { test("NA == 'a' || IO =~ 'ba.*'"); // top level union with non-indexed regex is a full table scan, do not rewrite - test("F == 'a' || NA =~ 'ba.*'", "F == 'a' || filter:includeRegex(NA, 'ba.*')"); - test("IO == 'a' || NA =~ 'ba.*'", "IO == 'a' || filter:includeRegex(NA, 'ba.*')"); - test("NA == 'a' || NA =~ 'ba.*'", "NA == 'a' || filter:includeRegex(NA, 'ba.*')"); + test("F == 'a' || NA =~ 'ba.*'", "F == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("IO == 'a' || NA =~ 'ba.*'", "IO == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("NA == 'a' || NA =~ 'ba.*'", "NA == 'a' || ((_Eval_ = true) && (NA =~ 'ba.*'))"); } // (A and B) or regex @@ -89,22 +89,22 @@ public void testNestedIntersectionOrRegex() { // the input queries are non-executable, non-indexed field still gets rewritten // all combinations of nested intersection and non-indexed regex - test("(F == 'a' && F == 'b') || NA =~ 'ba.*'", "(F == 'a' && F == 'b') || filter:includeRegex(NA, 'ba.*')"); - test("(F == 'a' && IO == 'b') || NA =~ 'ba.*'", "(F == 'a' && IO == 'b') || filter:includeRegex(NA, 'ba.*')"); - test("(F == 'a' && NA == 'b') || NA =~ 'ba.*'", "(F == 'a' && NA == 'b') || filter:includeRegex(NA, 'ba.*')"); - test("(IO == 'a' && IO == 'b') || NA =~ 'ba.*'", "(IO == 'a' && IO == 'b') || filter:includeRegex(NA, 'ba.*')"); - test("(IO == 'a' && NA == 'b') || NA =~ 'ba.*'", "(IO == 'a' && NA == 'b') || filter:includeRegex(NA, 'ba.*')"); - test("(NA == 'a' && NA == 'b') || Na =~ 'ba.*'", "(NA == 'a' && NA == 'b') || filter:includeRegex(Na, 'ba.*')"); + test("(F == 'a' && F == 'b') || NA =~ 'ba.*'", "(F == 'a' && F == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' && IO == 'b') || NA =~ 'ba.*'", "(F == 'a' && IO == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' && NA == 'b') || NA =~ 'ba.*'", "(F == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' && IO == 'b') || NA =~ 'ba.*'", "(IO == 'a' && IO == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' && NA == 'b') || NA =~ 'ba.*'", "(IO == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(NA == 'a' && NA == 'b') || NA =~ 'ba.*'", "(NA == 'a' && NA == 'b') || ((_Eval_ = true) && (NA =~ 'ba.*'))"); } // (A or B) and regex @Test public void testNestedUnionAndRegex() { // all combinations of nested intersection and indexed regex - test("(F == 'a' || F == 'b') && F =~ 'ba.*'", "(F == 'a' || F == 'b') && filter:includeRegex(F, 'ba.*')"); - test("(F == 'a' || IO == 'b') && F =~ 'ba.*'", "(F == 'a' || IO == 'b') && filter:includeRegex(F, 'ba.*')"); + test("(F == 'a' || F == 'b') && F =~ 'ba.*'", "(F == 'a' || F == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); + test("(F == 'a' || IO == 'b') && F =~ 'ba.*'", "(F == 'a' || IO == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); test("(F == 'a' || NA == 'b') && F =~ 'ba.*'"); - test("(IO == 'a' || IO == 'b') && F =~ 'ba.*'", "(IO == 'a' || IO == 'b') && filter:includeRegex(F, 'ba.*')"); + test("(IO == 'a' || IO == 'b') && F =~ 'ba.*'", "(IO == 'a' || IO == 'b') && ((_Eval_ = true) && (F =~ 'ba.*'))"); test("(IO == 'a' || NA == 'b') && F =~ 'ba.*'"); test("(NA == 'a' || NA == 'b') && F =~ 'ba.*'"); @@ -117,21 +117,21 @@ public void testNestedUnionAndRegex() { test("(NA == 'a' || NA == 'b') && IO =~ 'ba.*'"); // all combinations of nested intersection and non-indexed regex - test("(F == 'a' || F == 'b') && NA =~ 'ba.*'", "(F == 'a' || F == 'b') && filter:includeRegex(NA, 'ba.*')"); - test("(F == 'a' || IO == 'b') && NA =~ 'ba.*'", "(F == 'a' || IO == 'b') && filter:includeRegex(NA, 'ba.*')"); - test("(F == 'a' || NA == 'b') && NA =~ 'ba.*'", "(F == 'a' || NA == 'b') && filter:includeRegex(NA, 'ba.*')"); - test("(IO == 'a' || IO == 'b') && NA =~ 'ba.*'", "(IO == 'a' || IO == 'b') && filter:includeRegex(NA, 'ba.*')"); - test("(IO == 'a' || NA == 'b') && NA =~ 'ba.*'", "(IO == 'a' || NA == 'b') && filter:includeRegex(NA, 'ba.*')"); - test("(NA == 'a' || NA == 'b') && Na =~ 'ba.*'", "(NA == 'a' || NA == 'b') && filter:includeRegex(Na, 'ba.*')"); + test("(F == 'a' || F == 'b') && NA =~ 'ba.*'", "(F == 'a' || F == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' || IO == 'b') && NA =~ 'ba.*'", "(F == 'a' || IO == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(F == 'a' || NA == 'b') && NA =~ 'ba.*'", "(F == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' || IO == 'b') && NA =~ 'ba.*'", "(IO == 'a' || IO == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(IO == 'a' || NA == 'b') && NA =~ 'ba.*'", "(IO == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); + test("(NA == 'a' || NA == 'b') && NA =~ 'ba.*'", "(NA == 'a' || NA == 'b') && ((_Eval_ = true) && (NA =~ 'ba.*'))"); } // A and (B or regex) @Test public void testIntersectionWithNestedUnionWithSingleRegex() { // top level indexed term, variable indexed state for nested term, indexed regex - test("F == 'a' && (F == 'b' || F =~ 'ba.*')", "F == 'a' && (F == 'b' || filter:includeRegex(F, 'ba.*'))"); - test("F == 'a' && (IO == 'b' || F =~ 'ba.*')", "F == 'a' && (IO == 'b' || filter:includeRegex(F, 'ba.*'))"); - test("F == 'a' && (NA == 'b' || F =~ 'ba.*')", "F == 'a' && (NA == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("F == 'a' && (F == 'b' || F =~ 'ba.*')", "F == 'a' && (F == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("F == 'a' && (IO == 'b' || F =~ 'ba.*')", "F == 'a' && (IO == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("F == 'a' && (NA == 'b' || F =~ 'ba.*')", "F == 'a' && (NA == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); // top level indexed term, variable indexed state for nested term, index only regex test("F == 'a' && (F == 'b' || IO =~ 'ba.*')"); @@ -139,14 +139,14 @@ public void testIntersectionWithNestedUnionWithSingleRegex() { test("F == 'a' && (NA == 'b' || IO =~ 'ba.*')"); // top level indexed term, variable indexed state for nested term, non-indexed regex - test("F == 'a' && (F == 'b' || NA =~ 'ba.*')", "F == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("F == 'a' && (IO == 'b' || NA =~ 'ba.*')", "F == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("F == 'a' && (NA == 'b' || NA =~ 'ba.*')", "F == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("F == 'a' && (F == 'b' || NA =~ 'ba.*')", "F == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("F == 'a' && (IO == 'b' || NA =~ 'ba.*')", "F == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("F == 'a' && (NA == 'b' || NA =~ 'ba.*')", "F == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); // top level index only term, variable indexed state for nested term, indexed regex - test("IO == 'a' && (F == 'b' || F =~ 'ba.*')", "IO == 'a' && (F == 'b' || filter:includeRegex(F, 'ba.*'))"); - test("IO == 'a' && (IO == 'b' || F =~ 'ba.*')", "IO == 'a' && (IO == 'b' || filter:includeRegex(F, 'ba.*'))"); - test("IO == 'a' && (NA == 'b' || F =~ 'ba.*')", "IO == 'a' && (NA == 'b' || filter:includeRegex(F, 'ba.*'))"); + test("IO == 'a' && (F == 'b' || F =~ 'ba.*')", "IO == 'a' && (F == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("IO == 'a' && (IO == 'b' || F =~ 'ba.*')", "IO == 'a' && (IO == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); + test("IO == 'a' && (NA == 'b' || F =~ 'ba.*')", "IO == 'a' && (NA == 'b' || ((_Eval_ = true) && (F =~ 'ba.*')))"); // top level index only term, variable indexed state for nested term, index only regex test("IO == 'a' && (F == 'b' || IO =~ 'ba.*')"); @@ -154,9 +154,9 @@ public void testIntersectionWithNestedUnionWithSingleRegex() { test("IO == 'a' && (NA == 'b' || IO =~ 'ba.*')"); // top level index only term, variable indexed state for nested term, non-indexed regex - test("IO == 'a' && (F == 'b' || NA =~ 'ba.*')", "IO == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("IO == 'a' && (IO == 'b' || NA =~ 'ba.*')", "IO == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("IO == 'a' && (NA == 'b' || NA =~ 'ba.*')", "IO == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("IO == 'a' && (F == 'b' || NA =~ 'ba.*')", "IO == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("IO == 'a' && (IO == 'b' || NA =~ 'ba.*')", "IO == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("IO == 'a' && (NA == 'b' || NA =~ 'ba.*')", "IO == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); // top level non-indexed term, variable indexed state for nested term, indexed regex test("NA == 'a' && (F == 'b' || F =~ 'ba.*')"); @@ -169,9 +169,9 @@ public void testIntersectionWithNestedUnionWithSingleRegex() { test("NA == 'a' && (NA == 'b' || IO =~ 'ba.*')"); // top level non-indexed term, variable indexed state for nested term, non-indexed regex - test("NA == 'a' && (F == 'b' || NA =~ 'ba.*')", "NA == 'a' && (F == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("NA == 'a' && (IO == 'b' || NA =~ 'ba.*')", "NA == 'a' && (IO == 'b' || filter:includeRegex(NA, 'ba.*'))"); - test("NA == 'a' && (NA == 'b' || NA =~ 'ba.*')", "NA == 'a' && (NA == 'b' || filter:includeRegex(NA, 'ba.*'))"); + test("NA == 'a' && (F == 'b' || NA =~ 'ba.*')", "NA == 'a' && (F == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("NA == 'a' && (IO == 'b' || NA =~ 'ba.*')", "NA == 'a' && (IO == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); + test("NA == 'a' && (NA == 'b' || NA =~ 'ba.*')", "NA == 'a' && (NA == 'b' || ((_Eval_ = true) && (NA =~ 'ba.*')))"); } // A or (B and regex) @@ -227,67 +227,67 @@ public void testUnionWithNestedIntersectionWithSingleRegex() { @Test public void testIntersectionWithNestedUnionOfRegexes() { // indexed term and union of regexes with all possible index states - test("F == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"); - test("F == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || IO =~ 'ac.*')"); - test("F == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"); + test("F == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || IO =~ 'ac.*')"); + test("F == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("F == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); - test("F == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); - test("F == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "F == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); // index only term and union of regexes with all possible index states - test("IO == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"); - test("IO == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || IO =~ 'ac.*')"); - test("IO == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"); + test("IO == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || IO =~ 'ac.*')"); + test("IO == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("IO == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); - test("IO == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); - test("IO == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "IO == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); // non-indexed tem and union of regexes with all possible index states test("NA == 'a' && (F =~ 'ab.*' || F =~ 'ac.*')"); test("NA == 'a' && (F =~ 'ab.*' || IO =~ 'ac.*')"); - test("NA == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (F =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' && (F =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (F =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("NA == 'a' && (IO =~ 'ab.*' || IO =~ 'ac.*')"); - test("NA == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (IO =~ 'ab.*' || filter:includeRegex(NA, 'ac.*'))"); - test("NA == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (filter:includeRegex(NA, 'ab.*') || filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' && (IO =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (IO =~ 'ab.*' || ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' && (NA =~ 'ab.*' || NA =~ 'ac.*')", "NA == 'a' && (((_Eval_ = true) && (NA =~ 'ab.*')) || ((_Eval_ = true) && (NA =~ 'ac.*')))"); } // A or (regex and regex) @Test public void testUnionWithNestedIntersectionOfRegexes() { // indexed term or intersection of regexes with all possible index states - test("F == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "F == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); - test("F == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "F == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); - test("F == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("F == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("F == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); - test("F == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); - test("F == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + test("F == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("F == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "F == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); // index only term or intersection of regexes with all possible index states - test("IO == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); - test("IO == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); - test("IO == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("IO == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("IO == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); - test("IO == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); - test("IO == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + test("IO == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("IO == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "IO == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); // non-indexed tem or intersection of regexes with all possible index states - test("NA == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(F, 'ab.*') && F =~ 'ac.*')"); - test("NA == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(F, 'ab.*') && IO =~ 'ac.*')"); - test("NA == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (F =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' || (F =~ 'ab.*' && F =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && F =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && IO =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (F =~ 'ab.*')) && IO =~ 'ac.*')"); + test("NA == 'a' || (F =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (F =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); test("NA == 'a' || (IO =~ 'ab.*' && IO =~ 'ac.*')"); - test("NA == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (IO =~ 'ab.*' && filter:includeRegex(NA, 'ac.*'))"); - test("NA == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (filter:includeRegex(NA, 'ab.*') && filter:includeRegex(NA, 'ac.*'))"); + test("NA == 'a' || (IO =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (IO =~ 'ab.*' && ((_Eval_ = true) && (NA =~ 'ac.*')))"); + test("NA == 'a' || (NA =~ 'ab.*' && NA =~ 'ac.*')", "NA == 'a' || (((_Eval_ = true) && (NA =~ 'ab.*')) && ((_Eval_ = true) && (NA =~ 'ac.*')))"); } // (A or regex) and (B or regex) @Test public void testNestedUnionsWithDistributedRegexes() { String query = "(F == 'a' || F =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; - String expected = "(F == 'a' || filter:includeRegex(F, 'ab.*')) && (F == 'b' || F =~ 'ac.*')"; + String expected = "(F == 'a' || ((_Eval_ = true) && (F =~ 'ab.*'))) && (F == 'b' || F =~ 'ac.*')"; test(query, expected); query = "(F == 'a' || NA =~ 'ab.*') && (F == 'b' || F =~ 'ac.*')"; - expected = "(F == 'a' || filter:includeRegex(NA, 'ab.*')) && (F == 'b' || F =~ 'ac.*')"; + expected = "(F == 'a' || ((_Eval_ = true) && (NA =~ 'ab.*'))) && (F == 'b' || F =~ 'ac.*')"; test(query, expected); } @@ -295,7 +295,7 @@ public void testNestedUnionsWithDistributedRegexes() { @Test public void testNestedIntersectionsWithDistributedRegexes() { String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; - String expected = "(F == 'a' && filter:includeRegex(F, 'ab.*')) || (F == 'b' && filter:includeRegex(F, 'ac.*'))"; + String expected = "(F == 'a' && ((_Eval_ = true) && (F =~ 'ab.*'))) || (F == 'b' && ((_Eval_ = true) && (F =~ 'ac.*')))"; test(query, expected); } @@ -303,7 +303,7 @@ public void testNestedIntersectionsWithDistributedRegexes() { @Test public void testPartialAnchorAndNestedUnionRegex() { String query = "(F == 'a' || F == 'b') && (F =~ 'ab.*' || F =~ 'ac.*')"; - String expected = "(F == 'a' || F == 'b') && (filter:includeRegex(F, 'ab.*') || filter:includeRegex(F, 'ac.*'))"; + String expected = "(F == 'a' || F == 'b') && (((_Eval_ = true) && (F =~ 'ab.*')) || ((_Eval_ = true) && (F =~ 'ac.*')))"; test(query, expected); } @@ -311,7 +311,7 @@ public void testPartialAnchorAndNestedUnionRegex() { @Test public void testLeftAnchorAndDeeplyNestedRegex() { String query = "F == 'a' && (F == 'b' || (F == 'c' && F =~ 'ab.*'))"; - String expected = "F == 'a' && (F == 'b' || (F == 'c' && filter:includeRegex(F, 'ab.*')))"; + String expected = "F == 'a' && (F == 'b' || (F == 'c' && ((_Eval_ = true) && (F =~ 'ab.*'))))"; test(query, expected); } @@ -319,14 +319,14 @@ public void testLeftAnchorAndDeeplyNestedRegex() { @Test public void testRightAnchorAndDeeplyNestedRegex() { String query = "((F =~ 'ab.*' && F == 'c') || F == 'b') && F == 'a'"; - String expected = "((filter:includeRegex(F, 'ab.*') && F == 'c') || F == 'b') && F == 'a'"; + String expected = "((((_Eval_ = true) && (F =~ 'ab.*')) && F == 'c') || F == 'b') && F == 'a'"; test(query, expected); } @Test public void testUnionOfTwoLegalRewrites() { String query = "(F == 'a' && F =~ 'ab.*') || (F == 'b' && F =~ 'ac.*')"; - String expected = "(F == 'a' && filter:includeRegex(F, 'ab.*')) || (F == 'b' && filter:includeRegex(F, 'ac.*'))"; + String expected = "(F == 'a' && ((_Eval_ = true) && (F =~ 'ab.*'))) || (F == 'b' && ((_Eval_ = true) && (F =~ 'ac.*')))"; test(query, expected); } @@ -340,13 +340,13 @@ public void testUnionOfTwoIllegalRewrites() { @Test public void testIncludeFieldsPreventNoRewrites() { withIncludeFields(Set.of("F", "F2")); - test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && filter:includeRegex(F, 'ab.*') && filter:includeRegex(F2, 'ac.*')"); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ab.*')) && ((_Eval_ = true) && (F2 =~ 'ac.*'))"); } @Test public void testIncludeFieldsPreventSomeLegalRewrites() { withIncludeFields(Set.of("F2")); - test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && F =~ 'ab.*' && filter:includeRegex(F2, 'ac.*')"); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && F =~ 'ab.*' && ((_Eval_ = true) && (F2 =~ 'ac.*'))"); } @Test @@ -358,7 +358,7 @@ public void testExcludeFieldsPreventAllLegalRewrites() { @Test public void testExcludeFieldsPreventSomeLegalRewrites() { withExcludeFields(Set.of("F2")); - test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && filter:includeRegex(F, 'ab.*') && F2 =~ 'ac.*'"); + test("IO == 'a' && F =~ 'ab.*' && F2 =~ 'ac.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'ab.*')) && F2 =~ 'ac.*'"); } @Test @@ -374,7 +374,7 @@ public void testPatternBeatsExcludeFields() { withPattern("F", "zz.*"); withExcludeFields(Set.of("F")); // pattern beats exclude fields - test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); } @Test @@ -382,7 +382,7 @@ public void testPatternBeatsIncludeFields() { withPattern("F", "zz.*"); withIncludeFields(Set.of("F2")); // pattern beats include fields - test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); } @Test @@ -391,7 +391,7 @@ public void testPatternBeatsIncludeAndExcludeFields() { withIncludeFields(Set.of("F2")); withExcludeFields(Set.of("F")); // pattern beats include fields - test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && filter:includeRegex(F, 'zz.*')"); + test("IO == 'a' && F =~ 'zz.*'", "IO == 'a' && ((_Eval_ = true) && (F =~ 'zz.*'))"); } /** diff --git a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java index 6b63061f7a3..a9be296a03c 100644 --- a/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java +++ b/warehouse/query-core/src/test/java/datawave/query/jexl/visitors/pushdown/AnchorDetectionVisitorTest.java @@ -115,8 +115,6 @@ void testMarkers() { // @formatter:off String[] anchorMarkers = new String[] { "((_Bounded_ = true) && (F > '2' && F < '5'))", - "((_Delayed_ = true) && (F == '1'))", - "((_Eval_ = true) && (F == '1'))", "((_List_ = true) && ((id = 'id') && (field = 'F') && (params = '{\"ranges\":[[\"[r1\",\"r2]\"],[\"[r3\",\"f4]\"]]}')))", "((_Value_ = true) && (F =~ 'ba.*'))", "((_Term_ = true) && (_ANYFIELD_ =~ 'ba.*'))" @@ -127,6 +125,8 @@ void testMarkers() { // @formatter:off String[] nonAnchorMarkers = new String[]{ + "((_Delayed_ = true) && (F == '1'))", + "((_Eval_ = true) && (F == '1'))", "((_Hole_ = true) && (F == '1'))", "((_Drop_ = true) && (F == '1'))", "((_Lenient_ = true) && (F == '1'))", From 0edacf738abb3ea720bff2e73a9c1f48655e4824 Mon Sep 17 00:00:00 2001 From: Moriarty <22225248+apmoriarty@users.noreply.github.com> Date: Tue, 5 Nov 2024 16:55:54 +0000 Subject: [PATCH 3/3] Extract regex options into class that supports pre index expansion and post index expansion operations --- .../query/planner/DefaultQueryPlanner.java | 65 ++++++-------- .../query/planner/RegexRewriteOptions.java | 87 +++++++++++++++++++ .../datawave/query/QueryLogicFactory.xml | 17 +++- 3 files changed, 127 insertions(+), 42 deletions(-) create mode 100644 warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java index 1caf556bb7d..cf455b2a68b 100644 --- a/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java +++ b/warehouse/query-core/src/main/java/datawave/query/planner/DefaultQueryPlanner.java @@ -309,10 +309,7 @@ public class DefaultQueryPlanner extends QueryPlanner implements Cloneable { /** * Controls optimistic rewriting of regex terms as filter functions, preserving overall query executability */ - protected boolean rewriteRegexTerms = false; - protected Set regexIncludeFields; - protected Set regexExcludeFields; - protected Set regexRewritePatterns; + private RegexRewriteOptions regexRewriteOptions; // handles boilerplate operations that surround a visitor's execution (e.g., timers, logging, validating) private TimedVisitorManager visitorManager = new TimedVisitorManager(); @@ -349,6 +346,7 @@ protected DefaultQueryPlanner(DefaultQueryPlanner other) { rangeStreamClass = other.rangeStreamClass; setSourceLimit(other.sourceLimit); setPushdownThreshold(other.getPushdownThreshold()); + setRegexRewriteOptions(other.getRegexRewriteOptions()); setVisitorManager(other.getVisitorManager()); setTransformRules(other.getTransformRules() == null ? null : new ArrayList<>(other.transformRules)); } @@ -823,8 +821,13 @@ protected ASTJexlScript updateQueryTree(ScannerFactory scannerFactory, MetadataH } // rewrite regex nodes, optimistically - if (rewriteRegexTerms) { - RewriteRegexVisitor.rewrite(config.getQueryTree(), indexedFields, indexOnlyFields, regexIncludeFields, regexExcludeFields, regexRewritePatterns); + if (regexRewriteOptions != null && regexRewriteOptions.isPreExpansionEnabled()) { + // @formatter:off + RewriteRegexVisitor.rewrite(config.getQueryTree(), indexedFields, indexOnlyFields, + regexRewriteOptions.getPreExpansionIncludeFields(), + regexRewriteOptions.getPreExpansionExcludeFields(), + regexRewriteOptions.getPreExpansionPatterns()); + // @formatter:on } if (disableBoundedLookup) { @@ -986,6 +989,16 @@ protected ASTJexlScript processTree(final ASTJexlScript originalQueryTree, Shard config.setQueryTree(timedPushFunctions(timers, config.getQueryTree(), config, metadataHelper)); } + // rewrite regex nodes, optimistically + if (regexRewriteOptions != null && regexRewriteOptions.isPostExpansionEnabled()) { + // @formatter:off + RewriteRegexVisitor.rewrite(config.getQueryTree(), indexedFields, indexOnlyFields, + regexRewriteOptions.getPostExpansionIncludeFields(), + regexRewriteOptions.getPostExpansionExcludeFields(), + regexRewriteOptions.getPostExpansionPatterns()); + // @formatter:on + } + if (executableExpansion) { config.setQueryTree(timedExecutableExpansion(timers, config.getQueryTree(), config, metadataHelper)); } @@ -3254,38 +3267,6 @@ public static void setMaxTermsToPrint(int maxTermsToPrint) { DefaultQueryPlanner.maxTermsToPrint = maxTermsToPrint; } - public boolean isRewriteRegexTerms() { - return rewriteRegexTerms; - } - - public void setRewriteRegexTerms(boolean rewriteRegexTerms) { - this.rewriteRegexTerms = rewriteRegexTerms; - } - - public Set getRegexIncludeFields() { - return regexIncludeFields; - } - - public void setRegexIncludeFields(Set regexIncludeFields) { - this.regexIncludeFields = regexIncludeFields; - } - - public Set getRegexExcludeFields() { - return regexExcludeFields; - } - - public void setRegexExcludeFields(Set regexExcludeFields) { - this.regexExcludeFields = regexExcludeFields; - } - - public Set getRegexRewritePatterns() { - return regexRewritePatterns; - } - - public void setRegexRewritePatterns(Set regexRewritePatterns) { - this.regexRewritePatterns = regexRewritePatterns; - } - /** * Given a date, truncate it to year, month, date and increment the day by one to determine the following day. * @@ -3304,4 +3285,12 @@ public void finalize() { builderThread.shutdown(); } } + + public RegexRewriteOptions getRegexRewriteOptions() { + return regexRewriteOptions; + } + + public void setRegexRewriteOptions(RegexRewriteOptions regexRewriteOptions) { + this.regexRewriteOptions = regexRewriteOptions; + } } diff --git a/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java b/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java new file mode 100644 index 00000000000..966a957ec87 --- /dev/null +++ b/warehouse/query-core/src/main/java/datawave/query/planner/RegexRewriteOptions.java @@ -0,0 +1,87 @@ +package datawave.query.planner; + +import java.util.Collections; +import java.util.Set; + +import datawave.query.jexl.visitors.RegexRewritePattern; +import datawave.query.jexl.visitors.RewriteRegexVisitor; + +/** + * Provides fine-grain control over how the {@link RewriteRegexVisitor} operates pre and post index expansion + */ +public class RegexRewriteOptions { + + private boolean preExpansionEnabled = false; + private Set preExpansionIncludeFields = Collections.emptySet(); + private Set preExpansionExcludeFields = Collections.emptySet(); + private Set preExpansionPatterns = Collections.emptySet(); + + private boolean postExpansionEnabled = false; + private Set postExpansionIncludeFields = Collections.emptySet(); + private Set postExpansionExcludeFields = Collections.emptySet(); + private Set postExpansionPatterns = Collections.emptySet(); + + public boolean isPreExpansionEnabled() { + return preExpansionEnabled; + } + + public void setPreExpansionEnabled(boolean preExpansionEnabled) { + this.preExpansionEnabled = preExpansionEnabled; + } + + public Set getPreExpansionIncludeFields() { + return preExpansionIncludeFields; + } + + public void setPreExpansionIncludeFields(Set preExpansionIncludeFields) { + this.preExpansionIncludeFields = preExpansionIncludeFields; + } + + public Set getPreExpansionExcludeFields() { + return preExpansionExcludeFields; + } + + public void setPreExpansionExcludeFields(Set preExpansionExcludeFields) { + this.preExpansionExcludeFields = preExpansionExcludeFields; + } + + public Set getPreExpansionPatterns() { + return preExpansionPatterns; + } + + public void setPreExpansionPatterns(Set preExpansionPatterns) { + this.preExpansionPatterns = preExpansionPatterns; + } + + public boolean isPostExpansionEnabled() { + return postExpansionEnabled; + } + + public void setPostExpansionEnabled(boolean postExpansionEnabled) { + this.postExpansionEnabled = postExpansionEnabled; + } + + public Set getPostExpansionIncludeFields() { + return postExpansionIncludeFields; + } + + public void setPostExpansionIncludeFields(Set postExpansionIncludeFields) { + this.postExpansionIncludeFields = postExpansionIncludeFields; + } + + public Set getPostExpansionExcludeFields() { + return postExpansionExcludeFields; + } + + public void setPostExpansionExcludeFields(Set postExpansionExcludeFields) { + this.postExpansionExcludeFields = postExpansionExcludeFields; + } + + public Set getPostExpansionPatterns() { + return postExpansionPatterns; + } + + public void setPostExpansionPatterns(Set postExpansionPatterns) { + this.postExpansionPatterns = postExpansionPatterns; + } +} diff --git a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml index 29401d03c43..4c4e5f4fd3b 100644 --- a/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml +++ b/warehouse/query-core/src/test/resources/datawave/query/QueryLogicFactory.xml @@ -368,12 +368,21 @@ + - - - - + + + + + + + + + + + +