CERT-Polska · msm-cert · Oct 1, 2024 · Oct 1, 2024
diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp
@@ -144,6 +144,61 @@ Query propagate_degenerate_queries(Query &&q, bool *changed) {
     return std::move(q);
 }
 
+// This heuristic should ideally measure "what is the chance
+// that this query returns zero results", or "how many files we expect to get".
+// Of course, less files and bigger chance for zero result is better.
+// This should also be weighted by the query cost (100 queries for 10% chance
+// to get empty result is worse than 2 queries for 15% chance of empty result).
+//
+// The current implementation is a very naive heuristic, that just looks at
+// the query type, and index type for primitives, and orders basing on that.
+uint32_t query_heuristic_cost(const Query &q) {
+    // From empirical test, order of query types doesn't seem to matter much.
+    switch (q.get_type()) {
+        case QueryType::PRIMITIVE:
+            // Sort by ngram type, then by ngram value, alphabetically first.
+            // This is (un)surprisingly important for two reasons:
+            // 1. we read sequentially as many ngrams as possible.
+            // 2. consecutive ngrams are independent: (abc, bcd) vs (abc, def).
+            // Use smaller indexes first, because they're faster to read.
+            switch (q.as_ngram().itype) {
+                case IndexType::WIDE8:
+                    return (0 << 24) + q.as_ngram().trigram;
+                case IndexType::TEXT4:
+                    return (1 << 24) + q.as_ngram().trigram;
+                case IndexType::HASH4:
+                    return (2 << 24) + q.as_ngram().trigram;
+                case IndexType::GRAM3:
+                    return (3 << 24) + q.as_ngram().trigram;
+            }
+        case QueryType::AND:
+            return 4 << 24;
+        case QueryType::MIN_OF:
+            return 5 << 24;
+        case QueryType::OR:
+            // OR is the worst operation, since it always needs to scan
+            // all of its arguments (no chance of early exit).
+            return 6 << 24;
+    }
+    throw std::runtime_error("Unexpected query/index type.");
+}
+
+// Order queries by their heuristic cost.
+bool query_heuristic_comparer(const Query &left, const Query &right) {
+    return query_heuristic_cost(left) < query_heuristic_cost(right);
+}
+
+// Order the subqueries to maximize the chance of early exit.
+// This is done after all other optimizations, and there's no point of
+// running this in a loop.
+Query reorder_subqueries(Query &&q) {
+    if (q.get_type() == QueryType::AND) {
+        std::stable_sort(q.as_queries().begin(), q.as_queries().end(),
+                         query_heuristic_comparer);
+    }
+    return std::move(q);  // Currently only support AND operators.
+}
+
 Query q_optimize(Query &&q) {
     if (q.get_type() == QueryType::PRIMITIVE) {
         // Nothing to improve here.
@@ -160,6 +215,6 @@ Query q_optimize(Query &&q) {
         q = simplify_minof(std::move(q), &changed);
         q = propagate_degenerate_queries(std::move(q), &changed);
     }
-
+    q = reorder_subqueries(std::move(q));
     return std::move(q);
 }
diff --git a/libursa/Version.h.in b/libursa/Version.h.in
@@ -9,5 +9,5 @@ constexpr std::string_view ursadb_format_version = "1.5.0";
 // Project version.
 // Consider updating the version tag when doing PRs.
 // clang-format off
-constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt5";
+constexpr std::string_view ursadb_version_string = "@PROJECT_VERSION@+opt6";
 // clang-format on