CERT-Polska · msm-cert · Oct 6, 2024 · Oct 4, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/libursa/OnDiskDataset.cpp b/libursa/OnDiskDataset.cpp
@@ -108,7 +108,7 @@ void OnDiskDataset::execute(const Query &query, ResultWriter *out,
         files_index->for_each_filename(
             [&out](const std::string &fname) { out->push_back(fname); });
     } else {
-        for (const auto &fid : result.vector()) {
+        for (const auto &fid : result.vector().decompressed()) {
             out->push_back(get_file_name(fid));
         }
     }

diff --git a/libursa/OnDiskIndex.cpp b/libursa/OnDiskIndex.cpp
@@ -74,8 +74,7 @@ std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
     return std::make_pair(ptrs[0], ptrs[1]);
 }
 
-std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
-                                         uint64_t next_ptr) const {
+SortedRun OnDiskIndex::get_run(uint64_t ptr, uint64_t next_ptr) const {
     uint64_t run_length = next_ptr - ptr;
 
     if (ptr > next_ptr || next_ptr > index_size) {
@@ -86,12 +85,11 @@ std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
 
     std::vector<uint8_t> run_bytes(run_length);
     ndxfile.pread(run_bytes.data(), run_length, ptr);
-    return read_compressed_run(run_bytes.data(),
-                               run_bytes.data() + run_bytes.size());
+    return SortedRun(std::move(run_bytes));
 }
 
-std::vector<FileId> OnDiskIndex::query_primitive(TriGram trigram,
-                                                 QueryCounter *counter) const {
+SortedRun OnDiskIndex::query_primitive(TriGram trigram,
+                                       QueryCounter *counter) const {
     auto op = QueryOperation(counter);
     std::pair<uint64_t, uint64_t> offsets = get_run_offsets(trigram);
     return get_run(offsets.first, offsets.second);

diff --git a/libursa/OnDiskIndex.h b/libursa/OnDiskIndex.h
@@ -21,9 +21,8 @@ class OnDiskIndex {
     IndexType ntype;
 
     static constexpr uint32_t VERSION = 6;
-    std::vector<FileId> get_run(uint64_t ptr, uint64_t next_ptr) const;
-    std::vector<FileId> query_primitive(TriGram trigram,
-                                        QueryCounter *counter) const;
+    SortedRun get_run(uint64_t ptr, uint64_t next_ptr) const;
+    SortedRun query_primitive(TriGram trigram, QueryCounter *counter) const;
     std::pair<uint64_t, uint64_t> get_run_offsets(TriGram trigram) const;
 
     static void on_disk_merge_core(const std::vector<IndexMergeHelper> &indexes,

diff --git a/libursa/Query.cpp b/libursa/Query.cpp
@@ -226,7 +226,6 @@ void Query::prefetch(int from_index, int howmany, bool only_last,
             if (only_last && (i + 1 != howmany)) {
                 continue;
             }
-            spdlog::debug("prefetching {}", ndx);
             prefetcher(queries[ndx].ngram);
         }
     }
@@ -260,7 +259,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
     // Case: or. Short circuits when result is already everything.
     if (type == QueryType::OR) {
         auto result = QueryResult::empty();
-        for (const auto &query : queries) {
+        for (auto &query : queries) {
             result.do_or(query.run(primitive, prefetcher, counters),
                          &counters->ors());
             if (result.is_everything()) {
@@ -276,7 +275,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
     // There is some logic duplication here and in QueryResult::do_min_of_real.
     if (type == QueryType::MIN_OF) {
         std::vector<QueryResult> results;
-        std::vector<const QueryResult *> results_ptrs;
+        std::vector<QueryResult *> results_ptrs;
         results.reserve(queries.size());
         results_ptrs.reserve(queries.size());
         int cutoff = count;

diff --git a/libursa/QueryOptimizer.cpp b/libursa/QueryOptimizer.cpp
@@ -32,7 +32,7 @@ Query simplify_subqueries(Query &&q) {
     return std::move(Query(q.get_type(), std::move(newqueries)));
 }
 
-// This optimization simplifies trivial (one operant) operations:
+// This optimization simplifies trivial (one operand) operations:
 // AND(x) --> x
 // OR(x)  --> x
 Query flatten_trivial_operations(Query &&q, bool *changed) {

diff --git a/libursa/QueryResult.cpp b/libursa/QueryResult.cpp
@@ -2,31 +2,31 @@
 
 #include <algorithm>
 
-void QueryResult::do_or(const QueryResult &other, QueryCounter *counter) {
+void QueryResult::do_or(QueryResult &&other, QueryCounter *counter) {
     auto op = QueryOperation(counter);
     if (this->is_everything() || other.is_everything()) {
         has_everything = true;
-        results = SortedRun();
+        results = std::move(SortedRun());
     } else {
         results.do_or(other.results);
     }
 }
 
-void QueryResult::do_and(const QueryResult &other, QueryCounter *counter) {
+void QueryResult::do_and(QueryResult &&other, QueryCounter *counter) {
     auto op = QueryOperation(counter);
     if (other.is_everything()) {
     } else if (this->is_everything()) {
-        results = other.results;
+        results = std::move(other.results);
         has_everything = other.has_everything;
     } else {
         results.do_and(other.results);
     }
 }
 
-QueryResult QueryResult::do_min_of_real(
-    int cutoff, const std::vector<const QueryResult *> &sources) {
-    std::vector<const SortedRun *> nontrivial_sources;
-    for (const auto *source : sources) {
+QueryResult QueryResult::do_min_of_real(int cutoff,
+                                        std::vector<QueryResult *> &sources) {
+    std::vector<SortedRun *> nontrivial_sources;
+    for (QueryResult *source : sources) {
         if (source->is_everything()) {
             cutoff -= 1;
         } else if (!source->is_empty()) {
@@ -66,9 +66,9 @@ QueryResult QueryResult::do_min_of_real(
     return QueryResult(SortedRun::pick_common(cutoff, nontrivial_sources));
 }
 
-QueryResult QueryResult::do_min_of(
-    int cutoff, const std::vector<const QueryResult *> &sources,
-    QueryCounter *counter) {
+QueryResult QueryResult::do_min_of(int cutoff,
+                                   std::vector<QueryResult *> &sources,
+                                   QueryCounter *counter) {
     // TODO: sources can be mutable here, to save us some copies later.
     QueryOperation op(counter);
     QueryResult out{do_min_of_real(cutoff, sources)};

diff --git a/libursa/QueryResult.h b/libursa/QueryResult.h
@@ -15,8 +15,8 @@ class QueryResult {
 
     QueryResult() : results{}, has_everything{true} {}
 
-    static QueryResult do_min_of_real(
-        int cutoff, const std::vector<const QueryResult *> &sources);
+    static QueryResult do_min_of_real(int cutoff,
+                                      std::vector<QueryResult *> &sources);
 
    public:
     QueryResult(QueryResult &&other) = default;
@@ -28,12 +28,12 @@ class QueryResult {
 
     static QueryResult everything() { return QueryResult(); }
 
-    void do_or(const QueryResult &other, QueryCounter *counter);
-    void do_and(const QueryResult &other, QueryCounter *counter);
+    void do_or(QueryResult &&other, QueryCounter *counter);
+    void do_and(QueryResult &&other, QueryCounter *counter);
 
-    static QueryResult do_min_of(
-        int cutoff, const std::vector<const QueryResult *> &sources,
-        QueryCounter *counter);
+    static QueryResult do_min_of(int cutoff,
+                                 std::vector<QueryResult *> &sources,
+                                 QueryCounter *counter);
 
     // If true, means that QueryResults represents special "uninitialized"
     // value, "set of all FileIds in DataSet".
@@ -44,4 +44,5 @@ class QueryResult {
     bool is_empty() const { return !has_everything && results.empty(); }
 
     const SortedRun &vector() const { return results; }
+    SortedRun &vector() { return results; }
 };
diff --git a/libursa/SortedRun.cpp b/libursa/SortedRun.cpp
@@ -1,23 +1,100 @@
 #include "SortedRun.h"
 
 #include <algorithm>
+#include <stdexcept>
 
-void SortedRun::do_or(const SortedRun &other) {
+#include "Utils.h"
+
+uint32_t RunIterator::current() const {
+    uint64_t acc = 0;
+    uint32_t shift = 0;
+    for (uint8_t *it = pos_;; it++) {
+        uint32_t next = *it;
+        acc += (next & 0x7FU) << shift;
+        shift += 7U;
+        if ((next & 0x80U) == 0) {
+            return prev_ + acc + 1;
+        }
+    }
+}
+
+uint8_t *RunIterator::nextpos() {
+    for (uint8_t *it = pos_;; it++) {
+        if ((*it & 0x80) == 0) {
+            return it + 1;
+        }
+    }
+}
+
+void SortedRun::validate_compression(bool expected) {
+    if (!empty() && is_compressed() != expected) {
+        throw std::runtime_error("Run was in invalid compression state");
+    }
+}
+
+std::vector<uint32_t>::iterator SortedRun::begin() {
+    validate_compression(false);
+    return sequence_.begin();
+}
+
+std::vector<uint32_t>::iterator SortedRun::end() {
+    validate_compression(false);
+    return sequence_.end();
+}
+
+RunIterator SortedRun::comp_begin() {
+    validate_compression(true);
+    return RunIterator(run_.data());
+}
+
+RunIterator SortedRun::comp_end() {
+    validate_compression(true);
+    return RunIterator(run_.data() + run_.size());
+}
+
+void SortedRun::do_or(SortedRun &other) {
+    // In almost every case this is already decompressed.
+    decompress();
     std::vector<FileId> new_results;
-    std::set_union(other.begin(), other.end(), sequence_.begin(),
-                   sequence_.end(), std::back_inserter(new_results));
+    if (other.is_compressed()) {
+        // Unlikely case, in most cases both runs are already decompressed.
+        std::set_union(other.comp_begin(), other.comp_end(), begin(), end(),
+                       std::back_inserter(new_results));
+    } else {
+        std::set_union(other.begin(), other.end(), begin(), end(),
+                       std::back_inserter(new_results));
+    }
     std::swap(new_results, sequence_);
 }
 
-void SortedRun::do_and(const SortedRun &other) {
-    auto new_end =
-        std::set_intersection(other.begin(), other.end(), sequence_.begin(),
-                              sequence_.end(), sequence_.begin());
+void SortedRun::do_and(SortedRun &other) {
+    // Benchmarking shows that handling a situation where this->is_compressed()
+    // makes the code *slower*. I assume that's because of memory efficiency.
+    decompress();
+    std::vector<uint32_t>::iterator new_end;
+    if (other.is_compressed()) {
+        new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
+                                        begin(), end(), begin());
+    } else {
+        new_end = std::set_intersection(other.begin(), other.end(), begin(),
+                                        end(), begin());
+    }
     sequence_.erase(new_end, sequence_.end());
 }
 
-SortedRun SortedRun::pick_common(
-    int cutoff, const std::vector<const SortedRun *> &sources) {
+void SortedRun::decompress() {
+    if (run_.empty()) {
+        // Already decompressed
+        return;
+    }
+
+    sequence_ = read_compressed_run(run_.data(), run_.data() + run_.size());
+    std::vector<uint8_t> empty;
+    run_.swap(empty);
+}
+
+SortedRun SortedRun::pick_common(int cutoff,
+                                 std::vector<SortedRun *> &sources) {
     // returns all FileIds which appear at least `cutoff` times among provided
     // `sources`
     using FileIdRange = std::pair<std::vector<FileId>::const_iterator,
@@ -27,9 +104,9 @@ SortedRun SortedRun::pick_common(
     heads.reserve(sources.size());
 
     for (auto source : sources) {
+        source->decompress();
         if (!source->empty()) {
-            heads.emplace_back(
-                std::make_pair(source->cbegin(), source->cend()));
+            heads.emplace_back(std::make_pair(source->begin(), source->end()));
         }
     }
 
@@ -70,3 +147,8 @@ SortedRun SortedRun::pick_common(
 
     return SortedRun(std::move(result));
 }
+
+const std::vector<uint32_t> &SortedRun::decompressed() {
+    decompress();
+    return sequence_;
+}