Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

opt8: keep runs in compressed form #227

Merged
merged 4 commits into from
Oct 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion libursa/OnDiskDataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ void OnDiskDataset::execute(const Query &query, ResultWriter *out,
files_index->for_each_filename(
[&out](const std::string &fname) { out->push_back(fname); });
} else {
for (const auto &fid : result.vector()) {
for (const auto &fid : result.vector().decompressed()) {
out->push_back(get_file_name(fid));
}
}
Expand Down
10 changes: 4 additions & 6 deletions libursa/OnDiskIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ std::pair<uint64_t, uint64_t> OnDiskIndex::get_run_offsets(
return std::make_pair(ptrs[0], ptrs[1]);
}

std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,
uint64_t next_ptr) const {
SortedRun OnDiskIndex::get_run(uint64_t ptr, uint64_t next_ptr) const {
uint64_t run_length = next_ptr - ptr;

if (ptr > next_ptr || next_ptr > index_size) {
Expand All @@ -86,12 +85,11 @@ std::vector<FileId> OnDiskIndex::get_run(uint64_t ptr,

std::vector<uint8_t> run_bytes(run_length);
ndxfile.pread(run_bytes.data(), run_length, ptr);
return read_compressed_run(run_bytes.data(),
run_bytes.data() + run_bytes.size());
return SortedRun(std::move(run_bytes));
}

std::vector<FileId> OnDiskIndex::query_primitive(TriGram trigram,
QueryCounter *counter) const {
SortedRun OnDiskIndex::query_primitive(TriGram trigram,
QueryCounter *counter) const {
auto op = QueryOperation(counter);
std::pair<uint64_t, uint64_t> offsets = get_run_offsets(trigram);
return get_run(offsets.first, offsets.second);
Expand Down
5 changes: 2 additions & 3 deletions libursa/OnDiskIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ class OnDiskIndex {
IndexType ntype;

static constexpr uint32_t VERSION = 6;
std::vector<FileId> get_run(uint64_t ptr, uint64_t next_ptr) const;
std::vector<FileId> query_primitive(TriGram trigram,
QueryCounter *counter) const;
SortedRun get_run(uint64_t ptr, uint64_t next_ptr) const;
SortedRun query_primitive(TriGram trigram, QueryCounter *counter) const;
std::pair<uint64_t, uint64_t> get_run_offsets(TriGram trigram) const;

static void on_disk_merge_core(const std::vector<IndexMergeHelper> &indexes,
Expand Down
5 changes: 2 additions & 3 deletions libursa/Query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ void Query::prefetch(int from_index, int howmany, bool only_last,
if (only_last && (i + 1 != howmany)) {
continue;
}
spdlog::debug("prefetching {}", ndx);
prefetcher(queries[ndx].ngram);
}
}
Expand Down Expand Up @@ -260,7 +259,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
// Case: or. Short circuits when result is already everything.
if (type == QueryType::OR) {
auto result = QueryResult::empty();
for (const auto &query : queries) {
for (auto &query : queries) {
result.do_or(query.run(primitive, prefetcher, counters),
&counters->ors());
if (result.is_everything()) {
Expand All @@ -276,7 +275,7 @@ QueryResult Query::run(const QueryPrimitive &primitive,
// There is some logic duplication here and in QueryResult::do_min_of_real.
if (type == QueryType::MIN_OF) {
std::vector<QueryResult> results;
std::vector<const QueryResult *> results_ptrs;
std::vector<QueryResult *> results_ptrs;
results.reserve(queries.size());
results_ptrs.reserve(queries.size());
int cutoff = count;
Expand Down
2 changes: 1 addition & 1 deletion libursa/QueryOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Query simplify_subqueries(Query &&q) {
return std::move(Query(q.get_type(), std::move(newqueries)));
}

// This optimization simplifies trivial (one operant) operations:
// This optimization simplifies trivial (one operand) operations:
// AND(x) --> x
// OR(x) --> x
Query flatten_trivial_operations(Query &&q, bool *changed) {
Expand Down
22 changes: 11 additions & 11 deletions libursa/QueryResult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,31 @@

#include <algorithm>

void QueryResult::do_or(const QueryResult &other, QueryCounter *counter) {
void QueryResult::do_or(QueryResult &&other, QueryCounter *counter) {
auto op = QueryOperation(counter);
if (this->is_everything() || other.is_everything()) {
has_everything = true;
results = SortedRun();
results = std::move(SortedRun());
} else {
results.do_or(other.results);
}
}

void QueryResult::do_and(const QueryResult &other, QueryCounter *counter) {
void QueryResult::do_and(QueryResult &&other, QueryCounter *counter) {
auto op = QueryOperation(counter);
if (other.is_everything()) {
} else if (this->is_everything()) {
results = other.results;
results = std::move(other.results);
has_everything = other.has_everything;
} else {
results.do_and(other.results);
}
}

QueryResult QueryResult::do_min_of_real(
int cutoff, const std::vector<const QueryResult *> &sources) {
std::vector<const SortedRun *> nontrivial_sources;
for (const auto *source : sources) {
QueryResult QueryResult::do_min_of_real(int cutoff,
std::vector<QueryResult *> &sources) {
std::vector<SortedRun *> nontrivial_sources;
for (QueryResult *source : sources) {
if (source->is_everything()) {
cutoff -= 1;
} else if (!source->is_empty()) {
Expand Down Expand Up @@ -66,9 +66,9 @@ QueryResult QueryResult::do_min_of_real(
return QueryResult(SortedRun::pick_common(cutoff, nontrivial_sources));
}

QueryResult QueryResult::do_min_of(
int cutoff, const std::vector<const QueryResult *> &sources,
QueryCounter *counter) {
QueryResult QueryResult::do_min_of(int cutoff,
std::vector<QueryResult *> &sources,
QueryCounter *counter) {
// TODO: sources can be mutable here, to save us some copies later.
QueryOperation op(counter);
QueryResult out{do_min_of_real(cutoff, sources)};
Expand Down
15 changes: 8 additions & 7 deletions libursa/QueryResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ class QueryResult {

QueryResult() : results{}, has_everything{true} {}

static QueryResult do_min_of_real(
int cutoff, const std::vector<const QueryResult *> &sources);
static QueryResult do_min_of_real(int cutoff,
std::vector<QueryResult *> &sources);

public:
QueryResult(QueryResult &&other) = default;
Expand All @@ -28,12 +28,12 @@ class QueryResult {

static QueryResult everything() { return QueryResult(); }

void do_or(const QueryResult &other, QueryCounter *counter);
void do_and(const QueryResult &other, QueryCounter *counter);
void do_or(QueryResult &&other, QueryCounter *counter);
void do_and(QueryResult &&other, QueryCounter *counter);

static QueryResult do_min_of(
int cutoff, const std::vector<const QueryResult *> &sources,
QueryCounter *counter);
static QueryResult do_min_of(int cutoff,
std::vector<QueryResult *> &sources,
QueryCounter *counter);

// If true, means that QueryResults represents special "uninitialized"
// value, "set of all FileIds in DataSet".
Expand All @@ -44,4 +44,5 @@ class QueryResult {
bool is_empty() const { return !has_everything && results.empty(); }

const SortedRun &vector() const { return results; }
SortedRun &vector() { return results; }
};
104 changes: 93 additions & 11 deletions libursa/SortedRun.cpp
Original file line number Diff line number Diff line change
@@ -1,23 +1,100 @@
#include "SortedRun.h"

#include <algorithm>
#include <stdexcept>

void SortedRun::do_or(const SortedRun &other) {
#include "Utils.h"

uint32_t RunIterator::current() const {
uint64_t acc = 0;
uint32_t shift = 0;
for (uint8_t *it = pos_;; it++) {
uint32_t next = *it;
acc += (next & 0x7FU) << shift;
shift += 7U;
if ((next & 0x80U) == 0) {
return prev_ + acc + 1;
}
}
}

uint8_t *RunIterator::nextpos() {
for (uint8_t *it = pos_;; it++) {
if ((*it & 0x80) == 0) {
return it + 1;
}
}
}

void SortedRun::validate_compression(bool expected) {
if (!empty() && is_compressed() != expected) {
throw std::runtime_error("Run was in invalid compression state");
}
}

std::vector<uint32_t>::iterator SortedRun::begin() {
validate_compression(false);
return sequence_.begin();
}

std::vector<uint32_t>::iterator SortedRun::end() {
validate_compression(false);
return sequence_.end();
}

RunIterator SortedRun::comp_begin() {
validate_compression(true);
return RunIterator(run_.data());
}

RunIterator SortedRun::comp_end() {
validate_compression(true);
return RunIterator(run_.data() + run_.size());
}

void SortedRun::do_or(SortedRun &other) {
// In almost every case this is already decompressed.
decompress();
std::vector<FileId> new_results;
std::set_union(other.begin(), other.end(), sequence_.begin(),
sequence_.end(), std::back_inserter(new_results));
if (other.is_compressed()) {
// Unlikely case, in most cases both runs are already decompressed.
std::set_union(other.comp_begin(), other.comp_end(), begin(), end(),
std::back_inserter(new_results));
} else {
std::set_union(other.begin(), other.end(), begin(), end(),
std::back_inserter(new_results));
}
std::swap(new_results, sequence_);
}

void SortedRun::do_and(const SortedRun &other) {
auto new_end =
std::set_intersection(other.begin(), other.end(), sequence_.begin(),
sequence_.end(), sequence_.begin());
void SortedRun::do_and(SortedRun &other) {
// Benchmarking shows that handling a situation where this->is_compressed()
// makes the code *slower*. I assume that's because of memory efficiency.
decompress();
std::vector<uint32_t>::iterator new_end;
if (other.is_compressed()) {
new_end = std::set_intersection(other.comp_begin(), other.comp_end(),
begin(), end(), begin());
} else {
new_end = std::set_intersection(other.begin(), other.end(), begin(),
end(), begin());
}
sequence_.erase(new_end, sequence_.end());
}

SortedRun SortedRun::pick_common(
int cutoff, const std::vector<const SortedRun *> &sources) {
void SortedRun::decompress() {
if (run_.empty()) {
// Already decompressed
return;
}

sequence_ = read_compressed_run(run_.data(), run_.data() + run_.size());
std::vector<uint8_t> empty;
run_.swap(empty);
}

SortedRun SortedRun::pick_common(int cutoff,
std::vector<SortedRun *> &sources) {
// returns all FileIds which appear at least `cutoff` times among provided
// `sources`
using FileIdRange = std::pair<std::vector<FileId>::const_iterator,
Expand All @@ -27,9 +104,9 @@ SortedRun SortedRun::pick_common(
heads.reserve(sources.size());

for (auto source : sources) {
source->decompress();
if (!source->empty()) {
heads.emplace_back(
std::make_pair(source->cbegin(), source->cend()));
heads.emplace_back(std::make_pair(source->begin(), source->end()));
}
}

Expand Down Expand Up @@ -70,3 +147,8 @@ SortedRun SortedRun::pick_common(

return SortedRun(std::move(result));
}

const std::vector<uint32_t> &SortedRun::decompressed() {
decompress();
return sequence_;
}
Loading
Loading