Skip to content

Commit

Permalink
Merge pull request ClickHouse#38667 from ClibMouse/ftsearch
Browse files Browse the repository at this point in the history
Inverted Indices Implementation
  • Loading branch information
rschu1ze authored Jan 20, 2023
2 parents 6aa6341 + 52ae33d commit 5ec6d89
Show file tree
Hide file tree
Showing 28 changed files with 3,288 additions and 13 deletions.
480 changes: 480 additions & 0 deletions src/Common/FST.cpp

Large diffs are not rendered by default.

182 changes: 182 additions & 0 deletions src/Common/FST.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#pragma once
#include <array>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <Core/Types.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteBuffer.h>
#include <base/types.h>

namespace DB
{
/// Finite State Transducer is an efficient way to represent term dictionary.
/// It can be viewed as a map of <term, output> where output is an integer.
/// Detailed explanation can be found in the following paper
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
namespace FST
{
using Output = UInt64;

class State;
using StatePtr = std::shared_ptr<State>;

/// Arc represents a transition from one state to another
/// It includes the target state to which the arc points and the arc's output.
struct Arc
{
Arc() = default;

explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }

/// 0 means the arc has no output
Output output = 0;

StatePtr target;

UInt64 serialize(WriteBuffer & write_buffer) const;
};

bool operator==(const Arc & arc1, const Arc & arc2);

/// LabelsAsBitmap implements a 256-bit bitmap for all labels of a state. Each bit represents
/// a label's presence and the index value of the bit represents the corresponding label
class LabelsAsBitmap
{
public:
void addLabel(char label);
bool hasLabel(char label) const;

/// computes the rank
UInt64 getIndex(char label) const;

UInt64 serialize(WriteBuffer& write_buffer);
private:
friend class State;
friend class FiniteStateTransducer;
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
/// possible label values.
UInt256 data{ 0 };
};

/// State implements the State in Finite State Transducer
/// Each state contains all its arcs and a flag indicating if it is final state
class State
{
public:
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
enum class EncodingMethod
{
/// Serialize arcs sequentially
Sequential = 0,

/// Serialize arcs by using bitmap
/// Note this is NOT enabled for now since it is experimental
Bitmap,
};
State() = default;

State(const State & state) = default;

UInt64 hash() const;

Arc * getArc(char label) const;

void addArc(char label, Output output, StatePtr target);

void clear();

UInt64 serialize(WriteBuffer & write_buffer);

bool isFinal() const
{
return flag_values.is_final == 1;
}
void setFinal(bool value)
{
flag_values.is_final = value;
}
EncodingMethod getEncodingMethod() const
{
return flag_values.encoding_method;
}
void readFlag(ReadBuffer & read_buffer)
{
read_buffer.readStrict(reinterpret_cast<char&>(flag));
}

/// Transient ID of the state which is used for building FST. It won't be serialized
UInt64 id = 0;

/// State index which indicates location of state in FST
UInt64 state_index = 0;

/// Arcs which are started from state, the 'char' is the label on the arc
std::unordered_map<char, Arc> arcs;
private:
struct FlagValues
{
unsigned int is_final : 1;
EncodingMethod encoding_method : 3;
};

union
{
FlagValues flag_values;
uint8_t flag = 0;
};
};

bool operator==(const State & state1, const State & state2);

inline constexpr size_t MAX_TERM_LENGTH = 256;

/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
/// Note that all the words have to be added in sorted order in order to achieve minimized result.
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
class FSTBuilder
{
public:
explicit FSTBuilder(WriteBuffer & write_buffer_);

void add(const std::string & word, Output output);
UInt64 build();
private:
StatePtr findMinimized(const State & s, bool & found);
void minimizePreviousWordSuffix(Int64 down_to);
static size_t getCommonPrefixLength(const String & word1, const String & word2);

std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
String previous_word;
StatePtr initial_state;

/// map of (state_hash, StatePtr)
std::unordered_map<UInt64, StatePtr> minimized_states;

/// Next available ID of state
UInt64 next_id = 1;

WriteBuffer & write_buffer;
UInt64 previous_written_bytes = 0;
UInt64 previous_state_index = 0;
};

//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
// It is used to retrieve output by given term
class FiniteStateTransducer
{
public:
FiniteStateTransducer() = default;
explicit FiniteStateTransducer(std::vector<UInt8> data_);
std::pair<UInt64, bool> getOutput(const String & term);
void clear();
std::vector<UInt8> & getData() { return data; }

private:
std::vector<UInt8> data;
};
}
}
94 changes: 94 additions & 0 deletions src/Common/tests/gtest_fst.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include <string>
#include <vector>

#include <IO/WriteBufferFromVector.h>
#include <Common/FST.h>
#include <gtest/gtest.h>

TEST(FST, SimpleTest)
{
std::vector<std::pair<std::string, DB::FST::Output>> indexed_data
{
{"mop", 100},
{"moth", 91},
{"pop", 72},
{"star", 83},
{"stop", 54},
{"top", 55},
};

std::vector<std::pair<std::string, DB::FST::Output>> not_indexed_data
{
{"mo", 100},
{"moth1", 91},
{"po", 72},
{"star2", 83},
{"sto", 54},
{"top33", 55},
};

std::vector<UInt8> buffer;
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
DB::FST::FSTBuilder builder(wbuf);

for (auto& [term, output] : indexed_data)
{
builder.add(term, output);
}
builder.build();
wbuf.finalize();

DB::FST::FiniteStateTransducer fst(buffer);
for (auto& [term, output] : indexed_data)
{
auto [result, found] = fst.getOutput(term);
ASSERT_EQ(found, true);
ASSERT_EQ(result, output);
}

for (auto& [term, output] : not_indexed_data)
{
auto [result, found] = fst.getOutput(term);
ASSERT_EQ(found, false);
}
}

TEST(FST, TestForLongTerms)
{
/// Test long terms within limitation
std::string term1(DB::FST::MAX_TERM_LENGTH - 1, 'A');
std::string term2(DB::FST::MAX_TERM_LENGTH, 'B');

DB::FST::Output output1 = 100;
DB::FST::Output output2 = 200;

std::vector<UInt8> buffer;
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
DB::FST::FSTBuilder builder(wbuf);

builder.add(term1, output1);
builder.add(term2, output2);

builder.build();
wbuf.finalize();

DB::FST::FiniteStateTransducer fst(buffer);

auto [result1, found1] = fst.getOutput(term1);
ASSERT_EQ(found1, true);
ASSERT_EQ(result1, output1);

auto [result2, found2] = fst.getOutput(term2);
ASSERT_EQ(found2, true);
ASSERT_EQ(result2, output2);

/// Test exception case when term length exceeds limitation
std::string term3(DB::FST::MAX_TERM_LENGTH + 1, 'C');
DB::FST::Output output3 = 300;

std::vector<UInt8> buffer3;
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf3(buffer3);
DB::FST::FSTBuilder builder3(wbuf3);

EXPECT_THROW(builder3.add(term3, output3), DB::Exception);
}
1 change: 1 addition & 0 deletions src/Core/Settings.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), "The maximum number of bytes per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \
M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \
\
M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \
\
Expand Down
Loading

0 comments on commit 5ec6d89

Please sign in to comment.