forked from ClickHouse/ClickHouse
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request ClickHouse#38667 from ClibMouse/ftsearch
Inverted Indices Implementation
- Loading branch information
Showing
28 changed files
with
3,288 additions
and
13 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,182 @@ | ||
#pragma once | ||
#include <array> | ||
#include <map> | ||
#include <memory> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <unordered_set> | ||
#include <vector> | ||
#include <Core/Types.h> | ||
#include <IO/ReadHelpers.h> | ||
#include <IO/WriteBuffer.h> | ||
#include <base/types.h> | ||
|
||
namespace DB | ||
{ | ||
/// Finite State Transducer is an efficient way to represent term dictionary. | ||
/// It can be viewed as a map of <term, output> where output is an integer. | ||
/// Detailed explanation can be found in the following paper | ||
/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France | ||
namespace FST | ||
{ | ||
using Output = UInt64; | ||
|
||
class State; | ||
using StatePtr = std::shared_ptr<State>; | ||
|
||
/// Arc represents a transition from one state to another | ||
/// It includes the target state to which the arc points and the arc's output. | ||
struct Arc | ||
{ | ||
Arc() = default; | ||
|
||
explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { } | ||
|
||
/// 0 means the arc has no output | ||
Output output = 0; | ||
|
||
StatePtr target; | ||
|
||
UInt64 serialize(WriteBuffer & write_buffer) const; | ||
}; | ||
|
||
bool operator==(const Arc & arc1, const Arc & arc2); | ||
|
||
/// LabelsAsBitmap implements a 256-bit bitmap for all labels of a state. Each bit represents | ||
/// a label's presence and the index value of the bit represents the corresponding label | ||
class LabelsAsBitmap | ||
{ | ||
public: | ||
void addLabel(char label); | ||
bool hasLabel(char label) const; | ||
|
||
/// computes the rank | ||
UInt64 getIndex(char label) const; | ||
|
||
UInt64 serialize(WriteBuffer& write_buffer); | ||
private: | ||
friend class State; | ||
friend class FiniteStateTransducer; | ||
/// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256 | ||
/// possible label values. | ||
UInt256 data{ 0 }; | ||
}; | ||
|
||
/// State implements the State in Finite State Transducer | ||
/// Each state contains all its arcs and a flag indicating if it is final state | ||
class State | ||
{ | ||
public: | ||
static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32; | ||
enum class EncodingMethod | ||
{ | ||
/// Serialize arcs sequentially | ||
Sequential = 0, | ||
|
||
/// Serialize arcs by using bitmap | ||
/// Note this is NOT enabled for now since it is experimental | ||
Bitmap, | ||
}; | ||
State() = default; | ||
|
||
State(const State & state) = default; | ||
|
||
UInt64 hash() const; | ||
|
||
Arc * getArc(char label) const; | ||
|
||
void addArc(char label, Output output, StatePtr target); | ||
|
||
void clear(); | ||
|
||
UInt64 serialize(WriteBuffer & write_buffer); | ||
|
||
bool isFinal() const | ||
{ | ||
return flag_values.is_final == 1; | ||
} | ||
void setFinal(bool value) | ||
{ | ||
flag_values.is_final = value; | ||
} | ||
EncodingMethod getEncodingMethod() const | ||
{ | ||
return flag_values.encoding_method; | ||
} | ||
void readFlag(ReadBuffer & read_buffer) | ||
{ | ||
read_buffer.readStrict(reinterpret_cast<char&>(flag)); | ||
} | ||
|
||
/// Transient ID of the state which is used for building FST. It won't be serialized | ||
UInt64 id = 0; | ||
|
||
/// State index which indicates location of state in FST | ||
UInt64 state_index = 0; | ||
|
||
/// Arcs which are started from state, the 'char' is the label on the arc | ||
std::unordered_map<char, Arc> arcs; | ||
private: | ||
struct FlagValues | ||
{ | ||
unsigned int is_final : 1; | ||
EncodingMethod encoding_method : 3; | ||
}; | ||
|
||
union | ||
{ | ||
FlagValues flag_values; | ||
uint8_t flag = 0; | ||
}; | ||
}; | ||
|
||
bool operator==(const State & state1, const State & state2); | ||
|
||
inline constexpr size_t MAX_TERM_LENGTH = 256; | ||
|
||
/// FSTBuilder is used to build Finite State Transducer by adding words incrementally. | ||
/// Note that all the words have to be added in sorted order in order to achieve minimized result. | ||
/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer | ||
class FSTBuilder | ||
{ | ||
public: | ||
explicit FSTBuilder(WriteBuffer & write_buffer_); | ||
|
||
void add(const std::string & word, Output output); | ||
UInt64 build(); | ||
private: | ||
StatePtr findMinimized(const State & s, bool & found); | ||
void minimizePreviousWordSuffix(Int64 down_to); | ||
static size_t getCommonPrefixLength(const String & word1, const String & word2); | ||
|
||
std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states; | ||
String previous_word; | ||
StatePtr initial_state; | ||
|
||
/// map of (state_hash, StatePtr) | ||
std::unordered_map<UInt64, StatePtr> minimized_states; | ||
|
||
/// Next available ID of state | ||
UInt64 next_id = 1; | ||
|
||
WriteBuffer & write_buffer; | ||
UInt64 previous_written_bytes = 0; | ||
UInt64 previous_state_index = 0; | ||
}; | ||
|
||
//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage) | ||
// It is used to retrieve output by given term | ||
class FiniteStateTransducer | ||
{ | ||
public: | ||
FiniteStateTransducer() = default; | ||
explicit FiniteStateTransducer(std::vector<UInt8> data_); | ||
std::pair<UInt64, bool> getOutput(const String & term); | ||
void clear(); | ||
std::vector<UInt8> & getData() { return data; } | ||
|
||
private: | ||
std::vector<UInt8> data; | ||
}; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
#include <string> | ||
#include <vector> | ||
|
||
#include <IO/WriteBufferFromVector.h> | ||
#include <Common/FST.h> | ||
#include <gtest/gtest.h> | ||
|
||
TEST(FST, SimpleTest) | ||
{ | ||
std::vector<std::pair<std::string, DB::FST::Output>> indexed_data | ||
{ | ||
{"mop", 100}, | ||
{"moth", 91}, | ||
{"pop", 72}, | ||
{"star", 83}, | ||
{"stop", 54}, | ||
{"top", 55}, | ||
}; | ||
|
||
std::vector<std::pair<std::string, DB::FST::Output>> not_indexed_data | ||
{ | ||
{"mo", 100}, | ||
{"moth1", 91}, | ||
{"po", 72}, | ||
{"star2", 83}, | ||
{"sto", 54}, | ||
{"top33", 55}, | ||
}; | ||
|
||
std::vector<UInt8> buffer; | ||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer); | ||
DB::FST::FSTBuilder builder(wbuf); | ||
|
||
for (auto& [term, output] : indexed_data) | ||
{ | ||
builder.add(term, output); | ||
} | ||
builder.build(); | ||
wbuf.finalize(); | ||
|
||
DB::FST::FiniteStateTransducer fst(buffer); | ||
for (auto& [term, output] : indexed_data) | ||
{ | ||
auto [result, found] = fst.getOutput(term); | ||
ASSERT_EQ(found, true); | ||
ASSERT_EQ(result, output); | ||
} | ||
|
||
for (auto& [term, output] : not_indexed_data) | ||
{ | ||
auto [result, found] = fst.getOutput(term); | ||
ASSERT_EQ(found, false); | ||
} | ||
} | ||
|
||
TEST(FST, TestForLongTerms) | ||
{ | ||
/// Test long terms within limitation | ||
std::string term1(DB::FST::MAX_TERM_LENGTH - 1, 'A'); | ||
std::string term2(DB::FST::MAX_TERM_LENGTH, 'B'); | ||
|
||
DB::FST::Output output1 = 100; | ||
DB::FST::Output output2 = 200; | ||
|
||
std::vector<UInt8> buffer; | ||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer); | ||
DB::FST::FSTBuilder builder(wbuf); | ||
|
||
builder.add(term1, output1); | ||
builder.add(term2, output2); | ||
|
||
builder.build(); | ||
wbuf.finalize(); | ||
|
||
DB::FST::FiniteStateTransducer fst(buffer); | ||
|
||
auto [result1, found1] = fst.getOutput(term1); | ||
ASSERT_EQ(found1, true); | ||
ASSERT_EQ(result1, output1); | ||
|
||
auto [result2, found2] = fst.getOutput(term2); | ||
ASSERT_EQ(found2, true); | ||
ASSERT_EQ(result2, output2); | ||
|
||
/// Test exception case when term length exceeds limitation | ||
std::string term3(DB::FST::MAX_TERM_LENGTH + 1, 'C'); | ||
DB::FST::Output output3 = 300; | ||
|
||
std::vector<UInt8> buffer3; | ||
DB::WriteBufferFromVector<std::vector<UInt8>> wbuf3(buffer3); | ||
DB::FST::FSTBuilder builder3(wbuf3); | ||
|
||
EXPECT_THROW(builder3.add(term3, output3), DB::Exception); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.