Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add Reserve for column. Optimize large block insertion #341

Merged
merged 3 commits into from
Nov 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clickhouse/columns/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ ColumnRef ColumnArray::CloneEmpty() const {
return std::make_shared<ColumnArray>(data_->CloneEmpty());
}

void ColumnArray::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
offsets_->Reserve(new_cap);
}

void ColumnArray::Append(ColumnRef column) {
if (auto col = column->As<ColumnArray>()) {
for (size_t i = 0; i < col->Size(); ++i) {
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ class ColumnArray : public Column {
}

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/column.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class Column : public std::enable_shared_from_this<Column> {
/// Appends content of given column to the end of current one.
virtual void Append(ColumnRef column) = 0;

/// Increase the capacity of the column for large block insertion.
virtual void Reserve(size_t new_cap) = 0;
1261385937 marked this conversation as resolved.
Show resolved Hide resolved

/// Template method to load column data from input stream. It'll call LoadPrefix and LoadBody.
/// Should be called only once from the client. Derived classes should not call it.
bool Load(InputStream* input, size_t rows);
Expand Down
5 changes: 5 additions & 0 deletions clickhouse/columns/date.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,11 @@ std::string ColumnDateTime64::Timezone() const {
return type_->As<DateTime64Type>()->Timezone();
}

void ColumnDateTime64::Reserve(size_t new_cap)
{
data_->Reserve(new_cap);
}

void ColumnDateTime64::Append(ColumnRef column) {
if (auto col = column->As<ColumnDateTime64>()) {
data_->Append(col->data_);
Expand Down
26 changes: 15 additions & 11 deletions clickhouse/columns/date.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ class ColumnDate : public Column {
/// Get Raw Vector Contents
std::vector<uint16_t>& GetWritableData();

/// Increase the capacity of the column
void Reserve(size_t new_cap);
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Returns the capacity of the column
size_t Capacity() const;
Expand Down Expand Up @@ -79,9 +79,6 @@ class ColumnDate32 : public Column {
/// The implementation is fundamentally wrong, ignores timezones, leap years and daylight saving.
std::time_t At(size_t n) const;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

inline std::time_t operator [] (size_t n) const { return At(n); }

/// Do append data as is -- number of day in Unix epoch (32bit signed), no conversions performed.
Expand All @@ -91,12 +88,16 @@ class ColumnDate32 : public Column {
/// Get Raw Vector Contents
std::vector<int32_t>& GetWritableData();

/// Increase the capacity of the column
void Reserve(size_t new_cap);

/// Returns the capacity of the column
size_t Capacity() const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

/// Loads column data from input stream.
bool LoadBody(InputStream* input, size_t rows) override;

Expand Down Expand Up @@ -148,13 +149,13 @@ class ColumnDateTime : public Column {
/// Get Raw Vector Contents
std::vector<uint32_t>& GetWritableData();

/// Increase the capacity of the column
void Reserve(size_t new_cap);

/// Returns the capacity of the column
size_t Capacity() const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down Expand Up @@ -205,6 +206,9 @@ class ColumnDateTime64 : public Column {
std::string Timezone() const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
4 changes: 4 additions & 0 deletions clickhouse/columns/decimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ Int128 ColumnDecimal::At(size_t i) const {
}
}

void ColumnDecimal::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
}

void ColumnDecimal::Append(ColumnRef column) {
if (auto col = column->As<ColumnDecimal>()) {
data_->Append(col->data_);
Expand Down
2 changes: 2 additions & 0 deletions clickhouse/columns/decimal.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class ColumnDecimal : public Column {
inline auto operator[](size_t i) const { return At(i); }

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;
void Append(ColumnRef column) override;
bool LoadBody(InputStream* input, size_t rows) override;
void SaveBody(OutputStream* output) override;
Expand Down
5 changes: 5 additions & 0 deletions clickhouse/columns/enum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ void ColumnEnum<T>::SetNameAt(size_t n, const std::string& name) {
data_.at(n) = static_cast<T>(type_->As<EnumType>()->GetEnumValue(name));
}

template<typename T>
void ColumnEnum<T>::Reserve(size_t new_cap) {
data_.reserve(new_cap);
}

template <typename T>
void ColumnEnum<T>::Append(ColumnRef column) {
if (auto col = column->As<ColumnEnum<T>>()) {
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/enum.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ class ColumnEnum : public Column {
void SetNameAt(size_t n, const std::string& name);

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
5 changes: 5 additions & 0 deletions clickhouse/columns/geo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ const typename ColumnGeo<NestedColumnType, type_code>::ValueType ColumnGeo<Neste
return data_->At(n);
}

template<typename NestedColumnType, Type::Code type_code>
void ColumnGeo<NestedColumnType, type_code>::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
}

template <typename NestedColumnType, Type::Code type_code>
void ColumnGeo<NestedColumnType, type_code>::Append(ColumnRef column) {
if (auto col = column->template As<ColumnGeo>()) {
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/geo.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class ColumnGeo : public Column {
inline const ValueType operator[](size_t n) const { return At(n); }

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
4 changes: 4 additions & 0 deletions clickhouse/columns/ip4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ std::string ColumnIPv4::AsString(size_t n) const {
return ip_str;
}

void ColumnIPv4::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
}

void ColumnIPv4::Append(ColumnRef column) {
if (auto col = column->As<ColumnIPv4>()) {
data_->Append(col->data_);
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/ip4.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ class ColumnIPv4 : public Column {
std::string AsString(size_t n) const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
4 changes: 4 additions & 0 deletions clickhouse/columns/ip6.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ in6_addr ColumnIPv6::operator [] (size_t n) const {
return *reinterpret_cast<const in6_addr*>(data_->At(n).data());
}

void ColumnIPv6::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
}

void ColumnIPv6::Append(ColumnRef column) {
if (auto col = column->As<ColumnIPv6>()) {
data_->Append(col->data_);
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/ip6.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ class ColumnIPv6 : public Column {
std::string AsString(size_t n) const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
5 changes: 5 additions & 0 deletions clickhouse/columns/lowcardinality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ ColumnLowCardinality::ColumnLowCardinality(std::shared_ptr<ColumnNullable> dicti
ColumnLowCardinality::~ColumnLowCardinality()
{}

void ColumnLowCardinality::Reserve(size_t new_cap) {
dictionary_column_->Reserve(new_cap);
Copy link
Collaborator

@Enmk Enmk Oct 31, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we assume that ALL of the new items are unique, wich is quite an uncommon and quite sub-optimal case for LowCardinality.

Maybe estimate dict size as ln(new_cap) * sqrt(new_cap) + new_cap / 5, this way we'll get:

new_cap dict's new_cap estimated % of unique values
10 10 100.0
20 18 90.0
40 32 80.0
60 44 73.33
80 56 70.0
100 67 67.0
200 115 57.5
400 200 50.0
600 277 46.17
800 350 43.75
1000 419 41.9
2000 740 37.0
4000 1325 33.12
6000 1874 31.23
8000 2404 30.05
10000 2922 29.22
20000 5401 27.01
40000 10120 25.3
60000 14695 24.49
80000 19194 23.99
100000 23641 23.64
1000000 213816 21.38
10000000 2050970 20.51

So it converges to 20% of unique items (e.g. items in the dictionary column) for huge columns, but tolerates a high number of unique values for small columns.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I do not understand the form, so could you implement ColumnLowCardinality::Reserve ? 🌹

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, will do

index_column_->Reserve(new_cap);
}

void ColumnLowCardinality::Setup(ColumnRef dictionary_column) {
AppendDefaultItem();

Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/lowcardinality.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ class ColumnLowCardinality : public Column {

~ColumnLowCardinality();

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends another LowCardinality column to the end of this one, updating dictionary.
void Append(ColumnRef /*column*/) override;

Expand Down
4 changes: 4 additions & 0 deletions clickhouse/columns/map.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ ColumnMap::ColumnMap(ColumnRef data)
: Column(GetMapType(data->GetType())), data_(data->As<ColumnArray>()) {
}

void ColumnMap::Reserve(size_t new_cap) {
data_->Reserve(new_cap);
}

void ColumnMap::Clear() {
data_->Clear();
}
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/map.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ class ColumnMap : public Column {
*/
explicit ColumnMap(ColumnRef data);

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/nothing.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class ColumnNothing : public Column {
{
}

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t) override {};

/// Appends one element to the column.
void Append(std::unique_ptr<void*>) { ++size_; }

Expand Down
5 changes: 5 additions & 0 deletions clickhouse/columns/nullable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ ColumnRef ColumnNullable::Nulls() const
return nulls_;
}

void ColumnNullable::Reserve(size_t new_cap) {
nested_->Reserve(new_cap);
nulls_->Reserve(new_cap);
}

void ColumnNullable::Append(ColumnRef column) {
if (auto col = column->As<ColumnNullable>()) {
if (!col->nested_->Type()->IsEqual(nested_->Type())) {
Expand Down
3 changes: 3 additions & 0 deletions clickhouse/columns/nullable.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class ColumnNullable : public Column {
ColumnRef Nulls() const;

public:
/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends content of given column to the end of current one.
void Append(ColumnRef column) override;

Expand Down
6 changes: 3 additions & 3 deletions clickhouse/columns/numeric.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class ColumnVector : public Column {
explicit ColumnVector(const std::vector<T>& data);
explicit ColumnVector(std::vector<T> && data);

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends one element to the end of column.
void Append(const T& value);

Expand All @@ -33,9 +36,6 @@ class ColumnVector : public Column {
/// Get Raw Vector Contents
std::vector<T>& GetWritableData();

/// Increase the capacity of the column
void Reserve(size_t new_cap);

/// Returns the capacity of the column
size_t Capacity() const;

Expand Down
20 changes: 16 additions & 4 deletions clickhouse/columns/string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ ColumnFixedString::ColumnFixedString(size_t n)
{
}

void ColumnFixedString::Reserve(size_t new_cap) {
data_.reserve(string_size_ * new_cap);
}

void ColumnFixedString::Append(std::string_view str) {
if (str.size() > string_size_) {
throw ValidationError("Expected string of length not greater than "
Expand All @@ -45,8 +49,10 @@ void ColumnFixedString::Append(std::string_view str) {

data_.insert(data_.size(), str);
// Pad up to string_size_ with zeroes.
const auto padding_size = string_size_ - str.size();
data_.resize(data_.size() + padding_size, char(0));
if (str.size() < string_size_) {
const auto padding_size = string_size_ - str.size();
data_.resize(data_.size() + padding_size, char(0));
}
}

void ColumnFixedString::Clear() {
Expand Down Expand Up @@ -160,8 +166,8 @@ ColumnString::ColumnString(size_t element_count)
: Column(Type::CreateString())
{
items_.reserve(element_count);
// 100 is arbitrary number, assumption that string values are about ~40 bytes long.
blocks_.reserve(std::max<size_t>(1, element_count / 100));
// 16 is arbitrary number, assumption that string values are about ~256 bytes long.
blocks_.reserve(std::max<size_t>(1, element_count / 16));
}

ColumnString::ColumnString(const std::vector<std::string>& data)
Expand Down Expand Up @@ -190,6 +196,12 @@ ColumnString::ColumnString(std::vector<std::string>&& data)
ColumnString::~ColumnString()
{}

void ColumnString::Reserve(size_t new_cap) {
items_.reserve(new_cap);
// 16 is arbitrary number, assumption that string values are about ~256 bytes long.
blocks_.reserve(std::max<size_t>(1, new_cap / 16));
1261385937 marked this conversation as resolved.
Show resolved Hide resolved
}

void ColumnString::Append(std::string_view str) {
if (blocks_.size() == 0 || blocks_.back().GetAvailable() < str.length()) {
blocks_.emplace_back(std::max(DEFAULT_BLOCK_SIZE, str.size()));
Expand Down
6 changes: 6 additions & 0 deletions clickhouse/columns/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ class ColumnFixedString : public Column {
Append(v);
}

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t) override;

/// Appends one element to the column.
void Append(std::string_view str);

Expand Down Expand Up @@ -84,6 +87,9 @@ class ColumnString : public Column {
ColumnString& operator=(const ColumnString&) = delete;
ColumnString(const ColumnString&) = delete;

/// Increase the capacity of the column for large block insertion.
void Reserve(size_t new_cap) override;

/// Appends one element to the column.
void Append(std::string_view str);

Expand Down
6 changes: 6 additions & 0 deletions clickhouse/columns/tuple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ size_t ColumnTuple::TupleSize() const {
return columns_.size();
}

void ColumnTuple::Reserve(size_t new_cap) {
for (auto& column : columns_) {
column->Reserve(new_cap);
}
}

void ColumnTuple::Append(ColumnRef column) {
if (!this->Type()->IsEqual(column->Type())) {
throw ValidationError(
Expand Down
Loading
Loading