Skip to content

Commit

Permalink
[Feature] Support Binary Data Type (StarRocks#21075) (StarRocks#33044)
Browse files Browse the repository at this point in the history
Signed-off-by: shuming.li <[email protected]>
  • Loading branch information
LiShuMing authored Oct 19, 2023
1 parent 4d0aef9 commit 77821ab
Show file tree
Hide file tree
Showing 19 changed files with 492 additions and 36 deletions.
3 changes: 2 additions & 1 deletion be/src/formats/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/formats")
add_library(Formats STATIC
csv/array_converter.cpp
csv/array_reader.cpp
csv/binary_converter.cpp
csv/boolean_converter.cpp
csv/converter.cpp
csv/csv_reader.cpp
Expand All @@ -30,6 +29,8 @@ add_library(Formats STATIC
csv/numeric_converter.cpp
csv/nullable_converter.cpp
csv/default_value_converter.cpp
csv/string_converter.cpp
csv/varbinary_converter.cpp
json/nullable_column.cpp
json/numeric_column.cpp
json/binary_column.cpp
Expand Down
7 changes: 5 additions & 2 deletions be/src/formats/csv/converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include "formats/csv/converter.h"

#include "formats/csv/array_converter.h"
#include "formats/csv/binary_converter.h"
#include "formats/csv/boolean_converter.h"
#include "formats/csv/date_converter.h"
#include "formats/csv/datetime_converter.h"
Expand All @@ -26,6 +25,8 @@
#include "formats/csv/json_converter.h"
#include "formats/csv/nullable_converter.h"
#include "formats/csv/numeric_converter.h"
#include "formats/csv/string_converter.h"
#include "formats/csv/varbinary_converter.h"
#include "runtime/types.h"

namespace starrocks::csv {
Expand All @@ -52,7 +53,7 @@ static std::unique_ptr<Converter> create_converter(const TypeDescriptor& t, cons
return std::make_unique<DecimalV2Converter>();
case TYPE_CHAR:
case TYPE_VARCHAR:
return std::make_unique<BinaryConverter>();
return std::make_unique<StringConverter>();
case TYPE_DATE:
return std::make_unique<DateConverter>();
case TYPE_DATETIME:
Expand All @@ -75,6 +76,8 @@ static std::unique_ptr<Converter> create_converter(const TypeDescriptor& t, cons
return std::make_unique<DecimalV3Converter<int128_t>>(t.precision, t.scale);
case TYPE_JSON:
return std::make_unique<JsonConverter>();
case TYPE_VARBINARY:
return std::make_unique<VarBinaryConverter>();
default:
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#include "formats/csv/binary_converter.h"
#include "formats/csv/string_converter.h"

#include "column/binary_column.h"
#include "gutil/strings/substitute.h"
Expand All @@ -21,7 +21,7 @@

namespace starrocks::csv {

Status BinaryConverter::write_string(OutputStream* os, const Column& column, size_t row_num,
Status StringConverter::write_string(OutputStream* os, const Column& column, size_t row_num,
const Options& options) const {
auto* binary = down_cast<const BinaryColumn*>(&column);
auto& bytes = binary->get_bytes();
Expand All @@ -32,7 +32,7 @@ Status BinaryConverter::write_string(OutputStream* os, const Column& column, siz
return os->write(s);
}

Status BinaryConverter::write_quoted_string(OutputStream* os, const Column& column, size_t row_num,
Status StringConverter::write_quoted_string(OutputStream* os, const Column& column, size_t row_num,
const Options& options) const {
auto* binary = down_cast<const BinaryColumn*>(&column);
auto& bytes = binary->get_bytes();
Expand All @@ -53,7 +53,7 @@ Status BinaryConverter::write_quoted_string(OutputStream* os, const Column& colu
return os->write('"');
}

bool BinaryConverter::read_string(Column* column, const Slice& s, const Options& options) const {
bool StringConverter::read_string(Column* column, const Slice& s, const Options& options) const {
int max_size = 0;
if (options.type_desc != nullptr) {
max_size = options.type_desc->len;
Expand All @@ -68,7 +68,7 @@ bool BinaryConverter::read_string(Column* column, const Slice& s, const Options&
return true;
}

bool BinaryConverter::read_quoted_string(Column* column, const Slice& tmp_s, const Options& options) const {
bool StringConverter::read_quoted_string(Column* column, const Slice& tmp_s, const Options& options) const {
Slice s = tmp_s;
if (!remove_enclosing_quotes<'"'>(&s)) {
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

namespace starrocks::csv {

class BinaryConverter final : public Converter {
class StringConverter final : public Converter {
public:
Status write_string(OutputStream* os, const Column& column, size_t row_num, const Options& options) const override;
Status write_quoted_string(OutputStream* os, const Column& column, size_t row_num,
Expand Down
105 changes: 105 additions & 0 deletions be/src/formats/csv/varbinary_converter.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "formats/csv/varbinary_converter.h"

#include <iomanip>
#include <iostream>
#include <sstream>

#include "column/binary_column.h"
#include "gutil/strings/escaping.h"
#include "runtime/descriptors.h"
#include "runtime/types.h"
#include "util/string_parser.hpp"

namespace starrocks::csv {

Status VarBinaryConverter::write_string(OutputStream* os, const Column& column, size_t row_num,
const Options& options) const {
auto* binary = down_cast<const BinaryColumn*>(&column);
auto& bytes = binary->get_bytes();
auto& offsets = binary->get_offset();
// TODO: support binary type config later

Slice str(&bytes[offsets[row_num]], offsets[row_num + 1] - offsets[row_num]);
std::stringstream ss;
ss << std::hex << std::uppercase << std::setfill('0');
for (int i = 0; i < str.size; ++i) {
// setw is not sticky. stringstream only converts integral values,
// so a cast to int is required, but only convert the least significant byte to hex.
ss << std::setw(2) << (static_cast<int32_t>(str.data[i]) & 0xFF);
}
// from binary to hex
return os->write(Slice(ss.str()));
}

Status VarBinaryConverter::write_quoted_string(OutputStream* os, const Column& column, size_t row_num,
const Options& options) const {
RETURN_IF_ERROR(os->write('"'));
RETURN_IF_ERROR(write_string(os, column, row_num, options));
return os->write('"');
}

bool VarBinaryConverter::read_string(Column* column, const Slice& s, const Options& options) const {
int max_size = 0;
if (options.type_desc != nullptr) {
max_size = options.type_desc->len;
}

char* data_ptr = static_cast<char*>(s.data);
int start = StringParser::skip_leading_whitespace(data_ptr, s.size);
int len = s.size - start;
// hex's length should be 2x.
if (len % 2 != 0) {
LOG(WARNING) << "Column [" << column->get_name() << "]'s length invalid:" << s.to_string();
return false;
}

// hex's length should not greater than MAX_VARCHAR_LENGTH.
int hex_len = len / 2;
if (UNLIKELY((hex_len > TypeDescriptor::MAX_VARCHAR_LENGTH) || (max_size > 0 && hex_len > max_size))) {
LOG(WARNING) << "Column [" << column->get_name() << "]'s length exceed max varbinary length.";
return false;
}

// check slice is valid
for (int i = start; i < s.size; i++) {
if (LIKELY((s[i] >= '0' && s[i] <= '9') || (s[i] >= 'A' && s[i] <= 'F') || (s[i] >= 'a' && s[i] <= 'f'))) {
continue;
} else {
LOG(WARNING) << "Invalid input's not a legal hex-encoded value:" << s.to_string();
return false;
}
}

// from string to binary
std::unique_ptr<char[]> p;
p.reset(new char[hex_len]);
strings::a2b_hex(data_ptr + start, p.get(), hex_len);
down_cast<BinaryColumn*>(column)->append(Slice(p.get(), hex_len));

return true;
}

bool VarBinaryConverter::read_quoted_string(Column* column, const Slice& tmp_s, const Options& options) const {
Slice s = tmp_s;
// TODO: need write quote for binary?
if (!remove_enclosing_quotes<'"'>(&s)) {
return false;
}
return read_string(column, s, options);
}

} // namespace starrocks::csv
30 changes: 30 additions & 0 deletions be/src/formats/csv/varbinary_converter.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "formats/csv/converter.h"

namespace starrocks::csv {

class VarBinaryConverter final : public Converter {
public:
Status write_string(OutputStream* os, const Column& column, size_t row_num, const Options& options) const override;
Status write_quoted_string(OutputStream* os, const Column& column, size_t row_num,
const Options& options) const override;
bool read_string(Column* column, const Slice& s, const Options& options) const override;
bool read_quoted_string(Column* column, const Slice& s, const Options& options) const override;
};

} // namespace starrocks::csv
2 changes: 2 additions & 0 deletions be/src/formats/json/nullable_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,8 @@ static Status add_adpative_nullable_column(Column* column, const TypeDescriptor&
return add_adaptive_nullable_numeric_column<float>(column, type_desc, name, value);
case TYPE_JSON:
return add_adpative_nullable_native_json_column(column, type_desc, name, value);
case TYPE_VARBINARY:
return add_adpative_nullable_native_json_column(column, type_desc, name, value);
case TYPE_ARRAY: {
try {
if (value->type() == simdjson::ondemand::json_type::array) {
Expand Down
7 changes: 7 additions & 0 deletions be/src/runtime/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,13 @@ struct TypeDescriptor {
return ret;
}

static TypeDescriptor create_varbinary_type(int len) {
TypeDescriptor ret;
ret.type = TYPE_VARBINARY;
ret.len = len;
return ret;
}

static TypeDescriptor create_json_type() {
TypeDescriptor res;
res.type = TYPE_JSON;
Expand Down
3 changes: 2 additions & 1 deletion be/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ set(EXEC_FILES
./exprs/runtime_filter_test.cpp
./exprs/subfield_expr_test.cpp
./formats/csv/array_converter_test.cpp
./formats/csv/binary_converter_test.cpp
./formats/csv/boolean_converter_test.cpp
./formats/csv/date_converter_test.cpp
./formats/csv/datetime_converter_test.cpp
Expand All @@ -133,6 +132,8 @@ set(EXEC_FILES
./formats/csv/nullable_converter_test.cpp
./formats/csv/numeric_converter_test.cpp
./formats/csv/default_value_converter_test.cpp
./formats/csv/string_converter_test.cpp
./formats/csv/varbinary_converter_test.cpp
./formats/json/binary_column_test.cpp
./formats/json/numeric_column_test.cpp
./formats/json/nullable_column_test.cpp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@

namespace starrocks::csv {

class BinaryConverterTest : public ::testing::Test {
class StringConverterTest : public ::testing::Test {
public:
BinaryConverterTest() {
StringConverterTest() {
_type.type = TYPE_VARCHAR;
_type.len = 6000;
}
Expand All @@ -33,7 +33,7 @@ class BinaryConverterTest : public ::testing::Test {
};

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_string) {
TEST_F(StringConverterTest, test_read_string) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -50,7 +50,7 @@ TEST_F(BinaryConverterTest, test_read_string) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_string01) {
TEST_F(StringConverterTest, test_read_large_string01) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -59,7 +59,7 @@ TEST_F(BinaryConverterTest, test_read_large_string01) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_string02) {
TEST_F(StringConverterTest, test_read_large_string02) {
TypeDescriptor varchar_type;
varchar_type.type = TYPE_VARCHAR;
varchar_type.len = 10;
Expand All @@ -74,7 +74,7 @@ TEST_F(BinaryConverterTest, test_read_large_string02) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_quoted_string) {
TEST_F(StringConverterTest, test_read_quoted_string) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -92,7 +92,7 @@ TEST_F(BinaryConverterTest, test_read_quoted_string) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_quoted_string01) {
TEST_F(StringConverterTest, test_read_large_quoted_string01) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -106,7 +106,7 @@ TEST_F(BinaryConverterTest, test_read_large_quoted_string01) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_quoted_string02) {
TEST_F(StringConverterTest, test_read_large_quoted_string02) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -120,7 +120,7 @@ TEST_F(BinaryConverterTest, test_read_large_quoted_string02) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_quoted_string03) {
TEST_F(StringConverterTest, test_read_large_quoted_string03) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);

Expand All @@ -138,7 +138,7 @@ TEST_F(BinaryConverterTest, test_read_large_quoted_string03) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_read_large_quoted_string04) {
TEST_F(StringConverterTest, test_read_large_quoted_string04) {
TypeDescriptor varchar_type;
varchar_type.type = TYPE_VARCHAR;
varchar_type.len = 10;
Expand All @@ -159,7 +159,7 @@ TEST_F(BinaryConverterTest, test_read_large_quoted_string04) {
}

// NOLINTNEXTLINE
TEST_F(BinaryConverterTest, test_write_string) {
TEST_F(StringConverterTest, test_write_string) {
auto conv = csv::get_converter(_type, false);
auto col = ColumnHelper::create_column(_type, false);
(void)col->append_strings({"aaaaaaaaaaaa", "bbbbbbbb", "\"\"", "ccccc"});
Expand Down
Loading

0 comments on commit 77821ab

Please sign in to comment.