From 818f71d085b6f820903afc6b1f1e577d8e45ff47 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Thu, 26 Oct 2023 15:57:48 -0400 Subject: [PATCH 1/6] GH-38418: [MATLAB] Add method for extracting one row of an `arrow.tabular.Table` as a string (#38463) ### Rationale for this change We would like to modify the display of the `arrow.tabular.Table` and `arrow.tabular.RecordBatch` classes to be more "MATLAB-like". In order to do this, we need to add a method to their respective C++ Proxy classes that returns a single row of the Table/RecordBatch as a MATLAB `string` array. ### What changes are included in this PR? Added new function template: ```cpp template arrow::matlab::tabular::print_row(const std::shared_ptr& tabularObject, const int64_t row_index) ``` This function template returns a string representation of the specified row in `tabbularObject`. Added a new proxy method called `getRowString` to both the `Table` and `RecordBatch` C++ proxy classes. These methods invoke `print_row` to return a string representation of one row in the `Table`/`RecordBatch`. Neither MATLAB class `arrow.tabular.Table` nor `arrow.tabular.RecordBatch` expose these methods directly because they will only be used internally for display. Below is an example Output of `getRowString()`: ```matlab >> matlabTable = table([1; 2; 3], ["ABC"; "DE"; "FGH"], datetime(2023, 10, 25) + days(0:2)'); >> arrowTable = arrow.table(matlabTable); >> rowOneAsString = arrowTable.Proxy.getRowString(struct(Index=int64(1))) rowOneAsString = "1 | "ABC" | 2023-10-25 00:00:00.000000" ``` ### Are these changes tested? Yes, added a new test class called `tTabularInternal.m`. Because `getRowString()` is not a method on the MATLAB classes `arrow.tabular.Table` and `arrow.tabular.RecordBatch`, this test class calls `getRowString()` on their `Proxy` properties, which are public but hidden. ### Are there any user-facing changes? No. * Closes: #38418 Lead-authored-by: Sarah Gilmore Co-authored-by: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Co-authored-by: Kevin Gurney Signed-off-by: Kevin Gurney --- matlab/src/cpp/arrow/matlab/error/error.h | 1 + .../arrow/matlab/tabular/get_row_as_string.h | 77 ++++++++++++ .../matlab/tabular/proxy/record_batch.cc | 18 +++ .../arrow/matlab/tabular/proxy/record_batch.h | 1 + .../cpp/arrow/matlab/tabular/proxy/table.cc | 19 +++ .../cpp/arrow/matlab/tabular/proxy/table.h | 1 + .../+tabular/createAllSupportedArrayTypes.m | 5 +- matlab/test/arrow/tabular/tTabularInternal.m | 110 ++++++++++++++++++ 8 files changed, 230 insertions(+), 2 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h create mode 100644 matlab/test/arrow/tabular/tTabularInternal.m diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 2d8f5c432c96e..33e80bca8cc5a 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -202,4 +202,5 @@ namespace arrow::matlab::error { static const char* INDEX_OUT_OF_RANGE = "arrow:index:OutOfRange"; static const char* BUFFER_VIEW_OR_COPY_FAILED = "arrow:buffer:ViewOrCopyFailed"; static const char* ARRAY_PRETTY_PRINT_FAILED = "arrow:array:PrettyPrintFailed"; + static const char* TABULAR_GET_ROW_AS_STRING_FAILED = "arrow:tabular:GetRowAsStringFailed"; } diff --git a/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h new file mode 100644 index 0000000000000..824b6c19a7109 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/pretty_print.h" + +#include + +namespace arrow::matlab::tabular { + + namespace { + arrow::PrettyPrintOptions make_pretty_print_options() { + auto opts = arrow::PrettyPrintOptions::Defaults(); + opts.skip_new_lines = true; + opts.array_delimiters.open = ""; + opts.array_delimiters.close = ""; + opts.chunked_array_delimiters.open = ""; + opts.chunked_array_delimiters.close = ""; + return opts; + } + } + + template + arrow::Result get_row_as_string(const std::shared_ptr& tabular_object, const int64_t matlab_row_index) { + std::stringstream ss; + const int64_t row_index = matlab_row_index - 1; + if (row_index >= tabular_object->num_rows() || row_index < 0) { + ss << "Invalid Row Index: " << matlab_row_index; + return arrow::Status::Invalid(ss.str()); + } + + const auto opts = make_pretty_print_options(); + const auto num_columns = tabular_object->num_columns(); + const auto& columns = tabular_object->columns(); + + for (int32_t i = 0; i < num_columns; ++i) { + const auto& column = columns[i]; + const auto type_id = column->type()->id(); + if (arrow::is_primitive(type_id) || arrow::is_string(type_id)) { + auto slice = column->Slice(row_index, 1); + ARROW_RETURN_NOT_OK(arrow::PrettyPrint(*slice, opts, &ss)); + } else if (type_id == arrow::Type::type::STRUCT) { + // Use as a placeholder since we don't have a good + // way to display StructArray elements horiztonally on screen. + ss << ""; + } else if (type_id == arrow::Type::type::LIST) { + // Use as a placeholder since we don't have a good + // way to display ListArray elements horiztonally on screen. + ss << ""; + } else { + return arrow::Status::NotImplemented("Datatype " + column->type()->ToString() + "is not currently supported for display."); + } + + if (i + 1 < num_columns) { + // Only add the delimiter if there is at least + // one more element to print. + ss << " | "; + } + } + return ss.str(); + } +} \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc index 679c7382f6532..7d24ad01d7e73 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc @@ -23,6 +23,7 @@ #include "arrow/matlab/error/error.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/tabular/get_row_as_string.h" #include "arrow/type.h" #include "arrow/util/utf8.h" @@ -58,6 +59,7 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(RecordBatch, getColumnByIndex); REGISTER_METHOD(RecordBatch, getColumnByName); REGISTER_METHOD(RecordBatch, getSchema); + REGISTER_METHOD(RecordBatch, getRowAsString); } std::shared_ptr RecordBatch::unwrap() { @@ -218,4 +220,20 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = schema_proxy_id_mda; } + void RecordBatch::getRowAsString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_row_index = int64_t(index_mda[0]); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8, arrow::matlab::tabular::get_row_as_string(record_batch, matlab_row_index), + context, error::TABULAR_GET_ROW_AS_STRING_FAILED); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16, arrow::util::UTF8StringToUTF16(row_str_utf8), + context, error::UNICODE_CONVERSION_ERROR_ID); + context.outputs[0] = factory.createScalar(row_str_utf16); + } + } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h index b136ad1ea5db1..c417d8198f9ad 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h @@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy { void getColumnByIndex(libmexclass::proxy::method::Context& context); void getColumnByName(libmexclass::proxy::method::Context& context); void getSchema(libmexclass::proxy::method::Context& context); + void getRowAsString(libmexclass::proxy::method::Context& context); std::shared_ptr record_batch; }; diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc index 228e28dad9e9c..cf628407b1742 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc @@ -24,6 +24,8 @@ #include "arrow/matlab/error/error.h" #include "arrow/matlab/tabular/proxy/table.h" #include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/tabular/get_row_as_string.h" + #include "arrow/type.h" #include "arrow/util/utf8.h" @@ -57,6 +59,7 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(Table, getSchema); REGISTER_METHOD(Table, getColumnByIndex); REGISTER_METHOD(Table, getColumnByName); + REGISTER_METHOD(Table, getRowAsString); } std::shared_ptr Table::unwrap() { @@ -212,4 +215,20 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = chunked_array_proxy_id_mda; } + void Table::getRowAsString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_row_index = int64_t(index_mda[0]); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8, arrow::matlab::tabular::get_row_as_string(table, matlab_row_index), + context, error::TABULAR_GET_ROW_AS_STRING_FAILED); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16, arrow::util::UTF8StringToUTF16(row_str_utf8), + context, error::UNICODE_CONVERSION_ERROR_ID); + context.outputs[0] = factory.createScalar(row_str_utf16); + } + } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h index dae86a180b7a6..bfcea15bbd1c3 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h @@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy { void getSchema(libmexclass::proxy::method::Context& context); void getColumnByIndex(libmexclass::proxy::method::Context& context); void getColumnByName(libmexclass::proxy::method::Context& context); + void getRowAsString(libmexclass::proxy::method::Context& context); std::shared_ptr table; }; diff --git a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m index ad2f026d64e20..a9682d317354b 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m @@ -24,8 +24,8 @@ end % Seed the random number generator to ensure - % reproducible results in tests. - rng(1); + % reproducible results in tests across MATLAB sessions. + rng(1, "twister"); import arrow.type.ID import arrow.array.* @@ -101,6 +101,7 @@ % Return the class names as a string array classes = string({metaClass.Name}); + classes = sort(classes); end function dict = getNumericArrayToMatlabDictionary() diff --git a/matlab/test/arrow/tabular/tTabularInternal.m b/matlab/test/arrow/tabular/tTabularInternal.m new file mode 100644 index 0000000000000..28075d7763dea --- /dev/null +++ b/matlab/test/arrow/tabular/tTabularInternal.m @@ -0,0 +1,110 @@ +%TTABULARINTERNAL Unit tests for internal functionality of tabular types. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTabularInternal < matlab.unittest.TestCase + + properties(TestParameter) + TabularObjectWithAllTypes + + TabularObjectWithOneColumn + + TabularObjectWithThreeRows + end + + methods (TestParameterDefinition, Static) + function TabularObjectWithAllTypes = initializeTabularObjectWithAllTypes() + arrays = arrow.internal.test.tabular.createAllSupportedArrayTypes(NumRows=1); + arrowTable = arrow.tabular.Table.fromArrays(arrays{:}); + arrowRecordBatch = arrow.tabular.Table.fromArrays(arrays{:}); + TabularObjectWithAllTypes = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + + function TabularObjectWithOneColumn = initializeTabularObjectWithOneColumn() + t = table((1:3)'); + arrowTable = arrow.table(t); + arrowRecordBatch = arrow.recordBatch(t); + TabularObjectWithOneColumn = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + + function TabularObjectWithThreeRows = initializeTabularObjectWithThreeRows() + t = table((1:3)', ["A"; "B"; "C"]); + arrowTable = arrow.table(t); + arrowRecordBatch = arrow.recordBatch(t); + TabularObjectWithThreeRows = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + end + + methods (Test) + function RowWithAllTypes(testCase, TabularObjectWithAllTypes) + % Verify getRowAsString successfully returns the expected string + % when called on a Table/RecordBatch that contains all + % supported array types. + proxy = TabularObjectWithAllTypes.Proxy; + columnStrs = ["false", "2024-02-23", "2023-08-24", "78", "38", ... + "24", "48", "89", "102", "", """107""", "", ... + "00:03:44", "00:00:07.000000", "2024-02-10 00:00:00.000000", ... + "107", "143", "36", "51"]; + expectedString = strjoin(columnStrs, " | "); + actualString = proxy.getRowAsString(struct(Index=int64(1))); + testCase.verifyEqual(actualString, expectedString); + end + + function RowWithOneColumn(testCase, TabularObjectWithOneColumn) + % Verify getRowAsString successfully returns the expected string + % when called on a Table/RecordBatch with one column. + proxy = TabularObjectWithOneColumn.Proxy; + expectedString = "1"; + actualString = proxy.getRowAsString(struct(Index=int64(1))); + testCase.verifyEqual(actualString, expectedString); + end + + function RowIndex(testCase, TabularObjectWithThreeRows) + % Verify getRowAsString returns the expected string for + % the provided row index. + proxy = TabularObjectWithThreeRows.Proxy; + + actualString = proxy.getRowAsString(struct(Index=int64(1))); + expectedString = "1 | ""A"""; + testCase.verifyEqual(actualString, expectedString); + + actualString = proxy.getRowAsString(struct(Index=int64(2))); + expectedString = "2 | ""B"""; + testCase.verifyEqual(actualString, expectedString); + + actualString = proxy.getRowAsString(struct(Index=int64(3))); + expectedString = "3 | ""C"""; + testCase.verifyEqual(actualString, expectedString); + end + + function GetRowAsStringFailed(testCase, TabularObjectWithThreeRows) + % Verify getRowAsString throws an error with the ID + % arrow:tabular:GetRowAsStringFailed if provided invalid index + % values. + proxy = TabularObjectWithThreeRows.Proxy; + fcn = @() proxy.getRowAsString(struct(Index=int64(0))); + testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed"); + + fcn = @() proxy.getRowAsString(struct(Index=int64(4))); + testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed"); + end + + end + +end \ No newline at end of file From f62213921b003cc716d6fe50d8604560cea4a3d4 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Thu, 26 Oct 2023 17:28:48 -0400 Subject: [PATCH 2/6] GH-37710: [C++][Integration] Add C++ Utf8View implementation (#37792) ### Rationale for this change After the PR changing the spec and schema ( https://github.com/apache/arrow/pull/37526 ) is accepted, this PR will be undrafted. It adds the minimal addition of a C++ implementation and was extracted from the original C++ Utf8View pr ( https://github.com/apache/arrow/pull/35628 ) for ease of review. ### What changes are included in this PR? - The new types are available with new subclasses of DataType, Array, ArrayBuilder, ... - The values of string view arrays can be visited as `std::string_view` as with StringArray - String view arrays can be round tripped through IPC and integration JSON * Closes: #37710 Relevant mailing list discussions: * https://lists.apache.org/thread/l8t1vj5x1wdf75mdw3wfjvnxrfy5xomy * https://lists.apache.org/thread/3qhkomvvc69v3gkotbwldyko7yk9cs9k Authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/array/array_base.cc | 2 + cpp/src/arrow/array/array_binary.cc | 28 ++ cpp/src/arrow/array/array_binary.h | 60 ++++ cpp/src/arrow/array/array_binary_test.cc | 138 +++++++-- cpp/src/arrow/array/array_test.cc | 8 +- cpp/src/arrow/array/array_view_test.cc | 32 +++ cpp/src/arrow/array/builder_base.cc | 16 +- cpp/src/arrow/array/builder_binary.cc | 69 ++++- cpp/src/arrow/array/builder_binary.h | 261 +++++++++++++++++- cpp/src/arrow/array/builder_dict.cc | 6 + cpp/src/arrow/array/builder_dict.h | 7 + cpp/src/arrow/array/concatenate.cc | 40 +++ cpp/src/arrow/array/concatenate_test.cc | 14 + cpp/src/arrow/array/data.cc | 63 ++++- cpp/src/arrow/array/data.h | 17 +- cpp/src/arrow/array/dict_internal.h | 26 ++ cpp/src/arrow/array/diff.cc | 31 ++- cpp/src/arrow/array/diff_test.cc | 13 + cpp/src/arrow/array/util.cc | 54 +++- cpp/src/arrow/array/validate.cc | 162 ++++++++--- cpp/src/arrow/buffer_builder.h | 7 +- cpp/src/arrow/builder.cc | 4 +- cpp/src/arrow/compare.cc | 28 +- cpp/src/arrow/compute/kernels/vector_hash.cc | 95 ++----- .../engine/substrait/expression_internal.cc | 2 + .../arrow/engine/substrait/type_internal.cc | 2 + cpp/src/arrow/integration/json_internal.cc | 211 ++++++++++++-- cpp/src/arrow/ipc/feather.cc | 4 +- cpp/src/arrow/ipc/feather_test.cc | 6 +- cpp/src/arrow/ipc/json_simple.cc | 4 + cpp/src/arrow/ipc/json_simple_test.cc | 10 +- cpp/src/arrow/ipc/metadata_internal.cc | 41 ++- cpp/src/arrow/ipc/metadata_internal.h | 6 +- cpp/src/arrow/ipc/read_write_test.cc | 91 +++--- cpp/src/arrow/ipc/reader.cc | 35 ++- cpp/src/arrow/ipc/test_common.cc | 53 ++-- cpp/src/arrow/ipc/test_common.h | 2 +- cpp/src/arrow/ipc/writer.cc | 24 +- cpp/src/arrow/ipc/writer.h | 1 + cpp/src/arrow/json/converter.cc | 2 + cpp/src/arrow/json/converter_test.cc | 4 +- cpp/src/arrow/json/parser.cc | 1 + cpp/src/arrow/json/test_common.h | 10 +- cpp/src/arrow/pretty_print.cc | 18 +- cpp/src/arrow/scalar.cc | 19 +- cpp/src/arrow/scalar.h | 43 ++- cpp/src/arrow/testing/gtest_util.h | 7 + cpp/src/arrow/testing/random.cc | 54 +++- cpp/src/arrow/testing/random.h | 26 +- cpp/src/arrow/testing/random_test.cc | 1 + cpp/src/arrow/type.cc | 28 +- cpp/src/arrow/type.h | 124 ++++++++- cpp/src/arrow/type_fwd.h | 21 ++ cpp/src/arrow/type_test.cc | 12 + cpp/src/arrow/type_traits.cc | 2 + cpp/src/arrow/type_traits.h | 45 +++ cpp/src/arrow/util/binary_view_util.h | 95 +++++++ cpp/src/arrow/util/string.cc | 42 ++- cpp/src/arrow/util/string.h | 4 +- cpp/src/arrow/visit_data_inline.h | 47 +++- cpp/src/arrow/visitor.cc | 8 +- cpp/src/arrow/visitor.h | 6 + cpp/src/arrow/visitor_generate.h | 2 + .../parquet/arrow/arrow_reader_writer_test.cc | 3 +- cpp/src/parquet/column_writer.cc | 2 + dev/archery/archery/integration/datagen.py | 5 +- .../src/arrow/python/arrow_to_pandas.cc | 3 +- .../src/arrow/python/python_to_arrow.cc | 12 +- r/src/r_to_arrow.cpp | 7 +- 69 files changed, 1951 insertions(+), 375 deletions(-) create mode 100644 cpp/src/arrow/util/binary_view_util.h diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1cf..eab71de27b11a 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,8 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { return Finish(a.GetString(index_)); } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9d7..d83ba0ca8936d 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -24,6 +24,7 @@ #include "arrow/array/validate.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -89,6 +90,33 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(std::move(data)); +} + +BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector buffers, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + buffers.insert(buffers.begin(), std::move(views)); + buffers.insert(buffers.begin(), std::move(null_bitmap)); + SetData( + ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset)); +} + +std::string_view BinaryViewArray::GetView(int64_t i) const { + const std::shared_ptr* data_buffers = data_->buffers.data() + 2; + return util::FromBinaryView(raw_values_[i], data_buffers); +} + +StringViewArray::StringViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(std::move(data)); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff841a..fd68a379ddbfb 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,65 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public FlatArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(std::shared_ptr data); + + BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector data_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + const auto& values() const { return data_->buffers[1]; } + const c_type* raw_values() const { return raw_values_; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using FlatArray::FlatArray; + + void SetData(std::shared_ptr data) { + FlatArray::SetData(std::move(data)); + raw_values_ = data_->GetValuesSafe(1); + } + + const c_type* raw_values_; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(std::shared_ptr data); + + using BinaryViewArray::BinaryViewArray; + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a022a..04391be0ac789 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,17 +27,21 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -365,38 +369,134 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +// Produce an Array of index/offset views from a std::vector of index/offset +// BinaryViewType::c_type +Result> MakeBinaryViewArray( + BufferVector data_buffers, const std::vector& views, + bool validate = true) { + auto length = static_cast(views.size()); + auto arr = std::make_shared( + utf8_view(), length, Buffer::FromVector(views), std::move(data_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +TEST(StringViewArray, Validate) { + // Since this is a test of validation, we need to be able to construct invalid arrays. + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // empty array is valid + EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok()); + + // empty array with some data buffers is valid + EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeBinaryViewArray({}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + util::ToInlineBinaryView("inline me"), + }), + Ok()); + + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), + Ok()); + + // views may not reference data buffers not present in the array + EXPECT_THAT( + MakeBinaryViewArray( + {}, {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced data buffer + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, {util::ToBinaryView( + "supe", static_cast(buffer_s->size() + 50), 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the data buffer + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = + MakeBinaryViewArray( + {buffer_s}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, + { + util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("uper", static_cast(buffer_s->size() - 1), 0, + 1), + util::ToBinaryView("perc", static_cast(buffer_s->size() - 2), 0, + 2), + util::ToBinaryView("erca", static_cast(buffer_s->size() - 3), 0, + 3), + }), + Ok()); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); + } + + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(type(), values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK( + ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -883,11 +983,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void SetUp() override { type_ = TypeTraits::type_singleton(); } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +1008,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 2bef9d725d37f..46908439ef5f0 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -382,10 +382,12 @@ static std::vector> TestArrayUtilitiesAgainstTheseType float64(), binary(), large_binary(), + binary_view(), fixed_size_binary(3), decimal(16, 4), utf8(), large_utf8(), + utf8_view(), list(utf8()), list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull list(large_utf8()), @@ -601,12 +603,15 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(Buffer::FromString("long string; not inlined")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -647,13 +652,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { for (int64_t length : {16}) { for (auto scalar : scalars) { + ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e4029..97110ea97f3fc 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 3000aea3e189a..d3502a0ab645a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -96,10 +96,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -117,7 +114,16 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_has_string_view Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9c1..3ff22d4a3feeb 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -35,11 +35,70 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/visit_data_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) {} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0, i = 0; + VisitNullBitmapInline( + array.buffers[0].data, array.offset, array.length, array.null_count, + [&] { + if (!values[i].is_inline()) { + out_of_line_total += static_cast(values[i].size()); + } + ++i; + }, + [&] { ++i; }); + + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + for (int64_t i = 0; i < length; i++) { + if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) { + UnsafeAppendNull(); + continue; + } + + UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data())); + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary @@ -125,8 +184,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -173,10 +232,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81fd..3e87cf2403610 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -36,6 +36,7 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -204,10 +205,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -463,6 +464,256 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment) + : pool_(pool), alignment_(alignment) {} + + void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; } + + using c_type = BinaryViewType::c_type; + + template + std::conditional_t, c_type> Append(const uint8_t* value, + int64_t length) { + if (length <= BinaryViewType::kInlineSize) { + return util::ToInlineBinaryView(value, static_cast(length)); + } + + if constexpr (Safe) { + ARROW_RETURN_NOT_OK(Reserve(length)); + } + + auto v = + util::ToBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), current_offset_); + + memcpy(current_out_buffer_, value, static_cast(length)); + current_out_buffer_ += length; + current_remaining_bytes_ -= length; + current_offset_ += static_cast(length); + return v; + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 2GB"); + } + if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + int64_t alignment_; + int64_t blocksize_ = kDefaultBlocksize; + std::vector> blocks_; + + int32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. + void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + ARROW_ASSIGN_OR_RAISE(auto v, + data_heap_builder_.Append(value, length)); + data_builder_.UnsafeAppend(v); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 2GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppendToBitmap(true); + auto v = data_heap_builder_.Append(value, length); + data_builder_.UnsafeAppend(v); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder @@ -498,7 +749,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -549,7 +800,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 525b0afbc908a..7a96463ec3c43 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -194,6 +194,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf309915b..3f0d711dc5bb5 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -114,6 +120,7 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); class DictionaryMemoTableImpl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f7549fa9d1d1a..37c7271b5b95c 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -43,6 +43,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -230,6 +231,45 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + for (const auto& buf : util::span(in_data->buffers).subspan(2)) { + out_->buffers.push_back(buf); + } + } + + ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, BinaryViewType::kSize)); + ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, pool_)); + + auto* views = view_buffer->mutable_data_as(); + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (views[i].is_inline()) continue; + views[i].ref.buffer_index = SafeSignedAdd( + views[i].ref.buffer_index, static_cast(preceding_buffer_count)); + } + } + + if (out_->buffers[0] != nullptr) { + i = in_[0]->length; + VisitNullBitmapInline( + out_->buffers[0]->data(), i, out_->length - i, out_->null_count, [&] { ++i; }, + [&] { + views[i++] = {}; // overwrite views under null bits with an empty view + }); + } + + out_->buffers[1] = std::move(view_buffer); + return Status::OK(); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731ffe..0ef1136ea78f8 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,9 +92,15 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); @@ -155,6 +161,14 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 79595ab7c7c31..678513fd4d151 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -31,6 +31,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -92,6 +93,11 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +BufferSpan PackVariadicBuffers(util::span> buffers) { + return {const_cast(reinterpret_cast(buffers.data())), + static_cast(buffers.size() * sizeof(std::shared_ptr))}; +} + } // namespace internal std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, @@ -187,7 +193,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -200,7 +206,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -215,6 +221,11 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of data buffers in the third buffer + this->buffers[2] = internal::PackVariadicBuffers(util::span(data.buffers).subspan(2)); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -247,6 +258,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -351,6 +364,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + + this->buffers[1].size = BinaryViewType::kSize; + this->buffers[1].data = scalar.scratch_space_; + static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); + auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; + if (scalar.is_valid) { + *view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0); + this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); + } else { + *view = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); @@ -513,6 +539,14 @@ std::shared_ptr ArraySpan::ToArrayData() const { type_id = ext_type->storage_type()->id(); } + if (HasVariadicBuffers()) { + DCHECK_EQ(result->buffers.size(), 3); + result->buffers.pop_back(); + for (const auto& data_buffer : GetVariadicBuffers()) { + result->buffers.push_back(data_buffer); + } + } + if (type_id == Type::NA) { result->null_count = this->length; } else if (this->buffers[0].data == nullptr) { @@ -531,6 +565,16 @@ std::shared_ptr ArraySpan::ToArrayData() const { return result; } +util::span> ArraySpan::GetVariadicBuffers() const { + DCHECK(HasVariadicBuffers()); + return {buffers[2].data_as>(), + buffers[2].size / sizeof(std::shared_ptr)}; +} + +bool ArraySpan::HasVariadicBuffers() const { + return type->id() == Type::BINARY_VIEW || type->id() == Type::STRING_VIEW; +} + std::shared_ptr ArraySpan::ToArray() const { return MakeArray(this->ToArrayData()); } @@ -722,7 +766,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -733,6 +778,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 8c6b250b71adf..40a77640cd1e5 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -28,6 +28,7 @@ #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" +#include "arrow/util/span.h" #include "arrow/util/visibility.h" namespace arrow { @@ -472,10 +473,12 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; } else { - this->null_count = this->length; + this->null_count = 0; } } @@ -530,6 +533,16 @@ struct ARROW_EXPORT ArraySpan { /// \see GetNullCount int64_t ComputeLogicalNullCount() const; + /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic + /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into + /// buffers[2]; IE buffers[2].data points to the first shared_ptr of the + /// variadic set and buffers[2].size is the number of variadic buffers times + /// sizeof(shared_ptr). + /// + /// \see HasVariadicBuffers + util::span> GetVariadicBuffers() const; + bool HasVariadicBuffers() const; + private: ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& span, int64_t i); diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 3c1c8c453d1e7..63d8583b17324 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -150,6 +150,32 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + // Instead of defining a custom memo table for StringView we reuse BinaryType's, + // then convert to views when we copy data out of the memo table. + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + BinaryViewBuilder builder(pool); + RETURN_NOT_OK(builder.Resize(memo_table.size() - start_offset)); + RETURN_NOT_OK(builder.ReserveData(memo_table.values_size())); + memo_table.VisitValues(static_cast(start_offset), + [&](std::string_view s) { builder.UnsafeAppend(s); }); + + std::shared_ptr out; + RETURN_NOT_OK(builder.FinishInternal(&out)); + out->type = type; + return out; + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 800f19b752726..f267c52c0d696 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -476,30 +476,31 @@ class MakeFormatterImpl { return Status::OK(); } - // format Binary, LargeBinary and FixedSizeBinary in hexadecimal template - enable_if_binary_like Visit(const T&) { + enable_if_has_string_view Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << HexEncode(checked_cast(array).GetView(index)); + std::string_view view = checked_cast(array).GetView(index); + if constexpr (T::is_utf8) { + // format String and StringView with \"\n\r\t\\ escaped + *os << '"' << Escape(view) << '"'; + } else { + // format Binary, LargeBinary, BinaryView, and FixedSizeBinary in hexadecimal + *os << HexEncode(view); + } }; return Status::OK(); } - // format Strings with \"\n\r\t\\ escaped + // format Decimals with Decimal___Array::FormatValue template - enable_if_string_like Visit(const T&) { - using ArrayType = typename TypeTraits::ArrayType; - impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << "\"" << Escape(checked_cast(array).GetView(index)) << "\""; - }; - return Status::OK(); - } - - // format Decimals with Decimal128Array::FormatValue - Status Visit(const Decimal128Type&) { + enable_if_decimal Visit(const T&) { impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << checked_cast(array).FormatValue(index); + if constexpr (T::type_id == Type::DECIMAL128) { + *os << checked_cast(array).FormatValue(index); + } else { + *os << checked_cast(array).FormatValue(index); + } }; return Status::OK(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index e322006732f9a..4c2f39eddf863 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -596,6 +596,19 @@ TEST_F(DiffTest, UnifiedDiffFormatter) { @@ -5, +8 @@ +79 +11 +)"); + } + + for (const auto& type : { + decimal128(10, 4), + decimal256(10, 4), + }) { + base_ = ArrayFromJSON(type, R"(["123.4567", "-78.9000"])"); + target_ = ArrayFromJSON(type, R"(["123.4567", "-123.4567"])"); + AssertDiffAndFormat(R"( +@@ -1, +1 @@ +--78.9000 ++-123.4567 )"); } } diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 98e9d51b5fc75..9ea2fc2b6f0a1 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -271,6 +274,13 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + // TODO(GH-37879): This requires knowledge of whether the array is being swapped to + // native endian or from it so that we know what size to trust when deciding whether + // something is an inline view. + return Status::NotImplemented("Swapping endianness of ", type); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -379,6 +389,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(BinaryViewType::c_type) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -498,6 +512,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -600,6 +619,11 @@ class RepeatedArrayFactory { RepeatedArrayFactory(MemoryPool* pool, const Scalar& scalar, int64_t length) : pool_(pool), scalar_(scalar), length_(length) {} + template + const auto& scalar() const { + return checked_cast::ScalarType&>(scalar_); + } + Result> Create() { RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this)); return out_; @@ -621,7 +645,7 @@ class RepeatedArrayFactory { template enable_if_t::value || is_temporal_type::value, Status> Visit( const T&) { - auto value = checked_cast::ScalarType&>(scalar_).value; + auto value = scalar().value; return FinishFixedWidth(&value, sizeof(value)); } @@ -632,8 +656,7 @@ class RepeatedArrayFactory { template enable_if_decimal Visit(const T&) { - using ScalarType = typename TypeTraits::ScalarType; - auto value = checked_cast(scalar_).value.ToBytes(); + auto value = scalar().value.ToBytes(); return FinishFixedWidth(value.data(), value.size()); } @@ -644,29 +667,36 @@ class RepeatedArrayFactory { template enable_if_base_binary Visit(const T&) { - std::shared_ptr value = - checked_cast::ScalarType&>(scalar_).value; + const std::shared_ptr& value = scalar().value; std::shared_ptr values_buffer, offsets_buffer; RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); + return Status::OK(); + } + + template + enable_if_binary_view_like Visit(const T& type) { + std::string_view value{*scalar().value}; + auto s = util::ToBinaryView(value, 0, 0); + RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s))); + if (!s.is_inline()) { + out_->data()->buffers.push_back(scalar().value); + } return Status::OK(); } template enable_if_var_size_list Visit(const T& type) { - using ScalarType = typename TypeTraits::ScalarType; using ArrayType = typename TypeTraits::ArrayType; - auto value = checked_cast(scalar_).value; - - ArrayVector values(length_, value); + ArrayVector values(length_, scalar().value); ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_)); std::shared_ptr offsets_buffer; - auto size = static_cast(value->length()); + auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); out_ = diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 19ff8e28b536c..3dde41b1450e8 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,41 +31,43 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); + } } }; @@ -169,6 +171,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); @@ -248,6 +258,8 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -453,11 +465,16 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), - " buffers in array " - "of type ", - type.ToString(), ", got ", data.buffers.size()); + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); } // This check is required to avoid addition overflow below @@ -469,7 +486,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -595,6 +614,85 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t views_byte_size = data.buffers[1]->size(); + int64_t required_view_count = data.length + data.offset; + if (static_cast(views_byte_size / BinaryViewType::kSize) < + required_view_count) { + return Status::Invalid("View buffer size (bytes): ", views_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation) return Status::OK(); + + auto CheckPrefix = [&](size_t i, + std::array prefix, + const uint8_t* data) { + if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) { + return Status::OK(); + } + return Status::Invalid("View at slot ", i, " has inlined prefix 0x", + HexEncode(prefix.data(), BinaryViewType::kPrefixSize), + " but the out-of-line data begins with 0x", + HexEncode(data, BinaryViewType::kPrefixSize)); + }; + + util::span views(data.GetValues(1), + static_cast(data.length)); + util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2); + + for (size_t i = 0; i < static_cast(data.length); ++i) { + if (data.IsNull(i)) continue; + + if (views[i].size() < 0) { + return Status::Invalid("View at slot ", i, " has negative size ", + views[i].size()); + } + + if (views[i].is_inline()) { + auto padding_bytes = util::span(views[i].inlined.data).subspan(views[i].size()); + for (auto padding_byte : padding_bytes) { + if (padding_byte != 0) { + return Status::Invalid("View at slot ", i, " was inline with size ", + views[i].size(), + " but its padding bytes were not all zero: ", + HexEncode(padding_bytes.data(), padding_bytes.size())); + } + } + continue; + } + + auto [size, prefix, buffer_index, offset] = views[i].ref; + + if (buffer_index < 0) { + return Status::Invalid("View at slot ", i, " has negative buffer index ", + buffer_index); + } + + if (offset < 0) { + return Status::Invalid("View at slot ", i, " has negative offset ", offset); + } + + if (static_cast(buffer_index) >= data_buffers.size()) { + return Status::IndexError("View at slot ", i, " references buffer ", buffer_index, + " but there are only ", data_buffers.size(), + " data buffers"); + } + const auto& buffer = data_buffers[buffer_index]; + + if (int64_t end = offset + static_cast(size); end > buffer->size()) { + return Status::IndexError( + "View at slot ", i, " references range ", offset, "-", end, " of buffer ", + buffer_index, " but that buffer is only ", buffer->size(), " bytes long"); + } + + RETURN_NOT_OK(CheckPrefix(i, prefix, buffer->data() + offset)); + } + + return Status::OK(); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; @@ -798,7 +896,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } @@ -806,5 +905,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index e7eea64043ba8..a84c98b6b2491 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -143,7 +143,10 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } - void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); } + + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); @@ -268,7 +271,7 @@ class TypedBufferBuilder< template void UnsafeAppend(Iter values_begin, Iter values_end) { - int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto num_elements = static_cast(std::distance(values_begin, values_end)); auto data = mutable_data() + length(); bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); std::copy(values_begin, values_end, data); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db5578..c7e6207bfefa4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -190,7 +192,7 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e06..50cfdd05a14bb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -38,6 +38,7 @@ #include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -261,6 +262,25 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!util::EqualBinaryView(left_values[i], right_values[i], left_buffers, + right_buffers)) { + return false; + } + } + return true; + }); + return Status::OK(); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -632,6 +652,11 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewType&) { + result_ = true; + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -802,8 +827,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index d9143b760f32b..5426dc405429c 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -31,6 +31,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -262,7 +263,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -503,39 +504,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -544,22 +519,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -568,22 +543,24 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -593,31 +570,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 0df8425609ff1..d395261597696 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -727,6 +727,8 @@ struct ScalarToProtoImpl { s); } + Status Visit(const BinaryViewScalar& s) { return NotImplemented(s); } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 1f9141f36ba6b..d3fb058137e6a 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -263,6 +263,8 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const BinaryViewType& t) { return NotImplemented(t); } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index ed7be4b502985..59749c36a958e 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -48,6 +48,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -106,6 +107,13 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +Result GetStringView(const rj::Value& str) { + if (!str.IsString()) { + return Status::Invalid("field was not a string"); + } + return std::string_view{str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -226,8 +234,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_var_length_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_var_length_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -382,6 +391,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -528,22 +539,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -642,6 +650,50 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteBinaryViewField(const ArrayType& array) { + writer_->Key("VIEWS"); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.is_inline()) { + writer_->Key("INLINED"); + if constexpr (ArrayType::TypeClass::is_utf8) { + writer_->String(reinterpret_cast(s.inline_data()), s.size()); + } else { + writer_->String(HexEncode(s.inline_data(), s.size())); + } + } else { + // Prefix is always 4 bytes so it may not be utf-8 even if the whole + // string view is + writer_->Key("PREFIX_HEX"); + writer_->String(HexEncode(s.inline_data(), BinaryViewType::kPrefixSize)); + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.ref.buffer_index); + writer_->Key("OFFSET"); + writer_->Int64(s.ref.offset); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_DATA_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the data buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -682,8 +734,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -700,6 +754,17 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteBinaryViewField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1033,6 +1098,10 @@ Result> GetType(const RjObject& json_type, return utf8(); } else if (type_name == "binary") { return binary(); + } else if (type_name == "utf8view") { + return utf8_view(); + } else if (type_name == "binaryview") { + return binary_view(); } else if (type_name == "largeutf8") { return large_utf8(); } else if (type_name == "largebinary") { @@ -1246,10 +1315,12 @@ class ArrayReader { return Status::OK(); } - Result GetDataArray(const RjObject& obj) { - ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, kData)); + Result GetDataArray(const RjObject& obj, + const std::string& key = kData) { + ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, key)); if (static_cast(json_data_arr.Size()) != length_) { - return Status::Invalid("JSON DATA array size differs from advertised array length"); + return Status::Invalid("JSON ", key, " array size ", json_data_arr.Size(), + " differs from advertised array length ", length_); } return json_data_arr; } @@ -1293,10 +1364,7 @@ class ArrayReader { RETURN_NOT_OK(builder.AppendNull()); continue; } - - DCHECK(json_val.IsString()); - std::string_view val{ - json_val.GetString()}; // XXX can we use json_val.GetStringLength()? + ARROW_ASSIGN_OR_RAISE(auto val, GetStringView(json_val)); int64_t offset_start = ParseOffset(json_offsets[i]); int64_t offset_end = ParseOffset(json_offsets[i + 1]); @@ -1332,6 +1400,97 @@ class ArrayReader { return FinishBuilder(&builder); } + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS")); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + ARROW_ASSIGN_OR_RAISE(auto hex_string, GetStringView(json_buf)); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE( + buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), pool_)); + + span views{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, out_view, is_valid] : Zip(json_views, views, is_valid_)) { + if (!is_valid) { + out_view = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + DCHECK_GE(json_size->value.GetInt64(), 0); + auto size = static_cast(json_size->value.GetInt64()); + + if (size <= BinaryViewType::kInlineSize) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + out_view.inlined = {size, {}}; + + if constexpr (ViewType::is_utf8) { + DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize); + memcpy(&out_view.inlined.data, json_inlined->value.GetString(), size); + } else { + DCHECK_LE(json_inlined->value.GetStringLength(), + BinaryViewType::kInlineSize * 2); + ARROW_ASSIGN_OR_RAISE(auto inlined, GetStringView(json_inlined->value)); + RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data())); + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX_HEX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX_HEX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + out_view.ref = { + size, + {}, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64()), + }; + + DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2); + ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value)); + RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data())); + + DCHECK_LE(static_cast(out_view.ref.buffer_index), buffers.size() - 2); + DCHECK_LE(static_cast(out_view.ref.offset) + out_view.size(), + buffers[out_view.ref.buffer_index + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8cbb..1ef076fac40e2 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb2635..0b6ae4f620647 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c9730283e..4d2d803f3f65e 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242aa..b67c26999945b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -271,7 +271,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { public: - std::shared_ptr type() { return TypeTraits::type_singleton(); } + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return T::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } + } }; TYPED_TEST_SUITE_P(TestStrings); @@ -327,6 +333,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5ce..ab1a58dd1df8b 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,6 +540,18 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const StringViewType& type) { + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return Status::OK(); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); @@ -967,6 +985,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -977,7 +996,13 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + flatbuffers::Offset> fb_variadic_buffer_counts{}; + if (!variadic_buffer_counts.empty()) { + fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts); + } + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_buffer_counts); return Status::OK(); } @@ -1224,11 +1249,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1285,11 +1311,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dace0..631a336f75a9a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 3ae007c20efe7..05a48aec2c7f3 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); @@ -2910,21 +2913,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6e801e1f8adb7..d603062d81d4a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -248,6 +248,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicBufferCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -296,7 +305,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { DCHECK_NE(out_, nullptr); out_->buffers.resize(3); @@ -355,7 +363,22 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); + } + + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -450,6 +473,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -580,10 +604,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData( - filtered_columns[i], context.options.memory_pool)); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20fbc..6faaf96b332d4 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,39 +351,32 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + auto AppendColumn = [&](auto& MakeArray) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c630..fc0c8ddbea319 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,7 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index e4b49ed56464e..9668f459d0d31 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -174,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -296,6 +299,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_buffer_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -403,6 +408,18 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + auto views = SliceBuffer(array.values(), array.offset() * BinaryViewType::kSize, + array.length() * BinaryViewType::kSize); + out_->body_buffers.emplace_back(std::move(views)); + + out_->variadic_buffer_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); + } + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -590,6 +607,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -606,8 +624,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3f2..4e0ee3dfc8b44 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_buffer_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ceec..c393b77acf334 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: diff --git a/cpp/src/arrow/json/converter_test.cc b/cpp/src/arrow/json/converter_test.cc index 378a4491c0e62..cfc44c99976d5 100644 --- a/cpp/src/arrow/json/converter_test.cc +++ b/cpp/src/arrow/json/converter_test.cc @@ -131,8 +131,8 @@ TEST(ConverterTest, Floats) { } } -TEST(ConverterTest, StringAndLargeString) { - for (auto string_type : {utf8(), large_utf8()}) { +TEST(ConverterTest, StringAndLargeStringAndStringView) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { ParseOptions options; options.explicit_schema = schema({field("", string_type)}); diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index e2941a29ab9bd..185dcde355f0a 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -104,6 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const DateType&) { return SetKind(Kind::kNumber); } Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const BinaryViewType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdbc9..f7ab6fd10275f 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index a5410df7e9ae2..b392e027a6b89 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -229,18 +229,13 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_string_like WriteDataValues(const ArrayType& array) { + enable_if_has_string_view WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { - (*sink_) << "\"" << array.GetView(i) << "\""; - return Status::OK(); - }); - } - - template - enable_if_t::value && !is_decimal_type::value, Status> - WriteDataValues(const ArrayType& array) { - return WriteValues(array, [&](int64_t i) { - (*sink_) << HexEncode(array.GetView(i)); + if constexpr (T::is_utf8) { + (*sink_) << "\"" << array.GetView(i) << "\""; + } else { + (*sink_) << HexEncode(array.GetView(i)); + } return Status::OK(); }); } @@ -302,6 +297,7 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index b2ad1ad519bb2..167e272705268 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,6 +263,12 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } + + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template @@ -548,17 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 1d1ce4aa72948..5175b0128524c 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -263,24 +263,21 @@ struct ARROW_EXPORT BaseBinaryScalar return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} friend ArraySpan; + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -292,11 +289,39 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} + + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} + + std::string_view view() const override { return std::string_view(*this->value); } +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; + using TypeClass = StringViewType; + + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} + + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + + StringViewScalar() : StringViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; @@ -307,7 +332,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -319,7 +345,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index bb462af86a5f2..641aae5a5e2e4 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,10 +176,17 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b74c41f75e452..1386075397e20 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -363,13 +363,11 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr -static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, - int32_t min_length, int32_t max_length, - double null_probability, - int64_t alignment, - MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; +template +static std::shared_ptr GenerateBinaryArray( + RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, + double null_probability, std::optional max_data_buffer_length, + int64_t alignment, MemoryPool* memory_pool) { using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -387,7 +385,12 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; + if constexpr (std::is_base_of_v) { + if (max_data_buffer_length) { + builder.SetBlockSize(*max_data_buffer_length); + } + } for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -409,7 +412,8 @@ std::shared_ptr RandomArrayGenerator::String(int64_t size, int32_t min_le int64_t alignment, MemoryPool* memory_pool) { return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t min_length, @@ -417,8 +421,9 @@ std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t m double null_probability, int64_t alignment, MemoryPool* memory_pool) { - return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( @@ -430,6 +435,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView( + int64_t size, int32_t min_length, int32_t max_length, double null_probability, + std::optional max_data_buffer_length, int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, max_data_buffer_length, + alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { @@ -843,6 +857,24 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + std::optional max_data_buffer_length = + GetMetadata(field.metadata().get(), "max_data_buffer_length", 0); + if (*max_data_buffer_length == 0) { + *max_data_buffer_length = {}; + } + + return StringView(length, min_length, max_length, null_probability, + max_data_buffer_length, alignment) + ->View(field.type()) + .ValueOrDie(); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index de9ea6d05648d..cbdac3baa0109 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,26 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a value being null + /// \param[in] max_data_buffer_length the data buffer size at which + /// a new chunk will be generated + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + std::optional max_data_buffer_length = {}, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate @@ -556,10 +576,14 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// - max_length (T::offset_type): the minimum length of the child to generate, /// default 1024 /// - /// For string and binary types T (not including their large variants): + /// For string and binary types T (not including their large or view variants): /// - unique (int32_t): if positive, this many distinct values will be generated /// and all array values will be one of these values, default -1 /// + /// For string and binary view types T: + /// - max_data_buffer_length (int64_t): the data buffer size at which a new chunk + /// will be generated, default 32KB + /// /// For MapType: /// - values (int32_t): the number of key-value pairs to generate, which will be /// partitioned among the array values. diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index f269818e83a3d..951b654e56f73 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a4f43256827da..f378bd974047d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -64,10 +64,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -130,6 +134,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, @@ -194,7 +200,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -247,7 +255,7 @@ struct PhysicalTypeVisitor { } template - Status Visit(const Type&) { + Status Visit(const Type& type) { result = TypeTraits::type_singleton(); return Status::OK(); } @@ -1058,10 +1066,14 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { return "binary_view"; } + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { return "string_view"; } + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2821,8 +2833,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -3034,6 +3048,16 @@ TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + +const std::shared_ptr& binary_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } @@ -3294,7 +3318,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3f4dd5c9b21fa..a905192e4a54e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -772,6 +779,103 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr int kSize = 16; + static constexpr int kInlineSize = 12; + static constexpr int kPrefixSize = 4; + + /// Variable length string or binary with inline optimization for small values (12 bytes + /// or fewer). This is similar to std::string_view except limited in size to INT32_MAX + /// and at least the first four bytes of the string are copied inline (accessible + /// without pointer dereference). This inline prefix allows failing comparisons early. + /// Furthermore when dealing with short strings the CPU cache working set is reduced + /// since many can be inline. + /// + /// This union supports two states: + /// + /// - Entirely inlined string data + /// |----|--------------| + /// ^ ^ + /// | | + /// size in-line string data, zero padded + /// + /// - Reference into a buffer + /// |----|----|----|----| + /// ^ ^ ^ ^ + /// | | | | + /// size | | `------. + /// prefix | | + /// buffer index | + /// offset in buffer + /// + /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. + /// + /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf + /// + /// Alignment to 64 bits enables an aligned load of the size and prefix into + /// a single 64 bit integer, which is useful to the comparison fast path. + union alignas(int64_t) c_type { + struct { + int32_t size; + std::array data; + } inlined; + + struct { + int32_t size; + std::array prefix; + int32_t buffer_index; + int32_t offset; + } ref; + + /// The number of bytes viewed. + int32_t size() const { + // Size is in the common initial subsequence of each member of the union, + // so accessing `inlined.size` is legal even if another member is active. + return inlined.size; + } + + /// True if the view's data is entirely stored inline. + bool is_inline() const { return size() <= kInlineSize; } + + /// Return a pointer to the inline data of a view. + /// + /// For inline views, this points to the entire data of the view. + /// For other views, this points to the 4 byte prefix. + const uint8_t* inline_data() const& { + // Since `ref.prefix` has the same address as `inlined.data`, + // the branch will be trivially optimized out. + return is_inline() ? inlined.data.data() : ref.prefix.data(); + } + const uint8_t* inline_data() && = delete; + }; + static_assert(sizeof(c_type) == kSize); + static_assert(std::is_trivial_v); + + static constexpr const char* type_name() { return "binary_view"; } + + BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(kSize)}, + DataTypeLayout::VariableWidth()); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -818,6 +922,24 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 499684719feab..ca263b710317b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -110,6 +110,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -125,6 +130,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -415,6 +425,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED = 38, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW = 39, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW = 40, + // Leave this at the end MAX_ID }; @@ -456,10 +473,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 9ba8cf98dea4f..273f8933fa577 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1469,9 +1469,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index ac16afe4b8cd8..de328f322ad5f 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -88,6 +88,8 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::DURATION: case Type::INTERVAL_MONTH_DAY_NANO: // Stored as two 32-bit integers and a 64-bit // integer + case Type::STRING_VIEW: + case Type::BINARY_VIEW: return 8; case Type::DICTIONARY: case Type::EXTENSION: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index bcbde23ae4a4b..9d8cafacf397b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +381,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +419,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,6 +639,24 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = std::integral_constant::value && T::is_utf8>; @@ -801,8 +844,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || + std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h new file mode 100644 index 0000000000000..94f7a5bdfa667 --- /dev/null +++ b/cpp/src/arrow/util/binary_view_util.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) { + // Small string: inlined. Bytes beyond size are zeroed + BinaryViewType::c_type out; + out.inlined = {size, {}}; + memcpy(&out.inlined.data, data, size); + return out; +} + +inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { + return ToInlineBinaryView(v.data(), static_cast(v.size())); +} + +inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, + int32_t buffer_index, int32_t offset) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + + // Large string: store index/offset. + BinaryViewType::c_type out; + out.ref = {size, {}, buffer_index, offset}; + memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix)); + return out; +} + +inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index, + int32_t offset) { + return ToBinaryView(v.data(), static_cast(v.size()), buffer_index, offset); +} + +template +std::string_view FromBinaryView(const BinaryViewType::c_type& v, + const BufferPtr* data_buffers) { + auto* data = v.is_inline() ? v.inlined.data.data() + : data_buffers[v.ref.buffer_index]->data() + v.ref.offset; + return {reinterpret_cast(data), static_cast(v.size())}; +} +template +std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = delete; + +template +bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr* l_buffers, const BufferPtr* r_buffers) { + int64_t l_size_and_prefix, r_size_and_prefix; + memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix)); + memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix)); + + if (l_size_and_prefix != r_size_and_prefix) return false; + + if (l.is_inline()) { + // The columnar spec mandates that the inlined part be zero-padded, so we can compare + // a word at a time regardless of the exact size. + int64_t l_inlined, r_inlined; + memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined)); + memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined)); + return l_inlined == r_inlined; + } + + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset; + const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset; + return memcmp(l_data + BinaryViewType::kPrefixSize, + r_data + BinaryViewType::kPrefixSize, + l.size() - BinaryViewType::kPrefixSize) == 0; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea22..192173fa16ce9 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -25,15 +25,13 @@ namespace arrow { -static const char* kAsciiTable = "0123456789ABCDEF"; - std::string HexEncode(const uint8_t* data, size_t length) { - std::string hex_string; - hex_string.reserve(length * 2); - for (size_t j = 0; j < length; ++j) { + std::string hex_string(length * 2, '\0'); + for (size_t j = 0, i = 0; j < length; ++j) { // Convert to 2 base16 digits - hex_string.push_back(kAsciiTable[data[j] >> 4]); - hex_string.push_back(kAsciiTable[data[j] & 15]); + constexpr auto kHexDigitTable = "0123456789ABCDEF"; + hex_string[i++] = kHexDigitTable[data[j] >> 4]; + hex_string[i++] = kHexDigitTable[data[j] & 0b1111]; } return hex_string; } @@ -73,20 +71,34 @@ std::string HexEncode(std::string_view str) { return HexEncode(str.data(), str.s std::string Escape(std::string_view str) { return Escape(str.data(), str.size()); } -Status ParseHexValue(const char* data, uint8_t* out) { - char c1 = data[0]; - char c2 = data[1]; +constexpr uint8_t kInvalidHexDigit = -1; - const char* kAsciiTableEnd = kAsciiTable + 16; - const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c1); - const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c2); +constexpr uint8_t ParseHexDigit(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return kInvalidHexDigit; +} + +Status ParseHexValue(const char* data, uint8_t* out) { + uint8_t high = ParseHexDigit(data[0]); + uint8_t low = ParseHexDigit(data[1]); // Error checking - if (pos1 == kAsciiTableEnd || pos2 == kAsciiTableEnd || *pos1 != c1 || *pos2 != c2) { + if (high == kInvalidHexDigit || low == kInvalidHexDigit) { return Status::Invalid("Encountered non-hex digit"); } - *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); + *out = static_cast(high << 4 | low); + return Status::OK(); +} + +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } return Status::OK(); } diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a8c..d7e377773f62f 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a635..a2ba9cfc65071 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -23,6 +23,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" @@ -144,6 +145,42 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { + return valid_func(util::FromBinaryView(s[index], data_buffers)); + }, + [&]() { return null_func(); }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(util::FromBinaryView(s[index], data_buffers)); }, + std::forward(null_func)); + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { @@ -240,9 +277,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { @@ -273,9 +309,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68d7..e057f6b12fb1b 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567e1..650b0e7ee0a30 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8fea..4b57abe53ff14 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 315405ef1e569..4e23d0fab5c69 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3374,6 +3374,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { for (const auto& inner_type : types) { if (inner_type->id() == ::arrow::Type::NA) continue; + if (inner_type->id() == ::arrow::Type::BINARY_VIEW) continue; + if (inner_type->id() == ::arrow::Type::STRING_VIEW) continue; auto writer_props = WriterProperties::Builder(); auto arrow_writer_props = ArrowWriterProperties::Builder(); @@ -3389,7 +3391,6 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { arrow_writer_props.coerce_timestamps(unit); } } - ASSERT_NO_FATAL_FAILURE(DoNestedRequiredRoundtrip(inner_type, writer_props.build(), arrow_writer_props.build())); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 72e984d7736fa..5dff533c1cce2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -134,6 +134,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Dictionary); NOT_IMPLEMENTED_VISIT(RunEndEncoded); NOT_IMPLEMENTED_VISIT(Extension); + NOT_IMPLEMENTED_VISIT(BinaryView); + NOT_IMPLEMENTED_VISIT(StringView); #undef NOT_IMPLEMENTED_VISIT diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 7635cfd98feda..1ce2775c160b8 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1856,12 +1856,13 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C++') .skip_tester('C#') .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust'), + .skip_tester('Rust') + .skip_format(SKIP_C_SCHEMA, 'C++') + .skip_format(SKIP_C_ARRAY, 'C++'), generate_extension_case() .skip_tester('C#') diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 91c7b8a45718e..8ed5d4e216e8e 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1353,7 +1353,8 @@ struct ObjectWriterVisitor { std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value, + std::is_base_of::value || + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 2143129356fd5..e47492499867a 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -572,12 +572,18 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value) || - std::is_same::value>> { + T, + enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value && !is_binary_view_like_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; +template +struct PyConverterTrait> { + // not implemented +}; + template struct PyConverterTrait> { using type = PyListConverter; diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 19c23973a62fe..d9bf848e24292 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1051,10 +1051,15 @@ struct RConverterTrait; template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && - !is_extension_type::value>> { + !is_extension_type::value && !is_binary_view_like_type::value>> { using type = RPrimitiveConverter; }; +template +struct RConverterTrait> { + // not implemented +}; + template struct RConverterTrait> { using type = RListConverter; From fcbcb7dab71b14ff3436af7b21bd23132877e2fa Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 26 Oct 2023 18:35:05 -0300 Subject: [PATCH 3/6] GH-34569: [C++] Diffing of Run-End Encoded arrays (#35003) ### What changes are included in this PR? Ability to diff run-end encoded arrays without expanding them first. This is useful for debugging and understanding of unit test failures. Many successive random accesses to REE arrays by logical indices incur many successive binary searches on the run-ends array to find the physical index of the values array, so this PR implements a few interesting optimizations -- the most complicated one is encapsulated in a new utility available via `ree_util.h` as there are many potential uses for it other than diffing. +-------------+---------------------------+----------------------------------------------------+ | | Optimizations | +-------------+---------------------------+-----------------------+----------------------------+ | # bsearches | Null checks on Comparator | RunLengthOfEqualsFrom | Cached FindPhysicalIndex() | +-------------+---------------------------+----------------------+-----------------------------+ | 3798 | | | | +-------------+---------------------------+----------------------+-----------------------------+ | 1914 | X | | | +-------------+---------------------------+----------------------+-----------------------------+ | 180 | X | X | | +-------------+---------------------------+----------------------+-----------------------------+ | 153 | X | | X | +-------------+---------------------------+----------------------+-----------------------------+ | 75 | X | X | X | +-------------+---------------------------+----------------------+-----------------------------+ ### Are these changes tested? Yes. By existing and newly added REE-specific tests. ### Are there any user-facing changes? No. * Closes: #34569 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Benjamin Kietzman --- cpp/src/arrow/array/diff.cc | 319 +++++++++++++++++++++++++------ cpp/src/arrow/array/diff_test.cc | 107 +++++++++++ cpp/src/arrow/util/ree_util.cc | 56 ++++++ cpp/src/arrow/util/ree_util.h | 68 +++++++ 4 files changed, 491 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index f267c52c0d696..be9597e59b378 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include "arrow/array/array_decimal.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/array_run_end.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" #include "arrow/extension_type.h" @@ -43,7 +45,9 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/ree_util.h" #include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/vendored/datetime.h" #include "arrow/visit_type_inline.h" @@ -96,45 +100,247 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) { return UnitSlice{&array, index}; } -using ValueComparator = std::function; +/// \brief A simple virtual comparator interface for two arrays. +/// +/// The base and target array ara bound at construction time. Then +/// Equals(base_index, target_index) should return true if the values +/// at the given indices are equal. +struct ValueComparator { + virtual ~ValueComparator() = default; + + /// \brief Compare the validity and values at the given indices in the base and target + /// arrays. + /// + /// \param base_index The index in the base array. + /// \param target_index The index in the target array. + /// \return true if the values at the given indices are equal, false otherwise. + /// \pre base_index and target_index are valid indices in their respective arrays. + virtual bool Equals(int64_t base_index, int64_t target_index) = 0; + + /// \brief Return the run length of equal values starting at the given indices in the + /// base and target arrays. + /// + /// \param base_index The starting index in the base array. + /// \param base_length The length of the base array. + /// \param target_index The starting index in the target array. + /// \param target_length The length of the target array. + /// \return The run length of equal values starting at the given indices in the base + /// and target arrays. + virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length, + int64_t target_index, int64_t target_length) { + int64_t run_length_of_equals = 0; + while (base_index < base_length && target_index < target_length) { + if (!Equals(base_index, target_index)) { + break; + } + base_index += 1; + target_index += 1; + run_length_of_equals += 1; + } + return run_length_of_equals; + } +}; + +template +struct DefaultValueComparator : public ValueComparator { + const ArrayType& base; + const ArrayType& target; + + DefaultValueComparator(const ArrayType& base, const ArrayType& target) + : base(base), target(target) {} + + ~DefaultValueComparator() override = default; + + bool Equals(int64_t base_index, int64_t target_index) override { + const bool base_valid = base.IsValid(base_index); + const bool target_valid = target.IsValid(target_index); + if (base_valid && target_valid) { + return GetView(base, base_index) == GetView(target, target_index); + } + return base_valid == target_valid; + } +}; + +template +class REEValueComparator : public ValueComparator { + private: + const RunEndEncodedArray& base_; + const RunEndEncodedArray& target_; + std::unique_ptr inner_value_comparator_; + ree_util::PhysicalIndexFinder base_physical_index_finder_; + ree_util::PhysicalIndexFinder target_physical_index_finder_; + + public: + REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target, + std::unique_ptr&& inner_value_comparator) + : base_(base), + target_(target), + inner_value_comparator_(std::move(inner_value_comparator)), + base_physical_index_finder_(*base_.data()), + target_physical_index_finder_(*target_.data()) { + DCHECK_EQ(*base_.type(), *target_.type()); + } + + ~REEValueComparator() override = default; + + private: + /// \pre 0 <= i < base_.length() + inline int64_t FindPhysicalIndexOnBase(int64_t i) { + return base_physical_index_finder_.FindPhysicalIndex(i); + } + + /// \pre 0 <= i < target_.length() + inline int64_t FindPhysicalIndexOnTarget(int64_t i) { + return target_physical_index_finder_.FindPhysicalIndex(i); + } + + const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; } + + const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; } + + public: + int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length, + int64_t target_index, int64_t target_length) override { + // Ensure the first search for physical index on the values arrays is safe. + if (base_index >= base_length || target_index >= target_length) { + // Without values on either side, there is no run of equal values. + return 0; + } + + // Translate the two logical indices into physical indices. + int64_t physical_base_index = FindPhysicalIndexOnBase(base_index); + int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index); + + int64_t run_length_of_equals = 0; + // The loop invariant (base_index < base_length && target_index < target_length) + // is valid when the loop starts because of the check above. + for (;;) { + const auto base_run_end = + static_cast(base_run_ends()[physical_base_index]) - base_.offset(); + const auto target_run_end = + static_cast(target_run_ends()[physical_target_index]) - + target_.offset(); + // The end of the runs containing the logical indices, by definition, ends + // after the logical indices. + DCHECK_LT(base_index, base_run_end); + DCHECK_LT(target_index, target_run_end); + + // Compare the physical values that make up the runs containing base_index + // and target_index. + if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) { + // First difference found, stop because the run of equal values cannot + // be extended further. + break; + } + + const int64_t base_run = std::min(base_run_end, base_length) - base_index; + const int64_t target_run = std::min(target_run_end, target_length) - target_index; + // Due to the loop-invariant (base_index < base_length && target_index < + // target_length) and properties of the run-ends asserted above, both base_run and + // target_run are strictly greater than zero. + DCHECK_GT(base_run, 0); + DCHECK_GT(target_run, 0); + + // Skip the smallest run (or both runs if they are equal) + const int64_t increment = std::min(base_run, target_run); + physical_base_index += increment == base_run; + physical_target_index += increment == target_run; + + // Since both base_run and target_run are greater than zero, + // increment is also greater than zero... + DCHECK_GT(increment, 0); + // ...which implies that the loop will make progress and eventually terminate + // because base_index or target_index will equal base_length or target_length, + // respectively. + base_index += increment; + target_index += increment; + // The value representing the two runs are equal, so we can assume that at + // least `increment` (size of smallest run) values are equal. + run_length_of_equals += increment; + + if (base_index >= base_length || target_index >= target_length) { + break; + } + } + + return run_length_of_equals; + } -struct ValueComparatorVisitor { + bool Equals(int64_t base_index, int64_t target_index) override { + const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index); + const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index); + return inner_value_comparator_->Equals(physical_base_index, physical_target_index); + } +}; + +class ValueComparatorFactory { + private: + std::unique_ptr comparator_; + + public: template - Status Visit(const T&) { + Status Visit(const T&, const Array& base, const Array& target) { using ArrayType = typename TypeTraits::ArrayType; - out = [](const Array& base, int64_t base_index, const Array& target, - int64_t target_index) { - return (GetView(checked_cast(base), base_index) == - GetView(checked_cast(target), target_index)); - }; + comparator_ = std::make_unique>( + checked_cast(base), checked_cast(target)); return Status::OK(); } - Status Visit(const NullType&) { return Status::NotImplemented("null type"); } + Status Visit(const NullType&, const Array&, const Array&) { + return Status::NotImplemented("null type"); + } - Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); } + Status Visit(const ExtensionType&, const Array&, const Array&) { + return Status::NotImplemented("extension type"); + } - Status Visit(const DictionaryType&) { + Status Visit(const DictionaryType&, const Array& base, const Array& target) { return Status::NotImplemented("dictionary type"); } - Status Visit(const RunEndEncodedType&) { - return Status::NotImplemented("run-end encoded type"); + Status Visit(const RunEndEncodedType& ree_type, const Array& base, + const Array& target) { + const auto& base_ree = checked_cast(base); + const auto& target_ree = checked_cast(target); + + ARROW_ASSIGN_OR_RAISE( + auto inner_values_comparator, + Create(*ree_type.value_type(), *base_ree.values(), *target_ree.values())); + + // Instantiate the specialized comparator types with operator new instead of + // make_unique() to avoid binary bloat. unique_ptr's constructor is templated + // on the type of the deleter and we're fine with destructor calls being virtually + // dispatched via ValueComparator. + ValueComparator* ree_value_comparator = nullptr; + switch (ree_type.run_end_type()->id()) { + case Type::INT16: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + case Type::INT32: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + case Type::INT64: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + default: + Unreachable(); + } + comparator_.reset(ree_value_comparator); + return Status::OK(); } - ValueComparator Create(const DataType& type) { - DCHECK_OK(VisitTypeInline(type, this)); - return out; + static Result> Create(const DataType& type, + const Array& base, + const Array& target) { + ValueComparatorFactory self; + RETURN_NOT_OK(VisitTypeInline(type, &self, base, target)); + return std::move(self.comparator_); } - - ValueComparator out; }; -ValueComparator GetValueComparator(const DataType& type) { - ValueComparatorVisitor type_visitor; - return type_visitor.Create(type); -} - // represents an intermediate state in the comparison of two arrays struct EditPoint { int64_t base, target; @@ -161,33 +367,9 @@ struct EditPoint { class QuadraticSpaceMyersDiff { public: QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool* pool) - : base_(base), - target_(target), - pool_(pool), - value_comparator_(GetValueComparator(*base.type())), - base_begin_(0), - base_end_(base.length()), - target_begin_(0), - target_end_(target.length()), - endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}), - insert_({true}) { - if ((base_end_ - base_begin_ == target_end_ - target_begin_) && - endpoint_base_[0] == base_end_) { - // trivial case: base == target - finish_index_ = 0; - } - } - - bool ValuesEqual(int64_t base_index, int64_t target_index) const { - bool base_null = base_.IsNull(base_index); - bool target_null = target_.IsNull(target_index); - if (base_null || target_null) { - // If only one is null, then this is false, otherwise true - return base_null && target_null; - } - return value_comparator_(base_, base_index, target_, target_index); - } + : base_(base), target_(target), pool_(pool) {} + private: // increment the position within base (the element pointed to was deleted) // then extend maximally EditPoint DeleteOne(EditPoint p) const { @@ -209,11 +391,10 @@ class QuadraticSpaceMyersDiff { // increment the position within base and target (the elements skipped in this way were // present in both sequences) EditPoint ExtendFrom(EditPoint p) const { - for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { - if (!ValuesEqual(p.base, p.target)) { - break; - } - } + const int64_t run_length_of_equals = + _comparator->RunLengthOfEqualsFrom(p.base, base_end_, p.target, target_end_); + p.base += run_length_of_equals; + p.target += run_length_of_equals; return p; } @@ -321,7 +502,24 @@ class QuadraticSpaceMyersDiff { {field("insert", boolean()), field("run_length", int64())}); } + public: Result> Diff() { + base_begin_ = 0; + base_end_ = base_.length(); + target_begin_ = 0; + target_end_ = target_.length(); + ARROW_ASSIGN_OR_RAISE(_comparator, + ValueComparatorFactory::Create(*base_.type(), base_, target_)); + + finish_index_ = -1; + edit_count_ = 0; + endpoint_base_ = {ExtendFrom({base_begin_, target_begin_}).base}; + insert_ = {true}; + if ((base_end_ - base_begin_ == target_end_ - target_begin_) && + endpoint_base_[0] == base_end_) { + // trivial case: base == target + finish_index_ = 0; + } while (!Done()) { Next(); } @@ -329,14 +527,19 @@ class QuadraticSpaceMyersDiff { } private: + // Constructor-injected references const Array& base_; const Array& target_; MemoryPool* pool_; - ValueComparator value_comparator_; + + // Initialized on Diff() and immutable thereafter + int64_t base_begin_ = 0, base_end_ = -1; + int64_t target_begin_ = 0, target_end_ = -1; + std::unique_ptr _comparator; + + // Initialized on Next() and mutated throughout the diffing process int64_t finish_index_ = -1; int64_t edit_count_ = 0; - int64_t base_begin_, base_end_; - int64_t target_begin_, target_end_; // each element of endpoint_base_ is the furthest position in base reachable given an // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether // the corresponding furthest position was reached via an insertion or a deletion @@ -386,8 +589,6 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); - } else if (base.type()->id() == Type::RUN_END_ENCODED) { - return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index 4c2f39eddf863..145978a91ad54 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -163,6 +163,107 @@ class DiffTest : public ::testing::Test { AssertRunLengthIs("[2, 0, 0]"); } + std::shared_ptr RunEndEncodedArrayFromJSON( + int64_t logical_length, const std::shared_ptr& ree_type, + std::string_view run_ends_json, std::string_view values_json, + int64_t logical_offset = 0) { + auto& ree_type_ref = checked_cast(*ree_type); + auto run_ends = ArrayFromJSON(ree_type_ref.run_end_type(), run_ends_json); + auto values = ArrayFromJSON(ree_type_ref.value_type(), values_json); + return RunEndEncodedArray::Make(logical_length, std::move(run_ends), + std::move(values), logical_offset) + .ValueOrDie(); + } + + template + void TestBasicsWithREEs() { + auto run_end_type = std::make_shared(); + auto value_type = utf8(); + auto ree_type = run_end_encoded(run_end_type, value_type); + + // empty REEs + base_ = RunEndEncodedArrayFromJSON(0, ree_type, "[]", "[]"); + target_ = RunEndEncodedArrayFromJSON(0, ree_type, "[]", "[]"); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[0]"); + + // null REE arrays of different lengths + base_ = RunEndEncodedArrayFromJSON(2, ree_type, "[2]", "[null]"); + target_ = RunEndEncodedArrayFromJSON(4, ree_type, "[4]", "[null]"); + DoDiff(); + AssertInsertIs("[false, true, true]"); + AssertRunLengthIs("[2, 0, 0]"); + + // identical REE arrays w/ offsets + base_ = + RunEndEncodedArrayFromJSON(110, ree_type, R"([20, 120])", R"(["a", "b"])", 10); + target_ = + RunEndEncodedArrayFromJSON(110, ree_type, R"([20, 120])", R"(["a", "b"])", 10); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[110]"); + + // equivalent REE arrays + base_ = RunEndEncodedArrayFromJSON(120, ree_type, R"([10, 20, 120])", + R"(["a", "a", "b"])"); + target_ = RunEndEncodedArrayFromJSON(120, ree_type, R"([20, 30, 120])", + R"(["a", "b", "b"])"); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[120]"); + + // slice so last run-end goes beyond length + base_ = base_->Slice(5, 105); + target_ = target_->Slice(5, 105); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[105]"); + + // insert one + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 12])", R"(["a", "b"])"); + target_ = RunEndEncodedArrayFromJSON(13, ree_type, R"([3, 13])", R"(["a", "b"])"); + DoDiff(); + AssertInsertIs("[false, true]"); + AssertRunLengthIs("[2, 10]"); + + // delete one + base_ = + RunEndEncodedArrayFromJSON(13, ree_type, R"([2, 5, 13])", R"(["a", "b", "c"])"); + target_ = + RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 12])", R"(["a", "b", "c"])"); + DoDiff(); + AssertInsertIs("[false, false]"); + AssertRunLengthIs("[4, 8]"); + + // null out one + base_ = + RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 5, 12])", R"(["a", "b", "c"])"); + target_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 5, 12])", + R"(["a", "b", null, "c"])"); + DoDiff(); + AssertInsertIs("[false, false, true]"); + AssertRunLengthIs("[4, 0, 7]"); + + // append some + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 8, 12])", + R"(["a", "b", "c", "d"])"); + target_ = RunEndEncodedArrayFromJSON(15, ree_type, R"([2, 4, 8, 13, 15])", + R"(["a", "b", "c", "d", "e"])"); + DoDiff(); + AssertInsertIs("[false, true, true, true]"); + AssertRunLengthIs("[12, 0, 0, 0]"); + + // prepend some + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 8, 12])", + R"(["c", "d", "e", "f"])"); + target_ = RunEndEncodedArrayFromJSON(15, ree_type, R"([1, 3, 5, 7, 11, 15])", + R"(["a", "b", "c", "d", "e", "f"])"); + DoDiff(); + AssertInsertIs("[false, true, true, true]"); + AssertRunLengthIs("[0, 0, 0, 12]"); + } + random::RandomArrayGenerator rng_; std::shared_ptr edits_; std::shared_ptr base_, target_; @@ -415,6 +516,12 @@ TEST_F(DiffTest, BasicsWithSparseUnions) { TestBasicsWithUnions(UnionMode::SPARS TEST_F(DiffTest, BasicsWithDenseUnions) { TestBasicsWithUnions(UnionMode::DENSE); } +TEST_F(DiffTest, BasicsWithREEs) { + TestBasicsWithREEs(); + TestBasicsWithREEs(); + TestBasicsWithREEs(); +} + TEST_F(DiffTest, UnifiedDiffFormatter) { // no changes base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])"); diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index fcd6c204e06b2..819de5eb60c63 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } +namespace internal { + +/// \pre 0 <= i < array_span.length() +template +int64_t FindPhysicalIndexImpl(PhysicalIndexFinder& self, int64_t i) { + DCHECK_LT(i, self.array_span.length); + const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length; + DCHECK_LT(self.last_physical_index, run_ends_size); + // This access to self.run_ends[last_physical_index] is alwas safe because: + // 1. 0 <= i < array_span.length() implies there is at least one run and the initial + // value 0 will be safe to index with. + // 2. last_physical_index > 0 is always the result of a valid call to + // internal::FindPhysicalIndex. + if (ARROW_PREDICT_TRUE(self.array_span.offset + i < + self.run_ends[self.last_physical_index])) { + // The cached value is an upper-bound, but is it the least upper-bound? + if (self.last_physical_index == 0 || + self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) { + return self.last_physical_index; + } + // last_physical_index - 1 is a candidate for the least upper-bound, + // so search for the least upper-bound in the range that includes it. + const int64_t j = ree_util::internal::FindPhysicalIndex( + self.run_ends, /*run_ends_size=*/self.last_physical_index, i, + self.array_span.offset); + DCHECK_LT(j, self.last_physical_index); + return self.last_physical_index = j; + } + + // last_physical_index is not an upper-bound, and the logical index i MUST be + // in the runs that follow it. Since i is a valid logical index, we know that at least + // one extra run is present. + DCHECK_LT(self.last_physical_index + 1, run_ends_size); + const int64_t min_physical_index = self.last_physical_index + 1; + + const int64_t j = ree_util::internal::FindPhysicalIndex( + /*run_ends=*/self.run_ends + min_physical_index, + /*run_ends_size=*/run_ends_size - min_physical_index, i, self.array_span.offset); + DCHECK_LT(min_physical_index + j, run_ends_size); + return self.last_physical_index = min_physical_index + j; +} + +int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +} // namespace internal + int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) { const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index a3d3d16c0da95..2b7940154a50b 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -23,6 +23,7 @@ #include "arrow/array/data.h" #include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" namespace arrow { @@ -139,6 +140,69 @@ int64_t FindPhysicalLength(const ArraySpan& span) { /*offset=*/span.offset); } +template +struct PhysicalIndexFinder; + +// non-inline implementations for each run-end type +ARROW_EXPORT int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder& self, + int64_t i); +ARROW_EXPORT int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder& self, + int64_t i); +ARROW_EXPORT int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder& self, + int64_t i); + +/// \brief Stateful version of FindPhysicalIndex() that caches the result of +/// the previous search and uses it to optimize the next search. +/// +/// When new queries for the physical index of a logical index come in, +/// binary search is performed again but the first candidate checked is the +/// result of the previous search (cached physical index) instead of the +/// midpoint of the run-ends array. +/// +/// If that test fails, internal::FindPhysicalIndex() is called with one of the +/// partitions defined by the cached index. If the queried logical indices +/// follow an increasing or decreasing pattern, this first test is much more +/// effective in (1) finding the answer right away (close logical indices belong +/// to the same runs) or (2) discarding many more candidates than probing +/// the midpoint would. +/// +/// The most adversarial case (i.e. alternating between 0 and length-1 queries) +/// only adds one extra binary search probe when compared to always starting +/// binary search from the midpoint without any of these optimizations. +/// +/// \tparam RunEndCType The numeric type of the run-ends array. +template +struct PhysicalIndexFinder { + const ArraySpan array_span; + const RunEndCType* run_ends; + int64_t last_physical_index = 0; + + explicit PhysicalIndexFinder(const ArrayData& data) + : array_span(data), + run_ends(RunEndsArray(array_span).template GetValues(1)) { + assert(CTypeTraits::ArrowType::type_id == + ::arrow::internal::checked_cast(*data.type) + .run_end_type() + ->id()); + } + + /// \brief Find the physical index into the values array of the REE array. + /// + /// \pre 0 <= i < array_span.length() + /// \param i the logical index into the REE array + /// \return the physical index into the values array + int64_t FindPhysicalIndex(int64_t i) { + if constexpr (std::is_same_v) { + return FindPhysicalIndexImpl16(*this, i); + } else if constexpr (std::is_same_v) { + return FindPhysicalIndexImpl32(*this, i); + } else { + static_assert(std::is_same_v, "Unsupported RunEndCType."); + return FindPhysicalIndexImpl64(*this, i); + } + } +}; + } // namespace internal /// \brief Find the physical index into the values array of the REE ArraySpan @@ -166,6 +230,10 @@ ARROW_EXPORT std::pair FindPhysicalRange(const ArraySpan& span int64_t offset, int64_t length); +// Publish PhysicalIndexFinder outside of the internal namespace. +template +using PhysicalIndexFinder = internal::PhysicalIndexFinder; + template class RunEndEncodedArraySpan { private: From 471334dbf4a1282de3afd52faa16eefbca941adb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 27 Oct 2023 09:51:00 +0900 Subject: [PATCH 4/6] GH-38424: [CI][C++] Use Fedora 38 instead of 35 (#38425) ### Rationale for this change Because Fedora 35 reached EOL. ### What changes are included in this PR? Use Fedora 38. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38424 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- .env | 2 +- ...pp.dockerfile => fedora-38-cpp.dockerfile} | 9 ++--- cpp/CMakeLists.txt | 6 +-- .../{FindORC.cmake => FindorcAlt.cmake} | 37 +++++++++++-------- cpp/cmake_modules/ThirdpartyToolchain.cmake | 34 ++++++++++------- cpp/src/arrow/adapters/orc/CMakeLists.txt | 2 +- dev/tasks/tasks.yml | 8 ++-- docker-compose.yml | 6 +-- 8 files changed, 57 insertions(+), 47 deletions(-) rename ci/docker/{fedora-35-cpp.dockerfile => fedora-38-cpp.dockerfile} (95%) rename cpp/cmake_modules/{FindORC.cmake => FindorcAlt.cmake} (68%) diff --git a/.env b/.env index a551e2120a6fb..6746892fd4ed8 100644 --- a/.env +++ b/.env @@ -49,7 +49,7 @@ ULIMIT_CORE=-1 ALMALINUX=8 ALPINE_LINUX=3.16 DEBIAN=11 -FEDORA=35 +FEDORA=38 UBUNTU=20.04 # Default versions for various dependencies diff --git a/ci/docker/fedora-35-cpp.dockerfile b/ci/docker/fedora-38-cpp.dockerfile similarity index 95% rename from ci/docker/fedora-35-cpp.dockerfile rename to ci/docker/fedora-38-cpp.dockerfile index aefa25663ba14..2dcc094ee20c5 100644 --- a/ci/docker/fedora-35-cpp.dockerfile +++ b/ci/docker/fedora-38-cpp.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch -FROM ${arch}/fedora:35 +FROM ${arch}/fedora:38 ARG arch # install dependencies @@ -46,9 +46,9 @@ RUN dnf update -y && \ java-latest-openjdk-devel \ java-latest-openjdk-headless \ json-devel \ + liborc-devel \ libzstd-devel \ llvm-devel \ - llvm-static \ lz4-devel \ make \ ninja-build \ @@ -64,6 +64,7 @@ RUN dnf update -y && \ utf8proc-devel \ wget \ which \ + xsimd-devel \ zlib-devel COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ @@ -100,8 +101,6 @@ ENV absl_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ google_cloud_cpp_storage_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - xsimd_SOURCE=BUNDLED + PATH=/usr/lib/ccache/:$PATH diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8566508406bd4..7138139b13ec0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -770,10 +770,10 @@ if(ARROW_WITH_ZSTD) endif() if(ARROW_ORC) - list(APPEND ARROW_SHARED_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) - list(APPEND ARROW_STATIC_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_SHARED_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_STATIC_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) if(ORC_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::liborc + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) endif() endif() diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindorcAlt.cmake similarity index 68% rename from cpp/cmake_modules/FindORC.cmake rename to cpp/cmake_modules/FindorcAlt.cmake index aca915acc13d0..dc3b978cf4037 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindorcAlt.cmake @@ -15,13 +15,20 @@ # specific language governing permissions and limitations # under the License. -# - Find Apache ORC C++ (orc/orc-config.h, liborc.a) -# This module defines -# ORC_INCLUDE_DIR, directory containing headers -# ORC_STATIC_LIB, path to liborc.a -# ORC_FOUND, whether orc has been found +if(orcAlt_FOUND) + return() +endif() -if(ORC_FOUND) +set(find_package_args) +if(orcAlt_FIND_VERSION) + list(APPEND find_package_args ${orcAlt_FIND_VERSION}) +endif() +if(orcAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(orc ${find_package_args}) +if(orc_FOUND) + set(orcAlt_FOUND TRUE) return() endif() @@ -45,15 +52,13 @@ else() PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() -if(ORC_STATIC_LIB AND ORC_INCLUDE_DIR) - set(ORC_FOUND TRUE) - add_library(orc::liborc STATIC IMPORTED) - set_target_properties(orc::liborc - PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") -else() - if(ORC_FIND_REQUIRED) - message(FATAL_ERROR "ORC library was required in toolchain and unable to locate") +find_package_handle_standard_args(orcAlt REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR) + +if(orcAlt_FOUND) + if(NOT TARGET orc::orc) + add_library(orc::orc STATIC IMPORTED) + set_target_properties(orc::orc + PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") endif() - set(ORC_FOUND FALSE) endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 559ddf14f6a91..5de8ff9b1cb11 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -65,7 +65,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES lz4 nlohmann_json opentelemetry-cpp - ORC + orc re2 Protobuf RapidJSON @@ -94,6 +94,14 @@ if("${re2_SOURCE}" STREQUAL "" AND NOT "${RE2_SOURCE}" STREQUAL "") set(re2_SOURCE ${RE2_SOURCE}) endif() +# For backward compatibility. We use "ORC_SOURCE" if "orc_SOURCE" +# isn't specified and "ORC_SOURCE" is specified. +# We renamed "ORC" dependency name to "orc" in 15.0.0 because +# upstream uses "orc" not "ORC" as package name. +if("${orc_SOURCE}" STREQUAL "" AND NOT "${ORC_SOURCE}" STREQUAL "") + set(orc_SOURCE ${ORC_SOURCE}) +endif() + # For backward compatibility. We use "RE2_ROOT" if "re2_ROOT" # isn't specified and "RE2_ROOT" is specified. if("${re2_ROOT}" STREQUAL "" AND NOT "${RE2_ROOT}" STREQUAL "") @@ -193,7 +201,7 @@ macro(build_dependency DEPENDENCY_NAME) build_nlohmann_json() elseif("${DEPENDENCY_NAME}" STREQUAL "opentelemetry-cpp") build_opentelemetry() - elseif("${DEPENDENCY_NAME}" STREQUAL "ORC") + elseif("${DEPENDENCY_NAME}" STREQUAL "orc") build_orc() elseif("${DEPENDENCY_NAME}" STREQUAL "Protobuf") build_protobuf() @@ -4423,31 +4431,31 @@ macro(build_orc) set(ORC_VENDORED 1) - add_library(orc::liborc STATIC IMPORTED) - set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") - target_include_directories(orc::liborc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") - set(ORC_LINK_LIBRARIES LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) + add_library(orc::orc STATIC IMPORTED) + set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") + target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") + target_link_libraries(orc::orc INTERFACE LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} + ${Snappy_TARGET}) # Protobuf generated files may use ABSL_DCHECK*() and # absl::log_internal_check_op is needed for them. if(TARGET absl::log_internal_check_op) - list(APPEND ORC_LINK_LIBRARIES absl::log_internal_check_op) + target_link_libraries(orc::orc INTERFACE absl::log_internal_check_op) endif() if(NOT MSVC) if(NOT APPLE AND ARROW_ENABLE_THREADING) - list(APPEND ORC_LINK_LIBRARIES Threads::Threads) + target_link_libraries(orc::orc INTERFACE Threads::Threads) endif() - list(APPEND ORC_LINK_LIBRARIES ${CMAKE_DL_LIBS}) + target_link_libraries(orc::orc INTERFACE ${CMAKE_DL_LIBS}) endif() - target_link_libraries(orc::liborc INTERFACE ${ORC_LINK_LIBRARIES}) add_dependencies(toolchain orc_ep) - add_dependencies(orc::liborc orc_ep) + add_dependencies(orc::orc orc_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS orc::liborc) + list(APPEND ARROW_BUNDLED_STATIC_LIBS orc::orc) endmacro() if(ARROW_ORC) - resolve_dependency(ORC) + resolve_dependency(orc HAVE_ALT TRUE) message(STATUS "Found ORC static library: ${ORC_STATIC_LIB}") message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt index e8ff69c191fb1..44719379994d9 100644 --- a/cpp/src/arrow/adapters/orc/CMakeLists.txt +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -32,7 +32,7 @@ else() set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_shared arrow_shared) endif() -set(ORC_STATIC_TEST_LINK_LIBS orc::liborc ${ARROW_LIBRARIES_FOR_STATIC_TESTS} +set(ORC_STATIC_TEST_LINK_LIBS orc::orc ${ARROW_LIBRARIES_FOR_STATIC_TESTS} ${ARROW_GTEST_GTEST_MAIN} ${ARROW_GTEST_GTEST}) add_arrow_test(adapter_test diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 17123ed2b8339..b620c9b8c8b0c 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1182,12 +1182,12 @@ tasks: image: debian-cpp {% endfor %} - test-fedora-35-cpp: + test-fedora-38-cpp: ci: github template: docker-tests/github.linux.yml params: env: - FEDORA: 35 + FEDORA: 38 image: fedora-cpp {% for cpp_standard in [20] %} @@ -1295,12 +1295,12 @@ tasks: UBUNTU: 22.04 image: ubuntu-python - test-fedora-35-python-3: + test-fedora-38-python-3: ci: azure template: docker-tests/azure.linux.yml params: env: - FEDORA: 35 + FEDORA: 38 image: fedora-python test-r-linux-valgrind: diff --git a/docker-compose.yml b/docker-compose.yml index c23d3ff08e487..bbe4e65c10c43 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -655,7 +655,7 @@ services: # docker-compose run --rm fedora-cpp # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 33 + # FEDORA: 38 image: ${REPO}:${ARCH}-fedora-${FEDORA}-cpp build: context: . @@ -671,7 +671,6 @@ services: <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: &fedora-volumes - .:/arrow:delegated - ${DOCKER_VOLUME_PREFIX}fedora-ccache:/ccache:delegated @@ -957,7 +956,7 @@ services: # docker-compose run --rm fedora-python # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 33 + # FEDORA: 38 image: ${REPO}:${ARCH}-fedora-${FEDORA}-python-3 build: context: . @@ -969,7 +968,6 @@ services: shm_size: *shm-size environment: <<: [*common, *ccache] - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: *fedora-volumes command: *python-command From 6e6fd0a30aebee6738bb4bb703344f0c866debea Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 27 Oct 2023 09:52:07 +0900 Subject: [PATCH 5/6] GH-38378: [C++][Parquet] Don't initialize OpenSSL explicitly with OpenSSL 1.1 (#38379) ### Rationale for this change This explicit initialization is for Valgrind and OpenSSL 1.1 will be unsupported eventually. So we can disable this explicit initialization for OpenSSL 1.1. ### What changes are included in this PR? Disable the explicit initialization with OpenSSL 1.1. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * Closes: #38378 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/parquet/encryption/openssl_internal.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/parquet/encryption/openssl_internal.cc b/cpp/src/parquet/encryption/openssl_internal.cc index 05f2773532353..61ec81b490802 100644 --- a/cpp/src/parquet/encryption/openssl_internal.cc +++ b/cpp/src/parquet/encryption/openssl_internal.cc @@ -24,11 +24,14 @@ namespace parquet::encryption::openssl { void EnsureInitialized() { +// OpenSSL 1.1 doesn't provide OPENSSL_INIT_ENGINE_ALL_BUILTIN. +#ifdef OPENSSL_INIT_ENGINE_ALL_BUILTIN // Initialize ciphers and random engines if (!OPENSSL_init_crypto(OPENSSL_INIT_ENGINE_ALL_BUILTIN | OPENSSL_INIT_ADD_ALL_CIPHERS, NULL)) { throw ParquetException("OpenSSL initialization failed"); } +#endif } } // namespace parquet::encryption::openssl From 547b2406d71bf9ad14e2bcc7064c6923165dd737 Mon Sep 17 00:00:00 2001 From: sgilmore10 <74676073+sgilmore10@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:01:19 -0400 Subject: [PATCH 6/6] GH-38166: [MATLAB] Improve tabular object display (#38482) ### Rationale for this change Currently, the display for `arrow.tabular.RecordBatch` and `arrow.tabular.Table` are not very MATLAB-like. ### What changes are included in this PR? 1. Updated the display of both `arrow.tabular.Table` and `arrow.tabular.RecordBatch`. 2. Added a new utility function `arrow.internal.display.boldFontIfPossible` 3. Renamed `arrow.array.internal.display.pluralizeStringIfNeeded` to `arrow.internal.display.pluralizeStringIfNeeded` 4. Renamed `arrow.tabular.internal.displaySchema` to `arrow.tabular.internal.display.getSchemaString`. **Example RecordBatch Display** ```matlab >> t = table(1, false, datetime(2023, 1, 1), VariableNames=["Number", "Logical", "Date"]); >> rb = arrow.recordBatch(t) rb = Arrow RecordBatch with 1 row and 3 columns: Schema: Number: Float64 | Logical: Boolean | Date: Timestamp First Row: 1 | false | 2023-01-01 00:00:00.000000 ``` **Example Table Display** ```matlab >> t = table(1, false, datetime(2023, 1, 1), VariableNames=["Number", "Logical", "Date"]); >> arrowTable = arrow.table(t) arrowTable = Arrow Table with 1 row and 3 columns: Schema: Number: Float64 | Logical: Boolean | Date: Timestamp First Row: 1 | false | 2023-01-01 00:00:00.000000 ``` ### Are these changes tested? Yes, I added a new test class in `matlab/test/arrow/tabular` called `tTabularDisplay.m`. ### Are there any user-facing changes? Yes. Users will now see the new `Table`/`RecordBatch` display * Closes: #38166 Authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- .../+array/+internal/+display/getHeader.m | 2 +- .../+internal/+display/boldFontIfPossible.m | 26 +++ .../+display/pluralizeStringIfNeeded.m | 0 .../getSchemaString.m} | 7 +- .../+internal/+display/getTabularDisplay.m | 44 ++++++ .../+internal/+display/getTabularHeader.m | 32 ++++ .../src/matlab/+arrow/+tabular/RecordBatch.m | 4 +- matlab/src/matlab/+arrow/+tabular/Schema.m | 2 +- matlab/src/matlab/+arrow/+tabular/Table.m | 5 +- matlab/test/arrow/tabular/tTabularDisplay.m | 148 ++++++++++++++++++ 10 files changed, 262 insertions(+), 8 deletions(-) create mode 100644 matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m rename matlab/src/matlab/+arrow/{+array => }/+internal/+display/pluralizeStringIfNeeded.m (100%) rename matlab/src/matlab/+arrow/+tabular/+internal/{displaySchema.m => +display/getSchemaString.m} (92%) create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m create mode 100644 matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m create mode 100644 matlab/test/arrow/tabular/tTabularDisplay.m diff --git a/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m b/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m index 5c8704d5bf2a4..85301ddefaada 100644 --- a/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m +++ b/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m @@ -16,7 +16,7 @@ % permissions and limitations under the License. function header = getHeader(className, numElements, numNulls) - import arrow.array.internal.display.pluralizeStringIfNeeded + import arrow.internal.display.pluralizeStringIfNeeded elementString = pluralizeStringIfNeeded(numElements, "element"); nullString = pluralizeStringIfNeeded(numNulls, "null value"); diff --git a/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m b/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m new file mode 100644 index 0000000000000..cb980cbff99e8 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m @@ -0,0 +1,26 @@ +%BOLDFONTIFPOSSIBLE Bolds the input string if possible + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function str = boldFontIfPossible(str) + + arguments + str(1, 1) string {mustBeNonzeroLengthText} + end + if usejava("desktop") + str = compose("%s", str); + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+array/+internal/+display/pluralizeStringIfNeeded.m b/matlab/src/matlab/+arrow/+internal/+display/pluralizeStringIfNeeded.m similarity index 100% rename from matlab/src/matlab/+arrow/+array/+internal/+display/pluralizeStringIfNeeded.m rename to matlab/src/matlab/+arrow/+internal/+display/pluralizeStringIfNeeded.m diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m similarity index 92% rename from matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m rename to matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m index 8d6740b195abc..7da945ca993ef 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m @@ -1,4 +1,5 @@ -%DISPLAYSCHEMA Generates arrow.tabular.Schema display text. +%GETSCHEMASTRING Generates a string representation of an +% arrow.tabular.Schema object. % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with @@ -15,7 +16,7 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -function text = displaySchema(schema) +function text = getSchemaString(schema) fields = schema.Fields; names = [fields.Name]; types = [fields.Type]; @@ -46,5 +47,5 @@ end text = names + ": " + typeIDs; - text = " " + strjoin(text, " | "); + text = strjoin(text, " | "); end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m new file mode 100644 index 0000000000000..054922fa03c75 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m @@ -0,0 +1,44 @@ +%GETTABULARDISPLAY Generates the display for arrow.tabular.Table and +% arrow.tabular.RecordBatch. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function tabularDisplay = getTabularDisplay(tabularObj, className) + import arrow.tabular.internal.display.getSchemaString + import arrow.tabular.internal.display.getTabularHeader + + numRows = tabularObj.NumRows; + numColumns = tabularObj.NumColumns; + tabularDisplay = getTabularHeader(className, numRows, numColumns); + + if numColumns > 0 + twoNewLines = string([newline newline]); + fourSpaces = string(repmat(' ', 1, 4)); + eightSpaces = string(repmat(' ', 1, 8)); + + schemaHeader = fourSpaces + "Schema:"; + schemaBody = eightSpaces + getSchemaString(tabularObj.Schema); + schemaDisplay = schemaHeader + twoNewLines + schemaBody; + tabularDisplay = tabularDisplay + twoNewLines + schemaDisplay; + + if numRows > 0 + rowHeader = fourSpaces + "First Row:"; + rowBody = eightSpaces + tabularObj.Proxy.getRowAsString(struct(Index=int64(1))); + rowDisplay = rowHeader + twoNewLines + rowBody; + tabularDisplay = tabularDisplay + twoNewLines + rowDisplay; + end + end +end diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m new file mode 100644 index 0000000000000..4c647986ce055 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m @@ -0,0 +1,32 @@ +%GETTABULARHEADER Generates the display header for arrow.tabular.Table and +% arrow.tabular.RecordBatch. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function header = getTabularHeader(className, numRows, numColumns) + import arrow.internal.display.boldFontIfPossible + import arrow.internal.display.pluralizeStringIfNeeded + + numRowsString = boldFontIfPossible(numRows); + numColsString = boldFontIfPossible(numColumns); + rowWordString = pluralizeStringIfNeeded(numRows, "row"); + colWordString = pluralizeStringIfNeeded(numColumns, "column"); + formatSpec = " Arrow %s with %s %s and %s %s"; + if numColumns > 0 + formatSpec = formatSpec + ":"; + end + header = compose(formatSpec,className, numRowsString, rowWordString, numColsString, colWordString); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index fdedaecb5eb3a..0225f3d771181 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -112,7 +112,9 @@ methods (Access=protected) function displayScalarObject(obj) - disp(obj.toString()); + className = matlab.mixin.CustomDisplay.getClassNameForHeader(obj); + tabularDisplay = arrow.tabular.internal.display.getTabularDisplay(obj, className); + disp(tabularDisplay + newline); end end diff --git a/matlab/src/matlab/+arrow/+tabular/Schema.m b/matlab/src/matlab/+arrow/+tabular/Schema.m index 3ee40f0e14293..a50522c6b5283 100644 --- a/matlab/src/matlab/+arrow/+tabular/Schema.m +++ b/matlab/src/matlab/+arrow/+tabular/Schema.m @@ -116,7 +116,7 @@ function displayScalarObject(obj) numFields = obj.NumFields; if numFields > 0 - text = arrow.tabular.internal.displaySchema(obj); + text = " " + arrow.tabular.internal.display.getSchemaString(obj); disp(text + newline); end diff --git a/matlab/src/matlab/+arrow/+tabular/Table.m b/matlab/src/matlab/+arrow/+tabular/Table.m index c2f73450408ef..1ed205d639747 100644 --- a/matlab/src/matlab/+arrow/+tabular/Table.m +++ b/matlab/src/matlab/+arrow/+tabular/Table.m @@ -112,9 +112,10 @@ end methods (Access=protected) - function displayScalarObject(obj) - disp(obj.toString()); + className = matlab.mixin.CustomDisplay.getClassNameForHeader(obj); + tabularDisplay = arrow.tabular.internal.display.getTabularDisplay(obj, className); + disp(tabularDisplay + newline); end end diff --git a/matlab/test/arrow/tabular/tTabularDisplay.m b/matlab/test/arrow/tabular/tTabularDisplay.m new file mode 100644 index 0000000000000..f99b25ab3452f --- /dev/null +++ b/matlab/test/arrow/tabular/tTabularDisplay.m @@ -0,0 +1,148 @@ +%TTABULARDISPLAY Unit tests verifying the display of arrow.tabular.Table +%and arrow.tabular.RecordBatch objects. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTabularDisplay < matlab.unittest.TestCase + + properties (TestParameter) + TabularType + end + + methods (TestParameterDefinition, Static) + function TabularType = initializeTabularType() + + tableStruct = struct(FullClassName="arrow.tabular.Table", ... + ClassName="Table", FromTableFcn = @arrow.table); + + recordBatchStruct = struct(FullClassName="arrow.tabular.RecordBatch", ... + ClassName="RecordBatch", FromTableFcn=@arrow.recordBatch); + + TabularType = struct(Table=tableStruct, RecordBatch=recordBatchStruct); + end + + end + + methods (Test) + + function ZeroRowsZeroColumns(testCase, TabularType) + % Verify tabular object display when the object has zero rows + % and zero columns. + import arrow.internal.test.display.makeLinkString + + tabularObj = TabularType.FromTableFcn(table); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + zeroString = getNumString(0); + header = compose(" Arrow %s with %s rows and %s columns", classNameString, zeroString, zeroString); + expectedDisplay = char(header + newline + newline); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function ZeroRowsOneColumn(testCase, TabularType) + % Verify tabular object display when the object has zero rows + % and one column. + import arrow.internal.test.display.makeLinkString + + t = table(1, VariableNames="Number"); + tabularObj = TabularType.FromTableFcn(t(1:0, :)); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s rows and %s column:", classNameString, getNumString(0), getNumString(1)); + + fieldString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + schema = join([" Schema:" " " + fieldString], [newline newline]); + + expectedDisplay = char(join([header schema + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function OneRowOneColumn(testCase, TabularType) + % Verify tabular object display when the object has one row + % and column. + import arrow.internal.test.display.makeLinkString + + t = table(1, VariableNames="Number"); + tabularObj = TabularType.FromTableFcn(t); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s row and %s column:", classNameString, getNumString(1), getNumString(1)); + + fieldString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + schema = join([" Schema:" " " + fieldString], [newline newline]); + row = join([" First Row:" " 1"], [newline newline]); + + + expectedDisplay = char(join([header schema row + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function ManyRowsAndColumns(testCase, TabularType) + % Verify tabular object display when the object has many rows + % and columns. + import arrow.internal.test.display.makeLinkString + + t = table((1:2)', ["A"; "B"], true(2, 1), VariableNames=["Number", "Letter", "Logical"]); + tabularObj = TabularType.FromTableFcn(t); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s rows and %s columns:", classNameString, getNumString(2), getNumString(3)); + + fieldOneString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + fieldTwoString = makeFieldString("Letter", "String", "arrow.type.StringType"); + fieldThreeString = makeFieldString("Logical", "Boolean", "arrow.type.BooleanType"); + + fields = join([fieldOneString fieldTwoString fieldThreeString], " | "); + schema = join([" Schema:" " " + fields], [newline newline]); + row = join([" First Row:" " 1 | ""A"" | true"], [newline newline]); + + expectedDisplay = char(join([header schema row + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + end +end + +function numString = getNumString(num) + if usejava("desktop") + numString = compose("%d", num); + else + numString = compose("%d", num); + end +end + +function str = makeFieldString(fieldName, classType, fullClassType) + import arrow.internal.test.display.makeLinkString + + if usejava("desktop") + name = compose("%s:", fieldName); + typeStr = makeLinkString(FullClassName=fullClassType, ClassName=classType, BoldFont=true); + str = name + " " + typeStr; + else + str = fieldName + ": " + classType; + end +end \ No newline at end of file