diff --git a/.env b/.env index a551e2120a6fb..6746892fd4ed8 100644 --- a/.env +++ b/.env @@ -49,7 +49,7 @@ ULIMIT_CORE=-1 ALMALINUX=8 ALPINE_LINUX=3.16 DEBIAN=11 -FEDORA=35 +FEDORA=38 UBUNTU=20.04 # Default versions for various dependencies diff --git a/ci/docker/fedora-35-cpp.dockerfile b/ci/docker/fedora-38-cpp.dockerfile similarity index 95% rename from ci/docker/fedora-35-cpp.dockerfile rename to ci/docker/fedora-38-cpp.dockerfile index aefa25663ba14..2dcc094ee20c5 100644 --- a/ci/docker/fedora-35-cpp.dockerfile +++ b/ci/docker/fedora-38-cpp.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG arch -FROM ${arch}/fedora:35 +FROM ${arch}/fedora:38 ARG arch # install dependencies @@ -46,9 +46,9 @@ RUN dnf update -y && \ java-latest-openjdk-devel \ java-latest-openjdk-headless \ json-devel \ + liborc-devel \ libzstd-devel \ llvm-devel \ - llvm-static \ lz4-devel \ make \ ninja-build \ @@ -64,6 +64,7 @@ RUN dnf update -y && \ utf8proc-devel \ wget \ which \ + xsimd-devel \ zlib-devel COPY ci/scripts/install_minio.sh /arrow/ci/scripts/ @@ -100,8 +101,6 @@ ENV absl_SOURCE=BUNDLED \ CC=gcc \ CXX=g++ \ google_cloud_cpp_storage_SOURCE=BUNDLED \ - ORC_SOURCE=BUNDLED \ PARQUET_BUILD_EXAMPLES=ON \ PARQUET_BUILD_EXECUTABLES=ON \ - PATH=/usr/lib/ccache/:$PATH \ - xsimd_SOURCE=BUNDLED + PATH=/usr/lib/ccache/:$PATH diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8566508406bd4..7138139b13ec0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -770,10 +770,10 @@ if(ARROW_WITH_ZSTD) endif() if(ARROW_ORC) - list(APPEND ARROW_SHARED_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) - list(APPEND ARROW_STATIC_LINK_LIBS orc::liborc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_SHARED_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) + list(APPEND ARROW_STATIC_LINK_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) if(ORC_SOURCE STREQUAL "SYSTEM") - list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::liborc + list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS orc::orc ${ARROW_PROTOBUF_LIBPROTOBUF}) endif() endif() diff --git a/cpp/cmake_modules/FindORC.cmake b/cpp/cmake_modules/FindorcAlt.cmake similarity index 68% rename from cpp/cmake_modules/FindORC.cmake rename to cpp/cmake_modules/FindorcAlt.cmake index aca915acc13d0..dc3b978cf4037 100644 --- a/cpp/cmake_modules/FindORC.cmake +++ b/cpp/cmake_modules/FindorcAlt.cmake @@ -15,13 +15,20 @@ # specific language governing permissions and limitations # under the License. -# - Find Apache ORC C++ (orc/orc-config.h, liborc.a) -# This module defines -# ORC_INCLUDE_DIR, directory containing headers -# ORC_STATIC_LIB, path to liborc.a -# ORC_FOUND, whether orc has been found +if(orcAlt_FOUND) + return() +endif() -if(ORC_FOUND) +set(find_package_args) +if(orcAlt_FIND_VERSION) + list(APPEND find_package_args ${orcAlt_FIND_VERSION}) +endif() +if(orcAlt_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() +find_package(orc ${find_package_args}) +if(orc_FOUND) + set(orcAlt_FOUND TRUE) return() endif() @@ -45,15 +52,13 @@ else() PATH_SUFFIXES ${ARROW_INCLUDE_PATH_SUFFIXES}) endif() -if(ORC_STATIC_LIB AND ORC_INCLUDE_DIR) - set(ORC_FOUND TRUE) - add_library(orc::liborc STATIC IMPORTED) - set_target_properties(orc::liborc - PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" - INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") -else() - if(ORC_FIND_REQUIRED) - message(FATAL_ERROR "ORC library was required in toolchain and unable to locate") +find_package_handle_standard_args(orcAlt REQUIRED_VARS ORC_STATIC_LIB ORC_INCLUDE_DIR) + +if(orcAlt_FOUND) + if(NOT TARGET orc::orc) + add_library(orc::orc STATIC IMPORTED) + set_target_properties(orc::orc + PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${ORC_INCLUDE_DIR}") endif() - set(ORC_FOUND FALSE) endif() diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 559ddf14f6a91..5de8ff9b1cb11 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -65,7 +65,7 @@ set(ARROW_THIRDPARTY_DEPENDENCIES lz4 nlohmann_json opentelemetry-cpp - ORC + orc re2 Protobuf RapidJSON @@ -94,6 +94,14 @@ if("${re2_SOURCE}" STREQUAL "" AND NOT "${RE2_SOURCE}" STREQUAL "") set(re2_SOURCE ${RE2_SOURCE}) endif() +# For backward compatibility. We use "ORC_SOURCE" if "orc_SOURCE" +# isn't specified and "ORC_SOURCE" is specified. +# We renamed "ORC" dependency name to "orc" in 15.0.0 because +# upstream uses "orc" not "ORC" as package name. +if("${orc_SOURCE}" STREQUAL "" AND NOT "${ORC_SOURCE}" STREQUAL "") + set(orc_SOURCE ${ORC_SOURCE}) +endif() + # For backward compatibility. We use "RE2_ROOT" if "re2_ROOT" # isn't specified and "RE2_ROOT" is specified. if("${re2_ROOT}" STREQUAL "" AND NOT "${RE2_ROOT}" STREQUAL "") @@ -193,7 +201,7 @@ macro(build_dependency DEPENDENCY_NAME) build_nlohmann_json() elseif("${DEPENDENCY_NAME}" STREQUAL "opentelemetry-cpp") build_opentelemetry() - elseif("${DEPENDENCY_NAME}" STREQUAL "ORC") + elseif("${DEPENDENCY_NAME}" STREQUAL "orc") build_orc() elseif("${DEPENDENCY_NAME}" STREQUAL "Protobuf") build_protobuf() @@ -4423,31 +4431,31 @@ macro(build_orc) set(ORC_VENDORED 1) - add_library(orc::liborc STATIC IMPORTED) - set_target_properties(orc::liborc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") - target_include_directories(orc::liborc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") - set(ORC_LINK_LIBRARIES LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} ${Snappy_TARGET}) + add_library(orc::orc STATIC IMPORTED) + set_target_properties(orc::orc PROPERTIES IMPORTED_LOCATION "${ORC_STATIC_LIB}") + target_include_directories(orc::orc BEFORE INTERFACE "${ORC_INCLUDE_DIR}") + target_link_libraries(orc::orc INTERFACE LZ4::lz4 ZLIB::ZLIB ${ARROW_ZSTD_LIBZSTD} + ${Snappy_TARGET}) # Protobuf generated files may use ABSL_DCHECK*() and # absl::log_internal_check_op is needed for them. if(TARGET absl::log_internal_check_op) - list(APPEND ORC_LINK_LIBRARIES absl::log_internal_check_op) + target_link_libraries(orc::orc INTERFACE absl::log_internal_check_op) endif() if(NOT MSVC) if(NOT APPLE AND ARROW_ENABLE_THREADING) - list(APPEND ORC_LINK_LIBRARIES Threads::Threads) + target_link_libraries(orc::orc INTERFACE Threads::Threads) endif() - list(APPEND ORC_LINK_LIBRARIES ${CMAKE_DL_LIBS}) + target_link_libraries(orc::orc INTERFACE ${CMAKE_DL_LIBS}) endif() - target_link_libraries(orc::liborc INTERFACE ${ORC_LINK_LIBRARIES}) add_dependencies(toolchain orc_ep) - add_dependencies(orc::liborc orc_ep) + add_dependencies(orc::orc orc_ep) - list(APPEND ARROW_BUNDLED_STATIC_LIBS orc::liborc) + list(APPEND ARROW_BUNDLED_STATIC_LIBS orc::orc) endmacro() if(ARROW_ORC) - resolve_dependency(ORC) + resolve_dependency(orc HAVE_ALT TRUE) message(STATUS "Found ORC static library: ${ORC_STATIC_LIB}") message(STATUS "Found ORC headers: ${ORC_INCLUDE_DIR}") endif() diff --git a/cpp/src/arrow/adapters/orc/CMakeLists.txt b/cpp/src/arrow/adapters/orc/CMakeLists.txt index e8ff69c191fb1..44719379994d9 100644 --- a/cpp/src/arrow/adapters/orc/CMakeLists.txt +++ b/cpp/src/arrow/adapters/orc/CMakeLists.txt @@ -32,7 +32,7 @@ else() set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_shared arrow_shared) endif() -set(ORC_STATIC_TEST_LINK_LIBS orc::liborc ${ARROW_LIBRARIES_FOR_STATIC_TESTS} +set(ORC_STATIC_TEST_LINK_LIBS orc::orc ${ARROW_LIBRARIES_FOR_STATIC_TESTS} ${ARROW_GTEST_GTEST_MAIN} ${ARROW_GTEST_GTEST}) add_arrow_test(adapter_test diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc index f7b8d7954e1cf..eab71de27b11a 100644 --- a/cpp/src/arrow/array/array_base.cc +++ b/cpp/src/arrow/array/array_base.cc @@ -87,6 +87,8 @@ struct ScalarFromArraySlotImpl { return Finish(a.GetString(index_)); } + Status Visit(const BinaryViewArray& a) { return Finish(a.GetString(index_)); } + Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); } Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); } diff --git a/cpp/src/arrow/array/array_binary.cc b/cpp/src/arrow/array/array_binary.cc index 9466b5a48f9d7..d83ba0ca8936d 100644 --- a/cpp/src/arrow/array/array_binary.cc +++ b/cpp/src/arrow/array/array_binary.cc @@ -24,6 +24,7 @@ #include "arrow/array/validate.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" @@ -89,6 +90,33 @@ LargeStringArray::LargeStringArray(int64_t length, Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } +BinaryViewArray::BinaryViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW); + SetData(std::move(data)); +} + +BinaryViewArray::BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector buffers, + std::shared_ptr null_bitmap, int64_t null_count, + int64_t offset) { + buffers.insert(buffers.begin(), std::move(views)); + buffers.insert(buffers.begin(), std::move(null_bitmap)); + SetData( + ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset)); +} + +std::string_view BinaryViewArray::GetView(int64_t i) const { + const std::shared_ptr* data_buffers = data_->buffers.data() + 2; + return util::FromBinaryView(raw_values_[i], data_buffers); +} + +StringViewArray::StringViewArray(std::shared_ptr data) { + ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW); + SetData(std::move(data)); +} + +Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); } + FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr& data) { SetData(data); } diff --git a/cpp/src/arrow/array/array_binary.h b/cpp/src/arrow/array/array_binary.h index 7e58a96ff841a..fd68a379ddbfb 100644 --- a/cpp/src/arrow/array/array_binary.h +++ b/cpp/src/arrow/array/array_binary.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -217,6 +218,65 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray { Status ValidateUTF8() const; }; +// ---------------------------------------------------------------------- +// BinaryView and StringView + +/// Concrete Array class for variable-size binary view data using the +/// BinaryViewType::c_type struct to reference in-line or out-of-line string values +class ARROW_EXPORT BinaryViewArray : public FlatArray { + public: + using TypeClass = BinaryViewType; + using IteratorType = stl::ArrayIterator; + using c_type = BinaryViewType::c_type; + + explicit BinaryViewArray(std::shared_ptr data); + + BinaryViewArray(std::shared_ptr type, int64_t length, + std::shared_ptr views, BufferVector data_buffers, + std::shared_ptr null_bitmap = NULLPTR, + int64_t null_count = kUnknownNullCount, int64_t offset = 0); + + // For API compatibility with BinaryArray etc. + std::string_view GetView(int64_t i) const; + std::string GetString(int64_t i) const { return std::string{GetView(i)}; } + + const auto& values() const { return data_->buffers[1]; } + const c_type* raw_values() const { return raw_values_; } + + std::optional operator[](int64_t i) const { + return *IteratorType(*this, i); + } + + IteratorType begin() const { return IteratorType(*this); } + IteratorType end() const { return IteratorType(*this, length()); } + + protected: + using FlatArray::FlatArray; + + void SetData(std::shared_ptr data) { + FlatArray::SetData(std::move(data)); + raw_values_ = data_->GetValuesSafe(1); + } + + const c_type* raw_values_; +}; + +/// Concrete Array class for variable-size string view (utf-8) data using +/// BinaryViewType::c_type to reference in-line or out-of-line string values +class ARROW_EXPORT StringViewArray : public BinaryViewArray { + public: + using TypeClass = StringViewType; + + explicit StringViewArray(std::shared_ptr data); + + using BinaryViewArray::BinaryViewArray; + + /// \brief Validate that this array contains only valid UTF8 entries + /// + /// This check is also implied by ValidateFull() + Status ValidateUTF8() const; +}; + // ---------------------------------------------------------------------- // Fixed width binary diff --git a/cpp/src/arrow/array/array_binary_test.cc b/cpp/src/arrow/array/array_binary_test.cc index 3bc9bb91a022a..04391be0ac789 100644 --- a/cpp/src/arrow/array/array_binary_test.cc +++ b/cpp/src/arrow/array/array_binary_test.cc @@ -27,17 +27,21 @@ #include "arrow/array.h" #include "arrow/array/builder_binary.h" +#include "arrow/array/validate.h" #include "arrow/buffer.h" #include "arrow/memory_pool.h" #include "arrow/status.h" #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/matchers.h" #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" #include "arrow/visit_data_inline.h" namespace arrow { @@ -365,38 +369,134 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets(); TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); } +// Produce an Array of index/offset views from a std::vector of index/offset +// BinaryViewType::c_type +Result> MakeBinaryViewArray( + BufferVector data_buffers, const std::vector& views, + bool validate = true) { + auto length = static_cast(views.size()); + auto arr = std::make_shared( + utf8_view(), length, Buffer::FromVector(views), std::move(data_buffers)); + if (validate) { + RETURN_NOT_OK(arr->ValidateFull()); + } + return arr; +} + +TEST(StringViewArray, Validate) { + // Since this is a test of validation, we need to be able to construct invalid arrays. + auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)"); + auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy"); + + // empty array is valid + EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok()); + + // empty array with some data buffers is valid + EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok()); + + // inline views need not have a corresponding buffer + EXPECT_THAT(MakeBinaryViewArray({}, + { + util::ToInlineBinaryView("hello"), + util::ToInlineBinaryView("world"), + util::ToInlineBinaryView("inline me"), + }), + Ok()); + + // non-inline views are expected to reference only buffers managed by the array + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), + Ok()); + + // views may not reference data buffers not present in the array + EXPECT_THAT( + MakeBinaryViewArray( + {}, {util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0)}), + Raises(StatusCode::IndexError)); + // ... or ranges which overflow the referenced data buffer + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, {util::ToBinaryView( + "supe", static_cast(buffer_s->size() + 50), 0, 0)}), + Raises(StatusCode::IndexError)); + + // Additionally, the prefixes of non-inline views must match the data buffer + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s, buffer_y}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", static_cast(buffer_y->size()), 1, 0)}), + Raises(StatusCode::Invalid)); + + // Invalid string views which are masked by a null bit do not cause validation to fail + auto invalid_but_masked = + MakeBinaryViewArray( + {buffer_s}, + {util::ToBinaryView("SUPE", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("yyyy", 50, 40, 30)}, + /*validate=*/false) + .ValueOrDie() + ->data(); + invalid_but_masked->null_count = 2; + invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2); + EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok()); + + // overlapping views are allowed + EXPECT_THAT( + MakeBinaryViewArray( + {buffer_s}, + { + util::ToBinaryView("supe", static_cast(buffer_s->size()), 0, 0), + util::ToBinaryView("uper", static_cast(buffer_s->size() - 1), 0, + 1), + util::ToBinaryView("perc", static_cast(buffer_s->size() - 2), 0, + 2), + util::ToBinaryView("erca", static_cast(buffer_s->size() - 3), 0, + 3), + }), + Ok()); +} + template class TestUTF8Array : public ::testing::Test { public: using TypeClass = T; - using offset_type = typename TypeClass::offset_type; using ArrayType = typename TypeTraits::ArrayType; - Status ValidateUTF8(int64_t length, std::vector offsets, - std::string_view data, int64_t offset = 0) { - ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared(data), - /*null_bitmap=*/nullptr, /*null_count=*/0, offset); - return arr.ValidateUTF8(); + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return TypeClass::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } } - Status ValidateUTF8(const std::string& json) { - auto ty = TypeTraits::type_singleton(); - auto arr = ArrayFromJSON(ty, json); - return checked_cast(*arr).ValidateUTF8(); + Status ValidateUTF8(const Array& arr) { + return checked_cast(arr).ValidateUTF8(); + } + + Status ValidateUTF8(std::vector values) { + std::shared_ptr arr; + ArrayFromVector(type(), values, &arr); + return ValidateUTF8(*arr); } void TestValidateUTF8() { - ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])")); - ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff + ASSERT_OK( + ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])"))); + ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff - ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4")); + ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"})); // More tests in TestValidateData() above // (ValidateFull() calls ValidateUTF8() internally) } }; -TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes); +TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes); TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); } @@ -883,11 +983,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { void SetUp() override { type_ = TypeTraits::type_singleton(); } void TestBasics() { - auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])"); + auto array = ArrayFromJSON( + type_, + R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])"); BinaryAppender appender; ArraySpanVisitor visitor; ASSERT_OK(visitor.Visit(*array->data(), &appender)); - ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"})); + ASSERT_THAT(appender.data, + ::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me", + "allocate_me_aaaaa", "allocate_me_bbbb"})); ARROW_UNUSED(visitor); // Workaround weird MSVC warning } @@ -904,7 +1008,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test { std::shared_ptr type_; }; -TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes); +TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes); TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); } diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc index 2bef9d725d37f..46908439ef5f0 100644 --- a/cpp/src/arrow/array/array_test.cc +++ b/cpp/src/arrow/array/array_test.cc @@ -382,10 +382,12 @@ static std::vector> TestArrayUtilitiesAgainstTheseType float64(), binary(), large_binary(), + binary_view(), fixed_size_binary(3), decimal(16, 4), utf8(), large_utf8(), + utf8_view(), list(utf8()), list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull list(large_utf8()), @@ -601,12 +603,15 @@ static ScalarVector GetScalars() { std::make_shared(60, duration(TimeUnit::SECOND)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), std::make_shared( hello, fixed_size_binary(static_cast(hello->size()))), std::make_shared(Decimal128(10), decimal(16, 4)), std::make_shared(Decimal256(10), decimal(76, 38)), std::make_shared(hello), std::make_shared(hello), + std::make_shared(hello), + std::make_shared(Buffer::FromString("long string; not inlined")), std::make_shared(ArrayFromJSON(int8(), "[1, 2, 3]")), ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"), std::make_shared(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")), @@ -647,13 +652,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) { for (int64_t length : {16}) { for (auto scalar : scalars) { + ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString()); ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length)); ASSERT_OK(array->ValidateFull()); ASSERT_EQ(array->length(), length); ASSERT_EQ(array->null_count(), 0); // test case for ARROW-13321 - for (int64_t i : std::vector{0, length / 2, length - 1}) { + for (int64_t i : {int64_t{0}, length / 2, length - 1}) { ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i)); AssertScalarsEqual(*s, *scalar, /*verbose=*/true); } diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc index 07dc3014e4029..97110ea97f3fc 100644 --- a/cpp/src/arrow/array/array_view_test.cc +++ b/cpp/src/arrow/array/array_view_test.cc @@ -126,6 +126,38 @@ TEST(TestArrayView, StringAsBinary) { CheckView(expected, arr); } +TEST(TestArrayView, StringViewAsBinaryView) { + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = ArrayFromJSON(utf8_view(), json); + auto expected = ArrayFromJSON(binary_view(), json); + CheckView(arr, expected); + CheckView(expected, arr); + } +} + +TEST(TestArrayView, StringViewAsBinaryViewInStruct) { + auto padl = ArrayFromJSON(list(int16()), "[[0, -1], [], [42]]"); + auto padr = ArrayFromJSON(utf8(), R"(["foox", "barz", null])"); + + for (auto json : { + R"(["foox", "barz", null])", + R"(["foox", "barz_not_inlined", null])", + }) { + auto arr = + StructArray::Make({padl, ArrayFromJSON(utf8_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + auto expected = + StructArray::Make({padl, ArrayFromJSON(binary_view(), json), padr}, {"", "", ""}) + .ValueOrDie(); + + CheckView(arr, expected); + CheckView(expected, arr); + } +} + TEST(TestArrayView, PrimitiveWrongSize) { auto arr = ArrayFromJSON(int16(), "[0, -1, 42]"); CheckViewFails(arr, int8()); diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc index 3000aea3e189a..d3502a0ab645a 100644 --- a/cpp/src/arrow/array/builder_base.cc +++ b/cpp/src/arrow/array/builder_base.cc @@ -96,10 +96,7 @@ namespace { template struct AppendScalarImpl { template - enable_if_t::value || is_decimal_type::value || - is_fixed_size_binary_type::value, - Status> - Visit(const T&) { + Status HandleFixedWidth(const T&) { auto builder = checked_cast::BuilderType*>(builder_); RETURN_NOT_OK(builder->Reserve(n_repeats_ * (scalars_end_ - scalars_begin_))); @@ -117,7 +114,16 @@ struct AppendScalarImpl { } template - enable_if_base_binary Visit(const T&) { + enable_if_t::value, Status> Visit(const T& t) { + return HandleFixedWidth(t); + } + + Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); } + Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); } + + template + enable_if_has_string_view Visit(const T&) { int64_t data_size = 0; for (auto it = scalars_begin_; it != scalars_end_; ++it) { const auto& scalar = checked_cast::ScalarType&>(*it); diff --git a/cpp/src/arrow/array/builder_binary.cc b/cpp/src/arrow/array/builder_binary.cc index 571f450aab9c1..3ff22d4a3feeb 100644 --- a/cpp/src/arrow/array/builder_binary.cc +++ b/cpp/src/arrow/array/builder_binary.cc @@ -35,11 +35,70 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/logging.h" +#include "arrow/visit_data_inline.h" namespace arrow { using internal::checked_cast; +// ---------------------------------------------------------------------- +// Binary/StringView +BinaryViewBuilder::BinaryViewBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : BinaryViewBuilder(pool) {} + +Status BinaryViewBuilder::AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) { + auto bitmap = array.GetValues(0, 0); + auto values = array.GetValues(1) + offset; + + int64_t out_of_line_total = 0, i = 0; + VisitNullBitmapInline( + array.buffers[0].data, array.offset, array.length, array.null_count, + [&] { + if (!values[i].is_inline()) { + out_of_line_total += static_cast(values[i].size()); + } + ++i; + }, + [&] { ++i; }); + + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(ReserveData(out_of_line_total)); + + for (int64_t i = 0; i < length; i++) { + if (bitmap && !bit_util::GetBit(bitmap, array.offset + offset + i)) { + UnsafeAppendNull(); + continue; + } + + UnsafeAppend(util::FromBinaryView(values[i], array.GetVariadicBuffers().data())); + } + return Status::OK(); +} + +Status BinaryViewBuilder::FinishInternal(std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, null_bitmap_builder_.FinishWithLength(length_)); + ARROW_ASSIGN_OR_RAISE(auto data, data_builder_.FinishWithLength(length_)); + BufferVector buffers = {null_bitmap, data}; + for (auto&& buffer : data_heap_builder_.Finish()) { + buffers.push_back(std::move(buffer)); + } + *out = ArrayData::Make(type(), length_, std::move(buffers), null_count_); + Reset(); + return Status::OK(); +} + +Status BinaryViewBuilder::ReserveData(int64_t length) { + return data_heap_builder_.Reserve(length); +} + +void BinaryViewBuilder::Reset() { + ArrayBuilder::Reset(); + data_builder_.Reset(); + data_heap_builder_.Reset(); +} + // ---------------------------------------------------------------------- // Fixed width binary @@ -125,8 +184,8 @@ const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { std::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { const uint8_t* data_ptr = byte_builder_.data(); - return std::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); + return {reinterpret_cast(data_ptr + i * byte_width_), + static_cast(byte_width_)}; } // ---------------------------------------------------------------------- @@ -173,10 +232,10 @@ Status ChunkedStringBuilder::Finish(ArrayVector* out) { RETURN_NOT_OK(ChunkedBinaryBuilder::Finish(out)); // Change data type to string/utf8 - for (size_t i = 0; i < out->size(); ++i) { - std::shared_ptr data = (*out)[i]->data(); + for (auto& chunk : *out) { + std::shared_ptr data = chunk->data()->Copy(); data->type = ::arrow::utf8(); - (*out)[i] = std::make_shared(data); + chunk = std::make_shared(std::move(data)); } return Status::OK(); } diff --git a/cpp/src/arrow/array/builder_binary.h b/cpp/src/arrow/array/builder_binary.h index b0c4fe2fc81fd..3e87cf2403610 100644 --- a/cpp/src/arrow/array/builder_binary.h +++ b/cpp/src/arrow/array/builder_binary.h @@ -36,6 +36,7 @@ #include "arrow/buffer_builder.h" #include "arrow/status.h" #include "arrow/type.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/macros.h" #include "arrow/util/visibility.h" @@ -204,10 +205,10 @@ class BaseBinaryBuilder } } } else { - for (std::size_t i = 0; i < values.size(); ++i) { + for (const auto& value : values) { UnsafeAppendNextOffset(); - value_data_builder_.UnsafeAppend( - reinterpret_cast(values[i].data()), values[i].size()); + value_data_builder_.UnsafeAppend(reinterpret_cast(value.data()), + value.size()); } } @@ -463,6 +464,256 @@ class ARROW_EXPORT LargeStringBuilder : public LargeBinaryBuilder { std::shared_ptr type() const override { return large_utf8(); } }; +// ---------------------------------------------------------------------- +// BinaryViewBuilder, StringViewBuilder +// +// These builders do not support building raw pointer view arrays. + +namespace internal { + +// We allocate medium-sized memory chunks and accumulate data in those, which +// may result in some waste if there are many large-ish strings. If a string +// comes along that does not fit into a block, we allocate a new block and +// write into that. +// +// Later we can implement optimizations to continuing filling underfull blocks +// after encountering a large string that required allocating a new block. +class ARROW_EXPORT StringHeapBuilder { + public: + static constexpr int64_t kDefaultBlocksize = 32 << 10; // 32KB + + StringHeapBuilder(MemoryPool* pool, int64_t alignment) + : pool_(pool), alignment_(alignment) {} + + void SetBlockSize(int64_t blocksize) { blocksize_ = blocksize; } + + using c_type = BinaryViewType::c_type; + + template + std::conditional_t, c_type> Append(const uint8_t* value, + int64_t length) { + if (length <= BinaryViewType::kInlineSize) { + return util::ToInlineBinaryView(value, static_cast(length)); + } + + if constexpr (Safe) { + ARROW_RETURN_NOT_OK(Reserve(length)); + } + + auto v = + util::ToBinaryView(value, static_cast(length), + static_cast(blocks_.size() - 1), current_offset_); + + memcpy(current_out_buffer_, value, static_cast(length)); + current_out_buffer_ += length; + current_remaining_bytes_ -= length; + current_offset_ += static_cast(length); + return v; + } + + static constexpr int64_t ValueSizeLimit() { + return std::numeric_limits::max(); + } + + /// \brief Ensure that the indicated number of bytes can be appended via + /// UnsafeAppend operations without the need to allocate more memory + Status Reserve(int64_t num_bytes) { + if (ARROW_PREDICT_FALSE(num_bytes > ValueSizeLimit())) { + return Status::CapacityError( + "BinaryView or StringView elements cannot reference " + "strings larger than 2GB"); + } + if (num_bytes > current_remaining_bytes_) { + // Ensure the buffer is fully overwritten to avoid leaking uninitialized + // bytes from the allocator + if (current_remaining_bytes_ > 0) { + std::memset(current_out_buffer_, 0, current_remaining_bytes_); + blocks_.back() = SliceBuffer(blocks_.back(), 0, + blocks_.back()->size() - current_remaining_bytes_); + } + current_remaining_bytes_ = num_bytes > blocksize_ ? num_bytes : blocksize_; + ARROW_ASSIGN_OR_RAISE(std::shared_ptr new_block, + AllocateBuffer(current_remaining_bytes_, alignment_, pool_)); + current_offset_ = 0; + current_out_buffer_ = new_block->mutable_data(); + blocks_.emplace_back(std::move(new_block)); + } + return Status::OK(); + } + + void Reset() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + blocks_.clear(); + } + + int64_t current_remaining_bytes() const { return current_remaining_bytes_; } + + std::vector> Finish() { + current_offset_ = 0; + current_out_buffer_ = NULLPTR; + current_remaining_bytes_ = 0; + return std::move(blocks_); + } + + private: + MemoryPool* pool_; + int64_t alignment_; + int64_t blocksize_ = kDefaultBlocksize; + std::vector> blocks_; + + int32_t current_offset_ = 0; + uint8_t* current_out_buffer_ = NULLPTR; + int64_t current_remaining_bytes_ = 0; +}; + +} // namespace internal + +class ARROW_EXPORT BinaryViewBuilder : public ArrayBuilder { + public: + using TypeClass = BinaryViewType; + + // this constructor provided for MakeBuilder compatibility + BinaryViewBuilder(const std::shared_ptr&, MemoryPool* pool); + + explicit BinaryViewBuilder(MemoryPool* pool = default_memory_pool(), + int64_t alignment = kDefaultBufferAlignment) + : ArrayBuilder(pool, alignment), + data_builder_(pool, alignment), + data_heap_builder_(pool, alignment) {} + + /// Set the size for future preallocated data buffers. + /// + /// The default size is 32KB, so after each 32KB of string data appended to the builder + /// a new data buffer will be allocated. Adjust this to a larger value to decrease the + /// frequency of allocation, or to a smaller value to lower the overhead of each + /// allocation. + void SetBlockSize(int64_t blocksize) { data_heap_builder_.SetBlockSize(blocksize); } + + /// The number of bytes which can be appended to this builder without allocating another + /// data buffer. + int64_t current_block_bytes_remaining() const { + return data_heap_builder_.current_remaining_bytes(); + } + + Status Append(const uint8_t* value, int64_t length) { + ARROW_RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(true); + ARROW_ASSIGN_OR_RAISE(auto v, + data_heap_builder_.Append(value, length)); + data_builder_.UnsafeAppend(v); + return Status::OK(); + } + + Status Append(const char* value, int64_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(std::string_view value) { + return Append(value.data(), static_cast(value.size())); + } + + /// \brief Append without checking capacity + /// + /// Builder should have been presized using Reserve() and ReserveData(), + /// respectively, and the value must not be larger than 2GB + void UnsafeAppend(const uint8_t* value, int64_t length) { + UnsafeAppendToBitmap(true); + auto v = data_heap_builder_.Append(value, length); + data_builder_.UnsafeAppend(v); + } + + void UnsafeAppend(const char* value, int64_t length) { + UnsafeAppend(reinterpret_cast(value), length); + } + + void UnsafeAppend(const std::string& value) { + UnsafeAppend(value.c_str(), static_cast(value.size())); + } + + void UnsafeAppend(std::string_view value) { + UnsafeAppend(value.data(), static_cast(value.size())); + } + + /// \brief Ensures there is enough allocated available capacity in the + /// out-of-line data heap to append the indicated number of bytes without + /// additional allocations + Status ReserveData(int64_t length); + + Status AppendNulls(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNull(length); + return Status::OK(); + } + + /// \brief Append a single null element + Status AppendNull() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + /// \brief Append a empty element (length-0 inline string) + Status AppendEmptyValue() final { + ARROW_RETURN_NOT_OK(Reserve(1)); + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + return Status::OK(); + } + + /// \brief Append several empty elements + Status AppendEmptyValues(int64_t length) final { + ARROW_RETURN_NOT_OK(Reserve(length)); + data_builder_.UnsafeAppend(length, BinaryViewType::c_type{}); + UnsafeSetNotNull(length); + return Status::OK(); + } + + void UnsafeAppendNull() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(false); + } + + void UnsafeAppendEmptyValue() { + data_builder_.UnsafeAppend(BinaryViewType::c_type{}); + UnsafeAppendToBitmap(true); + } + + /// \brief Append a slice of a BinaryViewArray passed as an ArraySpan. Copies + /// the underlying out-of-line string memory to avoid memory lifetime issues + Status AppendArraySlice(const ArraySpan& array, int64_t offset, + int64_t length) override; + + void Reset() override; + + Status Resize(int64_t capacity) override { + ARROW_RETURN_NOT_OK(CheckCapacity(capacity)); + capacity = std::max(capacity, kMinBuilderCapacity); + ARROW_RETURN_NOT_OK(data_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); + } + + Status FinishInternal(std::shared_ptr* out) override; + + std::shared_ptr type() const override { return binary_view(); } + + protected: + TypedBufferBuilder data_builder_; + + // Accumulates out-of-line data in fixed-size chunks which are then attached + // to the resulting ArrayData + internal::StringHeapBuilder data_heap_builder_; +}; + +class ARROW_EXPORT StringViewBuilder : public BinaryViewBuilder { + public: + using BinaryViewBuilder::BinaryViewBuilder; + std::shared_ptr type() const override { return utf8_view(); } +}; + // ---------------------------------------------------------------------- // FixedSizeBinaryBuilder @@ -498,7 +749,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { Status Append(const Buffer& s) { ARROW_RETURN_NOT_OK(Reserve(1)); - UnsafeAppend(std::string_view(s)); + UnsafeAppend(s); return Status::OK(); } @@ -549,7 +800,7 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder { UnsafeAppend(reinterpret_cast(value.data())); } - void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view(s)); } + void UnsafeAppend(const Buffer& s) { UnsafeAppend(std::string_view{s}); } void UnsafeAppend(const std::shared_ptr& s) { UnsafeAppend(*s); } diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index 525b0afbc908a..7a96463ec3c43 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -194,6 +194,12 @@ Status DictionaryMemoTable::GetOrInsert(const BinaryType*, std::string_view valu return impl_->GetOrInsert(value, out); } +Status DictionaryMemoTable::GetOrInsert(const BinaryViewType*, std::string_view value, + int32_t* out) { + // Create BinaryArray dictionary for now + return impl_->GetOrInsert(value, out); +} + Status DictionaryMemoTable::GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out) { return impl_->GetOrInsert(value, out); diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index cb0aaf309915b..3f0d711dc5bb5 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -60,6 +60,12 @@ struct DictionaryValue> { BinaryType, LargeBinaryType>::type; }; +template +struct DictionaryValue> { + using type = std::string_view; + using PhysicalType = BinaryViewType; +}; + template struct DictionaryValue> { using type = std::string_view; @@ -114,6 +120,7 @@ class ARROW_EXPORT DictionaryMemoTable { Status GetOrInsert(const BinaryType*, std::string_view value, int32_t* out); Status GetOrInsert(const LargeBinaryType*, std::string_view value, int32_t* out); + Status GetOrInsert(const BinaryViewType*, std::string_view value, int32_t* out); class DictionaryMemoTableImpl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index f7549fa9d1d1a..37c7271b5b95c 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -43,6 +43,7 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -230,6 +231,45 @@ class ConcatenateImpl { return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + for (const auto& in_data : in_) { + for (const auto& buf : util::span(in_data->buffers).subspan(2)) { + out_->buffers.push_back(buf); + } + } + + ARROW_ASSIGN_OR_RAISE(auto view_buffers, Buffers(1, BinaryViewType::kSize)); + ARROW_ASSIGN_OR_RAISE(auto view_buffer, ConcatenateBuffers(view_buffers, pool_)); + + auto* views = view_buffer->mutable_data_as(); + size_t preceding_buffer_count = 0; + + int64_t i = in_[0]->length; + for (size_t in_index = 1; in_index < in_.size(); ++in_index) { + preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2; + + for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) { + if (views[i].is_inline()) continue; + views[i].ref.buffer_index = SafeSignedAdd( + views[i].ref.buffer_index, static_cast(preceding_buffer_count)); + } + } + + if (out_->buffers[0] != nullptr) { + i = in_[0]->length; + VisitNullBitmapInline( + out_->buffers[0]->data(), i, out_->length - i, out_->null_count, [&] { ++i; }, + [&] { + views[i++] = {}; // overwrite views under null bits with an empty view + }); + } + + out_->buffers[1] = std::move(view_buffer); + return Status::OK(); + } + Status Visit(const ListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index 4c03fab731ffe..0ef1136ea78f8 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -92,9 +92,15 @@ class ConcatenateTest : public ::testing::Test { for (auto null_probability : this->null_probabilities_) { std::shared_ptr array; factory(size, null_probability, &array); + ASSERT_OK(array->ValidateFull()); auto expected = array->Slice(offsets.front(), offsets.back() - offsets.front()); + ASSERT_OK(expected->ValidateFull()); auto slices = this->Slices(array, offsets); + for (auto slice : slices) { + ASSERT_OK(slice->ValidateFull()); + } ASSERT_OK_AND_ASSIGN(auto actual, Concatenate(slices)); + ASSERT_OK(actual->ValidateFull()); AssertArraysEqual(*expected, *actual); if (actual->data()->buffers[0]) { CheckTrailingBitsAreZeroed(actual->data()->buffers[0], actual->length()); @@ -155,6 +161,14 @@ TEST_F(ConcatenateTest, StringType) { }); } +TEST_F(ConcatenateTest, StringViewType) { + Check([this](int32_t size, double null_probability, std::shared_ptr* out) { + *out = rng_.StringView(size, /*min_length =*/0, /*max_length =*/40, null_probability, + /*max_buffer_length=*/200); + ASSERT_OK((**out).ValidateFull()); + }); +} + TEST_F(ConcatenateTest, LargeStringType) { Check([this](int32_t size, double null_probability, std::shared_ptr* out) { *out = diff --git a/cpp/src/arrow/array/data.cc b/cpp/src/arrow/array/data.cc index 79595ab7c7c31..678513fd4d151 100644 --- a/cpp/src/arrow/array/data.cc +++ b/cpp/src/arrow/array/data.cc @@ -31,6 +31,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" @@ -92,6 +93,11 @@ bool RunEndEncodedMayHaveLogicalNulls(const ArrayData& data) { return ArraySpan(data).MayHaveLogicalNulls(); } +BufferSpan PackVariadicBuffers(util::span> buffers) { + return {const_cast(reinterpret_cast(buffers.data())), + static_cast(buffers.size() * sizeof(std::shared_ptr))}; +} + } // namespace internal std::shared_ptr ArrayData::Make(std::shared_ptr type, int64_t length, @@ -187,7 +193,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { } this->offset = data.offset; - for (int i = 0; i < static_cast(data.buffers.size()); ++i) { + for (int i = 0; i < std::min(static_cast(data.buffers.size()), 3); ++i) { const std::shared_ptr& buffer = data.buffers[i]; // It is the invoker-of-kernels's responsibility to ensure that // const buffers are not written to accidentally. @@ -200,7 +206,7 @@ void ArraySpan::SetMembers(const ArrayData& data) { Type::type type_id = this->type->id(); if (type_id == Type::EXTENSION) { - const ExtensionType* ext_type = checked_cast(this->type); + auto* ext_type = checked_cast(this->type); type_id = ext_type->storage_type()->id(); } @@ -215,6 +221,11 @@ void ArraySpan::SetMembers(const ArrayData& data) { this->buffers[i] = {}; } + if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) { + // store the span of data buffers in the third buffer + this->buffers[2] = internal::PackVariadicBuffers(util::span(data.buffers).subspan(2)); + } + if (type_id == Type::DICTIONARY) { this->child_data.resize(1); this->child_data[0].SetMembers(*data.dictionary); @@ -247,6 +258,8 @@ int GetNumBuffers(const DataType& type) { case Type::LARGE_BINARY: case Type::STRING: case Type::LARGE_STRING: + case Type::STRING_VIEW: + case Type::BINARY_VIEW: case Type::DENSE_UNION: return 3; case Type::EXTENSION: @@ -351,6 +364,19 @@ void ArraySpan::FillFromScalar(const Scalar& value) { } this->buffers[2].data = const_cast(data_buffer); this->buffers[2].size = data_size; + } else if (type_id == Type::BINARY_VIEW || type_id == Type::STRING_VIEW) { + const auto& scalar = checked_cast(value); + + this->buffers[1].size = BinaryViewType::kSize; + this->buffers[1].data = scalar.scratch_space_; + static_assert(sizeof(BinaryViewType::c_type) <= sizeof(scalar.scratch_space_)); + auto* view = new (&scalar.scratch_space_) BinaryViewType::c_type; + if (scalar.is_valid) { + *view = util::ToBinaryView(std::string_view{*scalar.value}, 0, 0); + this->buffers[2] = internal::PackVariadicBuffers({&scalar.value, 1}); + } else { + *view = {}; + } } else if (type_id == Type::FIXED_SIZE_BINARY) { const auto& scalar = checked_cast(value); this->buffers[1].data = const_cast(scalar.value->data()); @@ -513,6 +539,14 @@ std::shared_ptr ArraySpan::ToArrayData() const { type_id = ext_type->storage_type()->id(); } + if (HasVariadicBuffers()) { + DCHECK_EQ(result->buffers.size(), 3); + result->buffers.pop_back(); + for (const auto& data_buffer : GetVariadicBuffers()) { + result->buffers.push_back(data_buffer); + } + } + if (type_id == Type::NA) { result->null_count = this->length; } else if (this->buffers[0].data == nullptr) { @@ -531,6 +565,16 @@ std::shared_ptr ArraySpan::ToArrayData() const { return result; } +util::span> ArraySpan::GetVariadicBuffers() const { + DCHECK(HasVariadicBuffers()); + return {buffers[2].data_as>(), + buffers[2].size / sizeof(std::shared_ptr)}; +} + +bool ArraySpan::HasVariadicBuffers() const { + return type->id() == Type::BINARY_VIEW || type->id() == Type::STRING_VIEW; +} + std::shared_ptr ArraySpan::ToArray() const { return MakeArray(this->ToArrayData()); } @@ -722,7 +766,8 @@ struct ViewDataImpl { } RETURN_NOT_OK(CheckInputAvailable()); - const auto& in_spec = in_layouts[in_layout_idx].buffers[in_buffer_idx]; + const auto& in_layout = in_layouts[in_layout_idx]; + const auto& in_spec = in_layout.buffers[in_buffer_idx]; if (out_spec != in_spec) { return InvalidView("incompatible layouts"); } @@ -733,6 +778,18 @@ struct ViewDataImpl { DCHECK_GT(in_data_item->buffers.size(), in_buffer_idx); out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); ++in_buffer_idx; + + if (in_buffer_idx == in_layout.buffers.size()) { + if (out_layout.variadic_spec != in_layout.variadic_spec) { + return InvalidView("incompatible layouts"); + } + + if (in_layout.variadic_spec) { + for (; in_buffer_idx < in_data_item->buffers.size(); ++in_buffer_idx) { + out_buffers.push_back(in_data_item->buffers[in_buffer_idx]); + } + } + } AdjustInputPointer(); } diff --git a/cpp/src/arrow/array/data.h b/cpp/src/arrow/array/data.h index 8c6b250b71adf..40a77640cd1e5 100644 --- a/cpp/src/arrow/array/data.h +++ b/cpp/src/arrow/array/data.h @@ -28,6 +28,7 @@ #include "arrow/type.h" #include "arrow/util/bit_util.h" #include "arrow/util/macros.h" +#include "arrow/util/span.h" #include "arrow/util/visibility.h" namespace arrow { @@ -472,10 +473,12 @@ struct ARROW_EXPORT ArraySpan { void SetSlice(int64_t offset, int64_t length) { this->offset = offset; this->length = length; - if (this->type->id() != Type::NA) { + if (this->type->id() == Type::NA) { + this->null_count = this->length; + } else if (this->MayHaveNulls()) { this->null_count = kUnknownNullCount; } else { - this->null_count = this->length; + this->null_count = 0; } } @@ -530,6 +533,16 @@ struct ARROW_EXPORT ArraySpan { /// \see GetNullCount int64_t ComputeLogicalNullCount() const; + /// Some DataTypes (StringView, BinaryView) may have an arbitrary number of variadic + /// buffers. Since ArraySpan only has 3 buffers, we pack the variadic buffers into + /// buffers[2]; IE buffers[2].data points to the first shared_ptr of the + /// variadic set and buffers[2].size is the number of variadic buffers times + /// sizeof(shared_ptr). + /// + /// \see HasVariadicBuffers + util::span> GetVariadicBuffers() const; + bool HasVariadicBuffers() const; + private: ARROW_FRIEND_EXPORT friend bool internal::IsNullRunEndEncoded(const ArrayData& span, int64_t i); diff --git a/cpp/src/arrow/array/dict_internal.h b/cpp/src/arrow/array/dict_internal.h index 3c1c8c453d1e7..63d8583b17324 100644 --- a/cpp/src/arrow/array/dict_internal.h +++ b/cpp/src/arrow/array/dict_internal.h @@ -150,6 +150,32 @@ struct DictionaryTraits> { } }; +template +struct DictionaryTraits> { + using MemoTableType = typename HashTraits::MemoTableType; + + static_assert(std::is_same_v>); + + // Instead of defining a custom memo table for StringView we reuse BinaryType's, + // then convert to views when we copy data out of the memo table. + static Result> GetDictionaryArrayData( + MemoryPool* pool, const std::shared_ptr& type, + const MemoTableType& memo_table, int64_t start_offset) { + DCHECK(type->id() == Type::STRING_VIEW || type->id() == Type::BINARY_VIEW); + + BinaryViewBuilder builder(pool); + RETURN_NOT_OK(builder.Resize(memo_table.size() - start_offset)); + RETURN_NOT_OK(builder.ReserveData(memo_table.values_size())); + memo_table.VisitValues(static_cast(start_offset), + [&](std::string_view s) { builder.UnsafeAppend(s); }); + + std::shared_ptr out; + RETURN_NOT_OK(builder.FinishInternal(&out)); + out->type = type; + return out; + } +}; + template struct DictionaryTraits> { using MemoTableType = typename HashTraits::MemoTableType; diff --git a/cpp/src/arrow/array/diff.cc b/cpp/src/arrow/array/diff.cc index 800f19b752726..be9597e59b378 100644 --- a/cpp/src/arrow/array/diff.cc +++ b/cpp/src/arrow/array/diff.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,7 @@ #include "arrow/array/array_decimal.h" #include "arrow/array/array_nested.h" #include "arrow/array/array_primitive.h" +#include "arrow/array/array_run_end.h" #include "arrow/buffer.h" #include "arrow/buffer_builder.h" #include "arrow/extension_type.h" @@ -43,7 +45,9 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/ree_util.h" #include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/vendored/datetime.h" #include "arrow/visit_type_inline.h" @@ -96,45 +100,247 @@ static UnitSlice GetView(const UnionArray& array, int64_t index) { return UnitSlice{&array, index}; } -using ValueComparator = std::function; +/// \brief A simple virtual comparator interface for two arrays. +/// +/// The base and target array ara bound at construction time. Then +/// Equals(base_index, target_index) should return true if the values +/// at the given indices are equal. +struct ValueComparator { + virtual ~ValueComparator() = default; + + /// \brief Compare the validity and values at the given indices in the base and target + /// arrays. + /// + /// \param base_index The index in the base array. + /// \param target_index The index in the target array. + /// \return true if the values at the given indices are equal, false otherwise. + /// \pre base_index and target_index are valid indices in their respective arrays. + virtual bool Equals(int64_t base_index, int64_t target_index) = 0; + + /// \brief Return the run length of equal values starting at the given indices in the + /// base and target arrays. + /// + /// \param base_index The starting index in the base array. + /// \param base_length The length of the base array. + /// \param target_index The starting index in the target array. + /// \param target_length The length of the target array. + /// \return The run length of equal values starting at the given indices in the base + /// and target arrays. + virtual int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length, + int64_t target_index, int64_t target_length) { + int64_t run_length_of_equals = 0; + while (base_index < base_length && target_index < target_length) { + if (!Equals(base_index, target_index)) { + break; + } + base_index += 1; + target_index += 1; + run_length_of_equals += 1; + } + return run_length_of_equals; + } +}; + +template +struct DefaultValueComparator : public ValueComparator { + const ArrayType& base; + const ArrayType& target; + + DefaultValueComparator(const ArrayType& base, const ArrayType& target) + : base(base), target(target) {} + + ~DefaultValueComparator() override = default; + + bool Equals(int64_t base_index, int64_t target_index) override { + const bool base_valid = base.IsValid(base_index); + const bool target_valid = target.IsValid(target_index); + if (base_valid && target_valid) { + return GetView(base, base_index) == GetView(target, target_index); + } + return base_valid == target_valid; + } +}; + +template +class REEValueComparator : public ValueComparator { + private: + const RunEndEncodedArray& base_; + const RunEndEncodedArray& target_; + std::unique_ptr inner_value_comparator_; + ree_util::PhysicalIndexFinder base_physical_index_finder_; + ree_util::PhysicalIndexFinder target_physical_index_finder_; + + public: + REEValueComparator(const RunEndEncodedArray& base, const RunEndEncodedArray& target, + std::unique_ptr&& inner_value_comparator) + : base_(base), + target_(target), + inner_value_comparator_(std::move(inner_value_comparator)), + base_physical_index_finder_(*base_.data()), + target_physical_index_finder_(*target_.data()) { + DCHECK_EQ(*base_.type(), *target_.type()); + } + + ~REEValueComparator() override = default; + + private: + /// \pre 0 <= i < base_.length() + inline int64_t FindPhysicalIndexOnBase(int64_t i) { + return base_physical_index_finder_.FindPhysicalIndex(i); + } + + /// \pre 0 <= i < target_.length() + inline int64_t FindPhysicalIndexOnTarget(int64_t i) { + return target_physical_index_finder_.FindPhysicalIndex(i); + } + + const RunEndCType* base_run_ends() { return base_physical_index_finder_.run_ends; } + + const RunEndCType* target_run_ends() { return target_physical_index_finder_.run_ends; } + + public: + int64_t RunLengthOfEqualsFrom(int64_t base_index, int64_t base_length, + int64_t target_index, int64_t target_length) override { + // Ensure the first search for physical index on the values arrays is safe. + if (base_index >= base_length || target_index >= target_length) { + // Without values on either side, there is no run of equal values. + return 0; + } + + // Translate the two logical indices into physical indices. + int64_t physical_base_index = FindPhysicalIndexOnBase(base_index); + int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index); + + int64_t run_length_of_equals = 0; + // The loop invariant (base_index < base_length && target_index < target_length) + // is valid when the loop starts because of the check above. + for (;;) { + const auto base_run_end = + static_cast(base_run_ends()[physical_base_index]) - base_.offset(); + const auto target_run_end = + static_cast(target_run_ends()[physical_target_index]) - + target_.offset(); + // The end of the runs containing the logical indices, by definition, ends + // after the logical indices. + DCHECK_LT(base_index, base_run_end); + DCHECK_LT(target_index, target_run_end); + + // Compare the physical values that make up the runs containing base_index + // and target_index. + if (!inner_value_comparator_->Equals(physical_base_index, physical_target_index)) { + // First difference found, stop because the run of equal values cannot + // be extended further. + break; + } + + const int64_t base_run = std::min(base_run_end, base_length) - base_index; + const int64_t target_run = std::min(target_run_end, target_length) - target_index; + // Due to the loop-invariant (base_index < base_length && target_index < + // target_length) and properties of the run-ends asserted above, both base_run and + // target_run are strictly greater than zero. + DCHECK_GT(base_run, 0); + DCHECK_GT(target_run, 0); + + // Skip the smallest run (or both runs if they are equal) + const int64_t increment = std::min(base_run, target_run); + physical_base_index += increment == base_run; + physical_target_index += increment == target_run; + + // Since both base_run and target_run are greater than zero, + // increment is also greater than zero... + DCHECK_GT(increment, 0); + // ...which implies that the loop will make progress and eventually terminate + // because base_index or target_index will equal base_length or target_length, + // respectively. + base_index += increment; + target_index += increment; + // The value representing the two runs are equal, so we can assume that at + // least `increment` (size of smallest run) values are equal. + run_length_of_equals += increment; + + if (base_index >= base_length || target_index >= target_length) { + break; + } + } + + return run_length_of_equals; + } + + bool Equals(int64_t base_index, int64_t target_index) override { + const int64_t physical_base_index = FindPhysicalIndexOnBase(base_index); + const int64_t physical_target_index = FindPhysicalIndexOnTarget(target_index); + return inner_value_comparator_->Equals(physical_base_index, physical_target_index); + } +}; + +class ValueComparatorFactory { + private: + std::unique_ptr comparator_; -struct ValueComparatorVisitor { + public: template - Status Visit(const T&) { + Status Visit(const T&, const Array& base, const Array& target) { using ArrayType = typename TypeTraits::ArrayType; - out = [](const Array& base, int64_t base_index, const Array& target, - int64_t target_index) { - return (GetView(checked_cast(base), base_index) == - GetView(checked_cast(target), target_index)); - }; + comparator_ = std::make_unique>( + checked_cast(base), checked_cast(target)); return Status::OK(); } - Status Visit(const NullType&) { return Status::NotImplemented("null type"); } + Status Visit(const NullType&, const Array&, const Array&) { + return Status::NotImplemented("null type"); + } - Status Visit(const ExtensionType&) { return Status::NotImplemented("extension type"); } + Status Visit(const ExtensionType&, const Array&, const Array&) { + return Status::NotImplemented("extension type"); + } - Status Visit(const DictionaryType&) { + Status Visit(const DictionaryType&, const Array& base, const Array& target) { return Status::NotImplemented("dictionary type"); } - Status Visit(const RunEndEncodedType&) { - return Status::NotImplemented("run-end encoded type"); + Status Visit(const RunEndEncodedType& ree_type, const Array& base, + const Array& target) { + const auto& base_ree = checked_cast(base); + const auto& target_ree = checked_cast(target); + + ARROW_ASSIGN_OR_RAISE( + auto inner_values_comparator, + Create(*ree_type.value_type(), *base_ree.values(), *target_ree.values())); + + // Instantiate the specialized comparator types with operator new instead of + // make_unique() to avoid binary bloat. unique_ptr's constructor is templated + // on the type of the deleter and we're fine with destructor calls being virtually + // dispatched via ValueComparator. + ValueComparator* ree_value_comparator = nullptr; + switch (ree_type.run_end_type()->id()) { + case Type::INT16: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + case Type::INT32: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + case Type::INT64: + ree_value_comparator = new REEValueComparator( + base_ree, target_ree, std::move(inner_values_comparator)); + break; + default: + Unreachable(); + } + comparator_.reset(ree_value_comparator); + return Status::OK(); } - ValueComparator Create(const DataType& type) { - DCHECK_OK(VisitTypeInline(type, this)); - return out; + static Result> Create(const DataType& type, + const Array& base, + const Array& target) { + ValueComparatorFactory self; + RETURN_NOT_OK(VisitTypeInline(type, &self, base, target)); + return std::move(self.comparator_); } - - ValueComparator out; }; -ValueComparator GetValueComparator(const DataType& type) { - ValueComparatorVisitor type_visitor; - return type_visitor.Create(type); -} - // represents an intermediate state in the comparison of two arrays struct EditPoint { int64_t base, target; @@ -161,33 +367,9 @@ struct EditPoint { class QuadraticSpaceMyersDiff { public: QuadraticSpaceMyersDiff(const Array& base, const Array& target, MemoryPool* pool) - : base_(base), - target_(target), - pool_(pool), - value_comparator_(GetValueComparator(*base.type())), - base_begin_(0), - base_end_(base.length()), - target_begin_(0), - target_end_(target.length()), - endpoint_base_({ExtendFrom({base_begin_, target_begin_}).base}), - insert_({true}) { - if ((base_end_ - base_begin_ == target_end_ - target_begin_) && - endpoint_base_[0] == base_end_) { - // trivial case: base == target - finish_index_ = 0; - } - } - - bool ValuesEqual(int64_t base_index, int64_t target_index) const { - bool base_null = base_.IsNull(base_index); - bool target_null = target_.IsNull(target_index); - if (base_null || target_null) { - // If only one is null, then this is false, otherwise true - return base_null && target_null; - } - return value_comparator_(base_, base_index, target_, target_index); - } + : base_(base), target_(target), pool_(pool) {} + private: // increment the position within base (the element pointed to was deleted) // then extend maximally EditPoint DeleteOne(EditPoint p) const { @@ -209,11 +391,10 @@ class QuadraticSpaceMyersDiff { // increment the position within base and target (the elements skipped in this way were // present in both sequences) EditPoint ExtendFrom(EditPoint p) const { - for (; p.base != base_end_ && p.target != target_end_; ++p.base, ++p.target) { - if (!ValuesEqual(p.base, p.target)) { - break; - } - } + const int64_t run_length_of_equals = + _comparator->RunLengthOfEqualsFrom(p.base, base_end_, p.target, target_end_); + p.base += run_length_of_equals; + p.target += run_length_of_equals; return p; } @@ -321,7 +502,24 @@ class QuadraticSpaceMyersDiff { {field("insert", boolean()), field("run_length", int64())}); } + public: Result> Diff() { + base_begin_ = 0; + base_end_ = base_.length(); + target_begin_ = 0; + target_end_ = target_.length(); + ARROW_ASSIGN_OR_RAISE(_comparator, + ValueComparatorFactory::Create(*base_.type(), base_, target_)); + + finish_index_ = -1; + edit_count_ = 0; + endpoint_base_ = {ExtendFrom({base_begin_, target_begin_}).base}; + insert_ = {true}; + if ((base_end_ - base_begin_ == target_end_ - target_begin_) && + endpoint_base_[0] == base_end_) { + // trivial case: base == target + finish_index_ = 0; + } while (!Done()) { Next(); } @@ -329,14 +527,19 @@ class QuadraticSpaceMyersDiff { } private: + // Constructor-injected references const Array& base_; const Array& target_; MemoryPool* pool_; - ValueComparator value_comparator_; + + // Initialized on Diff() and immutable thereafter + int64_t base_begin_ = 0, base_end_ = -1; + int64_t target_begin_ = 0, target_end_ = -1; + std::unique_ptr _comparator; + + // Initialized on Next() and mutated throughout the diffing process int64_t finish_index_ = -1; int64_t edit_count_ = 0; - int64_t base_begin_, base_end_; - int64_t target_begin_, target_end_; // each element of endpoint_base_ is the furthest position in base reachable given an // edit_count and (# insertions) - (# deletions). Each bit of insert_ records whether // the corresponding furthest position was reached via an insertion or a deletion @@ -386,8 +589,6 @@ Result> Diff(const Array& base, const Array& target return Diff(*base_storage, *target_storage, pool); } else if (base.type()->id() == Type::DICTIONARY) { return Status::NotImplemented("diffing arrays of type ", *base.type()); - } else if (base.type()->id() == Type::RUN_END_ENCODED) { - return Status::NotImplemented("diffing arrays of type ", *base.type()); } else { return QuadraticSpaceMyersDiff(base, target, pool).Diff(); } @@ -476,30 +677,31 @@ class MakeFormatterImpl { return Status::OK(); } - // format Binary, LargeBinary and FixedSizeBinary in hexadecimal template - enable_if_binary_like Visit(const T&) { + enable_if_has_string_view Visit(const T&) { using ArrayType = typename TypeTraits::ArrayType; impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << HexEncode(checked_cast(array).GetView(index)); + std::string_view view = checked_cast(array).GetView(index); + if constexpr (T::is_utf8) { + // format String and StringView with \"\n\r\t\\ escaped + *os << '"' << Escape(view) << '"'; + } else { + // format Binary, LargeBinary, BinaryView, and FixedSizeBinary in hexadecimal + *os << HexEncode(view); + } }; return Status::OK(); } - // format Strings with \"\n\r\t\\ escaped + // format Decimals with Decimal___Array::FormatValue template - enable_if_string_like Visit(const T&) { - using ArrayType = typename TypeTraits::ArrayType; + enable_if_decimal Visit(const T&) { impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << "\"" << Escape(checked_cast(array).GetView(index)) << "\""; - }; - return Status::OK(); - } - - // format Decimals with Decimal128Array::FormatValue - Status Visit(const Decimal128Type&) { - impl_ = [](const Array& array, int64_t index, std::ostream* os) { - *os << checked_cast(array).FormatValue(index); + if constexpr (T::type_id == Type::DECIMAL128) { + *os << checked_cast(array).FormatValue(index); + } else { + *os << checked_cast(array).FormatValue(index); + } }; return Status::OK(); } diff --git a/cpp/src/arrow/array/diff_test.cc b/cpp/src/arrow/array/diff_test.cc index e322006732f9a..145978a91ad54 100644 --- a/cpp/src/arrow/array/diff_test.cc +++ b/cpp/src/arrow/array/diff_test.cc @@ -163,6 +163,107 @@ class DiffTest : public ::testing::Test { AssertRunLengthIs("[2, 0, 0]"); } + std::shared_ptr RunEndEncodedArrayFromJSON( + int64_t logical_length, const std::shared_ptr& ree_type, + std::string_view run_ends_json, std::string_view values_json, + int64_t logical_offset = 0) { + auto& ree_type_ref = checked_cast(*ree_type); + auto run_ends = ArrayFromJSON(ree_type_ref.run_end_type(), run_ends_json); + auto values = ArrayFromJSON(ree_type_ref.value_type(), values_json); + return RunEndEncodedArray::Make(logical_length, std::move(run_ends), + std::move(values), logical_offset) + .ValueOrDie(); + } + + template + void TestBasicsWithREEs() { + auto run_end_type = std::make_shared(); + auto value_type = utf8(); + auto ree_type = run_end_encoded(run_end_type, value_type); + + // empty REEs + base_ = RunEndEncodedArrayFromJSON(0, ree_type, "[]", "[]"); + target_ = RunEndEncodedArrayFromJSON(0, ree_type, "[]", "[]"); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[0]"); + + // null REE arrays of different lengths + base_ = RunEndEncodedArrayFromJSON(2, ree_type, "[2]", "[null]"); + target_ = RunEndEncodedArrayFromJSON(4, ree_type, "[4]", "[null]"); + DoDiff(); + AssertInsertIs("[false, true, true]"); + AssertRunLengthIs("[2, 0, 0]"); + + // identical REE arrays w/ offsets + base_ = + RunEndEncodedArrayFromJSON(110, ree_type, R"([20, 120])", R"(["a", "b"])", 10); + target_ = + RunEndEncodedArrayFromJSON(110, ree_type, R"([20, 120])", R"(["a", "b"])", 10); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[110]"); + + // equivalent REE arrays + base_ = RunEndEncodedArrayFromJSON(120, ree_type, R"([10, 20, 120])", + R"(["a", "a", "b"])"); + target_ = RunEndEncodedArrayFromJSON(120, ree_type, R"([20, 30, 120])", + R"(["a", "b", "b"])"); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[120]"); + + // slice so last run-end goes beyond length + base_ = base_->Slice(5, 105); + target_ = target_->Slice(5, 105); + DoDiff(); + AssertInsertIs("[false]"); + AssertRunLengthIs("[105]"); + + // insert one + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 12])", R"(["a", "b"])"); + target_ = RunEndEncodedArrayFromJSON(13, ree_type, R"([3, 13])", R"(["a", "b"])"); + DoDiff(); + AssertInsertIs("[false, true]"); + AssertRunLengthIs("[2, 10]"); + + // delete one + base_ = + RunEndEncodedArrayFromJSON(13, ree_type, R"([2, 5, 13])", R"(["a", "b", "c"])"); + target_ = + RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 12])", R"(["a", "b", "c"])"); + DoDiff(); + AssertInsertIs("[false, false]"); + AssertRunLengthIs("[4, 8]"); + + // null out one + base_ = + RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 5, 12])", R"(["a", "b", "c"])"); + target_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 5, 12])", + R"(["a", "b", null, "c"])"); + DoDiff(); + AssertInsertIs("[false, false, true]"); + AssertRunLengthIs("[4, 0, 7]"); + + // append some + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 8, 12])", + R"(["a", "b", "c", "d"])"); + target_ = RunEndEncodedArrayFromJSON(15, ree_type, R"([2, 4, 8, 13, 15])", + R"(["a", "b", "c", "d", "e"])"); + DoDiff(); + AssertInsertIs("[false, true, true, true]"); + AssertRunLengthIs("[12, 0, 0, 0]"); + + // prepend some + base_ = RunEndEncodedArrayFromJSON(12, ree_type, R"([2, 4, 8, 12])", + R"(["c", "d", "e", "f"])"); + target_ = RunEndEncodedArrayFromJSON(15, ree_type, R"([1, 3, 5, 7, 11, 15])", + R"(["a", "b", "c", "d", "e", "f"])"); + DoDiff(); + AssertInsertIs("[false, true, true, true]"); + AssertRunLengthIs("[0, 0, 0, 12]"); + } + random::RandomArrayGenerator rng_; std::shared_ptr edits_; std::shared_ptr base_, target_; @@ -415,6 +516,12 @@ TEST_F(DiffTest, BasicsWithSparseUnions) { TestBasicsWithUnions(UnionMode::SPARS TEST_F(DiffTest, BasicsWithDenseUnions) { TestBasicsWithUnions(UnionMode::DENSE); } +TEST_F(DiffTest, BasicsWithREEs) { + TestBasicsWithREEs(); + TestBasicsWithREEs(); + TestBasicsWithREEs(); +} + TEST_F(DiffTest, UnifiedDiffFormatter) { // no changes base_ = ArrayFromJSON(utf8(), R"(["give", "me", "a", "break"])"); @@ -596,6 +703,19 @@ TEST_F(DiffTest, UnifiedDiffFormatter) { @@ -5, +8 @@ +79 +11 +)"); + } + + for (const auto& type : { + decimal128(10, 4), + decimal256(10, 4), + }) { + base_ = ArrayFromJSON(type, R"(["123.4567", "-78.9000"])"); + target_ = ArrayFromJSON(type, R"(["123.4567", "-123.4567"])"); + AssertDiffAndFormat(R"( +@@ -1, +1 @@ +--78.9000 ++-123.4567 )"); } } diff --git a/cpp/src/arrow/array/util.cc b/cpp/src/arrow/array/util.cc index 98e9d51b5fc75..9ea2fc2b6f0a1 100644 --- a/cpp/src/arrow/array/util.cc +++ b/cpp/src/arrow/array/util.cc @@ -43,6 +43,9 @@ #include "arrow/util/decimal.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" +#include "arrow/util/sort.h" +#include "arrow/util/span.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -271,6 +274,13 @@ class ArrayDataEndianSwapper { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + // TODO(GH-37879): This requires knowledge of whether the array is being swapped to + // native endian or from it so that we know what size to trust when deciding whether + // something is an inline view. + return Status::NotImplemented("Swapping endianness of ", type); + } + Status Visit(const ListType& type) { RETURN_NOT_OK(SwapOffsets(1)); return Status::OK(); @@ -379,6 +389,10 @@ class NullArrayFactory { return MaxOf(sizeof(typename T::offset_type) * (length_ + 1)); } + Status Visit(const BinaryViewType& type) { + return MaxOf(sizeof(BinaryViewType::c_type) * length_); + } + Status Visit(const FixedSizeListType& type) { return MaxOf(GetBufferLength(type.value_type(), type.list_size() * length_)); } @@ -498,6 +512,11 @@ class NullArrayFactory { return Status::OK(); } + Status Visit(const BinaryViewType&) { + out_->buffers.resize(2, buffer_); + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& type) { out_->buffers.resize(2, buffer_); @@ -600,6 +619,11 @@ class RepeatedArrayFactory { RepeatedArrayFactory(MemoryPool* pool, const Scalar& scalar, int64_t length) : pool_(pool), scalar_(scalar), length_(length) {} + template + const auto& scalar() const { + return checked_cast::ScalarType&>(scalar_); + } + Result> Create() { RETURN_NOT_OK(VisitTypeInline(*scalar_.type, this)); return out_; @@ -621,7 +645,7 @@ class RepeatedArrayFactory { template enable_if_t::value || is_temporal_type::value, Status> Visit( const T&) { - auto value = checked_cast::ScalarType&>(scalar_).value; + auto value = scalar().value; return FinishFixedWidth(&value, sizeof(value)); } @@ -632,8 +656,7 @@ class RepeatedArrayFactory { template enable_if_decimal Visit(const T&) { - using ScalarType = typename TypeTraits::ScalarType; - auto value = checked_cast(scalar_).value.ToBytes(); + auto value = scalar().value.ToBytes(); return FinishFixedWidth(value.data(), value.size()); } @@ -644,29 +667,36 @@ class RepeatedArrayFactory { template enable_if_base_binary Visit(const T&) { - std::shared_ptr value = - checked_cast::ScalarType&>(scalar_).value; + const std::shared_ptr& value = scalar().value; std::shared_ptr values_buffer, offsets_buffer; RETURN_NOT_OK(CreateBufferOf(value->data(), value->size(), &values_buffer)); auto size = static_cast(value->size()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); - out_ = std::make_shared::ArrayType>(length_, offsets_buffer, - values_buffer); + out_ = std::make_shared::ArrayType>( + length_, std::move(offsets_buffer), std::move(values_buffer)); + return Status::OK(); + } + + template + enable_if_binary_view_like Visit(const T& type) { + std::string_view value{*scalar().value}; + auto s = util::ToBinaryView(value, 0, 0); + RETURN_NOT_OK(FinishFixedWidth(&s, sizeof(s))); + if (!s.is_inline()) { + out_->data()->buffers.push_back(scalar().value); + } return Status::OK(); } template enable_if_var_size_list Visit(const T& type) { - using ScalarType = typename TypeTraits::ScalarType; using ArrayType = typename TypeTraits::ArrayType; - auto value = checked_cast(scalar_).value; - - ArrayVector values(length_, value); + ArrayVector values(length_, scalar().value); ARROW_ASSIGN_OR_RAISE(auto value_array, Concatenate(values, pool_)); std::shared_ptr offsets_buffer; - auto size = static_cast(value->length()); + auto size = static_cast(scalar().value->length()); RETURN_NOT_OK(CreateOffsetsBuffer(size, &offsets_buffer)); out_ = diff --git a/cpp/src/arrow/array/validate.cc b/cpp/src/arrow/array/validate.cc index 19ff8e28b536c..3dde41b1450e8 100644 --- a/cpp/src/arrow/array/validate.cc +++ b/cpp/src/arrow/array/validate.cc @@ -31,41 +31,43 @@ #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" #include "arrow/util/ree_util.h" +#include "arrow/util/sort.h" +#include "arrow/util/string.h" +#include "arrow/util/unreachable.h" #include "arrow/util/utf8.h" #include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" -namespace arrow { -namespace internal { +namespace arrow::internal { namespace { struct UTF8DataValidator { const ArrayData& data; - Status Visit(const DataType&) { - // Default, should be unreachable - return Status::NotImplemented(""); - } + template + Status Visit(const T&) { + if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + util::InitializeUTF8(); - template - enable_if_string Visit(const StringType&) { - util::InitializeUTF8(); - - int64_t i = 0; - return VisitArraySpanInline( - data, - [&](std::string_view v) { - if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { - return Status::Invalid("Invalid UTF8 sequence at string index ", i); - } - ++i; - return Status::OK(); - }, - [&]() { - ++i; - return Status::OK(); - }); + int64_t i = 0; + return VisitArraySpanInline( + data, + [&](std::string_view v) { + if (ARROW_PREDICT_FALSE(!util::ValidateUTF8(v))) { + return Status::Invalid("Invalid UTF8 sequence at string index ", i); + } + ++i; + return Status::OK(); + }, + [&]() { + ++i; + return Status::OK(); + }); + } else { + Unreachable("utf-8 validation of non string type"); + } } }; @@ -169,6 +171,14 @@ struct ValidateArrayImpl { return Status::OK(); } + Status Visit(const StringViewType& type) { + RETURN_NOT_OK(ValidateBinaryView(type)); + if (full_validation) { + RETURN_NOT_OK(ValidateUTF8(data)); + } + return Status::OK(); + } + Status Visit(const Date64Type& type) { RETURN_NOT_OK(ValidateFixedWidthBuffers()); @@ -248,6 +258,8 @@ struct ValidateArrayImpl { Status Visit(const LargeBinaryType& type) { return ValidateBinaryLike(type); } + Status Visit(const BinaryViewType& type) { return ValidateBinaryView(type); } + Status Visit(const ListType& type) { return ValidateListLike(type); } Status Visit(const LargeListType& type) { return ValidateListLike(type); } @@ -453,11 +465,16 @@ struct ValidateArrayImpl { return Status::Invalid("Array length is negative"); } - if (data.buffers.size() != layout.buffers.size()) { + if (layout.variadic_spec) { + if (data.buffers.size() < layout.buffers.size()) { + return Status::Invalid("Expected at least ", layout.buffers.size(), + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); + } + } else if (data.buffers.size() != layout.buffers.size()) { return Status::Invalid("Expected ", layout.buffers.size(), - " buffers in array " - "of type ", - type.ToString(), ", got ", data.buffers.size()); + " buffers in array of type ", type.ToString(), ", got ", + data.buffers.size()); } // This check is required to avoid addition overflow below @@ -469,7 +486,9 @@ struct ValidateArrayImpl { for (int i = 0; i < static_cast(data.buffers.size()); ++i) { const auto& buffer = data.buffers[i]; - const auto& spec = layout.buffers[i]; + const auto& spec = i < static_cast(layout.buffers.size()) + ? layout.buffers[i] + : *layout.variadic_spec; if (buffer == nullptr) { continue; @@ -595,6 +614,85 @@ struct ValidateArrayImpl { return Status::OK(); } + Status ValidateBinaryView(const BinaryViewType& type) { + int64_t views_byte_size = data.buffers[1]->size(); + int64_t required_view_count = data.length + data.offset; + if (static_cast(views_byte_size / BinaryViewType::kSize) < + required_view_count) { + return Status::Invalid("View buffer size (bytes): ", views_byte_size, + " isn't large enough for length: ", data.length, + " and offset: ", data.offset); + } + + if (!full_validation) return Status::OK(); + + auto CheckPrefix = [&](size_t i, + std::array prefix, + const uint8_t* data) { + if (std::memcmp(data, prefix.data(), BinaryViewType::kPrefixSize) == 0) { + return Status::OK(); + } + return Status::Invalid("View at slot ", i, " has inlined prefix 0x", + HexEncode(prefix.data(), BinaryViewType::kPrefixSize), + " but the out-of-line data begins with 0x", + HexEncode(data, BinaryViewType::kPrefixSize)); + }; + + util::span views(data.GetValues(1), + static_cast(data.length)); + util::span data_buffers(data.buffers.data() + 2, data.buffers.size() - 2); + + for (size_t i = 0; i < static_cast(data.length); ++i) { + if (data.IsNull(i)) continue; + + if (views[i].size() < 0) { + return Status::Invalid("View at slot ", i, " has negative size ", + views[i].size()); + } + + if (views[i].is_inline()) { + auto padding_bytes = util::span(views[i].inlined.data).subspan(views[i].size()); + for (auto padding_byte : padding_bytes) { + if (padding_byte != 0) { + return Status::Invalid("View at slot ", i, " was inline with size ", + views[i].size(), + " but its padding bytes were not all zero: ", + HexEncode(padding_bytes.data(), padding_bytes.size())); + } + } + continue; + } + + auto [size, prefix, buffer_index, offset] = views[i].ref; + + if (buffer_index < 0) { + return Status::Invalid("View at slot ", i, " has negative buffer index ", + buffer_index); + } + + if (offset < 0) { + return Status::Invalid("View at slot ", i, " has negative offset ", offset); + } + + if (static_cast(buffer_index) >= data_buffers.size()) { + return Status::IndexError("View at slot ", i, " references buffer ", buffer_index, + " but there are only ", data_buffers.size(), + " data buffers"); + } + const auto& buffer = data_buffers[buffer_index]; + + if (int64_t end = offset + static_cast(size); end > buffer->size()) { + return Status::IndexError( + "View at slot ", i, " references range ", offset, "-", end, " of buffer ", + buffer_index, " but that buffer is only ", buffer->size(), " bytes long"); + } + + RETURN_NOT_OK(CheckPrefix(i, prefix, buffer->data() + offset)); + } + + return Status::OK(); + } + template Status ValidateListLike(const ListType& type) { const ArrayData& values = *data.child_data[0]; @@ -798,7 +896,8 @@ Status ValidateArrayFull(const Array& array) { return ValidateArrayFull(*array.d ARROW_EXPORT Status ValidateUTF8(const ArrayData& data) { - DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::LARGE_STRING); + DCHECK(data.type->id() == Type::STRING || data.type->id() == Type::STRING_VIEW || + data.type->id() == Type::LARGE_STRING); UTF8DataValidator validator{data}; return VisitTypeInline(*data.type, &validator); } @@ -806,5 +905,4 @@ Status ValidateUTF8(const ArrayData& data) { ARROW_EXPORT Status ValidateUTF8(const Array& array) { return ValidateUTF8(*array.data()); } -} // namespace internal -} // namespace arrow +} // namespace arrow::internal diff --git a/cpp/src/arrow/buffer_builder.h b/cpp/src/arrow/buffer_builder.h index e7eea64043ba8..a84c98b6b2491 100644 --- a/cpp/src/arrow/buffer_builder.h +++ b/cpp/src/arrow/buffer_builder.h @@ -143,7 +143,10 @@ class ARROW_EXPORT BufferBuilder { memcpy(data_ + size_, data, static_cast(length)); size_ += length; } - void UnsafeAppend(std::string_view v) { UnsafeAppend(v.data(), v.size()); } + + void UnsafeAppend(std::string_view v) { + UnsafeAppend(v.data(), static_cast(v.size())); + } void UnsafeAppend(const int64_t num_copies, uint8_t value) { memset(data_ + size_, value, static_cast(num_copies)); @@ -268,7 +271,7 @@ class TypedBufferBuilder< template void UnsafeAppend(Iter values_begin, Iter values_end) { - int64_t num_elements = static_cast(std::distance(values_begin, values_end)); + auto num_elements = static_cast(std::distance(values_begin, values_end)); auto data = mutable_data() + length(); bytes_builder_.UnsafeAdvance(num_elements * sizeof(T)); std::copy(values_begin, values_end, data); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index caddbf9db5578..c7e6207bfefa4 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -148,6 +148,8 @@ struct DictionaryBuilderCase { Status Visit(const StringType&) { return CreateFor(); } Status Visit(const LargeBinaryType&) { return CreateFor(); } Status Visit(const LargeStringType&) { return CreateFor(); } + Status Visit(const BinaryViewType&) { return CreateFor(); } + Status Visit(const StringViewType&) { return CreateFor(); } Status Visit(const FixedSizeBinaryType&) { return CreateFor(); } Status Visit(const Decimal128Type&) { return CreateFor(); } Status Visit(const Decimal256Type&) { return CreateFor(); } @@ -190,7 +192,7 @@ struct DictionaryBuilderCase { struct MakeBuilderImpl { template - enable_if_not_nested Visit(const T&) { + enable_if_not_nested Visit(const T& t) { out.reset(new typename TypeTraits::BuilderType(type, pool)); return Status::OK(); } diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index df41cd22c9e06..50cfdd05a14bb 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -38,6 +38,7 @@ #include "arrow/tensor.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_run_reader.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" @@ -261,6 +262,25 @@ class RangeDataEqualsImpl { // Also matches StringType Status Visit(const BinaryType& type) { return CompareBinary(type); } + // Also matches StringViewType + Status Visit(const BinaryViewType& type) { + auto* left_values = left_.GetValues(1) + left_start_idx_; + auto* right_values = right_.GetValues(1) + right_start_idx_; + + auto* left_buffers = left_.buffers.data() + 2; + auto* right_buffers = right_.buffers.data() + 2; + VisitValidRuns([&](int64_t i, int64_t length) { + for (auto end_i = i + length; i < end_i; ++i) { + if (!util::EqualBinaryView(left_values[i], right_values[i], left_buffers, + right_buffers)) { + return false; + } + } + return true; + }); + return Status::OK(); + } + // Also matches LargeStringType Status Visit(const LargeBinaryType& type) { return CompareBinary(type); } @@ -632,6 +652,11 @@ class TypeEqualsVisitor { return Status::OK(); } + Status Visit(const BinaryViewType&) { + result_ = true; + return Status::OK(); + } + template enable_if_interval Visit(const T& left) { const auto& right = checked_cast(right_); @@ -802,8 +827,7 @@ class ScalarEqualsVisitor { Status Visit(const DoubleScalar& left) { return CompareFloating(left); } template - typename std::enable_if::value, Status>::type - Visit(const T& left) { + enable_if_t::value, Status> Visit(const T& left) { const auto& right = checked_cast(right_); result_ = internal::SharedPtrEquals(left.value, right.value); return Status::OK(); diff --git a/cpp/src/arrow/compute/kernels/vector_hash.cc b/cpp/src/arrow/compute/kernels/vector_hash.cc index d9143b760f32b..5426dc405429c 100644 --- a/cpp/src/arrow/compute/kernels/vector_hash.cc +++ b/cpp/src/arrow/compute/kernels/vector_hash.cc @@ -31,6 +31,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/result.h" #include "arrow/util/hashing.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -262,7 +263,7 @@ class HashKernel : public KernelState { // Base class for all "regular" hash kernel implementations // (NullType has a separate implementation) -template class RegularHashKernel : public HashKernel { public: @@ -503,39 +504,13 @@ class DictionaryHashKernel : public HashKernel { }; // ---------------------------------------------------------------------- - -template -struct HashKernelTraits {}; - -template -struct HashKernelTraits> { - using HashKernel = NullHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -struct HashKernelTraits> { - using HashKernel = RegularHashKernel; -}; - -template -Result> HashInitImpl(KernelContext* ctx, - const KernelInitArgs& args) { - using HashKernelType = typename HashKernelTraits::HashKernel; - auto result = std::make_unique(args.inputs[0].GetSharedPtr(), - args.options, ctx->memory_pool()); - RETURN_NOT_OK(result->Reset()); - return std::move(result); -} - -template +template Result> HashInit(KernelContext* ctx, const KernelInitArgs& args) { - return HashInitImpl(ctx, args); + auto result = std::make_unique(args.inputs[0].GetSharedPtr(), args.options, + ctx->memory_pool()); + RETURN_NOT_OK(result->Reset()); + return std::move(result); } template @@ -544,22 +519,22 @@ KernelInit GetHashInit(Type::type type_id) { // representation switch (type_id) { case Type::NA: - return HashInit; + return HashInit>; case Type::BOOL: - return HashInit; + return HashInit>; case Type::INT8: case Type::UINT8: - return HashInit; + return HashInit>; case Type::INT16: case Type::UINT16: - return HashInit; + return HashInit>; case Type::INT32: case Type::UINT32: case Type::FLOAT: case Type::DATE32: case Type::TIME32: case Type::INTERVAL_MONTHS: - return HashInit; + return HashInit>; case Type::INT64: case Type::UINT64: case Type::DOUBLE: @@ -568,22 +543,24 @@ KernelInit GetHashInit(Type::type type_id) { case Type::TIMESTAMP: case Type::DURATION: case Type::INTERVAL_DAY_TIME: - return HashInit; + return HashInit>; case Type::BINARY: case Type::STRING: - return HashInit; + return HashInit>; case Type::LARGE_BINARY: case Type::LARGE_STRING: - return HashInit; + return HashInit>; + case Type::BINARY_VIEW: + case Type::STRING_VIEW: + return HashInit>; case Type::FIXED_SIZE_BINARY: case Type::DECIMAL128: case Type::DECIMAL256: - return HashInit; + return HashInit>; case Type::INTERVAL_MONTH_DAY_NANO: - return HashInit; + return HashInit>; default: - DCHECK(false); - return nullptr; + Unreachable("non hashable type"); } } @@ -593,31 +570,11 @@ template Result> DictionaryHashInit(KernelContext* ctx, const KernelInitArgs& args) { const auto& dict_type = checked_cast(*args.inputs[0].type); - Result> indices_hasher; - switch (dict_type.index_type()->id()) { - case Type::INT8: - case Type::UINT8: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT16: - case Type::UINT16: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT32: - case Type::UINT32: - indices_hasher = HashInitImpl(ctx, args); - break; - case Type::INT64: - case Type::UINT64: - indices_hasher = HashInitImpl(ctx, args); - break; - default: - DCHECK(false) << "Unsupported dictionary index type"; - break; - } - RETURN_NOT_OK(indices_hasher); - return std::make_unique(std::move(indices_hasher.ValueOrDie()), - dict_type.value_type()); + ARROW_ASSIGN_OR_RAISE(auto indices_hasher, + GetHashInit(dict_type.index_type()->id())(ctx, args)); + return std::make_unique( + checked_pointer_cast(std::move(indices_hasher)), + dict_type.value_type()); } Status HashExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { diff --git a/cpp/src/arrow/engine/substrait/expression_internal.cc b/cpp/src/arrow/engine/substrait/expression_internal.cc index 0df8425609ff1..d395261597696 100644 --- a/cpp/src/arrow/engine/substrait/expression_internal.cc +++ b/cpp/src/arrow/engine/substrait/expression_internal.cc @@ -727,6 +727,8 @@ struct ScalarToProtoImpl { s); } + Status Visit(const BinaryViewScalar& s) { return NotImplemented(s); } + Status Visit(const FixedSizeBinaryScalar& s) { return FromBuffer( [](Lit* lit, std::string&& s) { lit->set_fixed_binary(std::move(s)); }, s); diff --git a/cpp/src/arrow/engine/substrait/type_internal.cc b/cpp/src/arrow/engine/substrait/type_internal.cc index 1f9141f36ba6b..d3fb058137e6a 100644 --- a/cpp/src/arrow/engine/substrait/type_internal.cc +++ b/cpp/src/arrow/engine/substrait/type_internal.cc @@ -263,6 +263,8 @@ struct DataTypeToProtoImpl { return SetWith(&substrait::Type::set_allocated_binary); } + Status Visit(const BinaryViewType& t) { return NotImplemented(t); } + Status Visit(const FixedSizeBinaryType& t) { SetWithThen(&substrait::Type::set_allocated_fixed_binary)->set_length(t.byte_width()); return Status::OK(); diff --git a/cpp/src/arrow/integration/json_internal.cc b/cpp/src/arrow/integration/json_internal.cc index ed7be4b502985..59749c36a958e 100644 --- a/cpp/src/arrow/integration/json_internal.cc +++ b/cpp/src/arrow/integration/json_internal.cc @@ -48,6 +48,7 @@ #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/range.h" +#include "arrow/util/span.h" #include "arrow/util/string.h" #include "arrow/util/value_parsing.h" #include "arrow/visit_array_inline.h" @@ -106,6 +107,13 @@ std::string GetTimeUnitName(TimeUnit::type unit) { return "UNKNOWN"; } +Result GetStringView(const rj::Value& str) { + if (!str.IsString()) { + return Status::Invalid("field was not a string"); + } + return std::string_view{str.GetString(), str.GetStringLength()}; +} + class SchemaWriter { public: explicit SchemaWriter(const Schema& schema, const DictionaryFieldMapper& mapper, @@ -226,8 +234,9 @@ class SchemaWriter { template enable_if_t::value || is_primitive_ctype::value || - is_base_binary_type::value || is_var_length_list_type::value || - is_struct_type::value || is_run_end_encoded_type::value> + is_base_binary_type::value || is_binary_view_like_type::value || + is_var_length_list_type::value || is_struct_type::value || + is_run_end_encoded_type::value> WriteTypeMetadata(const T& type) {} void WriteTypeMetadata(const MapType& type) { @@ -382,6 +391,8 @@ class SchemaWriter { Status Visit(const TimeType& type) { return WritePrimitive("time", type); } Status Visit(const StringType& type) { return WriteVarBytes("utf8", type); } Status Visit(const BinaryType& type) { return WriteVarBytes("binary", type); } + Status Visit(const StringViewType& type) { return WritePrimitive("utf8view", type); } + Status Visit(const BinaryViewType& type) { return WritePrimitive("binaryview", type); } Status Visit(const LargeStringType& type) { return WriteVarBytes("largeutf8", type); } Status Visit(const LargeBinaryType& type) { return WriteVarBytes("largebinary", type); } Status Visit(const FixedSizeBinaryType& type) { @@ -528,22 +539,19 @@ class ArrayWriter { } } - // Binary, encode to hexadecimal. - template - enable_if_binary_like WriteDataValues( - const ArrayType& arr) { - for (int64_t i = 0; i < arr.length(); ++i) { - writer_->String(HexEncode(arr.GetView(i))); - } - } - - // UTF8 string, write as is - template - enable_if_string_like WriteDataValues( - const ArrayType& arr) { + template + std::enable_if_t::value || + is_fixed_size_binary_type::value> + WriteDataValues(const ArrayType& arr) { for (int64_t i = 0; i < arr.length(); ++i) { - auto view = arr.GetView(i); - writer_->String(view.data(), static_cast(view.size())); + if constexpr (Type::is_utf8) { + // UTF8 string, write as is + auto view = arr.GetView(i); + writer_->String(view.data(), static_cast(view.size())); + } else { + // Binary, encode to hexadecimal. + writer_->String(HexEncode(arr.GetView(i))); + } } } @@ -642,6 +650,50 @@ class ArrayWriter { writer_->EndArray(); } + template + void WriteBinaryViewField(const ArrayType& array) { + writer_->Key("VIEWS"); + writer_->StartArray(); + for (int64_t i = 0; i < array.length(); ++i) { + auto s = array.raw_values()[i]; + writer_->StartObject(); + writer_->Key("SIZE"); + writer_->Int64(s.size()); + if (s.is_inline()) { + writer_->Key("INLINED"); + if constexpr (ArrayType::TypeClass::is_utf8) { + writer_->String(reinterpret_cast(s.inline_data()), s.size()); + } else { + writer_->String(HexEncode(s.inline_data(), s.size())); + } + } else { + // Prefix is always 4 bytes so it may not be utf-8 even if the whole + // string view is + writer_->Key("PREFIX_HEX"); + writer_->String(HexEncode(s.inline_data(), BinaryViewType::kPrefixSize)); + writer_->Key("BUFFER_INDEX"); + writer_->Int64(s.ref.buffer_index); + writer_->Key("OFFSET"); + writer_->Int64(s.ref.offset); + } + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteVariadicBuffersField(const BinaryViewArray& arr) { + writer_->Key("VARIADIC_DATA_BUFFERS"); + writer_->StartArray(); + const auto& buffers = arr.data()->buffers; + for (size_t i = 2; i < buffers.size(); ++i) { + // Encode the data buffers into hexadecimal strings. + // Even for arrays which contain utf-8, portions of the buffer not + // referenced by any view may be invalid. + writer_->String(buffers[i]->ToHexString()); + } + writer_->EndArray(); + } + void WriteValidityField(const Array& arr) { writer_->Key("VALIDITY"); writer_->StartArray(); @@ -682,8 +734,10 @@ class ArrayWriter { } template - enable_if_t::value, Status> Visit( - const ArrayType& array) { + enable_if_t::value && + !is_binary_view_like_type::value, + Status> + Visit(const ArrayType& array) { WriteValidityField(array); WriteDataField(array); SetNoChildren(); @@ -700,6 +754,17 @@ class ArrayWriter { return Status::OK(); } + template + enable_if_binary_view_like Visit( + const ArrayType& array) { + WriteValidityField(array); + WriteBinaryViewField(array); + WriteVariadicBuffersField(array); + + SetNoChildren(); + return Status::OK(); + } + Status Visit(const DictionaryArray& array) { return VisitArrayValues(*array.indices()); } @@ -1033,6 +1098,10 @@ Result> GetType(const RjObject& json_type, return utf8(); } else if (type_name == "binary") { return binary(); + } else if (type_name == "utf8view") { + return utf8_view(); + } else if (type_name == "binaryview") { + return binary_view(); } else if (type_name == "largeutf8") { return large_utf8(); } else if (type_name == "largebinary") { @@ -1246,10 +1315,12 @@ class ArrayReader { return Status::OK(); } - Result GetDataArray(const RjObject& obj) { - ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, kData)); + Result GetDataArray(const RjObject& obj, + const std::string& key = kData) { + ARROW_ASSIGN_OR_RAISE(const auto json_data_arr, GetMemberArray(obj, key)); if (static_cast(json_data_arr.Size()) != length_) { - return Status::Invalid("JSON DATA array size differs from advertised array length"); + return Status::Invalid("JSON ", key, " array size ", json_data_arr.Size(), + " differs from advertised array length ", length_); } return json_data_arr; } @@ -1293,10 +1364,7 @@ class ArrayReader { RETURN_NOT_OK(builder.AppendNull()); continue; } - - DCHECK(json_val.IsString()); - std::string_view val{ - json_val.GetString()}; // XXX can we use json_val.GetStringLength()? + ARROW_ASSIGN_OR_RAISE(auto val, GetStringView(json_val)); int64_t offset_start = ParseOffset(json_offsets[i]); int64_t offset_end = ParseOffset(json_offsets[i + 1]); @@ -1332,6 +1400,97 @@ class ArrayReader { return FinishBuilder(&builder); } + template + enable_if_binary_view_like Visit(const ViewType& type) { + ARROW_ASSIGN_OR_RAISE(const auto json_views, GetDataArray(obj_, "VIEWS")); + ARROW_ASSIGN_OR_RAISE(const auto json_variadic_bufs, + GetMemberArray(obj_, "VARIADIC_DATA_BUFFERS")); + + using internal::Zip; + using util::span; + + BufferVector buffers; + buffers.resize(json_variadic_bufs.Size() + 2); + for (auto [json_buf, buf] : Zip(json_variadic_bufs, span{buffers}.subspan(2))) { + ARROW_ASSIGN_OR_RAISE(auto hex_string, GetStringView(json_buf)); + ARROW_ASSIGN_OR_RAISE( + buf, AllocateBuffer(static_cast(hex_string.size()) / 2, pool_)); + RETURN_NOT_OK(ParseHexValues(hex_string, buf->mutable_data())); + } + + TypedBufferBuilder validity_builder{pool_}; + RETURN_NOT_OK(validity_builder.Resize(length_)); + for (bool is_valid : is_valid_) { + validity_builder.UnsafeAppend(is_valid); + } + ARROW_ASSIGN_OR_RAISE(buffers[0], validity_builder.Finish()); + + ARROW_ASSIGN_OR_RAISE( + buffers[1], AllocateBuffer(length_ * sizeof(BinaryViewType::c_type), pool_)); + + span views{buffers[1]->mutable_data_as(), + static_cast(length_)}; + + int64_t null_count = 0; + for (auto [json_view, out_view, is_valid] : Zip(json_views, views, is_valid_)) { + if (!is_valid) { + out_view = {}; + ++null_count; + continue; + } + + DCHECK(json_view.IsObject()); + const auto& json_view_obj = json_view.GetObject(); + + auto json_size = json_view_obj.FindMember("SIZE"); + RETURN_NOT_INT("SIZE", json_size, json_view_obj); + DCHECK_GE(json_size->value.GetInt64(), 0); + auto size = static_cast(json_size->value.GetInt64()); + + if (size <= BinaryViewType::kInlineSize) { + auto json_inlined = json_view_obj.FindMember("INLINED"); + RETURN_NOT_STRING("INLINED", json_inlined, json_view_obj); + out_view.inlined = {size, {}}; + + if constexpr (ViewType::is_utf8) { + DCHECK_LE(json_inlined->value.GetStringLength(), BinaryViewType::kInlineSize); + memcpy(&out_view.inlined.data, json_inlined->value.GetString(), size); + } else { + DCHECK_LE(json_inlined->value.GetStringLength(), + BinaryViewType::kInlineSize * 2); + ARROW_ASSIGN_OR_RAISE(auto inlined, GetStringView(json_inlined->value)); + RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data())); + } + continue; + } + + auto json_prefix = json_view_obj.FindMember("PREFIX_HEX"); + auto json_buffer_index = json_view_obj.FindMember("BUFFER_INDEX"); + auto json_offset = json_view_obj.FindMember("OFFSET"); + RETURN_NOT_STRING("PREFIX_HEX", json_prefix, json_view_obj); + RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj); + RETURN_NOT_INT("OFFSET", json_offset, json_view_obj); + + out_view.ref = { + size, + {}, + static_cast(json_buffer_index->value.GetInt64()), + static_cast(json_offset->value.GetInt64()), + }; + + DCHECK_EQ(json_prefix->value.GetStringLength(), BinaryViewType::kPrefixSize * 2); + ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value)); + RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data())); + + DCHECK_LE(static_cast(out_view.ref.buffer_index), buffers.size() - 2); + DCHECK_LE(static_cast(out_view.ref.offset) + out_view.size(), + buffers[out_view.ref.buffer_index + 2]->size()); + } + + data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count); + return Status::OK(); + } + Status Visit(const DayTimeIntervalType& type) { DayTimeIntervalBuilder builder(pool_); diff --git a/cpp/src/arrow/ipc/feather.cc b/cpp/src/arrow/ipc/feather.cc index b6d3a3d7d8cbb..1ef076fac40e2 100644 --- a/cpp/src/arrow/ipc/feather.cc +++ b/cpp/src/arrow/ipc/feather.cc @@ -536,8 +536,8 @@ struct ArrayWriterV1 { is_nested_type::value || is_null_type::value || is_decimal_type::value || std::is_same::value || is_duration_type::value || is_interval_type::value || is_fixed_size_binary_type::value || - std::is_same::value || std::is_same::value || - std::is_same::value, + is_binary_view_like_type::value || std::is_same::value || + std::is_same::value || std::is_same::value, Status>::type Visit(const T& type) { return Status::NotImplemented(type.ToString()); diff --git a/cpp/src/arrow/ipc/feather_test.cc b/cpp/src/arrow/ipc/feather_test.cc index e1d4282cb2635..0b6ae4f620647 100644 --- a/cpp/src/arrow/ipc/feather_test.cc +++ b/cpp/src/arrow/ipc/feather_test.cc @@ -264,7 +264,8 @@ TEST_P(TestFeather, TimeTypes) { TEST_P(TestFeather, VLenPrimitiveRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckRoundtrip(batch); } @@ -306,7 +307,8 @@ TEST_P(TestFeather, SliceFloatRoundTrip) { TEST_P(TestFeather, SliceStringsRoundTrip) { std::shared_ptr batch; - ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true)); + ASSERT_OK(ipc::test::MakeStringTypesRecordBatch(&batch, /*with_nulls=*/true, + /*with_view_types=*/false)); CheckSlices(batch); } diff --git a/cpp/src/arrow/ipc/json_simple.cc b/cpp/src/arrow/ipc/json_simple.cc index eea0c9730283e..4d2d803f3f65e 100644 --- a/cpp/src/arrow/ipc/json_simple.cc +++ b/cpp/src/arrow/ipc/json_simple.cc @@ -847,6 +847,8 @@ Status GetDictConverter(const std::shared_ptr& type, PARAM_CONVERTER_CASE(Type::BINARY, StringConverter, BinaryType) PARAM_CONVERTER_CASE(Type::LARGE_STRING, StringConverter, LargeStringType) PARAM_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter, LargeBinaryType) + PARAM_CONVERTER_CASE(Type::STRING_VIEW, StringConverter, StringViewType) + PARAM_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter, BinaryViewType) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter, FixedSizeBinaryType) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter, Decimal128Type) @@ -905,6 +907,8 @@ Status GetConverter(const std::shared_ptr& type, SIMPLE_CONVERTER_CASE(Type::BINARY, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_STRING, StringConverter) SIMPLE_CONVERTER_CASE(Type::LARGE_BINARY, StringConverter) + SIMPLE_CONVERTER_CASE(Type::STRING_VIEW, StringConverter) + SIMPLE_CONVERTER_CASE(Type::BINARY_VIEW, StringConverter) SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_BINARY, FixedSizeBinaryConverter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL128, Decimal128Converter<>) SIMPLE_CONVERTER_CASE(Type::DECIMAL256, Decimal256Converter<>) diff --git a/cpp/src/arrow/ipc/json_simple_test.cc b/cpp/src/arrow/ipc/json_simple_test.cc index 6eee5955242aa..b67c26999945b 100644 --- a/cpp/src/arrow/ipc/json_simple_test.cc +++ b/cpp/src/arrow/ipc/json_simple_test.cc @@ -271,7 +271,13 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestHalfFloat, TestIntegers, HalfFloatType); template class TestStrings : public ::testing::Test { public: - std::shared_ptr type() { return TypeTraits::type_singleton(); } + std::shared_ptr type() const { + if constexpr (is_binary_view_like_type::value) { + return T::is_utf8 ? utf8_view() : binary_view(); + } else { + return TypeTraits::type_singleton(); + } + } }; TYPED_TEST_SUITE_P(TestStrings); @@ -327,6 +333,8 @@ INSTANTIATE_TYPED_TEST_SUITE_P(TestString, TestStrings, StringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestBinary, TestStrings, BinaryType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeString, TestStrings, LargeStringType); INSTANTIATE_TYPED_TEST_SUITE_P(TestLargeBinary, TestStrings, LargeBinaryType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestStringView, TestStrings, StringViewType); +INSTANTIATE_TYPED_TEST_SUITE_P(TestBinaryView, TestStrings, BinaryViewType); TEST(TestNull, Basics) { std::shared_ptr type = null(); diff --git a/cpp/src/arrow/ipc/metadata_internal.cc b/cpp/src/arrow/ipc/metadata_internal.cc index 1394516ecd5ce..ab1a58dd1df8b 100644 --- a/cpp/src/arrow/ipc/metadata_internal.cc +++ b/cpp/src/arrow/ipc/metadata_internal.cc @@ -258,6 +258,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeBinary: *out = large_binary(); return Status::OK(); + case flatbuf::Type::BinaryView: + *out = binary_view(); + return Status::OK(); case flatbuf::Type::FixedSizeBinary: { auto fw_binary = static_cast(type_data); return FixedSizeBinaryType::Make(fw_binary->byteWidth()).Value(out); @@ -268,6 +271,9 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type::LargeUtf8: *out = large_utf8(); return Status::OK(); + case flatbuf::Type::Utf8View: + *out = utf8_view(); + return Status::OK(); case flatbuf::Type::Bool: *out = boolean(); return Status::OK(); @@ -534,6 +540,18 @@ class FieldToFlatbufferVisitor { return Status::OK(); } + Status Visit(const BinaryViewType& type) { + fb_type_ = flatbuf::Type::BinaryView; + type_offset_ = flatbuf::CreateBinaryView(fbb_).Union(); + return Status::OK(); + } + + Status Visit(const StringViewType& type) { + fb_type_ = flatbuf::Type::Utf8View; + type_offset_ = flatbuf::CreateUtf8View(fbb_).Union(); + return Status::OK(); + } + Status Visit(const LargeBinaryType& type) { fb_type_ = flatbuf::Type::LargeBinary; type_offset_ = flatbuf::CreateLargeBinary(fbb_).Union(); @@ -967,6 +985,7 @@ static Status GetBodyCompression(FBB& fbb, const IpcWriteOptions& options, static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers, + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, RecordBatchOffset* offset) { FieldNodeVector fb_nodes; RETURN_NOT_OK(WriteFieldNodes(fbb, nodes, &fb_nodes)); @@ -977,7 +996,13 @@ static Status MakeRecordBatch(FBB& fbb, int64_t length, int64_t body_length, BodyCompressionOffset fb_compression; RETURN_NOT_OK(GetBodyCompression(fbb, options, &fb_compression)); - *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression); + flatbuffers::Offset> fb_variadic_buffer_counts{}; + if (!variadic_buffer_counts.empty()) { + fb_variadic_buffer_counts = fbb.CreateVector(variadic_buffer_counts); + } + + *offset = flatbuf::CreateRecordBatch(fbb, length, fb_nodes, fb_buffers, fb_compression, + fb_variadic_buffer_counts); return Status::OK(); } @@ -1224,11 +1249,12 @@ Status WriteRecordBatchMessage( int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); return WriteFBMessage(fbb, flatbuf::MessageHeader::RecordBatch, record_batch.Union(), body_length, options.metadata_version, custom_metadata, options.memory_pool) @@ -1285,11 +1311,12 @@ Status WriteDictionaryMessage( int64_t id, bool is_delta, int64_t length, int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out) { + const std::vector& variadic_buffer_counts, const IpcWriteOptions& options, + std::shared_ptr* out) { FBB fbb; RecordBatchOffset record_batch; - RETURN_NOT_OK( - MakeRecordBatch(fbb, length, body_length, nodes, buffers, options, &record_batch)); + RETURN_NOT_OK(MakeRecordBatch(fbb, length, body_length, nodes, buffers, + variadic_buffer_counts, options, &record_batch)); auto dictionary_batch = flatbuf::CreateDictionaryBatch(fbb, id, record_batch, is_delta).Union(); return WriteFBMessage(fbb, flatbuf::MessageHeader::DictionaryBatch, dictionary_batch, diff --git a/cpp/src/arrow/ipc/metadata_internal.h b/cpp/src/arrow/ipc/metadata_internal.h index abbed5b2dace0..631a336f75a9a 100644 --- a/cpp/src/arrow/ipc/metadata_internal.h +++ b/cpp/src/arrow/ipc/metadata_internal.h @@ -201,7 +201,8 @@ Status WriteRecordBatchMessage( const int64_t length, const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); ARROW_EXPORT Result> WriteTensorMessage(const Tensor& tensor, @@ -225,7 +226,8 @@ Status WriteDictionaryMessage( const int64_t body_length, const std::shared_ptr& custom_metadata, const std::vector& nodes, const std::vector& buffers, - const IpcWriteOptions& options, std::shared_ptr* out); + const std::vector& variadic_counts, const IpcWriteOptions& options, + std::shared_ptr* out); static inline Result> WriteFlatbufferBuilder( flatbuffers::FlatBufferBuilder& fbb, // NOLINT non-const reference diff --git a/cpp/src/arrow/ipc/read_write_test.cc b/cpp/src/arrow/ipc/read_write_test.cc index 3ae007c20efe7..05a48aec2c7f3 100644 --- a/cpp/src/arrow/ipc/read_write_test.cc +++ b/cpp/src/arrow/ipc/read_write_test.cc @@ -159,7 +159,7 @@ TEST_P(TestMessage, SerializeCustomMetadata) { ASSERT_OK(internal::WriteRecordBatchMessage( /*length=*/0, /*body_length=*/0, metadata, /*nodes=*/{}, - /*buffers=*/{}, options_, &serialized)); + /*buffers=*/{}, /*variadic_counts=*/{}, options_, &serialized)); ASSERT_OK_AND_ASSIGN(std::unique_ptr message, Message::Open(serialized, /*body=*/nullptr)); @@ -240,23 +240,33 @@ class TestSchemaMetadata : public ::testing::Test { } }; -const std::shared_ptr INT32 = std::make_shared(); - TEST_F(TestSchemaMetadata, PrimitiveFields) { - auto f0 = field("f0", std::make_shared()); - auto f1 = field("f1", std::make_shared(), false); - auto f2 = field("f2", std::make_shared()); - auto f3 = field("f3", std::make_shared()); - auto f4 = field("f4", std::make_shared()); - auto f5 = field("f5", std::make_shared()); - auto f6 = field("f6", std::make_shared()); - auto f7 = field("f7", std::make_shared()); - auto f8 = field("f8", std::make_shared()); - auto f9 = field("f9", std::make_shared(), false); - auto f10 = field("f10", std::make_shared()); - - Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", int8()), + field("f1", int16(), false), + field("f2", int32()), + field("f3", int64()), + field("f4", uint8()), + field("f5", uint16()), + field("f6", uint32()), + field("f7", uint64()), + field("f8", float32()), + field("f9", float64(), false), + field("f10", boolean()), + })); +} + +TEST_F(TestSchemaMetadata, BinaryFields) { + CheckSchemaRoundtrip(Schema({ + field("f0", utf8()), + field("f1", binary()), + field("f2", large_utf8()), + field("f3", large_binary()), + field("f4", utf8_view()), + field("f5", binary_view()), + field("f6", fixed_size_binary(3)), + field("f7", fixed_size_binary(33)), + })); } TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { @@ -269,15 +279,14 @@ TEST_F(TestSchemaMetadata, PrimitiveFieldsWithKeyValueMetadata) { } TEST_F(TestSchemaMetadata, NestedFields) { - auto type = list(int32()); - auto f0 = field("f0", type); - - std::shared_ptr type2( - new StructType({field("k1", INT32), field("k2", INT32), field("k3", INT32)})); - auto f1 = field("f1", type2); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({ + field("f0", list(int32())), + field("f1", struct_({ + field("k1", int32()), + field("k2", int32()), + field("k3", int32()), + })), + })); } // Verify that nullable=false is well-preserved for child fields of map type. @@ -305,19 +314,15 @@ TEST_F(TestSchemaMetadata, NestedFieldsWithKeyValueMetadata) { TEST_F(TestSchemaMetadata, DictionaryFields) { { - auto dict_type = dictionary(int8(), int32(), true /* ordered */); - auto f0 = field("f0", dict_type); - auto f1 = field("f1", list(dict_type)); - - Schema schema({f0, f1}); - CheckSchemaRoundtrip(schema); + auto dict_type = dictionary(int8(), int32(), /*ordered=*/true); + CheckSchemaRoundtrip(Schema({ + field("f0", dict_type), + field("f1", list(dict_type)), + })); } { auto dict_type = dictionary(int8(), list(int32())); - auto f0 = field("f0", dict_type); - - Schema schema({f0}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } } @@ -325,9 +330,7 @@ TEST_F(TestSchemaMetadata, NestedDictionaryFields) { { auto inner_dict_type = dictionary(int8(), int32(), /*ordered=*/true); auto dict_type = dictionary(int16(), list(inner_dict_type)); - - Schema schema({field("f0", dict_type)}); - CheckSchemaRoundtrip(schema); + CheckSchemaRoundtrip(Schema({field("f0", dict_type)})); } { auto dict_type1 = dictionary(int8(), utf8(), /*ordered=*/true); @@ -2910,21 +2913,21 @@ void GetReadRecordBatchReadRanges( // 1) read magic and footer length IO // 2) read footer IO // 3) read record batch metadata IO - ASSERT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); + EXPECT_EQ(read_ranges.size(), 3 + expected_body_read_lengths.size()); const int32_t magic_size = static_cast(strlen(ipc::internal::kArrowMagicBytes)); // read magic and footer length IO auto file_end_size = magic_size + sizeof(int32_t); auto footer_length_offset = buffer->size() - file_end_size; auto footer_length = bit_util::FromLittleEndian( util::SafeLoadAs(buffer->data() + footer_length_offset)); - ASSERT_EQ(read_ranges[0].length, file_end_size); + EXPECT_EQ(read_ranges[0].length, file_end_size); // read footer IO - ASSERT_EQ(read_ranges[1].length, footer_length); + EXPECT_EQ(read_ranges[1].length, footer_length); // read record batch metadata. The exact size is tricky to determine but it doesn't // matter for this test and it should be smaller than the footer. - ASSERT_LT(read_ranges[2].length, footer_length); + EXPECT_LE(read_ranges[2].length, footer_length); for (uint32_t i = 0; i < expected_body_read_lengths.size(); i++) { - ASSERT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); + EXPECT_EQ(read_ranges[3 + i].length, expected_body_read_lengths[i]); } } diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc index 6e801e1f8adb7..d603062d81d4a 100644 --- a/cpp/src/arrow/ipc/reader.cc +++ b/cpp/src/arrow/ipc/reader.cc @@ -248,6 +248,15 @@ class ArrayLoader { } } + Result GetVariadicCount(int i) { + auto* variadic_counts = metadata_->variadicBufferCounts(); + CHECK_FLATBUFFERS_NOT_NULL(variadic_counts, "RecordBatch.variadicBufferCounts"); + if (i >= static_cast(variadic_counts->size())) { + return Status::IOError("variadic_count_index out of range."); + } + return static_cast(variadic_counts->Get(i)); + } + Status GetFieldMetadata(int field_index, ArrayData* out) { auto nodes = metadata_->nodes(); CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); @@ -296,7 +305,6 @@ class ArrayLoader { return Status::OK(); } - template Status LoadBinary(Type::type type_id) { DCHECK_NE(out_, nullptr); out_->buffers.resize(3); @@ -355,7 +363,22 @@ class ArrayLoader { template enable_if_base_binary Visit(const T& type) { - return LoadBinary(type.id()); + return LoadBinary(type.id()); + } + + Status Visit(const BinaryViewType& type) { + out_->buffers.resize(2); + + RETURN_NOT_OK(LoadCommon(type.id())); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); + + ARROW_ASSIGN_OR_RAISE(auto character_buffer_count, + GetVariadicCount(variadic_count_index_++)); + out_->buffers.resize(character_buffer_count + 2); + for (size_t i = 0; i < character_buffer_count; ++i) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[i + 2])); + } + return Status::OK(); } Status Visit(const FixedSizeBinaryType& type) { @@ -450,6 +473,7 @@ class ArrayLoader { int buffer_index_ = 0; int field_index_ = 0; bool skip_io_ = false; + int variadic_count_index_ = 0; BatchDataReadRequest read_request_; const Field* field_ = nullptr; @@ -580,10 +604,9 @@ Result> LoadRecordBatchSubset( // swap endian in a set of ArrayData if necessary (swap_endian == true) if (context.swap_endian) { - for (int i = 0; i < static_cast(filtered_columns.size()); ++i) { - ARROW_ASSIGN_OR_RAISE(filtered_columns[i], - arrow::internal::SwapEndianArrayData( - filtered_columns[i], context.options.memory_pool)); + for (auto& filtered_column : filtered_columns) { + ARROW_ASSIGN_OR_RAISE(filtered_column, + arrow::internal::SwapEndianArrayData(filtered_column)); } } return RecordBatch::Make(std::move(filtered_schema), metadata->length(), diff --git a/cpp/src/arrow/ipc/test_common.cc b/cpp/src/arrow/ipc/test_common.cc index 53721c0b20fbc..6faaf96b332d4 100644 --- a/cpp/src/arrow/ipc/test_common.cc +++ b/cpp/src/arrow/ipc/test_common.cc @@ -351,39 +351,32 @@ static Status MakeBinaryArrayWithUniqueValues(int64_t length, bool include_nulls return builder.Finish(out); } -Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls) { +Status MakeStringTypesRecordBatch(std::shared_ptr* out, bool with_nulls, + bool with_view_types) { const int64_t length = 500; - auto f0 = field("strings", utf8()); - auto f1 = field("binaries", binary()); - auto f2 = field("large_strings", large_utf8()); - auto f3 = field("large_binaries", large_binary()); - auto schema = ::arrow::schema({f0, f1, f2, f3}); - - std::shared_ptr a0, a1, a2, a3; - MemoryPool* pool = default_memory_pool(); - // Quirk with RETURN_NOT_OK macro and templated functions - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a0); - RETURN_NOT_OK(s); + ArrayVector arrays; + FieldVector fields; + + auto AppendColumn = [&](auto& MakeArray) { + arrays.emplace_back(); + RETURN_NOT_OK(MakeArray(length, with_nulls, default_memory_pool(), &arrays.back())); + + const auto& type = arrays.back()->type(); + fields.push_back(field(type->ToString(), type)); + return Status::OK(); + }; + + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + if (with_view_types) { + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); + RETURN_NOT_OK(AppendColumn(MakeBinaryArrayWithUniqueValues)); } - { - auto s = - MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, &a1); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a2); - RETURN_NOT_OK(s); - } - { - auto s = MakeBinaryArrayWithUniqueValues(length, with_nulls, pool, - &a3); - RETURN_NOT_OK(s); - } - *out = RecordBatch::Make(schema, length, {a0, a1, a2, a3}); + + *out = RecordBatch::Make(schema(std::move(fields)), length, std::move(arrays)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/test_common.h b/cpp/src/arrow/ipc/test_common.h index 5e0c65556c630..fc0c8ddbea319 100644 --- a/cpp/src/arrow/ipc/test_common.h +++ b/cpp/src/arrow/ipc/test_common.h @@ -96,7 +96,7 @@ Status MakeRandomStringArray(int64_t length, bool include_nulls, MemoryPool* poo ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatch(std::shared_ptr* out, - bool with_nulls = true); + bool with_nulls = true, bool with_view_types = true); ARROW_TESTING_EXPORT Status MakeStringTypesRecordBatchWithNulls(std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index e4b49ed56464e..9668f459d0d31 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -52,10 +52,12 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/compression.h" #include "arrow/util/endian.h" +#include "arrow/util/int_util_overflow.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/parallel.h" #include "arrow/visit_array_inline.h" +#include "arrow/visit_data_inline.h" #include "arrow/visit_type_inline.h" namespace arrow { @@ -174,7 +176,8 @@ class RecordBatchSerializer { // Override this for writing dictionary metadata virtual Status SerializeMetadata(int64_t num_rows) { return WriteRecordBatchMessage(num_rows, out_->body_length, custom_metadata_, - field_nodes_, buffer_meta_, options_, &out_->metadata); + field_nodes_, buffer_meta_, variadic_counts_, options_, + &out_->metadata); } bool ShouldCompress(int64_t uncompressed_size, int64_t compressed_size) const { @@ -296,6 +299,8 @@ class RecordBatchSerializer { offset += size + padding; } + variadic_counts_ = out_->variadic_buffer_counts; + out_->body_length = offset - buffer_start_offset_; DCHECK(bit_util::IsMultipleOf8(out_->body_length)); @@ -403,6 +408,18 @@ class RecordBatchSerializer { return Status::OK(); } + Status Visit(const BinaryViewArray& array) { + auto views = SliceBuffer(array.values(), array.offset() * BinaryViewType::kSize, + array.length() * BinaryViewType::kSize); + out_->body_buffers.emplace_back(std::move(views)); + + out_->variadic_buffer_counts.emplace_back(array.data()->buffers.size() - 2); + for (size_t i = 2; i < array.data()->buffers.size(); ++i) { + out_->body_buffers.emplace_back(array.data()->buffers[i]); + } + return Status::OK(); + } + template enable_if_var_size_list Visit(const T& array) { using offset_type = typename T::offset_type; @@ -590,6 +607,7 @@ class RecordBatchSerializer { std::vector field_nodes_; std::vector buffer_meta_; + std::vector variadic_counts_; const IpcWriteOptions& options_; int64_t max_recursion_depth_; @@ -606,8 +624,8 @@ class DictionarySerializer : public RecordBatchSerializer { Status SerializeMetadata(int64_t num_rows) override { return WriteDictionaryMessage(dictionary_id_, is_delta_, num_rows, out_->body_length, - custom_metadata_, field_nodes_, buffer_meta_, options_, - &out_->metadata); + custom_metadata_, field_nodes_, buffer_meta_, + variadic_counts_, options_, &out_->metadata); } Status Assemble(const std::shared_ptr& dictionary) { diff --git a/cpp/src/arrow/ipc/writer.h b/cpp/src/arrow/ipc/writer.h index 9e18a213ba3f2..4e0ee3dfc8b44 100644 --- a/cpp/src/arrow/ipc/writer.h +++ b/cpp/src/arrow/ipc/writer.h @@ -57,6 +57,7 @@ struct IpcPayload { MessageType type = MessageType::NONE; std::shared_ptr metadata; std::vector> body_buffers; + std::vector variadic_buffer_counts; int64_t body_length = 0; // serialized body length (padded, maybe compressed) int64_t raw_body_length = 0; // initial uncompressed body length }; diff --git a/cpp/src/arrow/json/converter.cc b/cpp/src/arrow/json/converter.cc index 04ebe4714ceec..c393b77acf334 100644 --- a/cpp/src/arrow/json/converter.cc +++ b/cpp/src/arrow/json/converter.cc @@ -304,6 +304,8 @@ Status MakeConverter(const std::shared_ptr& out_type, MemoryPool* pool CONVERTER_CASE(Type::STRING, BinaryConverter); CONVERTER_CASE(Type::LARGE_BINARY, BinaryConverter); CONVERTER_CASE(Type::LARGE_STRING, BinaryConverter); + CONVERTER_CASE(Type::BINARY_VIEW, BinaryConverter); + CONVERTER_CASE(Type::STRING_VIEW, BinaryConverter); CONVERTER_CASE(Type::DECIMAL128, DecimalConverter); CONVERTER_CASE(Type::DECIMAL256, DecimalConverter); default: diff --git a/cpp/src/arrow/json/converter_test.cc b/cpp/src/arrow/json/converter_test.cc index 378a4491c0e62..cfc44c99976d5 100644 --- a/cpp/src/arrow/json/converter_test.cc +++ b/cpp/src/arrow/json/converter_test.cc @@ -131,8 +131,8 @@ TEST(ConverterTest, Floats) { } } -TEST(ConverterTest, StringAndLargeString) { - for (auto string_type : {utf8(), large_utf8()}) { +TEST(ConverterTest, StringAndLargeStringAndStringView) { + for (auto string_type : {utf8(), large_utf8(), utf8_view()}) { ParseOptions options; options.explicit_schema = schema({field("", string_type)}); diff --git a/cpp/src/arrow/json/parser.cc b/cpp/src/arrow/json/parser.cc index e2941a29ab9bd..185dcde355f0a 100644 --- a/cpp/src/arrow/json/parser.cc +++ b/cpp/src/arrow/json/parser.cc @@ -104,6 +104,7 @@ Status Kind::ForType(const DataType& type, Kind::type* kind) { Status Visit(const DateType&) { return SetKind(Kind::kNumber); } Status Visit(const BinaryType&) { return SetKind(Kind::kString); } Status Visit(const LargeBinaryType&) { return SetKind(Kind::kString); } + Status Visit(const BinaryViewType&) { return SetKind(Kind::kString); } Status Visit(const TimestampType&) { return SetKind(Kind::kString); } Status Visit(const DecimalType&) { return SetKind(Kind::kNumberOrString); } Status Visit(const DictionaryType& dict_type) { diff --git a/cpp/src/arrow/json/test_common.h b/cpp/src/arrow/json/test_common.h index 0f7b3466fdbc9..f7ab6fd10275f 100644 --- a/cpp/src/arrow/json/test_common.h +++ b/cpp/src/arrow/json/test_common.h @@ -110,8 +110,7 @@ struct GenerateImpl { return OK(writer.Double(val)); } - template - enable_if_base_binary Visit(const T&) { + Status GenerateAscii(const DataType&) { auto size = std::poisson_distribution<>{4}(e); std::uniform_int_distribution gen_char(32, 126); // FIXME generate UTF8 std::string s(size, '\0'); @@ -119,6 +118,13 @@ struct GenerateImpl { return OK(writer.String(s.c_str())); } + template + enable_if_base_binary Visit(const T& t) { + return GenerateAscii(t); + } + + Status Visit(const BinaryViewType& t) { return GenerateAscii(t); } + template enable_if_list_like Visit(const T& t) { auto size = std::poisson_distribution<>{4}(e); diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc index a5410df7e9ae2..b392e027a6b89 100644 --- a/cpp/src/arrow/pretty_print.cc +++ b/cpp/src/arrow/pretty_print.cc @@ -229,18 +229,13 @@ class ArrayPrinter : public PrettyPrinter { } template - enable_if_string_like WriteDataValues(const ArrayType& array) { + enable_if_has_string_view WriteDataValues(const ArrayType& array) { return WriteValues(array, [&](int64_t i) { - (*sink_) << "\"" << array.GetView(i) << "\""; - return Status::OK(); - }); - } - - template - enable_if_t::value && !is_decimal_type::value, Status> - WriteDataValues(const ArrayType& array) { - return WriteValues(array, [&](int64_t i) { - (*sink_) << HexEncode(array.GetView(i)); + if constexpr (T::is_utf8) { + (*sink_) << "\"" << array.GetView(i) << "\""; + } else { + (*sink_) << HexEncode(array.GetView(i)); + } return Status::OK(); }); } @@ -302,6 +297,7 @@ class ArrayPrinter : public PrettyPrinter { std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || + std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || std::is_base_of::value || diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index b2ad1ad519bb2..167e272705268 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -263,6 +263,12 @@ struct ScalarValidateImpl { Status Visit(const StringScalar& s) { return ValidateStringScalar(s); } + Status Visit(const BinaryViewScalar& s) { return ValidateBinaryScalar(s); } + + Status Visit(const StringViewScalar& s) { return ValidateStringScalar(s); } + + Status Visit(const LargeBinaryScalar& s) { return ValidateBinaryScalar(s); } + Status Visit(const LargeStringScalar& s) { return ValidateStringScalar(s); } template @@ -548,17 +554,8 @@ Status Scalar::ValidateFull() const { return ScalarValidateImpl(/*full_validation=*/true).Validate(*this); } -BinaryScalar::BinaryScalar(std::string s) - : BinaryScalar(Buffer::FromString(std::move(s))) {} - -StringScalar::StringScalar(std::string s) - : StringScalar(Buffer::FromString(std::move(s))) {} - -LargeBinaryScalar::LargeBinaryScalar(std::string s) - : LargeBinaryScalar(Buffer::FromString(std::move(s))) {} - -LargeStringScalar::LargeStringScalar(std::string s) - : LargeStringScalar(Buffer::FromString(std::move(s))) {} +BaseBinaryScalar::BaseBinaryScalar(std::string s, std::shared_ptr type) + : BaseBinaryScalar(Buffer::FromString(std::move(s)), std::move(type)) {} FixedSizeBinaryScalar::FixedSizeBinaryScalar(std::shared_ptr value, std::shared_ptr type, diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index 1d1ce4aa72948..5175b0128524c 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -263,24 +263,21 @@ struct ARROW_EXPORT BaseBinaryScalar return value ? std::string_view(*value) : std::string_view(); } - protected: BaseBinaryScalar(std::shared_ptr value, std::shared_ptr type) : internal::PrimitiveScalarBase{std::move(type), true}, value(std::move(value)) {} friend ArraySpan; + BaseBinaryScalar(std::string s, std::shared_ptr type); }; struct ARROW_EXPORT BinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = BinaryType; - BinaryScalar(std::shared_ptr value, std::shared_ptr type) - : BaseBinaryScalar(std::move(value), std::move(type)) {} - explicit BinaryScalar(std::shared_ptr value) : BinaryScalar(std::move(value), binary()) {} - explicit BinaryScalar(std::string s); + explicit BinaryScalar(std::string s) : BaseBinaryScalar(std::move(s), binary()) {} BinaryScalar() : BinaryScalar(binary()) {} }; @@ -292,11 +289,39 @@ struct ARROW_EXPORT StringScalar : public BinaryScalar { explicit StringScalar(std::shared_ptr value) : StringScalar(std::move(value), utf8()) {} - explicit StringScalar(std::string s); + explicit StringScalar(std::string s) : BinaryScalar(std::move(s), utf8()) {} StringScalar() : StringScalar(utf8()) {} }; +struct ARROW_EXPORT BinaryViewScalar : public BaseBinaryScalar { + using BaseBinaryScalar::BaseBinaryScalar; + using TypeClass = BinaryViewType; + + explicit BinaryViewScalar(std::shared_ptr value) + : BinaryViewScalar(std::move(value), binary_view()) {} + + explicit BinaryViewScalar(std::string s) + : BaseBinaryScalar(std::move(s), binary_view()) {} + + BinaryViewScalar() : BinaryViewScalar(binary_view()) {} + + std::string_view view() const override { return std::string_view(*this->value); } +}; + +struct ARROW_EXPORT StringViewScalar : public BinaryViewScalar { + using BinaryViewScalar::BinaryViewScalar; + using TypeClass = StringViewType; + + explicit StringViewScalar(std::shared_ptr value) + : StringViewScalar(std::move(value), utf8_view()) {} + + explicit StringViewScalar(std::string s) + : BinaryViewScalar(std::move(s), utf8_view()) {} + + StringViewScalar() : StringViewScalar(utf8_view()) {} +}; + struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { using BaseBinaryScalar::BaseBinaryScalar; using TypeClass = LargeBinaryType; @@ -307,7 +332,8 @@ struct ARROW_EXPORT LargeBinaryScalar : public BaseBinaryScalar { explicit LargeBinaryScalar(std::shared_ptr value) : LargeBinaryScalar(std::move(value), large_binary()) {} - explicit LargeBinaryScalar(std::string s); + explicit LargeBinaryScalar(std::string s) + : BaseBinaryScalar(std::move(s), large_binary()) {} LargeBinaryScalar() : LargeBinaryScalar(large_binary()) {} }; @@ -319,7 +345,8 @@ struct ARROW_EXPORT LargeStringScalar : public LargeBinaryScalar { explicit LargeStringScalar(std::shared_ptr value) : LargeStringScalar(std::move(value), large_utf8()) {} - explicit LargeStringScalar(std::string s); + explicit LargeStringScalar(std::string s) + : LargeBinaryScalar(std::move(s), large_utf8()) {} LargeStringScalar() : LargeStringScalar(large_utf8()) {} }; diff --git a/cpp/src/arrow/testing/gtest_util.h b/cpp/src/arrow/testing/gtest_util.h index bb462af86a5f2..641aae5a5e2e4 100644 --- a/cpp/src/arrow/testing/gtest_util.h +++ b/cpp/src/arrow/testing/gtest_util.h @@ -176,10 +176,17 @@ using DecimalArrowTypes = ::testing::Types; using BaseBinaryArrowTypes = ::testing::Types; +using BaseBinaryOrBinaryViewLikeArrowTypes = + ::testing::Types; + using BinaryArrowTypes = ::testing::Types; using StringArrowTypes = ::testing::Types; +using StringOrStringViewArrowTypes = + ::testing::Types; + using ListArrowTypes = ::testing::Types; using UnionArrowTypes = ::testing::Types; diff --git a/cpp/src/arrow/testing/random.cc b/cpp/src/arrow/testing/random.cc index b74c41f75e452..1386075397e20 100644 --- a/cpp/src/arrow/testing/random.cc +++ b/cpp/src/arrow/testing/random.cc @@ -363,13 +363,11 @@ std::shared_ptr RandomArrayGenerator::Decimal256(std::shared_ptr -static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int64_t size, - int32_t min_length, int32_t max_length, - double null_probability, - int64_t alignment, - MemoryPool* memory_pool) { - using offset_type = typename TypeClass::offset_type; +template +static std::shared_ptr GenerateBinaryArray( + RandomArrayGenerator* gen, int64_t size, int32_t min_length, int32_t max_length, + double null_probability, std::optional max_data_buffer_length, + int64_t alignment, MemoryPool* memory_pool) { using BuilderType = typename TypeTraits::BuilderType; using OffsetArrowType = typename CTypeTraits::ArrowType; using OffsetArrayType = typename TypeTraits::ArrayType; @@ -387,7 +385,12 @@ static std::shared_ptr GenerateBinaryArray(RandomArrayGenerator* gen, int /*null_probability=*/0); std::vector str_buffer(max_length); - BuilderType builder(memory_pool, alignment); + BuilderType builder{memory_pool, alignment}; + if constexpr (std::is_base_of_v) { + if (max_data_buffer_length) { + builder.SetBlockSize(*max_data_buffer_length); + } + } for (int64_t i = 0; i < size; ++i) { if (lengths->IsValid(i)) { @@ -409,7 +412,8 @@ std::shared_ptr RandomArrayGenerator::String(int64_t size, int32_t min_le int64_t alignment, MemoryPool* memory_pool) { return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t min_length, @@ -417,8 +421,9 @@ std::shared_ptr RandomArrayGenerator::LargeString(int64_t size, int32_t m double null_probability, int64_t alignment, MemoryPool* memory_pool) { - return GenerateBinaryArray(this, size, min_length, max_length, - null_probability, alignment, memory_pool); + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, /*max_data_buffer_length=*/{}, + alignment, memory_pool); } std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( @@ -430,6 +435,15 @@ std::shared_ptr RandomArrayGenerator::BinaryWithRepeats( return *strings->View(binary()); } +std::shared_ptr RandomArrayGenerator::StringView( + int64_t size, int32_t min_length, int32_t max_length, double null_probability, + std::optional max_data_buffer_length, int64_t alignment, + MemoryPool* memory_pool) { + return GenerateBinaryArray( + this, size, min_length, max_length, null_probability, max_data_buffer_length, + alignment, memory_pool); +} + std::shared_ptr RandomArrayGenerator::StringWithRepeats( int64_t size, int64_t unique, int32_t min_length, int32_t max_length, double null_probability, int64_t alignment, MemoryPool* memory_pool) { @@ -843,6 +857,24 @@ std::shared_ptr RandomArrayGenerator::ArrayOf(const Field& field, int64_t ->View(field.type()); } + case Type::type::STRING_VIEW: + case Type::type::BINARY_VIEW: { + const auto min_length = + GetMetadata(field.metadata().get(), "min_length", 0); + const auto max_length = + GetMetadata(field.metadata().get(), "max_length", 20); + std::optional max_data_buffer_length = + GetMetadata(field.metadata().get(), "max_data_buffer_length", 0); + if (*max_data_buffer_length == 0) { + *max_data_buffer_length = {}; + } + + return StringView(length, min_length, max_length, null_probability, + max_data_buffer_length, alignment) + ->View(field.type()) + .ValueOrDie(); + } + case Type::type::DECIMAL128: return Decimal128(field.type(), length, null_probability, alignment, memory_pool); diff --git a/cpp/src/arrow/testing/random.h b/cpp/src/arrow/testing/random.h index de9ea6d05648d..cbdac3baa0109 100644 --- a/cpp/src/arrow/testing/random.h +++ b/cpp/src/arrow/testing/random.h @@ -367,6 +367,26 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { int64_t alignment = kDefaultBufferAlignment, MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random StringViewArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min_length the lower bound of the string length + /// determined by the uniform distribution + /// \param[in] max_length the upper bound of the string length + /// determined by the uniform distribution + /// \param[in] null_probability the probability of a value being null + /// \param[in] max_data_buffer_length the data buffer size at which + /// a new chunk will be generated + /// \param[in] alignment alignment for memory allocations (in bytes) + /// \param[in] memory_pool memory pool to allocate memory from + /// + /// \return a generated Array + std::shared_ptr StringView(int64_t size, int32_t min_length, int32_t max_length, + double null_probability = 0, + std::optional max_data_buffer_length = {}, + int64_t alignment = kDefaultBufferAlignment, + MemoryPool* memory_pool = default_memory_pool()); + /// \brief Generate a random LargeStringArray /// /// \param[in] size the size of the array to generate @@ -556,10 +576,14 @@ class ARROW_TESTING_EXPORT RandomArrayGenerator { /// - max_length (T::offset_type): the minimum length of the child to generate, /// default 1024 /// - /// For string and binary types T (not including their large variants): + /// For string and binary types T (not including their large or view variants): /// - unique (int32_t): if positive, this many distinct values will be generated /// and all array values will be one of these values, default -1 /// + /// For string and binary view types T: + /// - max_data_buffer_length (int64_t): the data buffer size at which a new chunk + /// will be generated, default 32KB + /// /// For MapType: /// - values (int32_t): the number of key-value pairs to generate, which will be /// partitioned among the array values. diff --git a/cpp/src/arrow/testing/random_test.cc b/cpp/src/arrow/testing/random_test.cc index f269818e83a3d..951b654e56f73 100644 --- a/cpp/src/arrow/testing/random_test.cc +++ b/cpp/src/arrow/testing/random_test.cc @@ -160,6 +160,7 @@ auto values = ::testing::Values( field("uint32", uint32()), field("int32", int32()), field("uint64", uint64()), field("int64", int64()), field("float16", float16()), field("float32", float32()), field("float64", float64()), field("string", utf8()), field("binary", binary()), + field("string_view", utf8_view()), field("binary_view", binary_view()), field("fixed_size_binary", fixed_size_binary(8)), field("decimal128", decimal128(8, 3)), field("decimal128", decimal128(29, -5)), field("decimal256", decimal256(16, 4)), field("decimal256", decimal256(57, -6)), diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index a4f43256827da..f378bd974047d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -64,10 +64,14 @@ constexpr Type::type FixedSizeListType::type_id; constexpr Type::type BinaryType::type_id; +constexpr Type::type BinaryViewType::type_id; + constexpr Type::type LargeBinaryType::type_id; constexpr Type::type StringType::type_id; +constexpr Type::type StringViewType::type_id; + constexpr Type::type LargeStringType::type_id; constexpr Type::type FixedSizeBinaryType::type_id; @@ -130,6 +134,8 @@ std::vector AllTypeIds() { Type::BINARY, Type::LARGE_STRING, Type::LARGE_BINARY, + Type::STRING_VIEW, + Type::BINARY_VIEW, Type::FIXED_SIZE_BINARY, Type::STRUCT, Type::LIST, @@ -194,7 +200,9 @@ std::string ToString(Type::type id) { TO_STRING_CASE(INTERVAL_MONTHS) TO_STRING_CASE(DURATION) TO_STRING_CASE(STRING) + TO_STRING_CASE(STRING_VIEW) TO_STRING_CASE(BINARY) + TO_STRING_CASE(BINARY_VIEW) TO_STRING_CASE(LARGE_STRING) TO_STRING_CASE(LARGE_BINARY) TO_STRING_CASE(FIXED_SIZE_BINARY) @@ -247,7 +255,7 @@ struct PhysicalTypeVisitor { } template - Status Visit(const Type&) { + Status Visit(const Type& type) { result = TypeTraits::type_singleton(); return Status::OK(); } @@ -1058,10 +1066,14 @@ std::string FixedSizeListType::ToString() const { std::string BinaryType::ToString() const { return "binary"; } +std::string BinaryViewType::ToString() const { return "binary_view"; } + std::string LargeBinaryType::ToString() const { return "large_binary"; } std::string StringType::ToString() const { return "string"; } +std::string StringViewType::ToString() const { return "string_view"; } + std::string LargeStringType::ToString() const { return "large_string"; } int FixedSizeBinaryType::bit_width() const { return CHAR_BIT * byte_width(); } @@ -2821,8 +2833,10 @@ PARAMETER_LESS_FINGERPRINT(HalfFloat) PARAMETER_LESS_FINGERPRINT(Float) PARAMETER_LESS_FINGERPRINT(Double) PARAMETER_LESS_FINGERPRINT(Binary) +PARAMETER_LESS_FINGERPRINT(BinaryView) PARAMETER_LESS_FINGERPRINT(LargeBinary) PARAMETER_LESS_FINGERPRINT(String) +PARAMETER_LESS_FINGERPRINT(StringView) PARAMETER_LESS_FINGERPRINT(LargeString) PARAMETER_LESS_FINGERPRINT(Date32) PARAMETER_LESS_FINGERPRINT(Date64) @@ -3034,6 +3048,16 @@ TYPE_FACTORY(large_binary, LargeBinaryType) TYPE_FACTORY(date64, Date64Type) TYPE_FACTORY(date32, Date32Type) +const std::shared_ptr& utf8_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + +const std::shared_ptr& binary_view() { + static std::shared_ptr type = std::make_shared(); + return type; +} + std::shared_ptr fixed_size_binary(int32_t byte_width) { return std::make_shared(byte_width); } @@ -3294,7 +3318,7 @@ void InitStaticData() { // * Time32 // * Time64 // * Timestamp - g_primitive_types = {null(), boolean(), date32(), date64()}; + g_primitive_types = {null(), boolean(), date32(), date64(), binary_view(), utf8_view()}; Extend(g_numeric_types, &g_primitive_types); Extend(g_base_binary_types, &g_primitive_types); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3f4dd5c9b21fa..a905192e4a54e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -113,8 +114,14 @@ struct ARROW_EXPORT DataTypeLayout { std::vector buffers; /// Whether this type expects an associated dictionary array. bool has_dictionary = false; + /// If this is provided, the number of buffers expected is only lower-bounded by + /// buffers.size(). Buffers beyond this lower bound are expected to conform to + /// variadic_spec. + std::optional variadic_spec; - explicit DataTypeLayout(std::vector v) : buffers(std::move(v)) {} + explicit DataTypeLayout(std::vector buffers, + std::optional variadic_spec = {}) + : buffers(std::move(buffers)), variadic_spec(variadic_spec) {} }; /// \brief Base class for all data types @@ -772,6 +779,103 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType { explicit BinaryType(Type::type logical_type) : BaseBinaryType(logical_type) {} }; +/// \brief Concrete type class for variable-size binary view data +class ARROW_EXPORT BinaryViewType : public DataType { + public: + static constexpr Type::type type_id = Type::BINARY_VIEW; + static constexpr bool is_utf8 = false; + using PhysicalType = BinaryViewType; + + static constexpr int kSize = 16; + static constexpr int kInlineSize = 12; + static constexpr int kPrefixSize = 4; + + /// Variable length string or binary with inline optimization for small values (12 bytes + /// or fewer). This is similar to std::string_view except limited in size to INT32_MAX + /// and at least the first four bytes of the string are copied inline (accessible + /// without pointer dereference). This inline prefix allows failing comparisons early. + /// Furthermore when dealing with short strings the CPU cache working set is reduced + /// since many can be inline. + /// + /// This union supports two states: + /// + /// - Entirely inlined string data + /// |----|--------------| + /// ^ ^ + /// | | + /// size in-line string data, zero padded + /// + /// - Reference into a buffer + /// |----|----|----|----| + /// ^ ^ ^ ^ + /// | | | | + /// size | | `------. + /// prefix | | + /// buffer index | + /// offset in buffer + /// + /// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB. + /// + /// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf + /// + /// Alignment to 64 bits enables an aligned load of the size and prefix into + /// a single 64 bit integer, which is useful to the comparison fast path. + union alignas(int64_t) c_type { + struct { + int32_t size; + std::array data; + } inlined; + + struct { + int32_t size; + std::array prefix; + int32_t buffer_index; + int32_t offset; + } ref; + + /// The number of bytes viewed. + int32_t size() const { + // Size is in the common initial subsequence of each member of the union, + // so accessing `inlined.size` is legal even if another member is active. + return inlined.size; + } + + /// True if the view's data is entirely stored inline. + bool is_inline() const { return size() <= kInlineSize; } + + /// Return a pointer to the inline data of a view. + /// + /// For inline views, this points to the entire data of the view. + /// For other views, this points to the 4 byte prefix. + const uint8_t* inline_data() const& { + // Since `ref.prefix` has the same address as `inlined.data`, + // the branch will be trivially optimized out. + return is_inline() ? inlined.data.data() : ref.prefix.data(); + } + const uint8_t* inline_data() && = delete; + }; + static_assert(sizeof(c_type) == kSize); + static_assert(std::is_trivial_v); + + static constexpr const char* type_name() { return "binary_view"; } + + BinaryViewType() : BinaryViewType(Type::BINARY_VIEW) {} + + DataTypeLayout layout() const override { + return DataTypeLayout({DataTypeLayout::Bitmap(), DataTypeLayout::FixedWidth(kSize)}, + DataTypeLayout::VariableWidth()); + } + + std::string ToString() const override; + std::string name() const override { return "binary_view"; } + + protected: + std::string ComputeFingerprint() const override; + + // Allow subclasses like StringType to change the logical type. + explicit BinaryViewType(Type::type logical_type) : DataType(logical_type) {} +}; + /// \brief Concrete type class for large variable-size binary data class ARROW_EXPORT LargeBinaryType : public BaseBinaryType { public: @@ -818,6 +922,24 @@ class ARROW_EXPORT StringType : public BinaryType { std::string ComputeFingerprint() const override; }; +/// \brief Concrete type class for variable-size string data, utf8-encoded +class ARROW_EXPORT StringViewType : public BinaryViewType { + public: + static constexpr Type::type type_id = Type::STRING_VIEW; + static constexpr bool is_utf8 = true; + using PhysicalType = BinaryViewType; + + static constexpr const char* type_name() { return "utf8_view"; } + + StringViewType() : BinaryViewType(Type::STRING_VIEW) {} + + std::string ToString() const override; + std::string name() const override { return "utf8_view"; } + + protected: + std::string ComputeFingerprint() const override; +}; + /// \brief Concrete type class for large variable-size string data, utf8-encoded class ARROW_EXPORT LargeStringType : public LargeBinaryType { public: diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 499684719feab..ca263b710317b 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -110,6 +110,11 @@ class BinaryArray; class BinaryBuilder; struct BinaryScalar; +class BinaryViewType; +class BinaryViewArray; +class BinaryViewBuilder; +struct BinaryViewScalar; + class LargeBinaryType; class LargeBinaryArray; class LargeBinaryBuilder; @@ -125,6 +130,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StringViewType; +class StringViewArray; +class StringViewBuilder; +struct StringViewScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -415,6 +425,13 @@ struct Type { /// Run-end encoded data. RUN_END_ENCODED = 38, + /// String (UTF8) view type with 4-byte prefix and inline small string + /// optimization + STRING_VIEW = 39, + + /// Bytes view type with 4-byte prefix and inline small string optimization + BINARY_VIEW = 40, + // Leave this at the end MAX_ID }; @@ -456,10 +473,14 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +/// \brief Return a StringViewType instance +ARROW_EXPORT const std::shared_ptr& utf8_view(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance ARROW_EXPORT const std::shared_ptr& binary(); +/// \brief Return a BinaryViewType instance +ARROW_EXPORT const std::shared_ptr& binary_view(); /// \brief Return a LargeBinaryType instance ARROW_EXPORT const std::shared_ptr& large_binary(); /// \brief Return a Date32Type instance diff --git a/cpp/src/arrow/type_test.cc b/cpp/src/arrow/type_test.cc index 9ba8cf98dea4f..273f8933fa577 100644 --- a/cpp/src/arrow/type_test.cc +++ b/cpp/src/arrow/type_test.cc @@ -1469,9 +1469,21 @@ TEST(TestBinaryType, ToString) { TEST(TestStringType, ToString) { StringType str; ASSERT_EQ(str.id(), Type::STRING); + ASSERT_EQ(str.name(), std::string("utf8")); + ASSERT_EQ(str.type_name(), std::string("utf8")); ASSERT_EQ(str.ToString(), std::string("string")); } +TEST(TestBinaryViewType, ToString) { + BinaryViewType t1; + BinaryViewType e1; + StringViewType t2; + AssertTypeEqual(t1, e1); + AssertTypeNotEqual(t1, t2); + ASSERT_EQ(t1.id(), Type::BINARY_VIEW); + ASSERT_EQ(t1.ToString(), std::string("binary_view")); +} + TEST(TestLargeBinaryTypes, ToString) { BinaryType bt1; LargeBinaryType t1; diff --git a/cpp/src/arrow/type_traits.cc b/cpp/src/arrow/type_traits.cc index ac16afe4b8cd8..de328f322ad5f 100644 --- a/cpp/src/arrow/type_traits.cc +++ b/cpp/src/arrow/type_traits.cc @@ -88,6 +88,8 @@ int RequiredValueAlignmentForBuffer(Type::type type_id, int buffer_index) { case Type::DURATION: case Type::INTERVAL_MONTH_DAY_NANO: // Stored as two 32-bit integers and a 64-bit // integer + case Type::STRING_VIEW: + case Type::BINARY_VIEW: return 8; case Type::DICTIONARY: case Type::EXTENSION: diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index bcbde23ae4a4b..9d8cafacf397b 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -341,6 +341,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return binary(); } }; +template <> +struct TypeTraits { + using ArrayType = BinaryViewArray; + using BuilderType = BinaryViewBuilder; + using ScalarType = BinaryViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return binary_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeBinaryArray; @@ -371,6 +381,16 @@ struct TypeTraits { static inline std::shared_ptr type_singleton() { return utf8(); } }; +template <> +struct TypeTraits { + using ArrayType = StringViewArray; + using BuilderType = StringViewBuilder; + using ScalarType = StringViewScalar; + using CType = BinaryViewType::c_type; + constexpr static bool is_parameter_free = true; + static inline std::shared_ptr type_singleton() { return utf8_view(); } +}; + template <> struct TypeTraits { using ArrayType = LargeStringArray; @@ -399,6 +419,11 @@ struct CTypeTraits : public TypeTraits { using ArrowType = StringType; }; +template <> +struct CTypeTraits : public TypeTraits { + using ArrowType = BinaryViewType; +}; + template <> struct CTypeTraits : public CTypeTraits {}; @@ -614,6 +639,24 @@ using is_string_type = template using enable_if_string = enable_if_t::value, R>; +template +using is_binary_view_like_type = std::is_base_of; + +template +using is_binary_view_type = std::is_same; + +template +using is_string_view_type = std::is_same; + +template +using enable_if_binary_view_like = enable_if_t::value, R>; + +template +using enable_if_binary_view = enable_if_t::value, R>; + +template +using enable_if_string_view = enable_if_t::value, R>; + template using is_string_like_type = std::integral_constant::value && T::is_utf8>; @@ -801,8 +844,10 @@ using enable_if_has_c_type = enable_if_t::value, R>; template using has_string_view = std::integral_constant::value || + std::is_same::value || std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value || std::is_same::value>; diff --git a/cpp/src/arrow/util/binary_view_util.h b/cpp/src/arrow/util/binary_view_util.h new file mode 100644 index 0000000000000..94f7a5bdfa667 --- /dev/null +++ b/cpp/src/arrow/util/binary_view_util.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/type.h" +#include "arrow/util/span.h" + +namespace arrow::util { + +inline BinaryViewType::c_type ToInlineBinaryView(const void* data, int32_t size) { + // Small string: inlined. Bytes beyond size are zeroed + BinaryViewType::c_type out; + out.inlined = {size, {}}; + memcpy(&out.inlined.data, data, size); + return out; +} + +inline BinaryViewType::c_type ToInlineBinaryView(std::string_view v) { + return ToInlineBinaryView(v.data(), static_cast(v.size())); +} + +inline BinaryViewType::c_type ToBinaryView(const void* data, int32_t size, + int32_t buffer_index, int32_t offset) { + if (size <= BinaryViewType::kInlineSize) { + return ToInlineBinaryView(data, size); + } + + // Large string: store index/offset. + BinaryViewType::c_type out; + out.ref = {size, {}, buffer_index, offset}; + memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix)); + return out; +} + +inline BinaryViewType::c_type ToBinaryView(std::string_view v, int32_t buffer_index, + int32_t offset) { + return ToBinaryView(v.data(), static_cast(v.size()), buffer_index, offset); +} + +template +std::string_view FromBinaryView(const BinaryViewType::c_type& v, + const BufferPtr* data_buffers) { + auto* data = v.is_inline() ? v.inlined.data.data() + : data_buffers[v.ref.buffer_index]->data() + v.ref.offset; + return {reinterpret_cast(data), static_cast(v.size())}; +} +template +std::string_view FromBinaryView(BinaryViewType::c_type&&, const BufferPtr*) = delete; + +template +bool EqualBinaryView(BinaryViewType::c_type l, BinaryViewType::c_type r, + const BufferPtr* l_buffers, const BufferPtr* r_buffers) { + int64_t l_size_and_prefix, r_size_and_prefix; + memcpy(&l_size_and_prefix, &l, sizeof(l_size_and_prefix)); + memcpy(&r_size_and_prefix, &r, sizeof(r_size_and_prefix)); + + if (l_size_and_prefix != r_size_and_prefix) return false; + + if (l.is_inline()) { + // The columnar spec mandates that the inlined part be zero-padded, so we can compare + // a word at a time regardless of the exact size. + int64_t l_inlined, r_inlined; + memcpy(&l_inlined, l.inline_data() + BinaryViewType::kPrefixSize, sizeof(l_inlined)); + memcpy(&r_inlined, r.inline_data() + BinaryViewType::kPrefixSize, sizeof(r_inlined)); + return l_inlined == r_inlined; + } + + // Sizes are equal and this is not inline, therefore both are out + // of line and have kPrefixSize first in common. + const uint8_t* l_data = l_buffers[l.ref.buffer_index]->data() + l.ref.offset; + const uint8_t* r_data = r_buffers[r.ref.buffer_index]->data() + r.ref.offset; + return memcmp(l_data + BinaryViewType::kPrefixSize, + r_data + BinaryViewType::kPrefixSize, + l.size() - BinaryViewType::kPrefixSize) == 0; +} + +} // namespace arrow::util diff --git a/cpp/src/arrow/util/ree_util.cc b/cpp/src/arrow/util/ree_util.cc index fcd6c204e06b2..819de5eb60c63 100644 --- a/cpp/src/arrow/util/ree_util.cc +++ b/cpp/src/arrow/util/ree_util.cc @@ -61,6 +61,62 @@ int64_t LogicalNullCount(const ArraySpan& span) { return LogicalNullCount(span); } +namespace internal { + +/// \pre 0 <= i < array_span.length() +template +int64_t FindPhysicalIndexImpl(PhysicalIndexFinder& self, int64_t i) { + DCHECK_LT(i, self.array_span.length); + const int64_t run_ends_size = ree_util::RunEndsArray(self.array_span).length; + DCHECK_LT(self.last_physical_index, run_ends_size); + // This access to self.run_ends[last_physical_index] is alwas safe because: + // 1. 0 <= i < array_span.length() implies there is at least one run and the initial + // value 0 will be safe to index with. + // 2. last_physical_index > 0 is always the result of a valid call to + // internal::FindPhysicalIndex. + if (ARROW_PREDICT_TRUE(self.array_span.offset + i < + self.run_ends[self.last_physical_index])) { + // The cached value is an upper-bound, but is it the least upper-bound? + if (self.last_physical_index == 0 || + self.array_span.offset + i >= self.run_ends[self.last_physical_index - 1]) { + return self.last_physical_index; + } + // last_physical_index - 1 is a candidate for the least upper-bound, + // so search for the least upper-bound in the range that includes it. + const int64_t j = ree_util::internal::FindPhysicalIndex( + self.run_ends, /*run_ends_size=*/self.last_physical_index, i, + self.array_span.offset); + DCHECK_LT(j, self.last_physical_index); + return self.last_physical_index = j; + } + + // last_physical_index is not an upper-bound, and the logical index i MUST be + // in the runs that follow it. Since i is a valid logical index, we know that at least + // one extra run is present. + DCHECK_LT(self.last_physical_index + 1, run_ends_size); + const int64_t min_physical_index = self.last_physical_index + 1; + + const int64_t j = ree_util::internal::FindPhysicalIndex( + /*run_ends=*/self.run_ends + min_physical_index, + /*run_ends_size=*/run_ends_size - min_physical_index, i, self.array_span.offset); + DCHECK_LT(min_physical_index + j, run_ends_size); + return self.last_physical_index = min_physical_index + j; +} + +int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder& self, int64_t i) { + return FindPhysicalIndexImpl(self, i); +} + +} // namespace internal + int64_t FindPhysicalIndex(const ArraySpan& span, int64_t i, int64_t absolute_offset) { const auto type_id = RunEndsArray(span).type->id(); if (type_id == Type::INT16) { diff --git a/cpp/src/arrow/util/ree_util.h b/cpp/src/arrow/util/ree_util.h index a3d3d16c0da95..2b7940154a50b 100644 --- a/cpp/src/arrow/util/ree_util.h +++ b/cpp/src/arrow/util/ree_util.h @@ -23,6 +23,7 @@ #include "arrow/array/data.h" #include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" #include "arrow/util/macros.h" namespace arrow { @@ -139,6 +140,69 @@ int64_t FindPhysicalLength(const ArraySpan& span) { /*offset=*/span.offset); } +template +struct PhysicalIndexFinder; + +// non-inline implementations for each run-end type +ARROW_EXPORT int64_t FindPhysicalIndexImpl16(PhysicalIndexFinder& self, + int64_t i); +ARROW_EXPORT int64_t FindPhysicalIndexImpl32(PhysicalIndexFinder& self, + int64_t i); +ARROW_EXPORT int64_t FindPhysicalIndexImpl64(PhysicalIndexFinder& self, + int64_t i); + +/// \brief Stateful version of FindPhysicalIndex() that caches the result of +/// the previous search and uses it to optimize the next search. +/// +/// When new queries for the physical index of a logical index come in, +/// binary search is performed again but the first candidate checked is the +/// result of the previous search (cached physical index) instead of the +/// midpoint of the run-ends array. +/// +/// If that test fails, internal::FindPhysicalIndex() is called with one of the +/// partitions defined by the cached index. If the queried logical indices +/// follow an increasing or decreasing pattern, this first test is much more +/// effective in (1) finding the answer right away (close logical indices belong +/// to the same runs) or (2) discarding many more candidates than probing +/// the midpoint would. +/// +/// The most adversarial case (i.e. alternating between 0 and length-1 queries) +/// only adds one extra binary search probe when compared to always starting +/// binary search from the midpoint without any of these optimizations. +/// +/// \tparam RunEndCType The numeric type of the run-ends array. +template +struct PhysicalIndexFinder { + const ArraySpan array_span; + const RunEndCType* run_ends; + int64_t last_physical_index = 0; + + explicit PhysicalIndexFinder(const ArrayData& data) + : array_span(data), + run_ends(RunEndsArray(array_span).template GetValues(1)) { + assert(CTypeTraits::ArrowType::type_id == + ::arrow::internal::checked_cast(*data.type) + .run_end_type() + ->id()); + } + + /// \brief Find the physical index into the values array of the REE array. + /// + /// \pre 0 <= i < array_span.length() + /// \param i the logical index into the REE array + /// \return the physical index into the values array + int64_t FindPhysicalIndex(int64_t i) { + if constexpr (std::is_same_v) { + return FindPhysicalIndexImpl16(*this, i); + } else if constexpr (std::is_same_v) { + return FindPhysicalIndexImpl32(*this, i); + } else { + static_assert(std::is_same_v, "Unsupported RunEndCType."); + return FindPhysicalIndexImpl64(*this, i); + } + } +}; + } // namespace internal /// \brief Find the physical index into the values array of the REE ArraySpan @@ -166,6 +230,10 @@ ARROW_EXPORT std::pair FindPhysicalRange(const ArraySpan& span int64_t offset, int64_t length); +// Publish PhysicalIndexFinder outside of the internal namespace. +template +using PhysicalIndexFinder = internal::PhysicalIndexFinder; + template class RunEndEncodedArraySpan { private: diff --git a/cpp/src/arrow/util/string.cc b/cpp/src/arrow/util/string.cc index 2055b4f47ea22..192173fa16ce9 100644 --- a/cpp/src/arrow/util/string.cc +++ b/cpp/src/arrow/util/string.cc @@ -25,15 +25,13 @@ namespace arrow { -static const char* kAsciiTable = "0123456789ABCDEF"; - std::string HexEncode(const uint8_t* data, size_t length) { - std::string hex_string; - hex_string.reserve(length * 2); - for (size_t j = 0; j < length; ++j) { + std::string hex_string(length * 2, '\0'); + for (size_t j = 0, i = 0; j < length; ++j) { // Convert to 2 base16 digits - hex_string.push_back(kAsciiTable[data[j] >> 4]); - hex_string.push_back(kAsciiTable[data[j] & 15]); + constexpr auto kHexDigitTable = "0123456789ABCDEF"; + hex_string[i++] = kHexDigitTable[data[j] >> 4]; + hex_string[i++] = kHexDigitTable[data[j] & 0b1111]; } return hex_string; } @@ -73,20 +71,34 @@ std::string HexEncode(std::string_view str) { return HexEncode(str.data(), str.s std::string Escape(std::string_view str) { return Escape(str.data(), str.size()); } -Status ParseHexValue(const char* data, uint8_t* out) { - char c1 = data[0]; - char c2 = data[1]; +constexpr uint8_t kInvalidHexDigit = -1; - const char* kAsciiTableEnd = kAsciiTable + 16; - const char* pos1 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c1); - const char* pos2 = std::lower_bound(kAsciiTable, kAsciiTableEnd, c2); +constexpr uint8_t ParseHexDigit(char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return kInvalidHexDigit; +} + +Status ParseHexValue(const char* data, uint8_t* out) { + uint8_t high = ParseHexDigit(data[0]); + uint8_t low = ParseHexDigit(data[1]); // Error checking - if (pos1 == kAsciiTableEnd || pos2 == kAsciiTableEnd || *pos1 != c1 || *pos2 != c2) { + if (high == kInvalidHexDigit || low == kInvalidHexDigit) { return Status::Invalid("Encountered non-hex digit"); } - *out = static_cast((pos1 - kAsciiTable) << 4 | (pos2 - kAsciiTable)); + *out = static_cast(high << 4 | low); + return Status::OK(); +} + +Status ParseHexValues(std::string_view hex_string, uint8_t* out) { + if (hex_string.size() % 2 != 0) { + return Status::Invalid("Expected base16 hex string"); + } + for (size_t j = 0; j < hex_string.size() / 2; ++j) { + RETURN_NOT_OK(ParseHexValue(hex_string.data() + j * 2, out + j)); + } return Status::OK(); } diff --git a/cpp/src/arrow/util/string.h b/cpp/src/arrow/util/string.h index d9777efc56a8c..d7e377773f62f 100644 --- a/cpp/src/arrow/util/string.h +++ b/cpp/src/arrow/util/string.h @@ -46,7 +46,9 @@ ARROW_EXPORT std::string HexEncode(std::string_view str); ARROW_EXPORT std::string Escape(std::string_view str); -ARROW_EXPORT Status ParseHexValue(const char* data, uint8_t* out); +ARROW_EXPORT Status ParseHexValue(const char* hex_pair, uint8_t* out); + +ARROW_EXPORT Status ParseHexValues(std::string_view hex_string, uint8_t* out); namespace internal { diff --git a/cpp/src/arrow/visit_data_inline.h b/cpp/src/arrow/visit_data_inline.h index 6a9b32d73a635..a2ba9cfc65071 100644 --- a/cpp/src/arrow/visit_data_inline.h +++ b/cpp/src/arrow/visit_data_inline.h @@ -23,6 +23,7 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/binary_view_util.h" #include "arrow/util/bit_block_counter.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" @@ -144,6 +145,42 @@ struct ArraySpanInlineVisitor> { } }; +// BinaryView, StringView... +template +struct ArraySpanInlineVisitor> { + using c_type = std::string_view; + + template + static Status VisitStatus(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return Status::OK(); + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + return VisitBitBlocks( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { + return valid_func(util::FromBinaryView(s[index], data_buffers)); + }, + [&]() { return null_func(); }); + } + + template + static void VisitVoid(const ArraySpan& arr, ValidFunc&& valid_func, + NullFunc&& null_func) { + if (arr.length == 0) { + return; + } + auto* s = arr.GetValues(1); + auto* data_buffers = arr.GetVariadicBuffers().data(); + VisitBitBlocksVoid( + arr.buffers[0].data, arr.offset, arr.length, + [&](int64_t index) { valid_func(util::FromBinaryView(s[index], data_buffers)); }, + std::forward(null_func)); + } +}; + // FixedSizeBinary, Decimal128 template struct ArraySpanInlineVisitor> { @@ -240,9 +277,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { @@ -273,9 +309,8 @@ typename internal::call_traits::enable_if_return::type VisitNullBitmapInline(const uint8_t* valid_bits, int64_t valid_bits_offset, int64_t num_values, int64_t null_count, ValidFunc&& valid_func, NullFunc&& null_func) { - ARROW_UNUSED(null_count); - internal::OptionalBitBlockCounter bit_counter(valid_bits, valid_bits_offset, - num_values); + internal::OptionalBitBlockCounter bit_counter(null_count == 0 ? NULLPTR : valid_bits, + valid_bits_offset, num_values); int64_t position = 0; int64_t offset_position = valid_bits_offset; while (position < num_values) { diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc index ed3d5bc2c68d7..e057f6b12fb1b 100644 --- a/cpp/src/arrow/visitor.cc +++ b/cpp/src/arrow/visitor.cc @@ -45,8 +45,10 @@ ARRAY_VISITOR_DEFAULT(UInt64Array) ARRAY_VISITOR_DEFAULT(HalfFloatArray) ARRAY_VISITOR_DEFAULT(FloatArray) ARRAY_VISITOR_DEFAULT(DoubleArray) -ARRAY_VISITOR_DEFAULT(BinaryArray) ARRAY_VISITOR_DEFAULT(StringArray) +ARRAY_VISITOR_DEFAULT(StringViewArray) +ARRAY_VISITOR_DEFAULT(BinaryArray) +ARRAY_VISITOR_DEFAULT(BinaryViewArray) ARRAY_VISITOR_DEFAULT(LargeBinaryArray) ARRAY_VISITOR_DEFAULT(LargeStringArray) ARRAY_VISITOR_DEFAULT(FixedSizeBinaryArray) @@ -96,7 +98,9 @@ TYPE_VISITOR_DEFAULT(HalfFloatType) TYPE_VISITOR_DEFAULT(FloatType) TYPE_VISITOR_DEFAULT(DoubleType) TYPE_VISITOR_DEFAULT(StringType) +TYPE_VISITOR_DEFAULT(StringViewType) TYPE_VISITOR_DEFAULT(BinaryType) +TYPE_VISITOR_DEFAULT(BinaryViewType) TYPE_VISITOR_DEFAULT(LargeStringType) TYPE_VISITOR_DEFAULT(LargeBinaryType) TYPE_VISITOR_DEFAULT(FixedSizeBinaryType) @@ -147,7 +151,9 @@ SCALAR_VISITOR_DEFAULT(HalfFloatScalar) SCALAR_VISITOR_DEFAULT(FloatScalar) SCALAR_VISITOR_DEFAULT(DoubleScalar) SCALAR_VISITOR_DEFAULT(StringScalar) +SCALAR_VISITOR_DEFAULT(StringViewScalar) SCALAR_VISITOR_DEFAULT(BinaryScalar) +SCALAR_VISITOR_DEFAULT(BinaryViewScalar) SCALAR_VISITOR_DEFAULT(LargeStringScalar) SCALAR_VISITOR_DEFAULT(LargeBinaryScalar) SCALAR_VISITOR_DEFAULT(FixedSizeBinaryScalar) diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h index b22d4d3c567e1..650b0e7ee0a30 100644 --- a/cpp/src/arrow/visitor.h +++ b/cpp/src/arrow/visitor.h @@ -45,7 +45,9 @@ class ARROW_EXPORT ArrayVisitor { virtual Status Visit(const FloatArray& array); virtual Status Visit(const DoubleArray& array); virtual Status Visit(const StringArray& array); + virtual Status Visit(const StringViewArray& array); virtual Status Visit(const BinaryArray& array); + virtual Status Visit(const BinaryViewArray& array); virtual Status Visit(const LargeStringArray& array); virtual Status Visit(const LargeBinaryArray& array); virtual Status Visit(const FixedSizeBinaryArray& array); @@ -94,7 +96,9 @@ class ARROW_EXPORT TypeVisitor { virtual Status Visit(const FloatType& type); virtual Status Visit(const DoubleType& type); virtual Status Visit(const StringType& type); + virtual Status Visit(const StringViewType& type); virtual Status Visit(const BinaryType& type); + virtual Status Visit(const BinaryViewType& type); virtual Status Visit(const LargeStringType& type); virtual Status Visit(const LargeBinaryType& type); virtual Status Visit(const FixedSizeBinaryType& type); @@ -143,7 +147,9 @@ class ARROW_EXPORT ScalarVisitor { virtual Status Visit(const FloatScalar& scalar); virtual Status Visit(const DoubleScalar& scalar); virtual Status Visit(const StringScalar& scalar); + virtual Status Visit(const StringViewScalar& scalar); virtual Status Visit(const BinaryScalar& scalar); + virtual Status Visit(const BinaryViewScalar& scalar); virtual Status Visit(const LargeStringScalar& scalar); virtual Status Visit(const LargeBinaryScalar& scalar); virtual Status Visit(const FixedSizeBinaryScalar& scalar); diff --git a/cpp/src/arrow/visitor_generate.h b/cpp/src/arrow/visitor_generate.h index 8f6b176ba8fea..4b57abe53ff14 100644 --- a/cpp/src/arrow/visitor_generate.h +++ b/cpp/src/arrow/visitor_generate.h @@ -40,7 +40,9 @@ namespace arrow { ACTION(Boolean); \ ARROW_GENERATE_FOR_ALL_NUMERIC_TYPES(ACTION); \ ACTION(String); \ + ACTION(StringView); \ ACTION(Binary); \ + ACTION(BinaryView); \ ACTION(LargeString); \ ACTION(LargeBinary); \ ACTION(FixedSizeBinary); \ diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 315405ef1e569..4e23d0fab5c69 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3374,6 +3374,8 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { for (const auto& inner_type : types) { if (inner_type->id() == ::arrow::Type::NA) continue; + if (inner_type->id() == ::arrow::Type::BINARY_VIEW) continue; + if (inner_type->id() == ::arrow::Type::STRING_VIEW) continue; auto writer_props = WriterProperties::Builder(); auto arrow_writer_props = ArrowWriterProperties::Builder(); @@ -3389,7 +3391,6 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptional) { arrow_writer_props.coerce_timestamps(unit); } } - ASSERT_NO_FATAL_FAILURE(DoNestedRequiredRoundtrip(inner_type, writer_props.build(), arrow_writer_props.build())); diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 72e984d7736fa..5dff533c1cce2 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -134,6 +134,8 @@ struct ValueBufferSlicer { NOT_IMPLEMENTED_VISIT(Dictionary); NOT_IMPLEMENTED_VISIT(RunEndEncoded); NOT_IMPLEMENTED_VISIT(Extension); + NOT_IMPLEMENTED_VISIT(BinaryView); + NOT_IMPLEMENTED_VISIT(StringView); #undef NOT_IMPLEMENTED_VISIT diff --git a/cpp/src/parquet/encryption/openssl_internal.cc b/cpp/src/parquet/encryption/openssl_internal.cc index 05f2773532353..61ec81b490802 100644 --- a/cpp/src/parquet/encryption/openssl_internal.cc +++ b/cpp/src/parquet/encryption/openssl_internal.cc @@ -24,11 +24,14 @@ namespace parquet::encryption::openssl { void EnsureInitialized() { +// OpenSSL 1.1 doesn't provide OPENSSL_INIT_ENGINE_ALL_BUILTIN. +#ifdef OPENSSL_INIT_ENGINE_ALL_BUILTIN // Initialize ciphers and random engines if (!OPENSSL_init_crypto(OPENSSL_INIT_ENGINE_ALL_BUILTIN | OPENSSL_INIT_ADD_ALL_CIPHERS, NULL)) { throw ParquetException("OpenSSL initialization failed"); } +#endif } } // namespace parquet::encryption::openssl diff --git a/dev/archery/archery/integration/datagen.py b/dev/archery/archery/integration/datagen.py index 7635cfd98feda..1ce2775c160b8 100644 --- a/dev/archery/archery/integration/datagen.py +++ b/dev/archery/archery/integration/datagen.py @@ -1856,12 +1856,13 @@ def _temp_path(): .skip_tester('Rust'), generate_binary_view_case() - .skip_tester('C++') .skip_tester('C#') .skip_tester('Go') .skip_tester('Java') .skip_tester('JS') - .skip_tester('Rust'), + .skip_tester('Rust') + .skip_format(SKIP_C_SCHEMA, 'C++') + .skip_format(SKIP_C_ARRAY, 'C++'), generate_extension_case() .skip_tester('C#') diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 17123ed2b8339..b620c9b8c8b0c 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1182,12 +1182,12 @@ tasks: image: debian-cpp {% endfor %} - test-fedora-35-cpp: + test-fedora-38-cpp: ci: github template: docker-tests/github.linux.yml params: env: - FEDORA: 35 + FEDORA: 38 image: fedora-cpp {% for cpp_standard in [20] %} @@ -1295,12 +1295,12 @@ tasks: UBUNTU: 22.04 image: ubuntu-python - test-fedora-35-python-3: + test-fedora-38-python-3: ci: azure template: docker-tests/azure.linux.yml params: env: - FEDORA: 35 + FEDORA: 38 image: fedora-python test-r-linux-valgrind: diff --git a/docker-compose.yml b/docker-compose.yml index c23d3ff08e487..bbe4e65c10c43 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -655,7 +655,7 @@ services: # docker-compose run --rm fedora-cpp # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 33 + # FEDORA: 38 image: ${REPO}:${ARCH}-fedora-${FEDORA}-cpp build: context: . @@ -671,7 +671,6 @@ services: <<: [*common, *ccache, *sccache, *cpp] ARROW_ENABLE_TIMING_TESTS: # inherit ARROW_MIMALLOC: "ON" - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: &fedora-volumes - .:/arrow:delegated - ${DOCKER_VOLUME_PREFIX}fedora-ccache:/ccache:delegated @@ -957,7 +956,7 @@ services: # docker-compose run --rm fedora-python # Parameters: # ARCH: amd64, arm64v8, ... - # FEDORA: 33 + # FEDORA: 38 image: ${REPO}:${ARCH}-fedora-${FEDORA}-python-3 build: context: . @@ -969,7 +968,6 @@ services: shm_size: *shm-size environment: <<: [*common, *ccache] - Protobuf_SOURCE: "BUNDLED" # Need Protobuf >= 3.15 volumes: *fedora-volumes command: *python-command diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 2d8f5c432c96e..33e80bca8cc5a 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -202,4 +202,5 @@ namespace arrow::matlab::error { static const char* INDEX_OUT_OF_RANGE = "arrow:index:OutOfRange"; static const char* BUFFER_VIEW_OR_COPY_FAILED = "arrow:buffer:ViewOrCopyFailed"; static const char* ARRAY_PRETTY_PRINT_FAILED = "arrow:array:PrettyPrintFailed"; + static const char* TABULAR_GET_ROW_AS_STRING_FAILED = "arrow:tabular:GetRowAsStringFailed"; } diff --git a/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h new file mode 100644 index 0000000000000..824b6c19a7109 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/tabular/get_row_as_string.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/pretty_print.h" + +#include + +namespace arrow::matlab::tabular { + + namespace { + arrow::PrettyPrintOptions make_pretty_print_options() { + auto opts = arrow::PrettyPrintOptions::Defaults(); + opts.skip_new_lines = true; + opts.array_delimiters.open = ""; + opts.array_delimiters.close = ""; + opts.chunked_array_delimiters.open = ""; + opts.chunked_array_delimiters.close = ""; + return opts; + } + } + + template + arrow::Result get_row_as_string(const std::shared_ptr& tabular_object, const int64_t matlab_row_index) { + std::stringstream ss; + const int64_t row_index = matlab_row_index - 1; + if (row_index >= tabular_object->num_rows() || row_index < 0) { + ss << "Invalid Row Index: " << matlab_row_index; + return arrow::Status::Invalid(ss.str()); + } + + const auto opts = make_pretty_print_options(); + const auto num_columns = tabular_object->num_columns(); + const auto& columns = tabular_object->columns(); + + for (int32_t i = 0; i < num_columns; ++i) { + const auto& column = columns[i]; + const auto type_id = column->type()->id(); + if (arrow::is_primitive(type_id) || arrow::is_string(type_id)) { + auto slice = column->Slice(row_index, 1); + ARROW_RETURN_NOT_OK(arrow::PrettyPrint(*slice, opts, &ss)); + } else if (type_id == arrow::Type::type::STRUCT) { + // Use as a placeholder since we don't have a good + // way to display StructArray elements horiztonally on screen. + ss << ""; + } else if (type_id == arrow::Type::type::LIST) { + // Use as a placeholder since we don't have a good + // way to display ListArray elements horiztonally on screen. + ss << ""; + } else { + return arrow::Status::NotImplemented("Datatype " + column->type()->ToString() + "is not currently supported for display."); + } + + if (i + 1 < num_columns) { + // Only add the delimiter if there is at least + // one more element to print. + ss << " | "; + } + } + return ss.str(); + } +} \ No newline at end of file diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc index 679c7382f6532..7d24ad01d7e73 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.cc @@ -23,6 +23,7 @@ #include "arrow/matlab/error/error.h" #include "arrow/matlab/tabular/proxy/record_batch.h" #include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/tabular/get_row_as_string.h" #include "arrow/type.h" #include "arrow/util/utf8.h" @@ -58,6 +59,7 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(RecordBatch, getColumnByIndex); REGISTER_METHOD(RecordBatch, getColumnByName); REGISTER_METHOD(RecordBatch, getSchema); + REGISTER_METHOD(RecordBatch, getRowAsString); } std::shared_ptr RecordBatch::unwrap() { @@ -218,4 +220,20 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = schema_proxy_id_mda; } + void RecordBatch::getRowAsString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_row_index = int64_t(index_mda[0]); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8, arrow::matlab::tabular::get_row_as_string(record_batch, matlab_row_index), + context, error::TABULAR_GET_ROW_AS_STRING_FAILED); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16, arrow::util::UTF8StringToUTF16(row_str_utf8), + context, error::UNICODE_CONVERSION_ERROR_ID); + context.outputs[0] = factory.createScalar(row_str_utf16); + } + } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h index b136ad1ea5db1..c417d8198f9ad 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/record_batch.h @@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy { void getColumnByIndex(libmexclass::proxy::method::Context& context); void getColumnByName(libmexclass::proxy::method::Context& context); void getSchema(libmexclass::proxy::method::Context& context); + void getRowAsString(libmexclass::proxy::method::Context& context); std::shared_ptr record_batch; }; diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc index 228e28dad9e9c..cf628407b1742 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.cc @@ -24,6 +24,8 @@ #include "arrow/matlab/error/error.h" #include "arrow/matlab/tabular/proxy/table.h" #include "arrow/matlab/tabular/proxy/schema.h" +#include "arrow/matlab/tabular/get_row_as_string.h" + #include "arrow/type.h" #include "arrow/util/utf8.h" @@ -57,6 +59,7 @@ namespace arrow::matlab::tabular::proxy { REGISTER_METHOD(Table, getSchema); REGISTER_METHOD(Table, getColumnByIndex); REGISTER_METHOD(Table, getColumnByName); + REGISTER_METHOD(Table, getRowAsString); } std::shared_ptr Table::unwrap() { @@ -212,4 +215,20 @@ namespace arrow::matlab::tabular::proxy { context.outputs[0] = chunked_array_proxy_id_mda; } + void Table::getRowAsString(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + mda::ArrayFactory factory; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray index_mda = args[0]["Index"]; + const auto matlab_row_index = int64_t(index_mda[0]); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf8, arrow::matlab::tabular::get_row_as_string(table, matlab_row_index), + context, error::TABULAR_GET_ROW_AS_STRING_FAILED); + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto row_str_utf16, arrow::util::UTF8StringToUTF16(row_str_utf8), + context, error::UNICODE_CONVERSION_ERROR_ID); + context.outputs[0] = factory.createScalar(row_str_utf16); + } + } diff --git a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h index dae86a180b7a6..bfcea15bbd1c3 100644 --- a/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h +++ b/matlab/src/cpp/arrow/matlab/tabular/proxy/table.h @@ -41,6 +41,7 @@ namespace arrow::matlab::tabular::proxy { void getSchema(libmexclass::proxy::method::Context& context); void getColumnByIndex(libmexclass::proxy::method::Context& context); void getColumnByName(libmexclass::proxy::method::Context& context); + void getRowAsString(libmexclass::proxy::method::Context& context); std::shared_ptr table; }; diff --git a/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m b/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m index 5c8704d5bf2a4..85301ddefaada 100644 --- a/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m +++ b/matlab/src/matlab/+arrow/+array/+internal/+display/getHeader.m @@ -16,7 +16,7 @@ % permissions and limitations under the License. function header = getHeader(className, numElements, numNulls) - import arrow.array.internal.display.pluralizeStringIfNeeded + import arrow.internal.display.pluralizeStringIfNeeded elementString = pluralizeStringIfNeeded(numElements, "element"); nullString = pluralizeStringIfNeeded(numNulls, "null value"); diff --git a/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m b/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m new file mode 100644 index 0000000000000..cb980cbff99e8 --- /dev/null +++ b/matlab/src/matlab/+arrow/+internal/+display/boldFontIfPossible.m @@ -0,0 +1,26 @@ +%BOLDFONTIFPOSSIBLE Bolds the input string if possible + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function str = boldFontIfPossible(str) + + arguments + str(1, 1) string {mustBeNonzeroLengthText} + end + if usejava("desktop") + str = compose("%s", str); + end +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+array/+internal/+display/pluralizeStringIfNeeded.m b/matlab/src/matlab/+arrow/+internal/+display/pluralizeStringIfNeeded.m similarity index 100% rename from matlab/src/matlab/+arrow/+array/+internal/+display/pluralizeStringIfNeeded.m rename to matlab/src/matlab/+arrow/+internal/+display/pluralizeStringIfNeeded.m diff --git a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m index ad2f026d64e20..a9682d317354b 100644 --- a/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m +++ b/matlab/src/matlab/+arrow/+internal/+test/+tabular/createAllSupportedArrayTypes.m @@ -24,8 +24,8 @@ end % Seed the random number generator to ensure - % reproducible results in tests. - rng(1); + % reproducible results in tests across MATLAB sessions. + rng(1, "twister"); import arrow.type.ID import arrow.array.* @@ -101,6 +101,7 @@ % Return the class names as a string array classes = string({metaClass.Name}); + classes = sort(classes); end function dict = getNumericArrayToMatlabDictionary() diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m similarity index 92% rename from matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m rename to matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m index 8d6740b195abc..7da945ca993ef 100644 --- a/matlab/src/matlab/+arrow/+tabular/+internal/displaySchema.m +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getSchemaString.m @@ -1,4 +1,5 @@ -%DISPLAYSCHEMA Generates arrow.tabular.Schema display text. +%GETSCHEMASTRING Generates a string representation of an +% arrow.tabular.Schema object. % Licensed to the Apache Software Foundation (ASF) under one or more % contributor license agreements. See the NOTICE file distributed with @@ -15,7 +16,7 @@ % implied. See the License for the specific language governing % permissions and limitations under the License. -function text = displaySchema(schema) +function text = getSchemaString(schema) fields = schema.Fields; names = [fields.Name]; types = [fields.Type]; @@ -46,5 +47,5 @@ end text = names + ": " + typeIDs; - text = " " + strjoin(text, " | "); + text = strjoin(text, " | "); end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m new file mode 100644 index 0000000000000..054922fa03c75 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularDisplay.m @@ -0,0 +1,44 @@ +%GETTABULARDISPLAY Generates the display for arrow.tabular.Table and +% arrow.tabular.RecordBatch. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function tabularDisplay = getTabularDisplay(tabularObj, className) + import arrow.tabular.internal.display.getSchemaString + import arrow.tabular.internal.display.getTabularHeader + + numRows = tabularObj.NumRows; + numColumns = tabularObj.NumColumns; + tabularDisplay = getTabularHeader(className, numRows, numColumns); + + if numColumns > 0 + twoNewLines = string([newline newline]); + fourSpaces = string(repmat(' ', 1, 4)); + eightSpaces = string(repmat(' ', 1, 8)); + + schemaHeader = fourSpaces + "Schema:"; + schemaBody = eightSpaces + getSchemaString(tabularObj.Schema); + schemaDisplay = schemaHeader + twoNewLines + schemaBody; + tabularDisplay = tabularDisplay + twoNewLines + schemaDisplay; + + if numRows > 0 + rowHeader = fourSpaces + "First Row:"; + rowBody = eightSpaces + tabularObj.Proxy.getRowAsString(struct(Index=int64(1))); + rowDisplay = rowHeader + twoNewLines + rowBody; + tabularDisplay = tabularDisplay + twoNewLines + rowDisplay; + end + end +end diff --git a/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m new file mode 100644 index 0000000000000..4c647986ce055 --- /dev/null +++ b/matlab/src/matlab/+arrow/+tabular/+internal/+display/getTabularHeader.m @@ -0,0 +1,32 @@ +%GETTABULARHEADER Generates the display header for arrow.tabular.Table and +% arrow.tabular.RecordBatch. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +function header = getTabularHeader(className, numRows, numColumns) + import arrow.internal.display.boldFontIfPossible + import arrow.internal.display.pluralizeStringIfNeeded + + numRowsString = boldFontIfPossible(numRows); + numColsString = boldFontIfPossible(numColumns); + rowWordString = pluralizeStringIfNeeded(numRows, "row"); + colWordString = pluralizeStringIfNeeded(numColumns, "column"); + formatSpec = " Arrow %s with %s %s and %s %s"; + if numColumns > 0 + formatSpec = formatSpec + ":"; + end + header = compose(formatSpec,className, numRowsString, rowWordString, numColsString, colWordString); +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m index fdedaecb5eb3a..0225f3d771181 100644 --- a/matlab/src/matlab/+arrow/+tabular/RecordBatch.m +++ b/matlab/src/matlab/+arrow/+tabular/RecordBatch.m @@ -112,7 +112,9 @@ methods (Access=protected) function displayScalarObject(obj) - disp(obj.toString()); + className = matlab.mixin.CustomDisplay.getClassNameForHeader(obj); + tabularDisplay = arrow.tabular.internal.display.getTabularDisplay(obj, className); + disp(tabularDisplay + newline); end end diff --git a/matlab/src/matlab/+arrow/+tabular/Schema.m b/matlab/src/matlab/+arrow/+tabular/Schema.m index 3ee40f0e14293..a50522c6b5283 100644 --- a/matlab/src/matlab/+arrow/+tabular/Schema.m +++ b/matlab/src/matlab/+arrow/+tabular/Schema.m @@ -116,7 +116,7 @@ function displayScalarObject(obj) numFields = obj.NumFields; if numFields > 0 - text = arrow.tabular.internal.displaySchema(obj); + text = " " + arrow.tabular.internal.display.getSchemaString(obj); disp(text + newline); end diff --git a/matlab/src/matlab/+arrow/+tabular/Table.m b/matlab/src/matlab/+arrow/+tabular/Table.m index c2f73450408ef..1ed205d639747 100644 --- a/matlab/src/matlab/+arrow/+tabular/Table.m +++ b/matlab/src/matlab/+arrow/+tabular/Table.m @@ -112,9 +112,10 @@ end methods (Access=protected) - function displayScalarObject(obj) - disp(obj.toString()); + className = matlab.mixin.CustomDisplay.getClassNameForHeader(obj); + tabularDisplay = arrow.tabular.internal.display.getTabularDisplay(obj, className); + disp(tabularDisplay + newline); end end diff --git a/matlab/test/arrow/tabular/tTabularDisplay.m b/matlab/test/arrow/tabular/tTabularDisplay.m new file mode 100644 index 0000000000000..f99b25ab3452f --- /dev/null +++ b/matlab/test/arrow/tabular/tTabularDisplay.m @@ -0,0 +1,148 @@ +%TTABULARDISPLAY Unit tests verifying the display of arrow.tabular.Table +%and arrow.tabular.RecordBatch objects. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTabularDisplay < matlab.unittest.TestCase + + properties (TestParameter) + TabularType + end + + methods (TestParameterDefinition, Static) + function TabularType = initializeTabularType() + + tableStruct = struct(FullClassName="arrow.tabular.Table", ... + ClassName="Table", FromTableFcn = @arrow.table); + + recordBatchStruct = struct(FullClassName="arrow.tabular.RecordBatch", ... + ClassName="RecordBatch", FromTableFcn=@arrow.recordBatch); + + TabularType = struct(Table=tableStruct, RecordBatch=recordBatchStruct); + end + + end + + methods (Test) + + function ZeroRowsZeroColumns(testCase, TabularType) + % Verify tabular object display when the object has zero rows + % and zero columns. + import arrow.internal.test.display.makeLinkString + + tabularObj = TabularType.FromTableFcn(table); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + zeroString = getNumString(0); + header = compose(" Arrow %s with %s rows and %s columns", classNameString, zeroString, zeroString); + expectedDisplay = char(header + newline + newline); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function ZeroRowsOneColumn(testCase, TabularType) + % Verify tabular object display when the object has zero rows + % and one column. + import arrow.internal.test.display.makeLinkString + + t = table(1, VariableNames="Number"); + tabularObj = TabularType.FromTableFcn(t(1:0, :)); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s rows and %s column:", classNameString, getNumString(0), getNumString(1)); + + fieldString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + schema = join([" Schema:" " " + fieldString], [newline newline]); + + expectedDisplay = char(join([header schema + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function OneRowOneColumn(testCase, TabularType) + % Verify tabular object display when the object has one row + % and column. + import arrow.internal.test.display.makeLinkString + + t = table(1, VariableNames="Number"); + tabularObj = TabularType.FromTableFcn(t); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s row and %s column:", classNameString, getNumString(1), getNumString(1)); + + fieldString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + schema = join([" Schema:" " " + fieldString], [newline newline]); + row = join([" First Row:" " 1"], [newline newline]); + + + expectedDisplay = char(join([header schema row + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + + function ManyRowsAndColumns(testCase, TabularType) + % Verify tabular object display when the object has many rows + % and columns. + import arrow.internal.test.display.makeLinkString + + t = table((1:2)', ["A"; "B"], true(2, 1), VariableNames=["Number", "Letter", "Logical"]); + tabularObj = TabularType.FromTableFcn(t); %#ok + + classNameString = makeLinkString(FullClassName=TabularType.FullClassName, ... + ClassName=TabularType.ClassName, BoldFont=true); + header = compose(" Arrow %s with %s rows and %s columns:", classNameString, getNumString(2), getNumString(3)); + + fieldOneString = makeFieldString("Number", "Float64", "arrow.type.Float64Type"); + fieldTwoString = makeFieldString("Letter", "String", "arrow.type.StringType"); + fieldThreeString = makeFieldString("Logical", "Boolean", "arrow.type.BooleanType"); + + fields = join([fieldOneString fieldTwoString fieldThreeString], " | "); + schema = join([" Schema:" " " + fields], [newline newline]); + row = join([" First Row:" " 1 | ""A"" | true"], [newline newline]); + + expectedDisplay = char(join([header schema row + newline + newline], [newline newline])); + actualDisplay = evalc('disp(tabularObj)'); + + testCase.verifyEqual(actualDisplay, expectedDisplay); + end + end +end + +function numString = getNumString(num) + if usejava("desktop") + numString = compose("%d", num); + else + numString = compose("%d", num); + end +end + +function str = makeFieldString(fieldName, classType, fullClassType) + import arrow.internal.test.display.makeLinkString + + if usejava("desktop") + name = compose("%s:", fieldName); + typeStr = makeLinkString(FullClassName=fullClassType, ClassName=classType, BoldFont=true); + str = name + " " + typeStr; + else + str = fieldName + ": " + classType; + end +end \ No newline at end of file diff --git a/matlab/test/arrow/tabular/tTabularInternal.m b/matlab/test/arrow/tabular/tTabularInternal.m new file mode 100644 index 0000000000000..28075d7763dea --- /dev/null +++ b/matlab/test/arrow/tabular/tTabularInternal.m @@ -0,0 +1,110 @@ +%TTABULARINTERNAL Unit tests for internal functionality of tabular types. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef tTabularInternal < matlab.unittest.TestCase + + properties(TestParameter) + TabularObjectWithAllTypes + + TabularObjectWithOneColumn + + TabularObjectWithThreeRows + end + + methods (TestParameterDefinition, Static) + function TabularObjectWithAllTypes = initializeTabularObjectWithAllTypes() + arrays = arrow.internal.test.tabular.createAllSupportedArrayTypes(NumRows=1); + arrowTable = arrow.tabular.Table.fromArrays(arrays{:}); + arrowRecordBatch = arrow.tabular.Table.fromArrays(arrays{:}); + TabularObjectWithAllTypes = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + + function TabularObjectWithOneColumn = initializeTabularObjectWithOneColumn() + t = table((1:3)'); + arrowTable = arrow.table(t); + arrowRecordBatch = arrow.recordBatch(t); + TabularObjectWithOneColumn = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + + function TabularObjectWithThreeRows = initializeTabularObjectWithThreeRows() + t = table((1:3)', ["A"; "B"; "C"]); + arrowTable = arrow.table(t); + arrowRecordBatch = arrow.recordBatch(t); + TabularObjectWithThreeRows = struct(Table=arrowTable, ... + RecordBatch=arrowRecordBatch); + end + end + + methods (Test) + function RowWithAllTypes(testCase, TabularObjectWithAllTypes) + % Verify getRowAsString successfully returns the expected string + % when called on a Table/RecordBatch that contains all + % supported array types. + proxy = TabularObjectWithAllTypes.Proxy; + columnStrs = ["false", "2024-02-23", "2023-08-24", "78", "38", ... + "24", "48", "89", "102", "", """107""", "", ... + "00:03:44", "00:00:07.000000", "2024-02-10 00:00:00.000000", ... + "107", "143", "36", "51"]; + expectedString = strjoin(columnStrs, " | "); + actualString = proxy.getRowAsString(struct(Index=int64(1))); + testCase.verifyEqual(actualString, expectedString); + end + + function RowWithOneColumn(testCase, TabularObjectWithOneColumn) + % Verify getRowAsString successfully returns the expected string + % when called on a Table/RecordBatch with one column. + proxy = TabularObjectWithOneColumn.Proxy; + expectedString = "1"; + actualString = proxy.getRowAsString(struct(Index=int64(1))); + testCase.verifyEqual(actualString, expectedString); + end + + function RowIndex(testCase, TabularObjectWithThreeRows) + % Verify getRowAsString returns the expected string for + % the provided row index. + proxy = TabularObjectWithThreeRows.Proxy; + + actualString = proxy.getRowAsString(struct(Index=int64(1))); + expectedString = "1 | ""A"""; + testCase.verifyEqual(actualString, expectedString); + + actualString = proxy.getRowAsString(struct(Index=int64(2))); + expectedString = "2 | ""B"""; + testCase.verifyEqual(actualString, expectedString); + + actualString = proxy.getRowAsString(struct(Index=int64(3))); + expectedString = "3 | ""C"""; + testCase.verifyEqual(actualString, expectedString); + end + + function GetRowAsStringFailed(testCase, TabularObjectWithThreeRows) + % Verify getRowAsString throws an error with the ID + % arrow:tabular:GetRowAsStringFailed if provided invalid index + % values. + proxy = TabularObjectWithThreeRows.Proxy; + fcn = @() proxy.getRowAsString(struct(Index=int64(0))); + testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed"); + + fcn = @() proxy.getRowAsString(struct(Index=int64(4))); + testCase.verifyError(fcn, "arrow:tabular:GetRowAsStringFailed"); + end + + end + +end \ No newline at end of file diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 91c7b8a45718e..8ed5d4e216e8e 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1353,7 +1353,8 @@ struct ObjectWriterVisitor { std::is_same::value || (std::is_base_of::value && !std::is_same::value) || - std::is_base_of::value, + std::is_base_of::value || + std::is_base_of::value, Status> Visit(const Type& type) { return Status::NotImplemented("No implemented conversion to object dtype: ", diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 2143129356fd5..e47492499867a 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -572,12 +572,18 @@ struct PyConverterTrait; template struct PyConverterTrait< - T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && - !is_extension_type::value) || - std::is_same::value>> { + T, + enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value && !is_binary_view_like_type::value) || + std::is_same::value>> { using type = PyPrimitiveConverter; }; +template +struct PyConverterTrait> { + // not implemented +}; + template struct PyConverterTrait> { using type = PyListConverter; diff --git a/r/src/r_to_arrow.cpp b/r/src/r_to_arrow.cpp index 19c23973a62fe..d9bf848e24292 100644 --- a/r/src/r_to_arrow.cpp +++ b/r/src/r_to_arrow.cpp @@ -1051,10 +1051,15 @@ struct RConverterTrait; template struct RConverterTrait< T, enable_if_t::value && !is_interval_type::value && - !is_extension_type::value>> { + !is_extension_type::value && !is_binary_view_like_type::value>> { using type = RPrimitiveConverter; }; +template +struct RConverterTrait> { + // not implemented +}; + template struct RConverterTrait> { using type = RListConverter;