From 9d40a6a6630f951b9ccf8e8984c58dc0602921eb Mon Sep 17 00:00:00 2001 From: "yihao.dai" <954206947@qq.com> Date: Wed, 28 Aug 2024 23:32:10 +0800 Subject: [PATCH 1/8] GH-43860: [Go][Parquet] Handle the error correctly (#43861) ### Rationale for this change Fixes: https://github.com/apache/arrow/issues/43860 ### What changes are included in this PR? Return error correctly ### Are these changes tested? Yes ### Are there any user-facing changes? Nope * GitHub Issue: #43860 Authored-by: bigsheeper Signed-off-by: Matt Topol --- go/parquet/file/file_reader_test.go | 49 +++++++++++++++++++++++++++++ go/parquet/file/record_reader.go | 2 +- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/go/parquet/file/file_reader_test.go b/go/parquet/file/file_reader_test.go index 35f4da4e8667c..74926c958e2f7 100644 --- a/go/parquet/file/file_reader_test.go +++ b/go/parquet/file/file_reader_test.go @@ -452,6 +452,55 @@ func TestRleBooleanEncodingFileRead(t *testing.T) { assert.Equal(t, expected, values[:len(expected)]) } +type mockBadReader struct { + cnt int + reader *os.File +} + +func (m *mockBadReader) Seek(offset int64, whence int) (int64, error) { + return m.reader.Seek(offset, whence) +} + +func (m *mockBadReader) ReadAt(p []byte, off int64) (n int, err error) { + if m.cnt == 0 { + return 0, fmt.Errorf("mock error") + } + m.cnt-- + return m.reader.ReadAt(p, off) +} + +func TestBadReader(t *testing.T) { + dir := os.Getenv("PARQUET_TEST_DATA") + if dir == "" { + t.Skip("no path supplied with PARQUET_TEST_DATA") + } + require.DirExists(t, dir) + + filePath := path.Join(dir, "byte_stream_split_extended.gzip.parquet") + f, err := os.Open(filePath) + assert.NoError(t, err) + defer f.Close() + + reader := &mockBadReader{ + cnt: 2, + reader: f, + } + r, err := file.NewParquetReader(reader, file.WithReadProps(&parquet.ReaderProperties{ + BufferSize: int64(1024), + BufferedStreamEnabled: true, + })) + assert.NoError(t, err) + + fileReader, err := pqarrow.NewFileReader(r, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator) + assert.NoError(t, err) + + columnReader, err := fileReader.GetColumn(context.Background(), 0) + assert.NoError(t, err) + + _, err = columnReader.NextBatch(1) + assert.ErrorContains(t, err, "mock error") // Expect an error to occur. +} + func TestByteStreamSplitEncodingFileRead(t *testing.T) { dir := os.Getenv("PARQUET_TEST_DATA") if dir == "" { diff --git a/go/parquet/file/record_reader.go b/go/parquet/file/record_reader.go index 667ffca77a8d1..765f4a9d34b33 100755 --- a/go/parquet/file/record_reader.go +++ b/go/parquet/file/record_reader.go @@ -645,7 +645,7 @@ func (rr *recordReader) ReadRecords(numRecords int64) (int64, error) { } } - return recordsRead, nil + return recordsRead, rr.Err() } func (rr *recordReader) ReleaseValidBits() *memory.Buffer { From 0bc91dd2447696a208adec266270ab722099b0e2 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 28 Aug 2024 15:07:02 -0300 Subject: [PATCH 2/8] GH-43854: [C++] Expose the set of device types where a ChunkedArray is allocated (#43853) ### Rationale for this change `ChunkedArray`s allow flexible allocation of arrays -- the whole array doesn't have to be allocated in huge contiguous buffers. Nothing today prevents chunked arrays from being made of chunks allocated in different devices and that is good. But we need a way to query the set of devices where a chunked array is allocated at. This PR adds that missing part. ### What changes are included in this PR? Addition of: - the `DeviceAllocationTypeSet` class - `ChunkedArray::device_types()` - `Datum::device_types()` Moved `enum DeviceAllocationType` to the `type_fwd.h` header because `device.h` is too expensive of a header to hold this widely used `enum`. ### Are these changes tested? Added more asserts to `chunked_array_test.cc`. ### Are there any user-facing changes? New APIs. * GitHub Issue: #43854 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/chunked_array.cc | 13 +++ cpp/src/arrow/chunked_array.h | 8 ++ cpp/src/arrow/chunked_array_test.cc | 5 ++ cpp/src/arrow/compute/function.cc | 1 + cpp/src/arrow/compute/kernel.cc | 1 + cpp/src/arrow/compute/kernel.h | 1 + cpp/src/arrow/datum.cc | 40 +++++++++ cpp/src/arrow/datum.h | 3 + cpp/src/arrow/device.h | 18 ---- cpp/src/arrow/device_allocation_type_set.cc | 80 +++++++++++++++++ cpp/src/arrow/device_allocation_type_set.h | 97 +++++++++++++++++++++ cpp/src/arrow/type_fwd.h | 21 +++++ 13 files changed, 271 insertions(+), 18 deletions(-) create mode 100644 cpp/src/arrow/device_allocation_type_set.cc create mode 100644 cpp/src/arrow/device_allocation_type_set.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6b0ac8c23c75a..65343df1291ba 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -373,6 +373,7 @@ set(ARROW_SRCS config.cc datum.cc device.cc + device_allocation_type_set.cc extension_type.cc extension/bool8.cc extension/uuid.cc diff --git a/cpp/src/arrow/chunked_array.cc b/cpp/src/arrow/chunked_array.cc index c36b736d5d5df..dd6aa51534fcb 100644 --- a/cpp/src/arrow/chunked_array.cc +++ b/cpp/src/arrow/chunked_array.cc @@ -27,6 +27,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/util.h" #include "arrow/array/validate.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/pretty_print.h" #include "arrow/status.h" #include "arrow/type.h" @@ -86,6 +87,18 @@ Result> ChunkedArray::MakeEmpty( return std::make_shared(std::move(new_chunks)); } +DeviceAllocationTypeSet ChunkedArray::device_types() const { + if (chunks_.empty()) { + // An empty ChunkedArray is considered to be CPU-only. + return DeviceAllocationTypeSet::CpuOnly(); + } + DeviceAllocationTypeSet set; + for (const auto& chunk : chunks_) { + set.add(chunk->device_type()); + } + return set; +} + bool ChunkedArray::Equals(const ChunkedArray& other, const EqualOptions& opts) const { if (length_ != other.length()) { return false; diff --git a/cpp/src/arrow/chunked_array.h b/cpp/src/arrow/chunked_array.h index 5d300861d85c2..c65b6cb6e227f 100644 --- a/cpp/src/arrow/chunked_array.h +++ b/cpp/src/arrow/chunked_array.h @@ -25,6 +25,7 @@ #include "arrow/chunk_resolver.h" #include "arrow/compare.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type_fwd.h" @@ -116,6 +117,13 @@ class ARROW_EXPORT ChunkedArray { /// \return an ArrayVector of chunks const ArrayVector& chunks() const { return chunks_; } + /// \return The set of device allocation types used by the chunks in this + /// chunked array. + DeviceAllocationTypeSet device_types() const; + + /// \return true if all chunks are allocated on CPU-accessible memory. + bool is_cpu() const { return device_types().is_cpu_only(); } + /// \brief Construct a zero-copy slice of the chunked array with the /// indicated offset and length /// diff --git a/cpp/src/arrow/chunked_array_test.cc b/cpp/src/arrow/chunked_array_test.cc index e9cc283b53cd5..b796e9250008a 100644 --- a/cpp/src/arrow/chunked_array_test.cc +++ b/cpp/src/arrow/chunked_array_test.cc @@ -61,12 +61,17 @@ TEST_F(TestChunkedArray, Make) { ChunkedArray::Make({}, int64())); AssertTypeEqual(*int64(), *result->type()); ASSERT_EQ(result->num_chunks(), 0); + // Empty chunked arrays are treated as CPU-allocated. + ASSERT_TRUE(result->is_cpu()); auto chunk0 = ArrayFromJSON(int8(), "[0, 1, 2]"); auto chunk1 = ArrayFromJSON(int16(), "[3, 4, 5]"); ASSERT_OK_AND_ASSIGN(result, ChunkedArray::Make({chunk0, chunk0})); ASSERT_OK_AND_ASSIGN(auto result2, ChunkedArray::Make({chunk0, chunk0}, int8())); + // All chunks are CPU-accessible. + ASSERT_TRUE(result->is_cpu()); + ASSERT_TRUE(result2->is_cpu()); AssertChunkedEqual(*result, *result2); ASSERT_RAISES(TypeError, ChunkedArray::Make({chunk0, chunk1})); diff --git a/cpp/src/arrow/compute/function.cc b/cpp/src/arrow/compute/function.cc index e1a2e8c5d8879..0478a3d1e801a 100644 --- a/cpp/src/arrow/compute/function.cc +++ b/cpp/src/arrow/compute/function.cc @@ -30,6 +30,7 @@ #include "arrow/compute/kernels/common_internal.h" #include "arrow/compute/registry.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/util/cpu_info.h" #include "arrow/util/logging.h" #include "arrow/util/tracing_internal.h" diff --git a/cpp/src/arrow/compute/kernel.cc b/cpp/src/arrow/compute/kernel.cc index 5c87ef2cd0561..5e7461cc52d0e 100644 --- a/cpp/src/arrow/compute/kernel.cc +++ b/cpp/src/arrow/compute/kernel.cc @@ -24,6 +24,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/result.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" diff --git a/cpp/src/arrow/compute/kernel.h b/cpp/src/arrow/compute/kernel.h index 1adb3e96c97c8..cfa1cd8193f36 100644 --- a/cpp/src/arrow/compute/kernel.h +++ b/cpp/src/arrow/compute/kernel.h @@ -31,6 +31,7 @@ #include "arrow/buffer.h" #include "arrow/compute/exec.h" #include "arrow/datum.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/memory_pool.h" #include "arrow/result.h" #include "arrow/status.h" diff --git a/cpp/src/arrow/datum.cc b/cpp/src/arrow/datum.cc index 2ac230232e1b7..b19d186447547 100644 --- a/cpp/src/arrow/datum.cc +++ b/cpp/src/arrow/datum.cc @@ -25,6 +25,7 @@ #include "arrow/array/array_base.h" #include "arrow/array/util.h" #include "arrow/chunked_array.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/table.h" @@ -156,6 +157,45 @@ ArrayVector Datum::chunks() const { return this->chunked_array()->chunks(); } +DeviceAllocationTypeSet Datum::device_types() const { + switch (kind()) { + case NONE: + break; + case SCALAR: + // Scalars are asssumed as always residing in CPU memory for now. + return DeviceAllocationTypeSet::CpuOnly(); + case ARRAY: + return DeviceAllocationTypeSet{array()->device_type()}; + case CHUNKED_ARRAY: + return chunked_array()->device_types(); + case RECORD_BATCH: { + auto& columns = record_batch()->columns(); + if (columns.empty()) { + // An empty RecordBatch is considered to be CPU-only. + return DeviceAllocationTypeSet::CpuOnly(); + } + DeviceAllocationTypeSet set; + for (const auto& column : columns) { + set.add(column->device_type()); + } + return set; + } + case TABLE: { + auto& columns = table()->columns(); + if (columns.empty()) { + // An empty Table is considered to be CPU-only. + return DeviceAllocationTypeSet::CpuOnly(); + } + DeviceAllocationTypeSet set; + for (const auto& column : columns) { + set.Add(column->device_types()); + } + return set; + } + } + return {}; +} + bool Datum::Equals(const Datum& other) const { if (this->kind() != other.kind()) return false; diff --git a/cpp/src/arrow/datum.h b/cpp/src/arrow/datum.h index 31b2d2274c900..4a88e7a81125c 100644 --- a/cpp/src/arrow/datum.h +++ b/cpp/src/arrow/datum.h @@ -26,6 +26,7 @@ #include #include "arrow/array/data.h" +#include "arrow/device_allocation_type_set.h" #include "arrow/scalar.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -295,6 +296,8 @@ struct ARROW_EXPORT Datum { /// \return empty if not arraylike ArrayVector chunks() const; + DeviceAllocationTypeSet device_types() const; + /// \brief True if the two data are equal bool Equals(const Datum& other) const; diff --git a/cpp/src/arrow/device.h b/cpp/src/arrow/device.h index f5cca0d27d7b2..1dbe5b4b13e89 100644 --- a/cpp/src/arrow/device.h +++ b/cpp/src/arrow/device.h @@ -32,24 +32,6 @@ namespace arrow { -/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types -enum class DeviceAllocationType : char { - kCPU = 1, - kCUDA = 2, - kCUDA_HOST = 3, - kOPENCL = 4, - kVULKAN = 7, - kMETAL = 8, - kVPI = 9, - kROCM = 10, - kROCM_HOST = 11, - kEXT_DEV = 12, - kCUDA_MANAGED = 13, - kONEAPI = 14, - kWEBGPU = 15, - kHEXAGON = 16, -}; - class MemoryManager; /// \brief EXPERIMENTAL: Abstract interface for hardware devices diff --git a/cpp/src/arrow/device_allocation_type_set.cc b/cpp/src/arrow/device_allocation_type_set.cc new file mode 100644 index 0000000000000..83e9e57f2ee47 --- /dev/null +++ b/cpp/src/arrow/device_allocation_type_set.cc @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/device_allocation_type_set.h" +#include "arrow/type_fwd.h" + +namespace arrow { + +const char* DeviceAllocationTypeToCStr(DeviceAllocationType type) { + switch (type) { + case DeviceAllocationType::kCPU: + return "CPU"; + case DeviceAllocationType::kCUDA: + return "CUDA"; + case DeviceAllocationType::kCUDA_HOST: + return "CUDA_HOST"; + case DeviceAllocationType::kOPENCL: + return "OPENCL"; + case DeviceAllocationType::kVULKAN: + return "VULKAN"; + case DeviceAllocationType::kMETAL: + return "METAL"; + case DeviceAllocationType::kVPI: + return "VPI"; + case DeviceAllocationType::kROCM: + return "ROCM"; + case DeviceAllocationType::kROCM_HOST: + return "ROCM_HOST"; + case DeviceAllocationType::kEXT_DEV: + return "EXT_DEV"; + case DeviceAllocationType::kCUDA_MANAGED: + return "CUDA_MANAGED"; + case DeviceAllocationType::kONEAPI: + return "ONEAPI"; + case DeviceAllocationType::kWEBGPU: + return "WEBGPU"; + case DeviceAllocationType::kHEXAGON: + return "HEXAGON"; + } + return ""; +} + +std::string DeviceAllocationTypeSet::ToString() const { + std::string result = "{"; + for (int i = 1; i <= kDeviceAllocationTypeMax; i++) { + if (device_type_bitset_.test(i)) { + // Skip all the unused values in the enum. + switch (i) { + case 0: + case 5: + case 6: + continue; + } + if (result.size() > 1) { + result += ", "; + } + result += DeviceAllocationTypeToCStr(static_cast(i)); + } + } + result += "}"; + return result; +} + +} // namespace arrow diff --git a/cpp/src/arrow/device_allocation_type_set.h b/cpp/src/arrow/device_allocation_type_set.h new file mode 100644 index 0000000000000..974367307e6d4 --- /dev/null +++ b/cpp/src/arrow/device_allocation_type_set.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +ARROW_EXPORT +const char* DeviceAllocationTypeToCStr(DeviceAllocationType type); + +class ARROW_EXPORT DeviceAllocationTypeSet { + private: + std::bitset device_type_bitset_; + + public: + /// \brief Construct an empty set of device types. + DeviceAllocationTypeSet() = default; + + /// \brief Construct a set of device types with a single device type. + DeviceAllocationTypeSet( // NOLINT implicit construction + DeviceAllocationType accepted_device_type) { + add(accepted_device_type); + } + + /// \brief Construct a set of device types containing only "kCPU". + static DeviceAllocationTypeSet CpuOnly() { + return DeviceAllocationTypeSet{DeviceAllocationType::kCPU}; + } + + /// \brief Construct a set of device types containing all device types. + static DeviceAllocationTypeSet All() { + DeviceAllocationTypeSet all; + all.device_type_bitset_.set(); + // Don't set the invalid enum values. + all.device_type_bitset_.reset(0); + all.device_type_bitset_.reset(5); + all.device_type_bitset_.reset(6); + return all; + } + + /// \brief Add a device type to the set of device types. + void add(DeviceAllocationType device_type) { + device_type_bitset_.set(static_cast(device_type)); + } + + /// \brief Remove a device type from the set of device types. + void remove(DeviceAllocationType device_type) { + device_type_bitset_.reset(static_cast(device_type)); + } + + /// \brief Return true iff the set only contains the CPU device type. + bool is_cpu_only() const { + return device_type_bitset_ == CpuOnly().device_type_bitset_; + } + + /// \brief Return true if the set of accepted device types includes the + /// device type. + bool contains(DeviceAllocationType device_type) const { + return device_type_bitset_.test(static_cast(device_type)); + } + + /// \brief Add all device types from another set to this set. + void Add(DeviceAllocationTypeSet other) { + device_type_bitset_ |= other.device_type_bitset_; + } + + /// \brief Return true if the set of accepted device types includes all the + /// device types in the other set. + bool Contains(DeviceAllocationTypeSet other) const { + // other \subseteq this <==> (other \intersect this == other) + return (other.device_type_bitset_ & device_type_bitset_) == other.device_type_bitset_; + } + + std::string ToString() const; +}; + +} // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 08777d247edbf..8faebe217f141 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -724,4 +724,25 @@ ARROW_EXPORT MemoryPool* default_memory_pool(); constexpr int64_t kDefaultBufferAlignment = 64; +/// \brief EXPERIMENTAL: Device type enum which matches up with C Data Device types +enum class DeviceAllocationType : char { + kCPU = 1, + kCUDA = 2, + kCUDA_HOST = 3, + kOPENCL = 4, + kVULKAN = 7, + kMETAL = 8, + kVPI = 9, + kROCM = 10, + kROCM_HOST = 11, + kEXT_DEV = 12, + kCUDA_MANAGED = 13, + kONEAPI = 14, + kWEBGPU = 15, + kHEXAGON = 16, +}; +constexpr int kDeviceAllocationTypeMax = 16; + +class DeviceAllocationTypeSet; + } // namespace arrow From 58415d1fac50cb829b3dcf08526033d6db8c30db Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 29 Aug 2024 02:54:32 +0200 Subject: [PATCH 3/8] GH-38183: [CI][Python] Use pipx to install GCS testbench (#43852) ### Rationale for this change Installing the GCS testbench using the same Python that's being used to test PyArrow is fragile: some testbench versions may not be compatible, or there could be conflicts among the dependencies of the respective libraries. ### What changes are included in this PR? Use `pipx` to install the GCS testbench in a separate, controlled environment, using an appropriate Python version. ### Are these changes tested? Yes, by CI. ### Are there any user-facing changes? No. * GitHub Issue: #38183 Authored-by: Antoine Pitrou Signed-off-by: Sutou Kouhei --- .github/workflows/cpp.yml | 8 ++- appveyor.yml | 1 + ci/appveyor-cpp-build.bat | 2 + ci/docker/conda-cpp.dockerfile | 12 ++-- ci/docker/conda-python.dockerfile | 5 -- ...ython-wheel-windows-test-vs2019.dockerfile | 27 +++++--- ci/docker/ubuntu-20.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-22.04-cpp-minimal.dockerfile | 1 + ci/docker/ubuntu-24.04-cpp-minimal.dockerfile | 1 + ci/scripts/install_gcs_testbench.bat | 13 +++- ci/scripts/install_gcs_testbench.sh | 20 +++--- ci/scripts/python_wheel_windows_test.bat | 40 ++++++----- cpp/src/arrow/filesystem/gcsfs_test.cc | 68 +++++++++---------- python/pyarrow/tests/conftest.py | 7 +- python/scripts/run_emscripten_tests.py | 2 +- r/tests/testthat/test-gcs.R | 4 +- 16 files changed, 122 insertions(+), 90 deletions(-) diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index c5482f730823b..fd23e0cf217e6 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -465,15 +465,17 @@ jobs: chmod +x /usr/local/bin/minio.exe - name: Set up Python uses: actions/setup-python@v5.1.1 + id: python-install with: python-version: 3.9 - name: Install Google Cloud Storage Testbench - shell: bash + shell: msys2 {0} + env: + PIPX_BIN_DIR: /usr/local/bin + PIPX_PYTHON: ${{ steps.python-install.outputs.python-path }} run: | ci/scripts/install_gcs_testbench.sh default - echo "PYTHON_BIN_DIR=$(cygpath --windows $(dirname $(which python3.exe)))" >> $GITHUB_ENV - name: Test shell: msys2 {0} run: | - PATH="$(cygpath --unix ${PYTHON_BIN_DIR}):${PATH}" ci/scripts/cpp_test.sh "$(pwd)" "$(pwd)/build" diff --git a/appveyor.yml b/appveyor.yml index 5954251d34733..9e4582f1d8d7f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -24,6 +24,7 @@ only_commits: - appveyor.yml - ci/appveyor* - ci/conda* + - ci/scripts/*.bat - cpp/ - format/ - python/ diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index f688fbb63a9ad..08a052e82f24d 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -46,7 +46,9 @@ set ARROW_CMAKE_ARGS=-DARROW_DEPENDENCY_SOURCE=CONDA -DARROW_WITH_BZ2=ON set ARROW_CXXFLAGS=/WX /MP @rem Install GCS testbench +set PIPX_BIN_DIR=C:\Windows\ call %CD%\ci\scripts\install_gcs_testbench.bat +storage-testbench -h || exit /B @rem @rem Build and test Arrow C++ libraries (including Parquet) diff --git a/ci/docker/conda-cpp.dockerfile b/ci/docker/conda-cpp.dockerfile index dff1f2224809a..eb035d887a158 100644 --- a/ci/docker/conda-cpp.dockerfile +++ b/ci/docker/conda-cpp.dockerfile @@ -42,17 +42,19 @@ RUN mamba install -q -y \ valgrind && \ mamba clean --all +# We want to install the GCS testbench using the Conda base environment's Python, +# because the test environment's Python may later change. +ENV PIPX_PYTHON=/opt/conda/bin/python3 +COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts +RUN /arrow/ci/scripts/install_gcs_testbench.sh default + # Ensure npm, node and azurite are on path. npm and node are required to install azurite, which will then need to -# be on the path for the tests to run. +# be on the path for the tests to run. ENV PATH=/opt/conda/envs/arrow/bin:$PATH COPY ci/scripts/install_azurite.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_azurite.sh -# We want to install the GCS testbench using the same Python binary that the Conda code will use. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - COPY ci/scripts/install_sccache.sh /arrow/ci/scripts/ RUN /arrow/ci/scripts/install_sccache.sh unknown-linux-musl /usr/local/bin diff --git a/ci/docker/conda-python.dockerfile b/ci/docker/conda-python.dockerfile index 027fd589cecca..7e8dbe76f6248 100644 --- a/ci/docker/conda-python.dockerfile +++ b/ci/docker/conda-python.dockerfile @@ -32,11 +32,6 @@ RUN mamba install -q -y \ nomkl && \ mamba clean --all -# XXX The GCS testbench was already installed in conda-cpp.dockerfile, -# but we changed the installed Python version above, so we need to reinstall it. -COPY ci/scripts/install_gcs_testbench.sh /arrow/ci/scripts -RUN /arrow/ci/scripts/install_gcs_testbench.sh default - ENV ARROW_ACERO=ON \ ARROW_BUILD_STATIC=OFF \ ARROW_BUILD_TESTS=OFF \ diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 5f488a4c285ff..625ab25f848f2 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -35,16 +35,27 @@ RUN setx path "%path%;C:\Program Files\Git\usr\bin" RUN wmic product where "name like 'python%%'" call uninstall /nointeractive && \ rm -rf Python* +# Install the GCS testbench using a well-known Python version. +# NOTE: cannot use pipx's `--fetch-missing-python` because of +# https://github.com/pypa/pipx/issues/1521, therefore download Python ourselves. +RUN choco install -r -y --pre --no-progress python --version=3.11.9 +ENV PIPX_BIN_DIR=C:\\Windows\\ +ENV PIPX_PYTHON="C:\Python311\python.exe" +COPY ci/scripts/install_gcs_testbench.bat C:/arrow/ci/scripts/ +RUN call "C:\arrow\ci\scripts\install_gcs_testbench.bat" && \ + storage-testbench -h + # Define the full version number otherwise choco falls back to patch number 0 (3.8 => 3.8.0) ARG python=3.8 -RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\Python38;C:\Python38\Scripts") & \ - (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13" && setx PATH "%PATH%;C:\Python39;C:\Python39\Scripts") & \ - (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11" && setx PATH "%PATH%;C:\Python310;C:\Python310\Scripts") & \ - (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9" && setx PATH "%PATH%;C:\Python311;C:\Python311\Scripts") & \ - (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") & \ - (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1" && setx PATH "%PATH%;C:\Python313;C:\Python313\Scripts") +RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10") & \ + (if "%python%"=="3.9" setx PYTHON_VERSION "3.9.13") & \ + (if "%python%"=="3.10" setx PYTHON_VERSION "3.10.11") & \ + (if "%python%"=="3.11" setx PYTHON_VERSION "3.11.9") & \ + (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.4") & \ + (if "%python%"=="3.13" setx PYTHON_VERSION "3.13.0-rc1") # Install archiver to extract xz archives -RUN choco install -r -y --pre --no-progress python --version=%PYTHON_VERSION% & \ - python -m pip install --no-cache-dir -U pip setuptools & \ +RUN choco install -r -y --pre --no-progress --force python --version=%PYTHON_VERSION% && \ choco install --no-progress -r -y archiver + +ENV PYTHON=$python diff --git a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile index e17c0306f115d..4d867a448c994 100644 --- a/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile index 341d8a87e8661..f26cad51f0983 100644 --- a/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ wget && \ apt-get clean && \ diff --git a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile index a995ab2a8bc2d..125bc7ba46a81 100644 --- a/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile +++ b/ci/docker/ubuntu-24.04-cpp-minimal.dockerfile @@ -33,6 +33,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libcurl4-openssl-dev \ python3-pip \ + python3-venv \ tzdata \ tzdata-legacy \ wget && \ diff --git a/ci/scripts/install_gcs_testbench.bat b/ci/scripts/install_gcs_testbench.bat index b03d0c2ad6608..f54f98db7cac8 100644 --- a/ci/scripts/install_gcs_testbench.bat +++ b/ci/scripts/install_gcs_testbench.bat @@ -17,9 +17,18 @@ @echo on -set GCS_TESTBENCH_VERSION="v0.36.0" +set GCS_TESTBENCH_VERSION="v0.40.0" + +set PIPX_FLAGS=--verbose +if NOT "%PIPX_PYTHON%"=="" ( + set PIPX_FLAGS=--python %PIPX_PYTHON% %PIPX_FLAGS% +) + +python -m pip install -U pipx || exit /B 1 @REM Install GCS testbench %GCS_TESTBENCH_VERSION% -python -m pip install ^ +pipx install %PIPX_FLAGS% ^ "https://github.com/googleapis/storage-testbench/archive/%GCS_TESTBENCH_VERSION%.tar.gz" ^ || exit /B 1 + +pipx list --verbose diff --git a/ci/scripts/install_gcs_testbench.sh b/ci/scripts/install_gcs_testbench.sh index 5471b3cc238ca..78826e94d3294 100755 --- a/ci/scripts/install_gcs_testbench.sh +++ b/ci/scripts/install_gcs_testbench.sh @@ -17,7 +17,7 @@ # specific language governing permissions and limitations # under the License. -set -e +set -ex if [ "$#" -ne 1 ]; then echo "Usage: $0 " @@ -34,19 +34,23 @@ case "$(uname -m)" in ;; esac -# On newer pythons install into the system will fail, so override that -export PIP_BREAK_SYSTEM_PACKAGES=1 - version=$1 if [[ "${version}" -eq "default" ]]; then version="v0.39.0" - # Latests versions of Testbench require newer setuptools - python3 -m pip install --upgrade setuptools fi +: ${PIPX_PYTHON:=$(which python3)} + +export PIP_BREAK_SYSTEM_PACKAGES=1 +${PIPX_PYTHON} -m pip install -U pipx + # This script is run with PYTHON undefined in some places, # but those only use older pythons. if [[ -z "${PYTHON_VERSION}" ]] || [[ "${PYTHON_VERSION}" != "3.13" ]]; then - python3 -m pip install \ - "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" + pipx_flags=--verbose + if [[ $(id -un) == "root" ]]; then + # Install globally as /root/.local/bin is typically not in $PATH + pipx_flags="${pipx_flags} --global" + fi + ${PIPX_PYTHON} -m pipx install ${pipx_flags} "https://github.com/googleapis/storage-testbench/archive/${version}.tar.gz" fi diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index 87c0bb1252024..cac3f18434b6c 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -37,28 +37,32 @@ set PYARROW_TEST_TENSORFLOW=ON set ARROW_TEST_DATA=C:\arrow\testing\data set PARQUET_TEST_DATA=C:\arrow\cpp\submodules\parquet-testing\data -@REM Install testing dependencies -pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 +@REM List installed Pythons +py -0p + +set PYTHON_CMD=py -%PYTHON% -@REM Install GCS testbench -call "C:\arrow\ci\scripts\install_gcs_testbench.bat" +%PYTHON_CMD% -m pip install -U pip setuptools || exit /B 1 + +@REM Install testing dependencies +%PYTHON_CMD% -m pip install -r C:\arrow\python\requirements-wheel-test.txt || exit /B 1 @REM Install the built wheels -python -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 +%PYTHON_CMD% -m pip install --no-index --find-links=C:\arrow\python\dist\ pyarrow || exit /B 1 @REM Test that the modules are importable -python -c "import pyarrow" || exit /B 1 -python -c "import pyarrow._gcsfs" || exit /B 1 -python -c "import pyarrow._hdfs" || exit /B 1 -python -c "import pyarrow._s3fs" || exit /B 1 -python -c "import pyarrow.csv" || exit /B 1 -python -c "import pyarrow.dataset" || exit /B 1 -python -c "import pyarrow.flight" || exit /B 1 -python -c "import pyarrow.fs" || exit /B 1 -python -c "import pyarrow.json" || exit /B 1 -python -c "import pyarrow.orc" || exit /B 1 -python -c "import pyarrow.parquet" || exit /B 1 -python -c "import pyarrow.substrait" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._gcsfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._hdfs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow._s3fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.csv" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.dataset" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.flight" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.fs" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.json" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.orc" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 +%PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B @@ -67,4 +71,4 @@ arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo @REM Execute unittest -pytest -r s --pyargs pyarrow || exit /B 1 +%PYTHON_CMD% -m pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/cpp/src/arrow/filesystem/gcsfs_test.cc b/cpp/src/arrow/filesystem/gcsfs_test.cc index a6022a8d21681..2098cf4d7f319 100644 --- a/cpp/src/arrow/filesystem/gcsfs_test.cc +++ b/cpp/src/arrow/filesystem/gcsfs_test.cc @@ -95,44 +95,41 @@ class GcsTestbench : public ::testing::Environment { if (const auto* env = std::getenv("PYTHON")) { names = {env}; } - auto error = std::string( - "Could not start GCS emulator." - " Used the following list of python interpreter names:"); - for (const auto& interpreter : names) { - auto exe_path = bp::search_path(interpreter); - error += " " + interpreter; - if (exe_path.empty()) { - error += " (exe not found)"; - continue; - } + auto error = std::string("Could not start GCS emulator 'storage-testbench'"); - bp::ipstream output; - server_process_ = bp::child(exe_path, "-m", "testbench", "--port", port_, group_, - bp::std_err > output); + auto testbench_is_running = [](bp::child& process, bp::ipstream& output) { // Wait for message: "* Restarting with" - auto testbench_is_running = [&output, this](bp::child& process) { - std::string line; - std::chrono::time_point end = - std::chrono::steady_clock::now() + std::chrono::seconds(10); - while (server_process_.valid() && server_process_.running() && - std::chrono::steady_clock::now() < end) { - if (output.peek() && std::getline(output, line)) { - std::cerr << line << std::endl; - if (line.find("* Restarting with") != std::string::npos) return true; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(20)); - } + std::string line; + std::chrono::time_point end = + std::chrono::steady_clock::now() + std::chrono::seconds(10); + while (process.valid() && process.running() && + std::chrono::steady_clock::now() < end) { + if (output.peek() && std::getline(output, line)) { + std::cerr << line << std::endl; + if (line.find("* Restarting with") != std::string::npos) return true; + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); } - return false; - }; + } + return false; + }; - if (testbench_is_running(server_process_)) break; - error += " (failed to start)"; - server_process_.terminate(); - server_process_.wait(); + auto exe_path = bp::search_path("storage-testbench"); + if (!exe_path.empty()) { + bp::ipstream output; + server_process_ = + bp::child(exe_path, "--port", port_, group_, bp::std_err > output); + if (!testbench_is_running(server_process_, output)) { + error += " (failed to start)"; + server_process_.terminate(); + server_process_.wait(); + } + } else { + error += " (exe not found)"; + } + if (!server_process_.valid()) { + error_ = std::move(error); } - if (server_process_.valid() && server_process_.valid()) return; - error_ = std::move(error); } bool running() { return server_process_.running(); } @@ -140,7 +137,10 @@ class GcsTestbench : public ::testing::Environment { ~GcsTestbench() override { // Brutal shutdown, kill the full process group because the GCS testbench may launch // additional children. - group_.terminate(); + try { + group_.terminate(); + } catch (bp::process_error&) { + } if (server_process_.valid()) { server_process_.wait(); } diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py index e1919497b5116..7a222cec8a7c4 100644 --- a/python/pyarrow/tests/conftest.py +++ b/python/pyarrow/tests/conftest.py @@ -233,17 +233,16 @@ def minio_server_health_check(address): def gcs_server(): port = find_free_port() env = os.environ.copy() - args = [sys.executable, '-m', 'testbench', '--port', str(port)] + exe = 'storage-testbench' + args = [exe, '--port', str(port)] proc = None try: - # check first if testbench module is available - import testbench # noqa:F401 # start server proc = subprocess.Popen(args, env=env) # Make sure the server is alive. if proc.poll() is not None: pytest.skip(f"Command {args} did not start server successfully!") - except (ModuleNotFoundError, OSError) as e: + except OSError as e: pytest.skip(f"Command {args} failed to execute: {e}") else: yield { diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 1a4b4a4e05614..53d3dd52bd8a6 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -335,7 +335,7 @@ def _load_pyarrow_in_runner(driver, wheel_name): """ import pyarrow,pathlib pyarrow_dir = pathlib.Path(pyarrow.__file__).parent -pytest.main([pyarrow_dir, '-v']) +pytest.main([pyarrow_dir, '-r', 's']) """, wait_for_terminate=False, ) diff --git a/r/tests/testthat/test-gcs.R b/r/tests/testthat/test-gcs.R index d671c12138c60..54159e82ca60f 100644 --- a/r/tests/testthat/test-gcs.R +++ b/r/tests/testthat/test-gcs.R @@ -116,12 +116,12 @@ test_that("GcsFileSystem$create() can read json_credentials", { }) skip_on_cran() -skip_if_not(system('python -c "import testbench"') == 0, message = "googleapis-storage-testbench is not installed.") +skip_if_not(system("storage-testbench -h") == 0, message = "googleapis-storage-testbench is not installed.") library(dplyr) testbench_port <- Sys.getenv("TESTBENCH_PORT", "9001") -pid_minio <- sys::exec_background("python", c("-m", "testbench", "--port", testbench_port), +pid_minio <- sys::exec_background("storage-testbench", c("--port", testbench_port), std_out = FALSE, std_err = FALSE # TODO: is there a good place to send output? ) From 6c17b794509d3931225cf295ae864204162c786f Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Thu, 29 Aug 2024 17:53:14 +0900 Subject: [PATCH 4/8] GH-43877: [Ruby] Add support for 0 decimal value (#43882) ### Rationale for this change Apache Arrow C++ may use "0.EXXX" string such as "0.E-9" for 0 decimal value. Ruby's BigDecimal doesn't accept it. ### What changes are included in this PR? We convert "0.EXXX" to "0.0EXXX" in Ruby. ### Are these changes tested? Yes. ### Are there any user-facing changes? Yes. * GitHub Issue: #43877 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- ruby/red-arrow/lib/arrow/decimal128-array.rb | 4 +++- ruby/red-arrow/lib/arrow/decimal256-array.rb | 4 +++- ruby/red-arrow/test/test-decimal128-array.rb | 6 ++++++ ruby/red-arrow/test/test-decimal256-array.rb | 6 ++++++ 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/ruby/red-arrow/lib/arrow/decimal128-array.rb b/ruby/red-arrow/lib/arrow/decimal128-array.rb index a5ee53be7b229..528c878a859b5 100644 --- a/ruby/red-arrow/lib/arrow/decimal128-array.rb +++ b/ruby/red-arrow/lib/arrow/decimal128-array.rb @@ -18,7 +18,9 @@ module Arrow class Decimal128Array def get_value(i) - BigDecimal(format_value(i)) + string = format_value(i) + string.sub!(".E", ".0E") if string.include?(".E") + BigDecimal(string) end end end diff --git a/ruby/red-arrow/lib/arrow/decimal256-array.rb b/ruby/red-arrow/lib/arrow/decimal256-array.rb index 8c2306dfe3627..32841ca4862f5 100644 --- a/ruby/red-arrow/lib/arrow/decimal256-array.rb +++ b/ruby/red-arrow/lib/arrow/decimal256-array.rb @@ -19,7 +19,9 @@ module Arrow class Decimal256Array # @since 3.0.0 def get_value(i) - BigDecimal(format_value(i)) + string = format_value(i) + string.sub!(".E", ".0E") if string.include?(".E") + BigDecimal(string) end end end diff --git a/ruby/red-arrow/test/test-decimal128-array.rb b/ruby/red-arrow/test/test-decimal128-array.rb index a50e2cf4a4832..a6e7c4e1ac433 100644 --- a/ruby/red-arrow/test/test-decimal128-array.rb +++ b/ruby/red-arrow/test/test-decimal128-array.rb @@ -38,4 +38,10 @@ class Decimal128ArrayTest < Test::Unit::TestCase array.to_a) end end + + def test_zero + array = Arrow::Decimal128Array.new({precision: 38, scale: 9}, + [BigDecimal("0")]) + assert_equal(BigDecimal("0"), array[0]) + end end diff --git a/ruby/red-arrow/test/test-decimal256-array.rb b/ruby/red-arrow/test/test-decimal256-array.rb index ed542f2d6c75e..053e948fc84b7 100644 --- a/ruby/red-arrow/test/test-decimal256-array.rb +++ b/ruby/red-arrow/test/test-decimal256-array.rb @@ -38,4 +38,10 @@ class Decimal256ArrayTest < Test::Unit::TestCase array.to_a) end end + + def test_zero + array = Arrow::Decimal256Array.new({precision: 38, scale: 9}, + [BigDecimal("0")]) + assert_equal(BigDecimal("0"), array[0]) + end end From 30893876e0650d9c3c003c5646f94c274ade9669 Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Thu, 29 Aug 2024 19:09:52 +0800 Subject: [PATCH 5/8] GH-43870: [C++][Acero] Fix typos in join benchmark (#43871) ### Rationale for this change These are rather obvious typos. ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #43870 Authored-by: Ruoxi Sun Signed-off-by: Antoine Pitrou --- cpp/src/arrow/acero/hash_join_benchmark.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_benchmark.cc b/cpp/src/arrow/acero/hash_join_benchmark.cc index 470960b1c5062..e3e37e249e6a3 100644 --- a/cpp/src/arrow/acero/hash_join_benchmark.cc +++ b/cpp/src/arrow/acero/hash_join_benchmark.cc @@ -104,7 +104,7 @@ class JoinBenchmark { key_cmp.push_back(JoinKeyCmp::EQ); } - for (size_t i = 0; i < settings.build_payload_types.size(); i++) { + for (size_t i = 0; i < settings.probe_payload_types.size(); i++) { std::string name = "lp" + std::to_string(i); DCHECK_OK(l_schema_builder.AddField(field(name, settings.probe_payload_types[i]))); } @@ -279,7 +279,7 @@ static void BM_HashJoinBasic_MatchesPerRow(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(0)); settings.num_build_batches = static_cast(st.range(1)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } @@ -291,7 +291,7 @@ static void BM_HashJoinBasic_PayloadSize(benchmark::State& st) { settings.cardinality = 1.0 / static_cast(st.range(1)); settings.num_build_batches = static_cast(st.range(2)); - settings.num_probe_batches = settings.num_probe_batches; + settings.num_probe_batches = settings.num_build_batches; HashJoinBasicBenchmarkImpl(st, settings); } From 6db12f2ca7cccb5f90e1cd0e753d5e92fe3b17bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 29 Aug 2024 13:36:17 +0200 Subject: [PATCH 6/8] GH-41696: [Python][Packaging] Bump MACOSX_DEPLOYMENT_TARGET to 12 instead of 11 (#43137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change As shown on the associated issue there seems to be a problem with `MACOSX_DEPLOYMENT_TARGET` 11 on the wheels. ### What changes are included in this PR? Update `MACOSX_DEPLOYMENT_TARGET` everywhere to the latest supported macOS version. ### Are these changes tested? Via CI, even though the issue was not reproducible on CI. ### Are there any user-facing changes? Yes, wheels won't be available for macOS 11 but those were crashing on the previous release. * GitHub Issue: #41696 Authored-by: Raúl Cumplido Signed-off-by: Raúl Cumplido --- .github/workflows/python.yml | 2 +- ci/scripts/python_wheel_macos_build.sh | 2 +- ci/vcpkg/arm64-osx-static-debug.cmake | 2 +- ci/vcpkg/arm64-osx-static-release.cmake | 2 +- ci/vcpkg/universal2-osx-static-debug.cmake | 2 +- ci/vcpkg/universal2-osx-static-release.cmake | 2 +- cpp/src/arrow/flight/CMakeLists.txt | 6 ++++++ dev/tasks/tasks.yml | 10 +++++----- dev/tasks/verify-rc/github.macos.yml | 2 +- python/CMakeLists.txt | 2 +- ruby/red-arrow/ext/arrow/extconf.rb | 2 +- 11 files changed, 20 insertions(+), 14 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 916db2580e371..854d792f3100d 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -163,7 +163,7 @@ jobs: ARROW_BUILD_TESTS: OFF PYARROW_TEST_LARGE_MEMORY: ON # Current oldest supported version according to https://endoflife.date/macos - MACOSX_DEPLOYMENT_TARGET: 10.15 + MACOSX_DEPLOYMENT_TARGET: 12.0 steps: - name: Checkout Arrow uses: actions/checkout@v4 diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index d5430f26748eb..92b962f1740bd 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -34,7 +34,7 @@ rm -rf ${source_dir}/python/pyarrow/*.so.* echo "=== (${PYTHON_VERSION}) Set SDK, C++ and Wheel flags ===" export _PYTHON_HOST_PLATFORM="macosx-${MACOSX_DEPLOYMENT_TARGET}-${arch}" -export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-10.15} +export MACOSX_DEPLOYMENT_TARGET=${MACOSX_DEPLOYMENT_TARGET:-12.0} export SDKROOT=${SDKROOT:-$(xcrun --sdk macosx --show-sdk-path)} if [ $arch = "arm64" ]; then diff --git a/ci/vcpkg/arm64-osx-static-debug.cmake b/ci/vcpkg/arm64-osx-static-debug.cmake index f511819a2edd9..32ae7bc433489 100644 --- a/ci/vcpkg/arm64-osx-static-debug.cmake +++ b/ci/vcpkg/arm64-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/arm64-osx-static-release.cmake b/ci/vcpkg/arm64-osx-static-release.cmake index 43d65efb2651b..dde46cd763afe 100644 --- a/ci/vcpkg/arm64-osx-static-release.cmake +++ b/ci/vcpkg/arm64-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES arm64) -set(VCPKG_OSX_DEPLOYMENT_TARGET "11.0") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/ci/vcpkg/universal2-osx-static-debug.cmake b/ci/vcpkg/universal2-osx-static-debug.cmake index 8abc1ebf838f1..d3ef0d67eb719 100644 --- a/ci/vcpkg/universal2-osx-static-debug.cmake +++ b/ci/vcpkg/universal2-osx-static-debug.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE debug) diff --git a/ci/vcpkg/universal2-osx-static-release.cmake b/ci/vcpkg/universal2-osx-static-release.cmake index 2eb36c15175b2..3018aa93e5fbb 100644 --- a/ci/vcpkg/universal2-osx-static-release.cmake +++ b/ci/vcpkg/universal2-osx-static-release.cmake @@ -21,6 +21,6 @@ set(VCPKG_LIBRARY_LINKAGE static) set(VCPKG_CMAKE_SYSTEM_NAME Darwin) set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64") -set(VCPKG_OSX_DEPLOYMENT_TARGET "10.15") +set(VCPKG_OSX_DEPLOYMENT_TARGET "12.0") set(VCPKG_BUILD_TYPE release) diff --git a/cpp/src/arrow/flight/CMakeLists.txt b/cpp/src/arrow/flight/CMakeLists.txt index 98f93705f6f56..835c4fc83bf18 100644 --- a/cpp/src/arrow/flight/CMakeLists.txt +++ b/cpp/src/arrow/flight/CMakeLists.txt @@ -26,6 +26,12 @@ endif() if(WIN32) list(APPEND ARROW_FLIGHT_LINK_LIBS ws2_32.lib) endif() +# Updating the MACOSX_DEPLOYMENT_TARGET to 12 required us to explicitly +# link Flight with OpenSSL on macOS. Read this comment for more details: +# https://github.com/apache/arrow/pull/43137#pullrequestreview-2267476893 +if(APPLE AND ARROW_USE_OPENSSL) + list(APPEND ARROW_FLIGHT_LINK_LIBS ${ARROW_OPENSSL_LIBS}) +endif() set(ARROW_FLIGHT_TEST_LINKAGE "${ARROW_TEST_LINKAGE}") if(Protobuf_USE_STATIC_LIBS) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index cae34c3231381..7f52fe7b05232 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -413,7 +413,7 @@ tasks: {############################## Wheel macOS ####################################} -{% for macos_version, macos_codename in [("10.15", "catalina")] %} +{% for macos_version, macos_codename in [("12.0", "monterey")] %} {% set platform_tag = "macosx_{}_x86_64".format(macos_version.replace('.', '_')) %} wheel-macos-{{ macos_codename }}-{{ python_tag }}-amd64: @@ -424,25 +424,25 @@ tasks: arrow_jemalloc: "ON" python_version: "{{ python_version }}" macos_deployment_target: "{{ macos_version }}" - runs_on: "macos-13" + runs_on: "macos-12" vcpkg_arch: "amd64" artifacts: - pyarrow-{no_rc_version}-{{ python_tag }}-{{ abi_tag }}-{{ platform_tag }}.whl {% endfor %} - wheel-macos-big-sur-{{ python_tag }}-arm64: + wheel-macos-monterey-{{ python_tag }}-arm64: ci: github template: python-wheels/github.osx.yml params: arch: "arm64" arrow_jemalloc: "OFF" python_version: "{{ python_version }}" - macos_deployment_target: "11.0" + macos_deployment_target: "12.0" runs_on: "macos-14" vcpkg_arch: "arm64" artifacts: - - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_11_0_arm64.whl + - pyarrow-{no_rc_version}-{{ python_tag }}-{{ python_tag }}-macosx_12_0_arm64.whl {############################## Wheel Windows ################################} diff --git a/dev/tasks/verify-rc/github.macos.yml b/dev/tasks/verify-rc/github.macos.yml index 4bc3fff71b64a..e2bc7895c6d05 100644 --- a/dev/tasks/verify-rc/github.macos.yml +++ b/dev/tasks/verify-rc/github.macos.yml @@ -22,7 +22,7 @@ {% set use_conda = use_conda|default(False) %} # env: is generated by macros.github_header() # Current oldest supported version according to https://endoflife.date/macos - MACOSX_DEPLOYMENT_TARGET: "10.15" + MACOSX_DEPLOYMENT_TARGET: "12.0" jobs: verify: diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 5d5eeaf8157b4..1a18b2b173acb 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -84,7 +84,7 @@ set(CMAKE_MACOSX_RPATH 1) if(DEFINED ENV{MACOSX_DEPLOYMENT_TARGET}) set(CMAKE_OSX_DEPLOYMENT_TARGET $ENV{MACOSX_DEPLOYMENT_TARGET}) else() - set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15) + set(CMAKE_OSX_DEPLOYMENT_TARGET 12.0) endif() # Generate a Clang compile_commands.json "compilation database" file for use diff --git a/ruby/red-arrow/ext/arrow/extconf.rb b/ruby/red-arrow/ext/arrow/extconf.rb index 28ccd0b2d59e1..a3005cd56f270 100644 --- a/ruby/red-arrow/ext/arrow/extconf.rb +++ b/ruby/red-arrow/ext/arrow/extconf.rb @@ -91,7 +91,7 @@ symbols_in_external_bundles.each do |symbol| $DLDFLAGS << " -Wl,-U,#{symbol}" end - mmacosx_version_min = "-mmacosx-version-min=10.15" + mmacosx_version_min = "-mmacosx-version-min=12.0" $CFLAGS << " #{mmacosx_version_min}" $CXXFLAGS << " #{mmacosx_version_min}" end From 45592f9e1d98da75a7bdc534375b32a004f13e02 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 29 Aug 2024 22:53:54 +0800 Subject: [PATCH 7/8] GH-43732: [Go] Require Go 1.22 or above (#43864) ### Rationale for this change https://github.com/apache/arrow/issues/43732 ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? * GitHub Issue: #43732 Authored-by: Xin Hao Signed-off-by: Matt Topol --- .env | 4 ++-- .github/workflows/go.yml | 22 +++++++++++----------- ci/docker/conda-integration.dockerfile | 2 +- ci/docker/debian-12-go.dockerfile | 4 ++-- dev/release/verify-release-candidate.sh | 8 ++++---- dev/tasks/tasks.yml | 2 +- go/arrow/compute/cast_test.go | 2 +- go/arrow/scalar/parse.go | 2 +- go/go.mod | 2 +- go/parquet/file/file_reader.go | 2 +- go/parquet/schema/reflection.go | 8 ++++---- 11 files changed, 29 insertions(+), 29 deletions(-) diff --git a/.env b/.env index 21f904c3208f6..af647fc8b7a7f 100644 --- a/.env +++ b/.env @@ -58,8 +58,8 @@ CUDA=11.2.2 DASK=latest DOTNET=8.0 GCC_VERSION="" -GO=1.21.8 -STATICCHECK=v0.4.7 +GO=1.22.6 +STATICCHECK=v0.5.1 HDFS=3.2.1 JDK=11 KARTOTHEK=latest diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index ffd543691d5b2..b9a19d182d5c4 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -62,13 +62,13 @@ jobs: { "arch-label": "AMD64", "arch": "amd64", - "go": "1.21", + "go": "1.22", "runs-on": "ubuntu-latest" }, { "arch-label": "AMD64", "arch": "amd64", - "go": "1.22", + "go": "1.23", "runs-on": "ubuntu-latest" } JSON @@ -78,13 +78,13 @@ jobs: { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.21", + "go": "1.22", "runs-on": ["self-hosted", "arm", "linux"] }, { "arch-label": "ARM64", "arch": "arm64v8", - "go": "1.22", + "go": "1.23", "runs-on": ["self-hosted", "arm", "linux"] } JSON @@ -197,7 +197,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: GO: ${{ matrix.go }} steps: @@ -238,7 +238,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: GO: ${{ matrix.go }} steps: @@ -277,7 +277,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -310,7 +310,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 @@ -323,7 +323,7 @@ jobs: go-version: ${{ matrix.go }} cache: true cache-dependency-path: go/go.sum - - name: Install staticcheck + - name: Install staticcheck run: | . .env go install honnef.co/go/tools/cmd/staticcheck@${STATICCHECK} @@ -368,7 +368,7 @@ jobs: strategy: fail-fast: false matrix: - go: ['1.21', '1.22'] + go: ['1.22', '1.23'] env: ARROW_GO_TESTCGO: "1" steps: @@ -439,7 +439,7 @@ jobs: ci/scripts/msys2_setup.sh cgo - name: Get required Go version run: | - (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV + (. .env && echo "GO_VERSION=${GO}") >> $GITHUB_ENV - name: Update CGO Env vars shell: msys2 {0} run: | diff --git a/ci/docker/conda-integration.dockerfile b/ci/docker/conda-integration.dockerfile index c602490d6b729..7ad2e5c0e8008 100644 --- a/ci/docker/conda-integration.dockerfile +++ b/ci/docker/conda-integration.dockerfile @@ -24,7 +24,7 @@ ARG maven=3.8.7 ARG node=16 ARG yarn=1.22 ARG jdk=11 -ARG go=1.21.8 +ARG go=1.22.6 # Install Archery and integration dependencies COPY ci/conda_env_archery.txt /arrow/ci/ diff --git a/ci/docker/debian-12-go.dockerfile b/ci/docker/debian-12-go.dockerfile index c958e6bdee211..4bc683c109eb8 100644 --- a/ci/docker/debian-12-go.dockerfile +++ b/ci/docker/debian-12-go.dockerfile @@ -16,8 +16,8 @@ # under the License. ARG arch=amd64 -ARG go=1.21 -ARG staticcheck=v0.4.7 +ARG go=1.22 +ARG staticcheck=v0.5.1 FROM ${arch}/golang:${go}-bookworm # FROM collects all the args, get back the staticcheck version arg diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 07e765a759ea0..cdea4ca0d00a1 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -24,7 +24,7 @@ # - JDK >= 11 # - gcc >= 4.8 # - Node.js >= 18 -# - Go >= 1.21 +# - Go >= 1.22 # - Docker # # If using a non-system Boost, set BOOST_ROOT and add Boost libraries to @@ -403,7 +403,7 @@ install_go() { return 0 fi - local version=1.21.8 + local version=1.22.6 show_info "Installing go version ${version}..." local arch="$(uname -m)" @@ -512,7 +512,7 @@ install_maven() { show_info "System Maven version ${SYSTEM_MAVEN_VERSION} matches required Maven version ${MAVEN_VERSION}. Skipping installation." else # Append pipe character to make preview release versions like "X.Y.Z-beta-1" sort - # as older than their corresponding release version "X.Y.Z". This works because + # as older than their corresponding release version "X.Y.Z". This works because # `sort -V` orders the pipe character lower than any version number character. older_version=$(printf '%s\n%s\n' "$SYSTEM_MAVEN_VERSION" "$MAVEN_VERSION" | sed 's/$/|/' | sort -V | sed 's/|$//' | head -n1) if [[ "$older_version" == "$SYSTEM_MAVEN_VERSION" ]]; then @@ -953,7 +953,7 @@ test_go() { show_header "Build and test Go libraries" maybe_setup_go - maybe_setup_conda compilers go=1.21 + maybe_setup_conda compilers go=1.22 pushd go go get -v ./... diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 7f52fe7b05232..c6d2f2175d44c 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1475,7 +1475,7 @@ tasks: R_PRUNE_DEPS: TRUE image: r-clang-sanitizer - {% for go_version, staticcheck in [("1.21", "v0.4.7"), ("1.22", "latest")] %} + {% for go_version, staticcheck in [("1.22", "v0.5.1"), ("1.23", "latest")] %} test-debian-12-go-{{ go_version }}: ci: github template: docker-tests/github.linux.yml diff --git a/go/arrow/compute/cast_test.go b/go/arrow/compute/cast_test.go index fa08467dd3946..db6098225dda8 100644 --- a/go/arrow/compute/cast_test.go +++ b/go/arrow/compute/cast_test.go @@ -129,7 +129,7 @@ func checkScalarWithScalars(t *testing.T, funcName string, inputs []scalar.Scala fmt.Fprintf(&b, " (types differed: %s vs %s)", out.(*compute.ScalarDatum).Type(), expected.DataType()) } - t.Fatalf(b.String()) + t.Fatal(b.String()) } } diff --git a/go/arrow/scalar/parse.go b/go/arrow/scalar/parse.go index 866e627113d88..27db42afa69b1 100644 --- a/go/arrow/scalar/parse.go +++ b/go/arrow/scalar/parse.go @@ -329,7 +329,7 @@ func fromListScalar(s ListScalar, v reflect.Value) error { } case *array.Map: // only implementing slice of metadata for now - if v.Type().Elem() != reflect.PtrTo(reflect.TypeOf(arrow.Metadata{})) { + if v.Type().Elem() != reflect.PointerTo(reflect.TypeOf(arrow.Metadata{})) { return fmt.Errorf("unimplemented fromListScalar type %s to %s", arr.DataType(), v.Type().String()) } diff --git a/go/go.mod b/go/go.mod index a995eee24d563..77f98cefb0f0e 100644 --- a/go/go.mod +++ b/go/go.mod @@ -16,7 +16,7 @@ module github.com/apache/arrow/go/v18 -go 1.21 +go 1.22 require ( github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c diff --git a/go/parquet/file/file_reader.go b/go/parquet/file/file_reader.go index f838482fbb0e9..f25b882e00647 100644 --- a/go/parquet/file/file_reader.go +++ b/go/parquet/file/file_reader.go @@ -233,7 +233,7 @@ func (f *Reader) parseMetaData() error { func (f *Reader) handleAadPrefix(fileDecrypt *parquet.FileDecryptionProperties, algo *parquet.Algorithm) (string, error) { aadPrefixInProps := fileDecrypt.AadPrefix() aadPrefix := []byte(aadPrefixInProps) - fileHasAadPrefix := algo.Aad.AadPrefix != nil && len(algo.Aad.AadPrefix) > 0 + fileHasAadPrefix := len(algo.Aad.AadPrefix) > 0 aadPrefixInFile := algo.Aad.AadPrefix if algo.Aad.SupplyAadPrefix && aadPrefixInProps == "" { diff --git a/go/parquet/schema/reflection.go b/go/parquet/schema/reflection.go index 0bec9eb599dc8..51d0a84f2244f 100644 --- a/go/parquet/schema/reflection.go +++ b/go/parquet/schema/reflection.go @@ -639,7 +639,7 @@ func typeFromNode(n Node) reflect.Type { } if n.RepetitionType() == parquet.Repetitions.Optional { - typ = reflect.PtrTo(typ) + typ = reflect.PointerTo(typ) } else if n.RepetitionType() == parquet.Repetitions.Repeated { typ = reflect.SliceOf(typ) } @@ -707,7 +707,7 @@ func typeFromNode(n Node) reflect.Type { elemType = reflect.SliceOf(elemType) } if gnode.RepetitionType() == parquet.Repetitions.Optional { - elemType = reflect.PtrTo(elemType) + elemType = reflect.PointerTo(elemType) } return elemType case ConvertedTypes.Map, ConvertedTypes.MapKeyValue: @@ -778,7 +778,7 @@ func typeFromNode(n Node) reflect.Type { mapType := reflect.MapOf(keyType, valType) if gnode.RepetitionType() == parquet.Repetitions.Optional { - mapType = reflect.PtrTo(mapType) + mapType = reflect.PointerTo(mapType) } return mapType default: @@ -796,7 +796,7 @@ func typeFromNode(n Node) reflect.Type { return reflect.SliceOf(structType) } if gnode.RepetitionType() == parquet.Repetitions.Optional { - return reflect.PtrTo(structType) + return reflect.PointerTo(structType) } return structType } From 4f91c8f144125bd147c25cb49ac0071c8d28765c Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 29 Aug 2024 23:38:41 +0800 Subject: [PATCH 8/8] GH-43759: [C++] Acero: Minor code enhancement for Join (#43760) ### Rationale for this change Minor style enhancement for join ### What changes are included in this PR? Minor style enhancement for join ### Are these changes tested? Covered by existing ### Are there any user-facing changes? no * GitHub Issue: #43759 Authored-by: mwish Signed-off-by: mwish --- cpp/src/arrow/acero/hash_join_dict.cc | 9 ++- cpp/src/arrow/acero/hash_join_node.cc | 16 ++--- cpp/src/arrow/acero/hash_join_node.h | 6 +- cpp/src/arrow/acero/swiss_join.cc | 7 +- cpp/src/arrow/compute/light_array_internal.cc | 68 +++++++++---------- cpp/src/arrow/compute/light_array_internal.h | 6 +- cpp/src/arrow/compute/light_array_test.cc | 4 +- 7 files changed, 57 insertions(+), 59 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_dict.cc b/cpp/src/arrow/acero/hash_join_dict.cc index 3aef08e6e9ccf..8db9dddb2c3a0 100644 --- a/cpp/src/arrow/acero/hash_join_dict.cc +++ b/cpp/src/arrow/acero/hash_join_dict.cc @@ -225,21 +225,20 @@ Status HashJoinDictBuild::Init(ExecContext* ctx, std::shared_ptr dictiona return Status::OK(); } - dictionary_ = dictionary; + dictionary_ = std::move(dictionary); // Initialize encoder RowEncoder encoder; - std::vector encoder_types; - encoder_types.emplace_back(value_type_); + std::vector encoder_types{value_type_}; encoder.Init(encoder_types, ctx); // Encode all dictionary values - int64_t length = dictionary->data()->length; + int64_t length = dictionary_->data()->length; if (length >= std::numeric_limits::max()) { return Status::Invalid( "Dictionary length in hash join must fit into signed 32-bit integer."); } - RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary->data()}, length))); + RETURN_NOT_OK(encoder.EncodeAndAppend(ExecSpan({*dictionary_->data()}, length))); std::vector entries_to_take; diff --git a/cpp/src/arrow/acero/hash_join_node.cc b/cpp/src/arrow/acero/hash_join_node.cc index 67f902e64be93..80dd163ced740 100644 --- a/cpp/src/arrow/acero/hash_join_node.cc +++ b/cpp/src/arrow/acero/hash_join_node.cc @@ -61,30 +61,30 @@ Result> HashJoinSchema::ComputePayload( const std::vector& filter, const std::vector& keys) { // payload = (output + filter) - keys, with no duplicates std::unordered_set payload_fields; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.insert(match[0]); } - for (auto ref : keys) { + for (const auto& ref : keys) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); payload_fields.erase(match[0]); } std::vector payload_refs; - for (auto ref : output) { + for (const auto& ref : output) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); payload_fields.erase(match[0]); } } - for (auto ref : filter) { + for (const auto& ref : filter) { ARROW_ASSIGN_OR_RAISE(auto match, ref.FindOne(schema)); if (payload_fields.find(match[0]) != payload_fields.end()) { payload_refs.push_back(ref); @@ -198,7 +198,7 @@ Status HashJoinSchema::ValidateSchemas(JoinType join_type, const Schema& left_sc return Status::Invalid("Different number of key fields on left (", left_keys.size(), ") and right (", right_keys.size(), ") side of the join"); } - if (left_keys.size() < 1) { + if (left_keys.empty()) { return Status::Invalid("Join key cannot be empty"); } for (size_t i = 0; i < left_keys.size() + right_keys.size(); ++i) { @@ -432,7 +432,7 @@ Status HashJoinSchema::CollectFilterColumns(std::vector& left_filter, indices[0] -= left_schema.num_fields(); FieldPath corrected_path(std::move(indices)); if (right_seen_paths.find(*path) == right_seen_paths.end()) { - right_filter.push_back(corrected_path); + right_filter.emplace_back(corrected_path); right_seen_paths.emplace(std::move(corrected_path)); } } else if (left_seen_paths.find(*path) == left_seen_paths.end()) { @@ -698,7 +698,7 @@ class HashJoinNode : public ExecNode, public TracedNode { std::shared_ptr output_schema, std::unique_ptr schema_mgr, Expression filter, std::unique_ptr impl) - : ExecNode(plan, inputs, {"left", "right"}, + : ExecNode(plan, std::move(inputs), {"left", "right"}, /*output_schema=*/std::move(output_schema)), TracedNode(this), join_type_(join_options.join_type), diff --git a/cpp/src/arrow/acero/hash_join_node.h b/cpp/src/arrow/acero/hash_join_node.h index ad60019ceabc4..19745b8675cf0 100644 --- a/cpp/src/arrow/acero/hash_join_node.h +++ b/cpp/src/arrow/acero/hash_join_node.h @@ -65,9 +65,9 @@ class ARROW_ACERO_EXPORT HashJoinSchema { std::shared_ptr MakeOutputSchema(const std::string& left_field_name_suffix, const std::string& right_field_name_suffix); - bool LeftPayloadIsEmpty() { return PayloadIsEmpty(0); } + bool LeftPayloadIsEmpty() const { return PayloadIsEmpty(0); } - bool RightPayloadIsEmpty() { return PayloadIsEmpty(1); } + bool RightPayloadIsEmpty() const { return PayloadIsEmpty(1); } static int kMissingField() { return SchemaProjectionMaps::kMissingField; @@ -88,7 +88,7 @@ class ARROW_ACERO_EXPORT HashJoinSchema { const SchemaProjectionMap& right_to_filter, const Expression& filter); - bool PayloadIsEmpty(int side) { + bool PayloadIsEmpty(int side) const { assert(side == 0 || side == 1); return proj_maps[side].num_cols(HashJoinProjection::PAYLOAD) == 0; } diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index 4d0c8187ac6e2..6c783110af571 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -1667,7 +1667,7 @@ Result> JoinResultMaterialize::FlushBuildColumn( const std::shared_ptr& data_type, const RowArray* row_array, int column_id, uint32_t* row_ids) { ResizableArrayData output; - output.Init(data_type, pool_, bit_util::Log2(num_rows_)); + RETURN_NOT_OK(output.Init(data_type, pool_, bit_util::Log2(num_rows_))); for (size_t i = 0; i <= null_ranges_.size(); ++i) { int row_id_begin = @@ -2247,8 +2247,9 @@ Result JoinResidualFilter::MaterializeFilterInput( build_schemas_->map(HashJoinProjection::FILTER, HashJoinProjection::PAYLOAD); for (int i = 0; i < num_build_cols; ++i) { ResizableArrayData column_data; - column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), pool_, - bit_util::Log2(num_batch_rows)); + RETURN_NOT_OK( + column_data.Init(build_schemas_->data_type(HashJoinProjection::FILTER, i), + pool_, bit_util::Log2(num_batch_rows))); if (auto idx = to_key.get(i); idx != SchemaProjectionMap::kMissingField) { RETURN_NOT_OK(build_keys_->DecodeSelected(&column_data, idx, num_batch_rows, key_ids_maybe_null, pool_)); diff --git a/cpp/src/arrow/compute/light_array_internal.cc b/cpp/src/arrow/compute/light_array_internal.cc index 4f235925d0fb6..e4b1f1b8cdd63 100644 --- a/cpp/src/arrow/compute/light_array_internal.cc +++ b/cpp/src/arrow/compute/light_array_internal.cc @@ -118,10 +118,9 @@ Result ColumnMetadataFromDataType( const std::shared_ptr& type) { const bool is_extension = type->id() == Type::EXTENSION; const std::shared_ptr& typ = - is_extension - ? arrow::internal::checked_pointer_cast(type->GetSharedPtr()) - ->storage_type() - : type; + is_extension ? arrow::internal::checked_cast(type.get()) + ->storage_type() + : type; if (typ->id() == Type::DICTIONARY) { auto bit_width = @@ -205,22 +204,25 @@ Status ColumnArraysFromExecBatch(const ExecBatch& batch, column_arrays); } -void ResizableArrayData::Init(const std::shared_ptr& data_type, - MemoryPool* pool, int log_num_rows_min) { +Status ResizableArrayData::Init(const std::shared_ptr& data_type, + MemoryPool* pool, int log_num_rows_min) { #ifndef NDEBUG if (num_rows_allocated_ > 0) { - ARROW_DCHECK(data_type_ != NULLPTR); - KeyColumnMetadata metadata_before = - ColumnMetadataFromDataType(data_type_).ValueOrDie(); - KeyColumnMetadata metadata_after = ColumnMetadataFromDataType(data_type).ValueOrDie(); + ARROW_DCHECK(data_type_ != nullptr); + const KeyColumnMetadata& metadata_before = column_metadata_; + ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata metadata_after, + ColumnMetadataFromDataType(data_type)); ARROW_DCHECK(metadata_before.is_fixed_length == metadata_after.is_fixed_length && metadata_before.fixed_length == metadata_after.fixed_length); } #endif + ARROW_DCHECK(data_type != nullptr); + ARROW_ASSIGN_OR_RAISE(column_metadata_, ColumnMetadataFromDataType(data_type)); Clear(/*release_buffers=*/false); log_num_rows_min_ = log_num_rows_min; data_type_ = data_type; pool_ = pool; + return Status::OK(); } void ResizableArrayData::Clear(bool release_buffers) { @@ -246,8 +248,6 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) { num_rows_allocated_new *= 2; } - KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); - if (buffers_[kFixedLengthBuffer] == NULLPTR) { ARROW_DCHECK(buffers_[kValidityBuffer] == NULLPTR && buffers_[kVariableLengthBuffer] == NULLPTR); @@ -258,8 +258,8 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) { bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes, pool_)); memset(mutable_data(kValidityBuffer), 0, bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes); - if (column_metadata.is_fixed_length) { - if (column_metadata.fixed_length == 0) { + if (column_metadata_.is_fixed_length) { + if (column_metadata_.fixed_length == 0) { ARROW_ASSIGN_OR_RAISE( buffers_[kFixedLengthBuffer], AllocateResizableBuffer( @@ -271,7 +271,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) { ARROW_ASSIGN_OR_RAISE( buffers_[kFixedLengthBuffer], AllocateResizableBuffer( - num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes, + num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes, pool_)); } } else { @@ -300,15 +300,15 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) { memset(mutable_data(kValidityBuffer) + bytes_for_bits_before, 0, bytes_for_bits_after - bytes_for_bits_before); - if (column_metadata.is_fixed_length) { - if (column_metadata.fixed_length == 0) { + if (column_metadata_.is_fixed_length) { + if (column_metadata_.fixed_length == 0) { RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize( bit_util::BytesForBits(num_rows_allocated_new) + kNumPaddingBytes)); memset(mutable_data(kFixedLengthBuffer) + bytes_for_bits_before, 0, bytes_for_bits_after - bytes_for_bits_before); } else { RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize( - num_rows_allocated_new * column_metadata.fixed_length + kNumPaddingBytes)); + num_rows_allocated_new * column_metadata_.fixed_length + kNumPaddingBytes)); } } else { RETURN_NOT_OK(buffers_[kFixedLengthBuffer]->Resize( @@ -323,10 +323,7 @@ Status ResizableArrayData::ResizeFixedLengthBuffers(int num_rows_new) { } Status ResizableArrayData::ResizeVaryingLengthBuffer() { - KeyColumnMetadata column_metadata; - column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); - - if (!column_metadata.is_fixed_length) { + if (!column_metadata_.is_fixed_length) { int64_t min_new_size = buffers_[kFixedLengthBuffer]->data_as()[num_rows_]; ARROW_DCHECK(var_len_buf_size_ > 0); if (var_len_buf_size_ < min_new_size) { @@ -343,23 +340,19 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() { } KeyColumnArray ResizableArrayData::column_array() const { - KeyColumnMetadata column_metadata; - column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); - return KeyColumnArray(column_metadata, num_rows_, + return KeyColumnArray(column_metadata_, num_rows_, buffers_[kValidityBuffer]->mutable_data(), buffers_[kFixedLengthBuffer]->mutable_data(), buffers_[kVariableLengthBuffer]->mutable_data()); } std::shared_ptr ResizableArrayData::array_data() const { - KeyColumnMetadata column_metadata; - column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie(); - - auto valid_count = arrow::internal::CountSetBits( - buffers_[kValidityBuffer]->data(), /*offset=*/0, static_cast(num_rows_)); + auto valid_count = + arrow::internal::CountSetBits(buffers_[kValidityBuffer]->data(), /*bit_offset=*/0, + static_cast(num_rows_)); int null_count = static_cast(num_rows_) - static_cast(valid_count); - if (column_metadata.is_fixed_length) { + if (column_metadata_.is_fixed_length) { return ArrayData::Make(data_type_, num_rows_, {buffers_[kValidityBuffer], buffers_[kFixedLengthBuffer]}, null_count); @@ -493,10 +486,12 @@ Status ExecBatchBuilder::AppendSelected(const std::shared_ptr& source ARROW_DCHECK(num_rows_before >= 0); int num_rows_after = num_rows_before + num_rows_to_append; if (target->num_rows() == 0) { - target->Init(source->type, pool, kLogNumRows); + RETURN_NOT_OK(target->Init(source->type, pool, kLogNumRows)); } RETURN_NOT_OK(target->ResizeFixedLengthBuffers(num_rows_after)); + // Since target->Init is called before, we can assume that the ColumnMetadata + // would never fail to be created KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(source->type).ValueOrDie(); @@ -647,11 +642,12 @@ Status ExecBatchBuilder::AppendNulls(const std::shared_ptr& type, int num_rows_before = target.num_rows(); int num_rows_after = num_rows_before + num_rows_to_append; if (target.num_rows() == 0) { - target.Init(type, pool, kLogNumRows); + RETURN_NOT_OK(target.Init(type, pool, kLogNumRows)); } RETURN_NOT_OK(target.ResizeFixedLengthBuffers(num_rows_after)); - KeyColumnMetadata column_metadata = ColumnMetadataFromDataType(type).ValueOrDie(); + ARROW_ASSIGN_OR_RAISE(KeyColumnMetadata column_metadata, + ColumnMetadataFromDataType(type)); // Process fixed length buffer // @@ -708,7 +704,7 @@ Status ExecBatchBuilder::AppendSelected(MemoryPool* pool, const ExecBatch& batch const Datum& data = batch.values[col_ids ? col_ids[i] : i]; ARROW_DCHECK(data.is_array()); const std::shared_ptr& array_data = data.array(); - values_[i].Init(array_data->type, pool, kLogNumRows); + RETURN_NOT_OK(values_[i].Init(array_data->type, pool, kLogNumRows)); } } @@ -739,7 +735,7 @@ Status ExecBatchBuilder::AppendNulls(MemoryPool* pool, if (values_.empty()) { values_.resize(types.size()); for (size_t i = 0; i < types.size(); ++i) { - values_[i].Init(types[i], pool, kLogNumRows); + RETURN_NOT_OK(values_[i].Init(types[i], pool, kLogNumRows)); } } diff --git a/cpp/src/arrow/compute/light_array_internal.h b/cpp/src/arrow/compute/light_array_internal.h index 995c4211998e0..b8e48f096baeb 100644 --- a/cpp/src/arrow/compute/light_array_internal.h +++ b/cpp/src/arrow/compute/light_array_internal.h @@ -295,8 +295,8 @@ class ARROW_EXPORT ResizableArrayData { /// \param pool The pool to make allocations on /// \param log_num_rows_min All resize operations will allocate at least enough /// space for (1 << log_num_rows_min) rows - void Init(const std::shared_ptr& data_type, MemoryPool* pool, - int log_num_rows_min); + Status Init(const std::shared_ptr& data_type, MemoryPool* pool, + int log_num_rows_min); /// \brief Resets the array back to an empty state /// \param release_buffers If true then allocated memory is released and the @@ -351,6 +351,8 @@ class ARROW_EXPORT ResizableArrayData { static constexpr int64_t kNumPaddingBytes = 64; int log_num_rows_min_; std::shared_ptr data_type_; + // Would be valid if data_type_ != NULLPTR. + KeyColumnMetadata column_metadata_{}; MemoryPool* pool_; int num_rows_; int num_rows_allocated_; diff --git a/cpp/src/arrow/compute/light_array_test.cc b/cpp/src/arrow/compute/light_array_test.cc index cc02d489d138f..98a1ab8b7acae 100644 --- a/cpp/src/arrow/compute/light_array_test.cc +++ b/cpp/src/arrow/compute/light_array_test.cc @@ -295,7 +295,7 @@ TEST(ResizableArrayData, Basic) { arrow::internal::checked_pointer_cast(type)->bit_width() / 8; { ResizableArrayData array; - array.Init(type, pool.get(), /*log_num_rows_min=*/16); + ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/16)); ASSERT_EQ(0, array.num_rows()); ASSERT_OK(array.ResizeFixedLengthBuffers(2)); ASSERT_EQ(2, array.num_rows()); @@ -330,7 +330,7 @@ TEST(ResizableArrayData, Binary) { ARROW_SCOPED_TRACE("Type: ", type->ToString()); { ResizableArrayData array; - array.Init(type, pool.get(), /*log_num_rows_min=*/4); + ASSERT_OK(array.Init(type, pool.get(), /*log_num_rows_min=*/4)); ASSERT_EQ(0, array.num_rows()); ASSERT_OK(array.ResizeFixedLengthBuffers(2)); ASSERT_EQ(2, array.num_rows());