From ea47172bd80b5ee040c19e605f7e4a6f872b470f Mon Sep 17 00:00:00 2001 From: Rossi Sun Date: Tue, 14 Jan 2025 21:40:36 +0800 Subject: [PATCH] GH-45254: [C++][Acero] Fix the row offset truncation in row table merge (#45255) ### Rationale for this change See #45254 ### What changes are included in this PR? First modify the test case to expose the suspecting bug. Then the fix in source. ### Are these changes tested? By existing tests. ### Are there any user-facing changes? None. * GitHub Issue: #45254 Authored-by: Rossi Sun Signed-off-by: Rossi Sun --- cpp/src/arrow/acero/hash_join_node_test.cc | 4 +++- cpp/src/arrow/acero/swiss_join.cc | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/acero/hash_join_node_test.cc b/cpp/src/arrow/acero/hash_join_node_test.cc index 76ad9c7d650eb..7dbed7163daca 100644 --- a/cpp/src/arrow/acero/hash_join_node_test.cc +++ b/cpp/src/arrow/acero/hash_join_node_test.cc @@ -3370,8 +3370,10 @@ TEST(HashJoin, LARGE_MEMORY_TEST(BuildSideOver4GBVarLength)) { constexpr int value_no_match_length_min = 128; constexpr int value_no_match_length_max = 129; constexpr int value_match_length = 130; + // The value "DDD..." will be hashed to the partition over 4GB of the hash table. + // Matching at this area gives us more coverage. const auto value_match = - std::make_shared(std::string(value_match_length, 'X')); + std::make_shared(std::string(value_match_length, 'D')); constexpr int16_t num_rows_per_batch_left = 128; constexpr int16_t num_rows_per_batch_right = 4096; const int64_t num_batches_left = 8; diff --git a/cpp/src/arrow/acero/swiss_join.cc b/cpp/src/arrow/acero/swiss_join.cc index c068eeb50ff0a..fc3be1b462e60 100644 --- a/cpp/src/arrow/acero/swiss_join.cc +++ b/cpp/src/arrow/acero/swiss_join.cc @@ -439,11 +439,11 @@ Status RowArrayMerge::PrepareForMerge(RowArray* target, num_rows = 0; num_bytes = 0; for (size_t i = 0; i < sources.size(); ++i) { - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; num_rows += sources[i]->rows_.length(); num_bytes += sources[i]->rows_.offsets()[sources[i]->rows_.length()]; } - target->rows_.mutable_offsets()[num_rows] = static_cast(num_bytes); + target->rows_.mutable_offsets()[num_rows] = num_bytes; } return Status::OK();