Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-43797: [C++] Attach arrow::ArrayStatistics to arrow::ArrayData #43801

Merged
merged 5 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
///
/// \return const ArrayStatistics&
std::shared_ptr<ArrayStatistics> statistics() const { return data_->statistics; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The return type should be std::shared_ptr<ArrayStatistics>& and we should probably add const ArrayStatistics to the shared_ptr so that callers can't mutate the statistics through the shared pointer.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with you. I don't know why I missed const and & here...

Let's add them: GH-44590


protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
Expand Down
126 changes: 126 additions & 0 deletions cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3709,6 +3709,132 @@ TEST(TestSwapEndianArrayData, InvalidLength) {
}
}

class TestArrayDataStatistics : public ::testing::Test {
public:
void SetUp() {
valids_ = {1, 0, 1, 1};
null_count_ = std::count(valids_.begin(), valids_.end(), 0);
null_buffer_ = *internal::BytesToBits(valids_);
values_ = {1, 0, 3, -4};
min_ = *std::min_element(values_.begin(), values_.end());
max_ = *std::max_element(values_.begin(), values_.end());
values_buffer_ = Buffer::FromVector(values_);
data_ = ArrayData::Make(int32(), values_.size(), {null_buffer_, values_buffer_},
null_count_);
data_->statistics = std::make_shared<ArrayStatistics>();
data_->statistics->null_count = null_count_;
data_->statistics->min = min_;
data_->statistics->is_min_exact = true;
data_->statistics->max = max_;
data_->statistics->is_max_exact = true;
}

protected:
std::vector<uint8_t> valids_;
size_t null_count_;
std::shared_ptr<Buffer> null_buffer_;
std::vector<int32_t> values_;
int64_t min_;
int64_t max_;
std::shared_ptr<Buffer> values_buffer_;
std::shared_ptr<ArrayData> data_;
};

TEST_F(TestArrayDataStatistics, MoveConstructor) {
ArrayData copied_data(*data_);
ArrayData moved_data(std::move(copied_data));

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
ASSERT_TRUE(moved_data.statistics->is_min_exact);

ASSERT_TRUE(moved_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
ASSERT_TRUE(moved_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyConstructor) {
ArrayData copied_data(*data_);

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
ASSERT_TRUE(copied_data.statistics->is_min_exact);

ASSERT_TRUE(copied_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
ASSERT_TRUE(copied_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, MoveAssignment) {
ArrayData copied_data(*data_);
ArrayData moved_data;
moved_data = std::move(copied_data);

ASSERT_TRUE(moved_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, moved_data.statistics->null_count.value());

ASSERT_TRUE(moved_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(moved_data.statistics->min.value()));
ASSERT_TRUE(moved_data.statistics->is_min_exact);

ASSERT_TRUE(moved_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(moved_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(moved_data.statistics->max.value()));
ASSERT_TRUE(moved_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyAssignment) {
ArrayData copied_data;
copied_data = *data_;

ASSERT_TRUE(copied_data.statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data.statistics->null_count.value());

ASSERT_TRUE(copied_data.statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data.statistics->min.value()));
ASSERT_TRUE(copied_data.statistics->is_min_exact);

ASSERT_TRUE(copied_data.statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data.statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data.statistics->max.value()));
ASSERT_TRUE(copied_data.statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, CopyTo) {
ASSERT_OK_AND_ASSIGN(auto copied_data,
data_->CopyTo(arrow::default_cpu_memory_manager()));

ASSERT_TRUE(copied_data->statistics->null_count.has_value());
ASSERT_EQ(null_count_, copied_data->statistics->null_count.value());

ASSERT_TRUE(copied_data->statistics->min.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->min.value()));
ASSERT_EQ(min_, std::get<int64_t>(copied_data->statistics->min.value()));
ASSERT_TRUE(copied_data->statistics->is_min_exact);

ASSERT_TRUE(copied_data->statistics->max.has_value());
ASSERT_TRUE(std::holds_alternative<int64_t>(copied_data->statistics->max.value()));
ASSERT_EQ(max_, std::get<int64_t>(copied_data->statistics->max.value()));
ASSERT_TRUE(copied_data->statistics->is_max_exact);
}

TEST_F(TestArrayDataStatistics, Slice) {
auto sliced_data = data_->Slice(0, 1);
ASSERT_FALSE(sliced_data->statistics);
}

template <typename PType>
class TestPrimitiveArray : public ::testing::Test {
public:
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/array/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ Result<std::shared_ptr<ArrayData>> CopyToImpl(const ArrayData& data,
ARROW_ASSIGN_OR_RAISE(output->dictionary, CopyToImpl(*data.dictionary, to, copy_fn));
}

output->statistics = data.statistics;

return output;
}
} // namespace
Expand Down Expand Up @@ -195,6 +197,7 @@ std::shared_ptr<ArrayData> ArrayData::Slice(int64_t off, int64_t len) const {
} else {
copy->null_count = null_count != 0 ? kUnknownNullCount : 0;
}
copy->statistics = nullptr;
mapleFU marked this conversation as resolved.
Show resolved Hide resolved
return copy;
}

Expand Down
24 changes: 22 additions & 2 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <utility>
#include <vector>

#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {
SetNullCount(other.null_count);
}

Expand All @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
dictionary(other.dictionary),
statistics(other.statistics) {
SetNullCount(other.null_count);
}

Expand All @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}

Expand All @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}

Expand Down Expand Up @@ -274,6 +279,18 @@ struct ARROW_EXPORT ArrayData {
}

/// \brief Construct a zero-copy slice of the data with the given offset and length
///
/// The associated `ArrayStatistics` is always discarded in a sliced
/// `ArrayData`. Because `ArrayStatistics` in the original
/// `ArrayData` may be invalid in a sliced `ArrayData`. If you want
/// to reuse statistics in the original `ArrayData`, you need to do
/// it by yourself.
///
/// If the specified slice range has the same range as the original
/// `ArrayData`, we can reuse statistics in the original
/// `ArrayData`. Because it has the same data as the original
/// `ArrayData`. But the associated `ArrayStatistics` is discarded
/// in this case too. Use `Copy()` instead for the case.
std::shared_ptr<ArrayData> Slice(int64_t offset, int64_t length) const;

/// \brief Input-checking variant of Slice
Expand Down Expand Up @@ -390,6 +407,9 @@ struct ARROW_EXPORT ArrayData {

// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;

// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics;
};

/// \brief A non-owning Buffer reference
Expand Down
Loading