From be8740d18145a2297fdf090604573ecccc56f3fa Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 26 Nov 2024 17:58:48 +0900 Subject: [PATCH] GH-44815: [C++][Parquet] Add an example to dump statistics read as `arrow::ArrayStatistics` (#44816) ### Rationale for this change I want to use this in the C data interface statistics documents: https://github.com/apache/arrow/pull/43553 ### What changes are included in this PR? Add an executable that reads an Apache Parquet file and dumps statistics read as `arrow::ArrayStatistics`. ### Are these changes tested? Yes. ### Are there any user-facing changes? No. * GitHub Issue: #44815 Authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/tools/parquet/CMakeLists.txt | 7 ++- .../parquet/parquet_dump_arrow_statistics.cc | 58 +++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 cpp/tools/parquet/parquet_dump_arrow_statistics.cc diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 87c3254607589..5aaa456dcae08 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,12 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader parquet-scan) + set(PARQUET_TOOLS + parquet-dump-arrow-statistics + parquet-dump-footer + parquet-dump-schema + parquet-reader + parquet-scan) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) diff --git a/cpp/tools/parquet/parquet_dump_arrow_statistics.cc b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc new file mode 100644 index 0000000000000..8aeced94f6a74 --- /dev/null +++ b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +namespace { +arrow::Status PrintArrowStatistics(const char* path) { + ARROW_ASSIGN_OR_RAISE( + auto input, arrow::io::MemoryMappedFile::Open(path, arrow::io::FileMode::READ)); + ARROW_ASSIGN_OR_RAISE(auto reader, + parquet::arrow::OpenFile(input, arrow::default_memory_pool())); + ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, reader->GetRecordBatchReader()); + while (true) { + ARROW_ASSIGN_OR_RAISE(auto record_batch, record_batch_reader->Next()); + if (!record_batch) { + break; + } + ARROW_ASSIGN_OR_RAISE(auto statistics_array, record_batch->MakeStatisticsArray()); + std::cout << statistics_array->ToString() << std::endl; + } + return arrow::Status::OK(); +} +}; // namespace + +int main(int argc, char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " PARQUET_PATH" << std::endl; + std::cerr << " e.g.: " << argv[0] << " sample.parquet" << std::endl; + return EXIT_FAILURE; + } + + auto status = PrintArrowStatistics(argv[1]); + if (status.ok()) { + return EXIT_SUCCESS; + } else { + std::cerr << status.ToString() << std::endl; + return EXIT_FAILURE; + } +}