diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index 1c9b2323de500..f42256540788a 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -29,6 +29,7 @@ #include "arrow/io/caching.h" #include "arrow/io/file.h" #include "arrow/io/memory.h" +#include "arrow/io/util_internal.h" #include "arrow/util/bit_util.h" #include "arrow/util/checked_cast.h" #include "arrow/util/future.h" @@ -400,6 +401,21 @@ class SerializedFile : public ParquetFileReader::Contents { PARQUET_THROW_NOT_OK(cached_source_->Cache(ranges)); } + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit, int64_t range_size_limit) { + std::vector<::arrow::io::ReadRange> ranges; + for (int row_group : row_groups) { + for (int col : column_indices) { + ranges.push_back( + ComputeColumnChunkRange(file_metadata_.get(), source_size_, row_group, col)); + } + } + + return ::arrow::io::internal::CoalesceReadRanges(std::move(ranges), hole_size_limit, + range_size_limit); + } + ::arrow::Future<> WhenBuffered(const std::vector& row_groups, const std::vector& column_indices) const { if (!cached_source_) { diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index b59b59f95c2d8..c42163276cdaa 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -201,6 +201,32 @@ class PARQUET_EXPORT ParquetFileReader { const ::arrow::io::IOContext& ctx, const ::arrow::io::CacheOptions& options); + /// Retrieve the list of byte ranges that would need to be read to retrieve + /// the data for the specified row groups and column indices. + /// + /// A reader can optionally call this if they wish to handle their own + /// caching and management of file reads (or offload them to other readers). + /// Unlike PreBuffer, this method will not perform any actual caching or + /// reads, instead just using the file metadata to determine the byte ranges + /// that would need to be read if you were to consume the entirety of the column + /// chunks for the provided columns in the specified row groups. + /// + /// If row_groups or column_indices are empty, then the result of this will be empty. + /// + /// hole_size_limit represents the maximum distance, in bytes, between two + /// consecutive ranges; beyond this value, ranges will not be combined. The default + /// value is 1MB. + /// + /// range_size_limit is the maximum size in bytes of a combined range; if combining + /// two consecutive ranges would produce a range larger than this, they are not + /// combined. The default values is 64MB. This *must* be larger than hole_size_limit. + /// + /// This will not take into account page indexes or any other predicate push down + /// benefits that may be available. + ::arrow::Result> GetReadRanges( + const std::vector& row_groups, const std::vector& column_indices, + int64_t hole_size_limit = 1024 * 1024, int64_t range_size_limit = 64 * 1024 * 1024); + /// Wait for the specified row groups and column indices to be pre-buffered. /// /// After the returned Future completes, reading the specified row