Skip to content

Commit

Permalink
Minor: Improve parquet PageIndex documentation (#6042)
Browse files Browse the repository at this point in the history
* Minor: Improve parquet PageIndex documentation

* More improvements

* Add reasons for data page being without null

* Apply suggestions from code review

Co-authored-by: Val Lorentz <[email protected]>

* Update parquet/src/file/page_index/index.rs

---------

Co-authored-by: Val Lorentz <[email protected]>
  • Loading branch information
alamb and progval authored Jul 17, 2024
1 parent b44497e commit 9be0eb5
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 20 deletions.
37 changes: 32 additions & 5 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,27 @@ use crate::schema::types::{
Type as SchemaType,
};

/// [`Index`] for each row group of each column.
/// Page level statistics for each column chunk of each row group.
///
/// This structure is an in-memory representation of multiple [`ColumnIndex`]
/// structures in a parquet file footer, as described in the Parquet [PageIndex
/// documentation]. Each [`Index`] holds statistics about all the pages in a
/// particular column chunk.
///
/// `column_index[row_group_number][column_number]` holds the
/// [`Index`] corresponding to column `column_number` of row group
/// `row_group_number`.
///
/// For example `column_index[2][3]` holds the [`Index`] for the forth
/// column in the third row group of the parquet file.
///
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetColumnIndex = Vec<Vec<Index>>;

/// [`PageLocation`] for each data page of each row group of each column.
/// [`PageLocation`] for each data page of each row group of each column
///
/// This structure is the parsed representation of the [`OffsetIndex`] from the
/// Parquet file footer, as described in the Parquet [PageIndex documentation].
///
/// `offset_index[row_group_number][column_number][page_number]` holds
/// the [`PageLocation`] corresponding to page `page_number` of column
Expand All @@ -73,6 +83,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>;
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for
/// the fifth page of the forth column in the third row group of the
/// parquet file.
///
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>;

/// Parsed metadata for a single Parquet file
Expand Down Expand Up @@ -946,14 +958,22 @@ impl ColumnChunkMetaDataBuilder {
}
}

/// Builder for column index
/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
///
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct ColumnIndexBuilder {
null_pages: Vec<bool>,
min_values: Vec<Vec<u8>>,
max_values: Vec<Vec<u8>>,
null_counts: Vec<i64>,
boundary_order: BoundaryOrder,
// If one page can't get build index, need to ignore all index in this column
/// Is the information in the builder valid?
///
/// Set to `false` if any entry in the page doesn't have statistics for
/// some reason, so statistics for that page won't be written to the file.
/// This might happen if the page is entirely null, or
/// is a floating point column without any non-nan values
/// e.g. <https://github.com/apache/parquet-format/pull/196>
valid: bool,
}

Expand All @@ -975,6 +995,7 @@ impl ColumnIndexBuilder {
}
}

/// Append statistics for the next page
pub fn append(
&mut self,
null_page: bool,
Expand All @@ -992,15 +1013,19 @@ impl ColumnIndexBuilder {
self.boundary_order = boundary_order;
}

/// Mark this column index as invalid
pub fn to_invalid(&mut self) {
self.valid = false;
}

/// Is the information in the builder valid?
pub fn valid(&self) -> bool {
self.valid
}

/// Build and get the thrift metadata of column index
///
/// Note: callers should check [`Self::valid`] before calling this method
pub fn build_to_thrift(self) -> ColumnIndex {
ColumnIndex::new(
self.null_pages,
Expand All @@ -1012,7 +1037,9 @@ impl ColumnIndexBuilder {
}
}

/// Builder for offset index
/// Builder for offset index, part of the Parquet [PageIndex].
///
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub struct OffsetIndexBuilder {
offset_array: Vec<i64>,
compressed_page_size_array: Vec<i32>,
Expand Down
34 changes: 21 additions & 13 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex};
use crate::util::bit_util::from_le_slice;
use std::fmt::Debug;

/// PageIndex Statistics for one data page, as described in [Column Index].
/// Typed statistics for one data page
///
/// One significant difference from the row group level
/// [`Statistics`](crate::format::Statistics) is that page level
/// statistics may not store actual column values as min and max
/// (e.g. they may store truncated strings to save space)
///
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// See [`NativeIndex`] for more details
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct PageIndex<T> {
/// The minimum value, It is None when all values are null
Expand Down Expand Up @@ -70,11 +65,9 @@ where

#[derive(Debug, Clone, PartialEq)]
#[allow(non_camel_case_types)]
/// Typed statistics for a data page in a column chunk.
/// Statistics for data pages in a column chunk.
///
/// This structure is part of the "Page Index" and is optionally part of
/// [ColumnIndex] in the parquet file and can be used to skip decoding pages
/// while reading the file data.
/// See [`NativeIndex`] for more information
pub enum Index {
/// Sometimes reading page index from parquet file
/// will only return pageLocations without min_max index,
Expand Down Expand Up @@ -117,10 +110,25 @@ impl Index {
}
}

/// Stores the [`PageIndex`] for each page of a column
/// Strongly typed statistics for data pages in a column chunk.
///
/// This structure is a natively typed, in memory representation of the
/// [`ColumnIndex`] structure in a parquet file footer, as described in the
/// Parquet [PageIndex documentation]. The statistics stored in this structure
/// can be used by query engines to skip decoding pages while reading parquet
/// data.
///
/// # Differences with Row Group Level Statistics
///
/// One significant difference between `NativeIndex` and row group level
/// [`Statistics`] is that page level statistics may not store actual column
/// values as min and max (e.g. they may store truncated strings to save space)
///
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
/// [`Statistics`]: crate::file::statistics::Statistics
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct NativeIndex<T: ParquetValueType> {
/// The indexes, one item per page
/// The actual column indexes, one item per page
pub indexes: Vec<PageIndex<T>>,
/// If the min/max elements are ordered, and if so in which
/// direction. See [source] for details.
Expand Down
16 changes: 14 additions & 2 deletions parquet/src/file/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option<TStatistics> {
Some(thrift_stats)
}

/// Statistics for a column chunk and data page.
/// Strongly typed statistics for a column chunk within a row group.
///
/// This structure is a natively typed, in memory representation of the
/// [`Statistics`] structure in a parquet file footer. The statistics stored in
/// this structure can be used by query engines to skip decoding pages while
/// reading parquet data.
///
/// Page level statistics are stored separately, in [NativeIndex].
///
/// [`Statistics`]: crate::format::Statistics
/// [NativeIndex]: crate::file::page_index::index::NativeIndex
#[derive(Debug, Clone, PartialEq)]
pub enum Statistics {
Boolean(ValueStatistics<bool>),
Expand Down Expand Up @@ -445,7 +455,9 @@ impl fmt::Display for Statistics {
/// Typed implementation for [`Statistics`].
pub type TypedStatistics<T> = ValueStatistics<<T as DataType>::T>;

/// Statistics for a particular `ParquetValueType`
/// Typed statistics for one column chunk
///
/// See [`Statistics`] for more details
#[derive(Clone, Eq, PartialEq)]
pub struct ValueStatistics<T> {
min: Option<T>,
Expand Down

0 comments on commit 9be0eb5

Please sign in to comment.