From a1435812460686a3be8b20f6fc813dd6be4e959f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Jul 2024 06:47:51 -0400 Subject: [PATCH 1/5] Minor: Improve parquet PageIndex documentation --- parquet/src/file/metadata/mod.rs | 28 ++++++++++++++++++++++++---- parquet/src/file/page_index/index.rs | 11 +++++++---- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 40922d52bfd4..0b0bbcccfaff 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -60,7 +60,13 @@ use crate::schema::types::{ /// column in the third row group of the parquet file. pub type ParquetColumnIndex = Vec>; -/// [`PageLocation`] for each data page of each row group of each column. +/// [`PageLocation`] for each data page of each row group of each column +/// +/// This structure is the parsed representation of the [`OffsetIndex`] from the +/// Parquet file footer, as described in the Parquet [PageIndex documentation]. +/// +/// This structure is the parsed representation of the [`ColumnIndex`] in a parquet +/// file footer. /// /// `offset_index[row_group_number][column_number][page_number]` holds /// the [`PageLocation`] corresponding to page `page_number` of column @@ -69,6 +75,8 @@ pub type ParquetColumnIndex = Vec>; /// For example `offset_index[2][3][4]` holds the [`PageLocation`] for /// the fifth page of the forth column in the third row group of the /// parquet file. +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetOffsetIndex = Vec>>; /// Parsed metadata for a single Parquet file @@ -942,14 +950,19 @@ impl ColumnChunkMetaDataBuilder { } } -/// Builder for column index +/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex] +/// +/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct ColumnIndexBuilder { null_pages: Vec, min_values: Vec>, max_values: Vec>, null_counts: Vec, boundary_order: BoundaryOrder, - // If one page can't get build index, need to ignore all index in this column + /// Is the information in the builder valid? + /// + /// Set to `false` if any entry in the page doesn't have statistics for + /// some reason valid: bool, } @@ -971,6 +984,7 @@ impl ColumnIndexBuilder { } } + /// Append statistics for the next page pub fn append( &mut self, null_page: bool, @@ -988,15 +1002,19 @@ impl ColumnIndexBuilder { self.boundary_order = boundary_order; } + /// Mark this column index as invalid pub fn to_invalid(&mut self) { self.valid = false; } + /// Is the information in the builder valid? pub fn valid(&self) -> bool { self.valid } /// Build and get the thrift metadata of column index + /// + /// Note: callers should check [`Self::valid`] before calling this method pub fn build_to_thrift(self) -> ColumnIndex { ColumnIndex::new( self.null_pages, @@ -1008,7 +1026,9 @@ impl ColumnIndexBuilder { } } -/// Builder for offset index +/// Builder for offset index, part of the Parquet [PageIndex]. +/// +/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub struct OffsetIndexBuilder { offset_array: Vec, compressed_page_size_array: Vec, diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 71fb47afa960..3c7edce08572 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -25,14 +25,17 @@ use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; use std::fmt::Debug; -/// PageIndex Statistics for one data page, as described in [Column Index]. +/// Statistics for one data page +/// +/// This structure is the parsed representation of the [`ColumnIndex`] from the +/// Parquet file footer, as described in the Parquet [PageIndex documentation]. /// /// One significant difference from the row group level /// [`Statistics`](crate::format::Statistics) is that page level /// statistics may not store actual column values as min and max /// (e.g. they may store truncated strings to save space) /// -/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PageIndex { /// The minimum value, It is None when all values are null @@ -72,7 +75,7 @@ where #[allow(non_camel_case_types)] /// Typed statistics for a data page in a column chunk. /// -/// This structure is part of the "Page Index" and is optionally part of +/// This structure is part of the "Page Index" and is stored in the /// [ColumnIndex] in the parquet file and can be used to skip decoding pages /// while reading the file data. pub enum Index { @@ -117,7 +120,7 @@ impl Index { } } -/// Stores the [`PageIndex`] for each page of a column +/// Stores the values for [`PageIndex`] #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { /// The indexes, one item per page From 84596aac996f3b2b655febdd60c50308d34efd8c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Jul 2024 09:35:26 -0400 Subject: [PATCH 2/5] More improvements --- parquet/src/file/metadata/mod.rs | 12 ++++++--- parquet/src/file/page_index/index.rs | 37 ++++++++++++++++------------ parquet/src/file/statistics.rs | 16 ++++++++++-- 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index 0b0bbcccfaff..c46d0bd4deaf 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -50,7 +50,12 @@ use crate::schema::types::{ Type as SchemaType, }; -/// [`Index`] for each row group of each column. +/// Page level statistics for each column chunk of each row group. +/// +/// This structure is an memory representation of multiple [`ColumnIndex`] +/// structures in a parquet file footer, as described in the Parquet [PageIndex +/// documentation]. Each [`Index`] holds statistics about all the pages in a +/// particular column chunk. /// /// `column_index[row_group_number][column_number]` holds the /// [`Index`] corresponding to column `column_number` of row group @@ -58,6 +63,8 @@ use crate::schema::types::{ /// /// For example `column_index[2][3]` holds the [`Index`] for the forth /// column in the third row group of the parquet file. +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md pub type ParquetColumnIndex = Vec>; /// [`PageLocation`] for each data page of each row group of each column @@ -65,9 +72,6 @@ pub type ParquetColumnIndex = Vec>; /// This structure is the parsed representation of the [`OffsetIndex`] from the /// Parquet file footer, as described in the Parquet [PageIndex documentation]. /// -/// This structure is the parsed representation of the [`ColumnIndex`] in a parquet -/// file footer. -/// /// `offset_index[row_group_number][column_number][page_number]` holds /// the [`PageLocation`] corresponding to page `page_number` of column /// `column_number`of row group `row_group_number`. diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index 3c7edce08572..c1bad35d2b67 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -25,17 +25,9 @@ use crate::format::{BoundaryOrder, ColumnIndex}; use crate::util::bit_util::from_le_slice; use std::fmt::Debug; -/// Statistics for one data page +/// Typed statistics for one data page /// -/// This structure is the parsed representation of the [`ColumnIndex`] from the -/// Parquet file footer, as described in the Parquet [PageIndex documentation]. -/// -/// One significant difference from the row group level -/// [`Statistics`](crate::format::Statistics) is that page level -/// statistics may not store actual column values as min and max -/// (e.g. they may store truncated strings to save space) -/// -/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// See [`NativeIndex`] for more details #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct PageIndex { /// The minimum value, It is None when all values are null @@ -73,11 +65,9 @@ where #[derive(Debug, Clone, PartialEq)] #[allow(non_camel_case_types)] -/// Typed statistics for a data page in a column chunk. +/// Statistics for data pages in a column chunk. /// -/// This structure is part of the "Page Index" and is stored in the -/// [ColumnIndex] in the parquet file and can be used to skip decoding pages -/// while reading the file data. +/// See [NativeIndex] for more information pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index, @@ -120,10 +110,25 @@ impl Index { } } -/// Stores the values for [`PageIndex`] +/// Strongly typed statistics for data pages in a column chunk. +/// +/// This structure is a natively typed, in memory representation of the +/// [`ColumnIndex`] structure in a parquet file footer, as described in the +/// Parquet [PageIndex documentation]. The statistics stored in this structure +/// can be used by query engines to skip decoding pages while reading parquet +/// data. +/// +/// # Differences with Row Group Level Statistics +/// +/// One significant difference between `NativeIndex` and row group level +/// [`Statistics`] is that page level statistics may not store actual column +/// values as min and max (e.g. they may store truncated strings to save space) +/// +/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// [`Statistics`]: crate::file::statistics::Statistics #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct NativeIndex { - /// The indexes, one item per page + /// The actual column indexes, one item per page pub indexes: Vec>, /// If the min/max elements are ordered, and if so in which /// direction. See [source] for details. diff --git a/parquet/src/file/statistics.rs b/parquet/src/file/statistics.rs index 7d704cc138fc..dded9c9c1192 100644 --- a/parquet/src/file/statistics.rs +++ b/parquet/src/file/statistics.rs @@ -287,7 +287,17 @@ pub fn to_thrift(stats: Option<&Statistics>) -> Option { Some(thrift_stats) } -/// Statistics for a column chunk and data page. +/// Strongly typed statistics for a column chunk within a row group. +/// +/// This structure is a natively typed, in memory representation of the +/// [`Statistics`] structure in a parquet file footer. The statistics stored in +/// this structure can be used by query engines to skip decoding pages while +/// reading parquet data. +/// +/// Page level statistics are stored separately, in [NativeIndex]. +/// +/// [`Statistics`]: crate::format::Statistics +/// [NativeIndex]: crate::file::page_index::index::NativeIndex #[derive(Debug, Clone, PartialEq)] pub enum Statistics { Boolean(ValueStatistics), @@ -445,7 +455,9 @@ impl fmt::Display for Statistics { /// Typed implementation for [`Statistics`]. pub type TypedStatistics = ValueStatistics<::T>; -/// Statistics for a particular `ParquetValueType` +/// Typed statistics for one column chunk +/// +/// See [`Statistics`] for more details #[derive(Clone, Eq, PartialEq)] pub struct ValueStatistics { min: Option, From 4476adf7a67b0e2a17e03771cd9d0803d8db2970 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 11 Jul 2024 09:37:51 -0400 Subject: [PATCH 3/5] Add reasons for data page being without null --- parquet/src/file/metadata/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index c46d0bd4deaf..a1f312f8d9d8 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -966,7 +966,9 @@ pub struct ColumnIndexBuilder { /// Is the information in the builder valid? /// /// Set to `false` if any entry in the page doesn't have statistics for - /// some reason + /// some reason. This might happen if the page is entirely null, or + /// is a floating point column without any non-nan values + /// e.g. valid: bool, } From e1524c05e80b2484aaf825db37c316ac8fc2615d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 15 Jul 2024 14:29:21 -0400 Subject: [PATCH 4/5] Apply suggestions from code review Co-authored-by: Val Lorentz --- parquet/src/file/metadata/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/parquet/src/file/metadata/mod.rs b/parquet/src/file/metadata/mod.rs index a1f312f8d9d8..02a7ffa0af94 100644 --- a/parquet/src/file/metadata/mod.rs +++ b/parquet/src/file/metadata/mod.rs @@ -52,7 +52,7 @@ use crate::schema::types::{ /// Page level statistics for each column chunk of each row group. /// -/// This structure is an memory representation of multiple [`ColumnIndex`] +/// This structure is an in-memory representation of multiple [`ColumnIndex`] /// structures in a parquet file footer, as described in the Parquet [PageIndex /// documentation]. Each [`Index`] holds statistics about all the pages in a /// particular column chunk. @@ -966,7 +966,8 @@ pub struct ColumnIndexBuilder { /// Is the information in the builder valid? /// /// Set to `false` if any entry in the page doesn't have statistics for - /// some reason. This might happen if the page is entirely null, or + /// some reason, so statistics for that page won't be written to the file. + /// This might happen if the page is entirely null, or /// is a floating point column without any non-nan values /// e.g. valid: bool, From b5162a9f6736eb271b0c1c8a20ad90d3ffa74e5a Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 15 Jul 2024 14:29:33 -0400 Subject: [PATCH 5/5] Update parquet/src/file/page_index/index.rs --- parquet/src/file/page_index/index.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/file/page_index/index.rs b/parquet/src/file/page_index/index.rs index c1bad35d2b67..7eba949042f1 100644 --- a/parquet/src/file/page_index/index.rs +++ b/parquet/src/file/page_index/index.rs @@ -67,7 +67,7 @@ where #[allow(non_camel_case_types)] /// Statistics for data pages in a column chunk. /// -/// See [NativeIndex] for more information +/// See [`NativeIndex`] for more information pub enum Index { /// Sometimes reading page index from parquet file /// will only return pageLocations without min_max index,