-
Notifications
You must be signed in to change notification settings - Fork 839
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Minor: Improve parquet PageIndex documentation #6042
Changes from 1 commit
a143581
84596aa
4476adf
e1524c0
b5162a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,7 +60,13 @@ use crate::schema::types::{ | |
/// column in the third row group of the parquet file. | ||
pub type ParquetColumnIndex = Vec<Vec<Index>>; | ||
|
||
/// [`PageLocation`] for each data page of each row group of each column. | ||
/// [`PageLocation`] for each data page of each row group of each column | ||
/// | ||
/// This structure is the parsed representation of the [`OffsetIndex`] from the | ||
/// Parquet file footer, as described in the Parquet [PageIndex documentation]. | ||
/// | ||
/// This structure is the parsed representation of the [`ColumnIndex`] in a parquet | ||
/// file footer. | ||
/// | ||
/// `offset_index[row_group_number][column_number][page_number]` holds | ||
/// the [`PageLocation`] corresponding to page `page_number` of column | ||
|
@@ -69,6 +75,8 @@ pub type ParquetColumnIndex = Vec<Vec<Index>>; | |
/// For example `offset_index[2][3][4]` holds the [`PageLocation`] for | ||
/// the fifth page of the forth column in the third row group of the | ||
/// parquet file. | ||
/// | ||
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
pub type ParquetOffsetIndex = Vec<Vec<Vec<PageLocation>>>; | ||
|
||
/// Parsed metadata for a single Parquet file | ||
|
@@ -942,14 +950,19 @@ impl ColumnChunkMetaDataBuilder { | |
} | ||
} | ||
|
||
/// Builder for column index | ||
/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex] | ||
/// | ||
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
pub struct ColumnIndexBuilder { | ||
null_pages: Vec<bool>, | ||
min_values: Vec<Vec<u8>>, | ||
max_values: Vec<Vec<u8>>, | ||
null_counts: Vec<i64>, | ||
boundary_order: BoundaryOrder, | ||
// If one page can't get build index, need to ignore all index in this column | ||
/// Is the information in the builder valid? | ||
/// | ||
/// Set to `false` if any entry in the page doesn't have statistics for | ||
/// some reason | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason can be document some example here, like apache/parquet-format#196 ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you -- this is an excellent idea. Done. |
||
valid: bool, | ||
} | ||
|
||
|
@@ -971,6 +984,7 @@ impl ColumnIndexBuilder { | |
} | ||
} | ||
|
||
/// Append statistics for the next page | ||
pub fn append( | ||
&mut self, | ||
null_page: bool, | ||
|
@@ -988,15 +1002,19 @@ impl ColumnIndexBuilder { | |
self.boundary_order = boundary_order; | ||
} | ||
|
||
/// Mark this column index as invalid | ||
pub fn to_invalid(&mut self) { | ||
self.valid = false; | ||
} | ||
|
||
/// Is the information in the builder valid? | ||
pub fn valid(&self) -> bool { | ||
self.valid | ||
} | ||
|
||
/// Build and get the thrift metadata of column index | ||
/// | ||
/// Note: callers should check [`Self::valid`] before calling this method | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes me wonder if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that would be a good improvement -- filed #6064 |
||
pub fn build_to_thrift(self) -> ColumnIndex { | ||
ColumnIndex::new( | ||
self.null_pages, | ||
|
@@ -1008,7 +1026,9 @@ impl ColumnIndexBuilder { | |
} | ||
} | ||
|
||
/// Builder for offset index | ||
/// Builder for offset index, part of the Parquet [PageIndex]. | ||
/// | ||
/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
pub struct OffsetIndexBuilder { | ||
offset_array: Vec<i64>, | ||
compressed_page_size_array: Vec<i32>, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,14 +25,17 @@ use crate::format::{BoundaryOrder, ColumnIndex}; | |
use crate::util::bit_util::from_le_slice; | ||
use std::fmt::Debug; | ||
|
||
/// PageIndex Statistics for one data page, as described in [Column Index]. | ||
/// Statistics for one data page | ||
/// | ||
/// This structure is the parsed representation of the [`ColumnIndex`] from the | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I always find the naming of this structure confusing (as the |
||
/// Parquet file footer, as described in the Parquet [PageIndex documentation]. | ||
/// | ||
/// One significant difference from the row group level | ||
/// [`Statistics`](crate::format::Statistics) is that page level | ||
/// statistics may not store actual column values as min and max | ||
/// (e.g. they may store truncated strings to save space) | ||
/// | ||
/// [Column Index]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md | ||
#[derive(Debug, Clone, PartialEq, Eq, Hash)] | ||
pub struct PageIndex<T> { | ||
/// The minimum value, It is None when all values are null | ||
|
@@ -72,7 +75,7 @@ where | |
#[allow(non_camel_case_types)] | ||
/// Typed statistics for a data page in a column chunk. | ||
/// | ||
/// This structure is part of the "Page Index" and is optionally part of | ||
/// This structure is part of the "Page Index" and is stored in the | ||
/// [ColumnIndex] in the parquet file and can be used to skip decoding pages | ||
/// while reading the file data. | ||
pub enum Index { | ||
|
@@ -117,7 +120,7 @@ impl Index { | |
} | ||
} | ||
|
||
/// Stores the [`PageIndex`] for each page of a column | ||
/// Stores the values for [`PageIndex`] | ||
#[derive(Debug, Clone, PartialEq, Eq, Hash)] | ||
pub struct NativeIndex<T: ParquetValueType> { | ||
/// The indexes, one item per page | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this sentence maybe meant for the
ParquetColumnIndex
type above? Otherwise I'm getting more confused by the comment.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, you are correct. Sorry -- will fix