Skip to content

Commit

Permalink
Bug-fix in Filter and Limit statistics (#8094)
Browse files Browse the repository at this point in the history
* Bug fix and code simplification

* Remove limit stats changes

* Test added

* Reduce code diff
  • Loading branch information
berkaysynnada authored Nov 9, 2023
1 parent 43cc870 commit e54894c
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 5 deletions.
6 changes: 4 additions & 2 deletions datafusion/common/src/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -279,9 +279,11 @@ pub struct ColumnStatistics {
impl ColumnStatistics {
/// Column contains a single non null value (e.g constant).
pub fn is_singleton(&self) -> bool {
match (self.min_value.get_value(), self.max_value.get_value()) {
match (&self.min_value, &self.max_value) {
// Min and max values are the same and not infinity.
(Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max),
(Precision::Exact(min), Precision::Exact(max)) => {
!min.is_null() && !max.is_null() && (min == max)
}
(_, _) => false,
}
}
Expand Down
6 changes: 5 additions & 1 deletion datafusion/core/src/datasource/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ pub async fn get_statistics_with_limit(
// files. This only applies when we know the number of rows. It also
// currently ignores tables that have no statistics regarding the
// number of rows.
if num_rows.get_value().unwrap_or(&usize::MIN) <= &limit.unwrap_or(usize::MAX) {
let conservative_num_rows = match num_rows {
Precision::Exact(nr) => nr,
_ => usize::MIN,
};
if conservative_num_rows <= limit.unwrap_or(usize::MAX) {
while let Some(current) = all_files.next().await {
let (file, file_stats) = current?;
result_files.push(file);
Expand Down
38 changes: 36 additions & 2 deletions datafusion/physical-plan/src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,25 @@ fn collect_new_statistics(
},
)| {
let closed_interval = interval.close_bounds();
let (min_value, max_value) =
if closed_interval.lower.value.eq(&closed_interval.upper.value) {
(
Precision::Exact(closed_interval.lower.value),
Precision::Exact(closed_interval.upper.value),
)
} else {
(
Precision::Inexact(closed_interval.lower.value),
Precision::Inexact(closed_interval.upper.value),
)
};
ColumnStatistics {
null_count: match input_column_stats[idx].null_count.get_value() {
Some(nc) => Precision::Inexact(*nc),
None => Precision::Absent,
},
max_value: Precision::Inexact(closed_interval.upper.value),
min_value: Precision::Inexact(closed_interval.lower.value),
max_value,
min_value,
distinct_count: match distinct_count.get_value() {
Some(dc) => Precision::Inexact(*dc),
None => Precision::Absent,
Expand Down Expand Up @@ -963,4 +975,26 @@ mod tests {

Ok(())
}

#[tokio::test]
async fn test_statistics_with_constant_column() -> Result<()> {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
let input = Arc::new(StatisticsExec::new(
Statistics::new_unknown(&schema),
schema,
));
// WHERE a = 10
let predicate = Arc::new(BinaryExpr::new(
Arc::new(Column::new("a", 0)),
Operator::Eq,
Arc::new(Literal::new(ScalarValue::Int32(Some(10)))),
));
let filter: Arc<dyn ExecutionPlan> =
Arc::new(FilterExec::try_new(predicate, input)?);
let filter_statistics = filter.statistics()?;
// First column is "a", and it is a column with only one value after the filter.
assert!(filter_statistics.column_statistics[0].is_singleton());

Ok(())
}
}

0 comments on commit e54894c

Please sign in to comment.