diff --git a/datafusion/common/src/stats.rs b/datafusion/common/src/stats.rs index fbf639a32182..2e799c92bea7 100644 --- a/datafusion/common/src/stats.rs +++ b/datafusion/common/src/stats.rs @@ -279,9 +279,11 @@ pub struct ColumnStatistics { impl ColumnStatistics { /// Column contains a single non null value (e.g constant). pub fn is_singleton(&self) -> bool { - match (self.min_value.get_value(), self.max_value.get_value()) { + match (&self.min_value, &self.max_value) { // Min and max values are the same and not infinity. - (Some(min), Some(max)) => !min.is_null() && !max.is_null() && (min == max), + (Precision::Exact(min), Precision::Exact(max)) => { + !min.is_null() && !max.is_null() && (min == max) + } (_, _) => false, } } diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 3d8248dfdeb2..695e139517cf 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -70,7 +70,11 @@ pub async fn get_statistics_with_limit( // files. This only applies when we know the number of rows. It also // currently ignores tables that have no statistics regarding the // number of rows. - if num_rows.get_value().unwrap_or(&usize::MIN) <= &limit.unwrap_or(usize::MAX) { + let conservative_num_rows = match num_rows { + Precision::Exact(nr) => nr, + _ => usize::MIN, + }; + if conservative_num_rows <= limit.unwrap_or(usize::MAX) { while let Some(current) = all_files.next().await { let (file, file_stats) = current?; result_files.push(file); diff --git a/datafusion/physical-plan/src/filter.rs b/datafusion/physical-plan/src/filter.rs index ce66d614721c..0c44b367e514 100644 --- a/datafusion/physical-plan/src/filter.rs +++ b/datafusion/physical-plan/src/filter.rs @@ -252,13 +252,25 @@ fn collect_new_statistics( }, )| { let closed_interval = interval.close_bounds(); + let (min_value, max_value) = + if closed_interval.lower.value.eq(&closed_interval.upper.value) { + ( + Precision::Exact(closed_interval.lower.value), + Precision::Exact(closed_interval.upper.value), + ) + } else { + ( + Precision::Inexact(closed_interval.lower.value), + Precision::Inexact(closed_interval.upper.value), + ) + }; ColumnStatistics { null_count: match input_column_stats[idx].null_count.get_value() { Some(nc) => Precision::Inexact(*nc), None => Precision::Absent, }, - max_value: Precision::Inexact(closed_interval.upper.value), - min_value: Precision::Inexact(closed_interval.lower.value), + max_value, + min_value, distinct_count: match distinct_count.get_value() { Some(dc) => Precision::Inexact(*dc), None => Precision::Absent, @@ -963,4 +975,26 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn test_statistics_with_constant_column() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); + let input = Arc::new(StatisticsExec::new( + Statistics::new_unknown(&schema), + schema, + )); + // WHERE a = 10 + let predicate = Arc::new(BinaryExpr::new( + Arc::new(Column::new("a", 0)), + Operator::Eq, + Arc::new(Literal::new(ScalarValue::Int32(Some(10)))), + )); + let filter: Arc = + Arc::new(FilterExec::try_new(predicate, input)?); + let filter_statistics = filter.statistics()?; + // First column is "a", and it is a column with only one value after the filter. + assert!(filter_statistics.column_statistics[0].is_singleton()); + + Ok(()) + } }