This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new aca8c14478 Fix FilterExec converting Absent column stats to
Exact(NULL) (#20391)
aca8c14478 is described below
commit aca8c144789b9d5649abd16a6f55ecdf65ff72db
Author: Filip Wojciechowski <[email protected]>
AuthorDate: Mon Mar 9 13:50:42 2026 -0700
Fix FilterExec converting Absent column stats to Exact(NULL) (#20391)
## Which issue does this PR close?
- Closes #20388.
## Rationale for this change
`collect_new_statistics` in `FilterExec` wraps NULL interval bounds in
`Precision::Exact`, converting what should be `Precision::Absent` column
statistics into `Precision::Exact(ScalarValue::Int32(None))`.
Downstream, `estimate_disjoint_inputs` treats these as real bounds and
incorrectly concludes join inputs are disjoint, forcing Partitioned join
mode and disabling dynamic filter pushdown for Parquet row group
pruning.
## What changes are included in this PR?
Single change to `collect_new_statistics` in `filter.rs`: check
`is_null()` on interval bounds before wrapping in `Precision`, mapping
NULL bounds back to `Absent`.
## Are these changes tested?
Yes — includes a regression test
(`test_filter_statistics_absent_columns_stay_absent`) that fails on
current main and passes with the fix.
## Are there any user-facing changes?
No API changes. Corrects statistics propagation for tables/views with
absent column statistics.
---------
Co-authored-by: Claude Opus 4.6 <[email protected]>
---
datafusion/physical-plan/src/filter.rs | 59 +++++++++++++++++++++++++++++++---
1 file changed, 54 insertions(+), 5 deletions(-)
diff --git a/datafusion/physical-plan/src/filter.rs
b/datafusion/physical-plan/src/filter.rs
index 581e833f8c..8370201c1c 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -764,6 +764,21 @@ impl EmbeddedProjection for FilterExec {
}
}
+/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
+/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
+fn interval_bound_to_precision(
+ bound: ScalarValue,
+ is_exact: bool,
+) -> Precision<ScalarValue> {
+ if bound.is_null() {
+ Precision::Absent
+ } else if is_exact {
+ Precision::Exact(bound)
+ } else {
+ Precision::Inexact(bound)
+ }
+}
+
/// This function ensures that all bounds in the `ExprBoundaries` vector are
/// converted to closed bounds. If a lower/upper bound is initially open, it
/// is adjusted by using the next/previous value for its data type to convert
@@ -796,11 +811,9 @@ fn collect_new_statistics(
};
};
let (lower, upper) = interval.into_bounds();
- let (min_value, max_value) = if lower.eq(&upper) {
- (Precision::Exact(lower), Precision::Exact(upper))
- } else {
- (Precision::Inexact(lower), Precision::Inexact(upper))
- };
+ let is_exact = !lower.is_null() && !upper.is_null() && lower
== upper;
+ let min_value = interval_bound_to_precision(lower, is_exact);
+ let max_value = interval_bound_to_precision(upper, is_exact);
ColumnStatistics {
null_count:
input_column_stats[idx].null_count.to_inexact(),
max_value,
@@ -2078,4 +2091,40 @@ mod tests {
Ok(())
}
+
+ /// Columns with Absent min/max statistics should remain Absent after
+ /// FilterExec.
+ #[tokio::test]
+ async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()>
{
+ let schema = Schema::new(vec![
+ Field::new("a", DataType::Int32, false),
+ Field::new("b", DataType::Int32, false),
+ ]);
+ let input = Arc::new(StatisticsExec::new(
+ Statistics {
+ num_rows: Precision::Inexact(1000),
+ total_byte_size: Precision::Absent,
+ column_statistics: vec![
+ ColumnStatistics::default(),
+ ColumnStatistics::default(),
+ ],
+ },
+ schema.clone(),
+ ));
+
+ let predicate = Arc::new(BinaryExpr::new(
+ Arc::new(Column::new("a", 0)),
+ Operator::Eq,
+ Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+ ));
+ let filter: Arc<dyn ExecutionPlan> =
+ Arc::new(FilterExec::try_new(predicate, input)?);
+
+ let statistics = filter.partition_statistics(None)?;
+ let col_b_stats = &statistics.column_statistics[1];
+ assert_eq!(col_b_stats.min_value, Precision::Absent);
+ assert_eq!(col_b_stats.max_value, Precision::Absent);
+
+ Ok(())
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]