This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new aca8c14478 Fix FilterExec converting Absent column stats to 
Exact(NULL) (#20391)
aca8c14478 is described below

commit aca8c144789b9d5649abd16a6f55ecdf65ff72db
Author: Filip Wojciechowski <[email protected]>
AuthorDate: Mon Mar 9 13:50:42 2026 -0700

    Fix FilterExec converting Absent column stats to Exact(NULL) (#20391)
    
    ## Which issue does this PR close?
    
    - Closes #20388.
    
    ## Rationale for this change
    
    `collect_new_statistics` in `FilterExec` wraps NULL interval bounds in
    `Precision::Exact`, converting what should be `Precision::Absent` column
    statistics into `Precision::Exact(ScalarValue::Int32(None))`.
    Downstream, `estimate_disjoint_inputs` treats these as real bounds and
    incorrectly concludes join inputs are disjoint, forcing Partitioned join
    mode and disabling dynamic filter pushdown for Parquet row group
    pruning.
    
    ## What changes are included in this PR?
    
    Single change to `collect_new_statistics` in `filter.rs`: check
    `is_null()` on interval bounds before wrapping in `Precision`, mapping
    NULL bounds back to `Absent`.
    
    ## Are these changes tested?
    
    Yes — includes a regression test
    (`test_filter_statistics_absent_columns_stay_absent`) that fails on
    current main and passes with the fix.
    
    ## Are there any user-facing changes?
    
    No API changes. Corrects statistics propagation for tables/views with
    absent column statistics.
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 <[email protected]>
---
 datafusion/physical-plan/src/filter.rs | 59 +++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/datafusion/physical-plan/src/filter.rs 
b/datafusion/physical-plan/src/filter.rs
index 581e833f8c..8370201c1c 100644
--- a/datafusion/physical-plan/src/filter.rs
+++ b/datafusion/physical-plan/src/filter.rs
@@ -764,6 +764,21 @@ impl EmbeddedProjection for FilterExec {
     }
 }
 
+/// Converts an interval bound to a [`Precision`] value. NULL bounds (which
+/// represent "unbounded" in the interval type) map to [`Precision::Absent`].
+fn interval_bound_to_precision(
+    bound: ScalarValue,
+    is_exact: bool,
+) -> Precision<ScalarValue> {
+    if bound.is_null() {
+        Precision::Absent
+    } else if is_exact {
+        Precision::Exact(bound)
+    } else {
+        Precision::Inexact(bound)
+    }
+}
+
 /// This function ensures that all bounds in the `ExprBoundaries` vector are
 /// converted to closed bounds. If a lower/upper bound is initially open, it
 /// is adjusted by using the next/previous value for its data type to convert
@@ -796,11 +811,9 @@ fn collect_new_statistics(
                     };
                 };
                 let (lower, upper) = interval.into_bounds();
-                let (min_value, max_value) = if lower.eq(&upper) {
-                    (Precision::Exact(lower), Precision::Exact(upper))
-                } else {
-                    (Precision::Inexact(lower), Precision::Inexact(upper))
-                };
+                let is_exact = !lower.is_null() && !upper.is_null() && lower 
== upper;
+                let min_value = interval_bound_to_precision(lower, is_exact);
+                let max_value = interval_bound_to_precision(upper, is_exact);
                 ColumnStatistics {
                     null_count: 
input_column_stats[idx].null_count.to_inexact(),
                     max_value,
@@ -2078,4 +2091,40 @@ mod tests {
 
         Ok(())
     }
+
+    /// Columns with Absent min/max statistics should remain Absent after
+    /// FilterExec.
+    #[tokio::test]
+    async fn test_filter_statistics_absent_columns_stay_absent() -> Result<()> 
{
+        let schema = Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]);
+        let input = Arc::new(StatisticsExec::new(
+            Statistics {
+                num_rows: Precision::Inexact(1000),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    ColumnStatistics::default(),
+                    ColumnStatistics::default(),
+                ],
+            },
+            schema.clone(),
+        ));
+
+        let predicate = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Eq,
+            Arc::new(Literal::new(ScalarValue::Int32(Some(42)))),
+        ));
+        let filter: Arc<dyn ExecutionPlan> =
+            Arc::new(FilterExec::try_new(predicate, input)?);
+
+        let statistics = filter.partition_statistics(None)?;
+        let col_b_stats = &statistics.column_statistics[1];
+        assert_eq!(col_b_stats.min_value, Precision::Absent);
+        assert_eq!(col_b_stats.max_value, Precision::Absent);
+
+        Ok(())
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to