Re: [PR] #17801 Improve nullability reporting of case expressions [datafusion]

via GitHub Tue, 11 Nov 2025 12:34:24 -0800


alamb commented on code in PR #17813:
URL: https://github.com/apache/datafusion/pull/17813#discussion_r2515692029



##########
datafusion/expr/src/predicate_bounds.rs:
##########
@@ -0,0 +1,1045 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{BinaryExpr, Expr, ExprSchemable};
+use arrow::datatypes::DataType;
+use bitflags::bitflags;
+use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion};
+use datafusion_common::{DataFusionError, ExprSchema, Result, ScalarValue};
+use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval};
+use datafusion_expr_common::operator::Operator;
+
+bitflags! {
+    /// A set representing the possible outcomes of a SQL boolean expression
+    #[derive(PartialEq, Eq, Clone, Debug)]
+    struct TernarySet: u8 {
+        const TRUE = 0b1;
+        const FALSE = 0b10;
+        const UNKNOWN = 0b100;
+    }
+}
+
+impl TernarySet {
+    /// Returns the set of possible values after applying the `is true` test 
on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_true(&self) -> Self {
+        let mut is_true = Self::empty();
+        if self.contains(Self::TRUE) {
+            is_true.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::UNKNOWN | Self::FALSE) {
+            is_true.toggle(Self::FALSE);
+        }
+        is_true
+    }
+
+    /// Returns the set of possible values after applying the `is false` test 
on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_false(&self) -> Self {
+        let mut is_false = Self::empty();
+        if self.contains(Self::FALSE) {
+            is_false.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::UNKNOWN | Self::TRUE) {
+            is_false.toggle(Self::FALSE);
+        }
+        is_false
+    }
+
+    /// Returns the set of possible values after applying the `is unknown` 
test on all
+    /// values in this set.
+    /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 
'UNKNOWN'.
+    fn is_unknown(&self) -> Self {
+        let mut is_unknown = Self::empty();
+        if self.contains(Self::UNKNOWN) {
+            is_unknown.toggle(Self::TRUE);
+        }
+        if self.intersects(Self::TRUE | Self::FALSE) {
+            is_unknown.toggle(Self::FALSE);
+        }
+        is_unknown
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical NOT
+    /// on each value in `value`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    ///  A  | ¬A
+    /// ----|----
+    ///  F  |  T
+    ///  U  |  U
+    ///  T  |  F
+    /// ```
+    fn not(set: &Self) -> Self {
+        let mut not = Self::empty();
+        if set.contains(Self::TRUE) {
+            not.toggle(Self::FALSE);
+        }
+        if set.contains(Self::FALSE) {
+            not.toggle(Self::TRUE);
+        }
+        if set.contains(Self::UNKNOWN) {
+            not.toggle(Self::UNKNOWN);
+        }
+        not
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical AND
+    /// on each combination of values from `lhs` and `rhs`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    /// A ∧ B │ F U T
+    /// ──────┼──────
+    ///     F │ F F F
+    ///     U │ F U U
+    ///     T │ F U T
+    /// ```
+    fn and(lhs: &Self, rhs: &Self) -> Self {
+        if lhs.is_empty() || rhs.is_empty() {
+            return Self::empty();
+        }
+
+        let mut and = Self::empty();
+        if lhs.contains(Self::FALSE) || rhs.contains(Self::FALSE) {
+            and.toggle(Self::FALSE);
+        }
+
+        if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::TRUE | 
Self::UNKNOWN))
+            || (rhs.contains(Self::UNKNOWN) && lhs.intersects(Self::TRUE | 
Self::UNKNOWN))
+        {
+            and.toggle(Self::UNKNOWN);
+        }
+
+        if lhs.contains(Self::TRUE) && rhs.contains(Self::TRUE) {
+            and.toggle(Self::TRUE);
+        }
+
+        and
+    }
+
+    /// Returns the set of possible values after applying SQL three-valued 
logical OR
+    /// on each combination of values from `lhs` and `rhs`.
+    ///
+    /// This method uses the following truth table.
+    ///
+    /// ```text
+    /// A ∨ B │ F U T
+    /// ──────┼──────
+    ///     F │ F U T
+    ///     U │ U U T
+    ///     T │ T T T
+    /// ```
+    fn or(lhs: &Self, rhs: &Self) -> Self {
+        let mut or = Self::empty();
+        if lhs.contains(Self::TRUE) || rhs.contains(Self::TRUE) {
+            or.toggle(Self::TRUE);
+        }
+
+        if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::FALSE | 
Self::UNKNOWN))
+            || (rhs.contains(Self::UNKNOWN)
+                && lhs.intersects(Self::FALSE | Self::UNKNOWN))
+        {
+            or.toggle(Self::UNKNOWN);
+        }
+
+        if lhs.contains(Self::FALSE) && rhs.contains(Self::FALSE) {
+            or.toggle(Self::FALSE);
+        }
+
+        or
+    }
+}
+
+impl TryFrom<&ScalarValue> for TernarySet {
+    type Error = DataFusionError;
+
+    fn try_from(value: &ScalarValue) -> Result<Self> {
+        Ok(match value {
+            ScalarValue::Null => TernarySet::UNKNOWN,
+            ScalarValue::Boolean(b) => match b {
+                Some(true) => TernarySet::TRUE,
+                Some(false) => TernarySet::FALSE,
+                None => TernarySet::UNKNOWN,
+            },
+            _ => {
+                let b = value.cast_to(&DataType::Boolean)?;
+                Self::try_from(&b)?
+            }
+        })
+    }
+}
+
+/// Computes the output interval for the given boolean expression based on 
statically
+/// available information.
+///
+/// # Arguments
+///
+/// * `predicate` - The boolean expression to analyze
+/// * `is_null` - A callback function that provides additional nullability 
information for
+///   expressions. When called with an expression, it should return:
+///   - `Some(true)` if the expression is known to evaluate to NULL
+///   - `Some(false)` if the expression is known to NOT evaluate to NULL
+///   - `None` if the nullability cannot be determined
+///
+///   This callback allows the caller to provide context-specific knowledge 
about expression
+///   nullability that cannot be determined from the schema alone. For 
example, it can be used
+///   to indicate that a particular column reference is known to be NULL in a 
specific context,
+///   or that certain expressions will never be NULL based on runtime 
constraints.
+///
+/// * `input_schema` - Schema information for resolving expression types and 
nullability
+///
+/// # Return Value
+///
+/// The function returns a [NullableInterval] that describes the possible 
boolean values the
+/// predicate can evaluate to. The return value will be one of the following:
+///
+/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_TRUE }` - The 
predicate will
+///   always evaluate to TRUE (never FALSE or NULL)
+///
+/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_FALSE }` - The 
predicate will
+///   always evaluate to FALSE (never TRUE or NULL)
+///
+/// * `NullableInterval::NotNull { values: Interval::UNCERTAIN }` - The 
predicate will never
+///   evaluate to NULL, but may be either TRUE or FALSE
+///
+/// * `NullableInterval::Null { datatype: DataType::Boolean }` - The predicate 
will always
+///   evaluate to NULL (SQL UNKNOWN in three-valued logic)
+///
+/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_TRUE }` - The 
predicate may
+///   evaluate to TRUE or NULL, but never FALSE
+///
+/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_FALSE }` - 
The predicate may
+///   evaluate to FALSE or NULL, but never TRUE
+///
+/// * `NullableInterval::MaybeNull { values: Interval::UNCERTAIN }` - The 
predicate may
+///   evaluate to any of TRUE, FALSE, or NULL
+///
+pub(super) fn evaluate_bounds(
+    predicate: &Expr,
+    certainly_null_expr: Option<&Expr>,
+    input_schema: &dyn ExprSchema,
+) -> Result<NullableInterval> {
+    let evaluator = PredicateBoundsEvaluator {

Review Comment:
   Is your idea that you might be able to use the `NullableInterval` code 
instead of this (now that you have fixed the bugs?)



##########
datafusion/expr/src/expr_fn.rs:
##########
@@ -341,6 +341,11 @@ pub fn is_null(expr: Expr) -> Expr {
     Expr::IsNull(Box::new(expr))
 }
 
+/// Create is not null expression

Review Comment:
   this is a nice drive by cleanup



##########
datafusion/physical-expr/src/expressions/case.rs:
##########
@@ -1394,6 +1437,51 @@ impl PhysicalExpr for CaseExpr {
     }
 }
 
+/// Attempts to const evaluate the given `predicate`.
+/// Returns:
+/// - `Some(true)` if the predicate evaluates to a truthy value.
+/// - `Some(false)` if the predicate evaluates to a falsy value.
+/// - `None` if the predicate could not be evaluated.
+fn evaluate_predicate(predicate: &Arc<dyn PhysicalExpr>) -> 
Result<Option<bool>> {

Review Comment:
   as a follow on, I feel like this would be a more generally useful function 
as it is not specific to case.
   
   Maybe we could add it in expression utils or something: 
https://github.com/apache/datafusion/blob/149d3e9533acadfb92329f5445d7deb72f6f679c/datafusion/physical-expr/src/utils/mod.rs#L268-L267
   
   



##########
datafusion/expr/Cargo.toml:
##########
@@ -48,6 +48,7 @@ sql = ["sqlparser"]
 [dependencies]
 arrow = { workspace = true }
 async-trait = { workspace = true }
+bitflags = "2.0.0"

Review Comment:
   This is already a dependency of arrow-schema
   
    
https://github.com/apache/arrow-rs/blob/a0db1985c3a0f3190cfc5166b428933a28c740f9/arrow-schema/Cargo.toml#L43
   
   Thus while this is a new explicit dependency it is not a new-new dependency 
and thus I think it is fine to add



##########
datafusion/expr/src/expr_schema.rs:
##########
@@ -282,14 +282,53 @@ impl ExprSchemable for Expr {
             Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()),
             Expr::Literal(value, _) => Ok(value.is_null()),
             Expr::Case(case) => {
-                // This expression is nullable if any of the input expressions 
are nullable
-                let then_nullable = case
+                let nullable_then = case
                     .when_then_expr
                     .iter()
-                    .map(|(_, t)| t.nullable(input_schema))
-                    .collect::<Result<Vec<_>>>()?;
-                if then_nullable.contains(&true) {
-                    Ok(true)
+                    .filter_map(|(w, t)| {

Review Comment:
   This code is clear and easy to follow. Very nice



##########
datafusion/physical-expr/src/expressions/case.rs:
##########
@@ -1282,15 +1283,57 @@ impl PhysicalExpr for CaseExpr {
     }
 
     fn nullable(&self, input_schema: &Schema) -> Result<bool> {
-        // this expression is nullable if any of the input expressions are 
nullable
-        let then_nullable = self
+        let nullable_then = self
             .body
             .when_then_expr
             .iter()
-            .map(|(_, t)| t.nullable(input_schema))
-            .collect::<Result<Vec<_>>>()?;
-        if then_nullable.contains(&true) {
-            Ok(true)
+            .filter_map(|(w, t)| {
+                let is_nullable = match t.nullable(input_schema) {
+                    // Pass on error determining nullability verbatim
+                    Err(e) => return Some(Err(e)),
+                    Ok(n) => n,
+                };
+
+                // Branches with a then expression that is not nullable do not 
impact the
+                // nullability of the case expression.
+                if !is_nullable {
+                    return None;
+                }
+
+                // For case-with-expression assume all 'then' expressions are 
reachable
+                if self.body.expr.is_some() {
+                    return Some(Ok(()));
+                }
+
+                // For branches with a nullable 'then' expression, try to 
determine

Review Comment:
   Also I found another related part of the code here: 
https://github.com/apache/datafusion/blob/488b024aa3d22a63fb159157ac165682a627d1f5/datafusion/optimizer/src/simplify_expressions/guarantees.rs#L30-L44
   
   This code seems to be simplifying the predicates based on nullability 
information. 
   
   I am not sure if we can reuse the other parts



##########
datafusion/core/src/physical_planner.rs:
##########
@@ -2533,6 +2536,33 @@ impl<'a> OptimizationInvariantChecker<'a> {
     }
 }
 
+/// Checks if the change from `old` schema to `new` is allowed or not.
+/// The current implementation only allows nullability of individual fields to 
change
+/// from 'nullable' to 'not nullable'.

Review Comment:
   I think adding rationale about **why** would be helpful here as it may not 
be immediately obvious
   
   ```suggestion
   /// from 'nullable' to 'not nullable' (that is that the physical expression 
knows
   /// more about its null-ness than its logical counterpart)
   
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] #17801 Improve nullability reporting of case expressions [datafusion]

Reply via email to