alamb commented on code in PR #17813: URL: https://github.com/apache/datafusion/pull/17813#discussion_r2515692029
########## datafusion/expr/src/predicate_bounds.rs: ########## @@ -0,0 +1,1045 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::{BinaryExpr, Expr, ExprSchemable}; +use arrow::datatypes::DataType; +use bitflags::bitflags; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{DataFusionError, ExprSchema, Result, ScalarValue}; +use datafusion_expr_common::interval_arithmetic::{Interval, NullableInterval}; +use datafusion_expr_common::operator::Operator; + +bitflags! { + /// A set representing the possible outcomes of a SQL boolean expression + #[derive(PartialEq, Eq, Clone, Debug)] + struct TernarySet: u8 { + const TRUE = 0b1; + const FALSE = 0b10; + const UNKNOWN = 0b100; + } +} + +impl TernarySet { + /// Returns the set of possible values after applying the `is true` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_true(&self) -> Self { + let mut is_true = Self::empty(); + if self.contains(Self::TRUE) { + is_true.toggle(Self::TRUE); + } + if self.intersects(Self::UNKNOWN | Self::FALSE) { + is_true.toggle(Self::FALSE); + } + is_true + } + + /// Returns the set of possible values after applying the `is false` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_false(&self) -> Self { + let mut is_false = Self::empty(); + if self.contains(Self::FALSE) { + is_false.toggle(Self::TRUE); + } + if self.intersects(Self::UNKNOWN | Self::TRUE) { + is_false.toggle(Self::FALSE); + } + is_false + } + + /// Returns the set of possible values after applying the `is unknown` test on all + /// values in this set. + /// The resulting set can only contain 'TRUE' and/or 'FALSE', never 'UNKNOWN'. + fn is_unknown(&self) -> Self { + let mut is_unknown = Self::empty(); + if self.contains(Self::UNKNOWN) { + is_unknown.toggle(Self::TRUE); + } + if self.intersects(Self::TRUE | Self::FALSE) { + is_unknown.toggle(Self::FALSE); + } + is_unknown + } + + /// Returns the set of possible values after applying SQL three-valued logical NOT + /// on each value in `value`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A | ¬A + /// ----|---- + /// F | T + /// U | U + /// T | F + /// ``` + fn not(set: &Self) -> Self { + let mut not = Self::empty(); + if set.contains(Self::TRUE) { + not.toggle(Self::FALSE); + } + if set.contains(Self::FALSE) { + not.toggle(Self::TRUE); + } + if set.contains(Self::UNKNOWN) { + not.toggle(Self::UNKNOWN); + } + not + } + + /// Returns the set of possible values after applying SQL three-valued logical AND + /// on each combination of values from `lhs` and `rhs`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A ∧ B │ F U T + /// ──────┼────── + /// F │ F F F + /// U │ F U U + /// T │ F U T + /// ``` + fn and(lhs: &Self, rhs: &Self) -> Self { + if lhs.is_empty() || rhs.is_empty() { + return Self::empty(); + } + + let mut and = Self::empty(); + if lhs.contains(Self::FALSE) || rhs.contains(Self::FALSE) { + and.toggle(Self::FALSE); + } + + if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::TRUE | Self::UNKNOWN)) + || (rhs.contains(Self::UNKNOWN) && lhs.intersects(Self::TRUE | Self::UNKNOWN)) + { + and.toggle(Self::UNKNOWN); + } + + if lhs.contains(Self::TRUE) && rhs.contains(Self::TRUE) { + and.toggle(Self::TRUE); + } + + and + } + + /// Returns the set of possible values after applying SQL three-valued logical OR + /// on each combination of values from `lhs` and `rhs`. + /// + /// This method uses the following truth table. + /// + /// ```text + /// A ∨ B │ F U T + /// ──────┼────── + /// F │ F U T + /// U │ U U T + /// T │ T T T + /// ``` + fn or(lhs: &Self, rhs: &Self) -> Self { + let mut or = Self::empty(); + if lhs.contains(Self::TRUE) || rhs.contains(Self::TRUE) { + or.toggle(Self::TRUE); + } + + if (lhs.contains(Self::UNKNOWN) && rhs.intersects(Self::FALSE | Self::UNKNOWN)) + || (rhs.contains(Self::UNKNOWN) + && lhs.intersects(Self::FALSE | Self::UNKNOWN)) + { + or.toggle(Self::UNKNOWN); + } + + if lhs.contains(Self::FALSE) && rhs.contains(Self::FALSE) { + or.toggle(Self::FALSE); + } + + or + } +} + +impl TryFrom<&ScalarValue> for TernarySet { + type Error = DataFusionError; + + fn try_from(value: &ScalarValue) -> Result<Self> { + Ok(match value { + ScalarValue::Null => TernarySet::UNKNOWN, + ScalarValue::Boolean(b) => match b { + Some(true) => TernarySet::TRUE, + Some(false) => TernarySet::FALSE, + None => TernarySet::UNKNOWN, + }, + _ => { + let b = value.cast_to(&DataType::Boolean)?; + Self::try_from(&b)? + } + }) + } +} + +/// Computes the output interval for the given boolean expression based on statically +/// available information. +/// +/// # Arguments +/// +/// * `predicate` - The boolean expression to analyze +/// * `is_null` - A callback function that provides additional nullability information for +/// expressions. When called with an expression, it should return: +/// - `Some(true)` if the expression is known to evaluate to NULL +/// - `Some(false)` if the expression is known to NOT evaluate to NULL +/// - `None` if the nullability cannot be determined +/// +/// This callback allows the caller to provide context-specific knowledge about expression +/// nullability that cannot be determined from the schema alone. For example, it can be used +/// to indicate that a particular column reference is known to be NULL in a specific context, +/// or that certain expressions will never be NULL based on runtime constraints. +/// +/// * `input_schema` - Schema information for resolving expression types and nullability +/// +/// # Return Value +/// +/// The function returns a [NullableInterval] that describes the possible boolean values the +/// predicate can evaluate to. The return value will be one of the following: +/// +/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_TRUE }` - The predicate will +/// always evaluate to TRUE (never FALSE or NULL) +/// +/// * `NullableInterval::NotNull { values: Interval::CERTAINLY_FALSE }` - The predicate will +/// always evaluate to FALSE (never TRUE or NULL) +/// +/// * `NullableInterval::NotNull { values: Interval::UNCERTAIN }` - The predicate will never +/// evaluate to NULL, but may be either TRUE or FALSE +/// +/// * `NullableInterval::Null { datatype: DataType::Boolean }` - The predicate will always +/// evaluate to NULL (SQL UNKNOWN in three-valued logic) +/// +/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_TRUE }` - The predicate may +/// evaluate to TRUE or NULL, but never FALSE +/// +/// * `NullableInterval::MaybeNull { values: Interval::CERTAINLY_FALSE }` - The predicate may +/// evaluate to FALSE or NULL, but never TRUE +/// +/// * `NullableInterval::MaybeNull { values: Interval::UNCERTAIN }` - The predicate may +/// evaluate to any of TRUE, FALSE, or NULL +/// +pub(super) fn evaluate_bounds( + predicate: &Expr, + certainly_null_expr: Option<&Expr>, + input_schema: &dyn ExprSchema, +) -> Result<NullableInterval> { + let evaluator = PredicateBoundsEvaluator { Review Comment: Is your idea that you might be able to use the `NullableInterval` code instead of this (now that you have fixed the bugs?) ########## datafusion/expr/src/expr_fn.rs: ########## @@ -341,6 +341,11 @@ pub fn is_null(expr: Expr) -> Expr { Expr::IsNull(Box::new(expr)) } +/// Create is not null expression Review Comment: this is a nice drive by cleanup ########## datafusion/physical-expr/src/expressions/case.rs: ########## @@ -1394,6 +1437,51 @@ impl PhysicalExpr for CaseExpr { } } +/// Attempts to const evaluate the given `predicate`. +/// Returns: +/// - `Some(true)` if the predicate evaluates to a truthy value. +/// - `Some(false)` if the predicate evaluates to a falsy value. +/// - `None` if the predicate could not be evaluated. +fn evaluate_predicate(predicate: &Arc<dyn PhysicalExpr>) -> Result<Option<bool>> { Review Comment: as a follow on, I feel like this would be a more generally useful function as it is not specific to case. Maybe we could add it in expression utils or something: https://github.com/apache/datafusion/blob/149d3e9533acadfb92329f5445d7deb72f6f679c/datafusion/physical-expr/src/utils/mod.rs#L268-L267 ########## datafusion/expr/Cargo.toml: ########## @@ -48,6 +48,7 @@ sql = ["sqlparser"] [dependencies] arrow = { workspace = true } async-trait = { workspace = true } +bitflags = "2.0.0" Review Comment: This is already a dependency of arrow-schema https://github.com/apache/arrow-rs/blob/a0db1985c3a0f3190cfc5166b428933a28c740f9/arrow-schema/Cargo.toml#L43 Thus while this is a new explicit dependency it is not a new-new dependency and thus I think it is fine to add ########## datafusion/expr/src/expr_schema.rs: ########## @@ -282,14 +282,53 @@ impl ExprSchemable for Expr { Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()), Expr::Literal(value, _) => Ok(value.is_null()), Expr::Case(case) => { - // This expression is nullable if any of the input expressions are nullable - let then_nullable = case + let nullable_then = case .when_then_expr .iter() - .map(|(_, t)| t.nullable(input_schema)) - .collect::<Result<Vec<_>>>()?; - if then_nullable.contains(&true) { - Ok(true) + .filter_map(|(w, t)| { Review Comment: This code is clear and easy to follow. Very nice ########## datafusion/physical-expr/src/expressions/case.rs: ########## @@ -1282,15 +1283,57 @@ impl PhysicalExpr for CaseExpr { } fn nullable(&self, input_schema: &Schema) -> Result<bool> { - // this expression is nullable if any of the input expressions are nullable - let then_nullable = self + let nullable_then = self .body .when_then_expr .iter() - .map(|(_, t)| t.nullable(input_schema)) - .collect::<Result<Vec<_>>>()?; - if then_nullable.contains(&true) { - Ok(true) + .filter_map(|(w, t)| { + let is_nullable = match t.nullable(input_schema) { + // Pass on error determining nullability verbatim + Err(e) => return Some(Err(e)), + Ok(n) => n, + }; + + // Branches with a then expression that is not nullable do not impact the + // nullability of the case expression. + if !is_nullable { + return None; + } + + // For case-with-expression assume all 'then' expressions are reachable + if self.body.expr.is_some() { + return Some(Ok(())); + } + + // For branches with a nullable 'then' expression, try to determine Review Comment: Also I found another related part of the code here: https://github.com/apache/datafusion/blob/488b024aa3d22a63fb159157ac165682a627d1f5/datafusion/optimizer/src/simplify_expressions/guarantees.rs#L30-L44 This code seems to be simplifying the predicates based on nullability information. I am not sure if we can reuse the other parts ########## datafusion/core/src/physical_planner.rs: ########## @@ -2533,6 +2536,33 @@ impl<'a> OptimizationInvariantChecker<'a> { } } +/// Checks if the change from `old` schema to `new` is allowed or not. +/// The current implementation only allows nullability of individual fields to change +/// from 'nullable' to 'not nullable'. Review Comment: I think adding rationale about **why** would be helpful here as it may not be immediately obvious ```suggestion /// from 'nullable' to 'not nullable' (that is that the physical expression knows /// more about its null-ness than its logical counterpart) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
