This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new e566329707 Improve Canonicalize API (#8983)
e566329707 is described below
commit e566329707e910ce6f8a32822dbafef993a6faaa
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Feb 1 12:05:36 2024 -0500
Improve Canonicalize API (#8983)
* Improve Canonicalize API
* Update datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Co-authored-by: Junhao Liu <[email protected]>
* Simplify the API
* fix comment
* add more flavor in comments
---------
Co-authored-by: Junhao Liu <[email protected]>
---
.../src/simplify_expressions/expr_simplifier.rs | 73 +++++++++++++++++++---
.../src/simplify_expressions/simplify_exprs.rs | 55 ++++++++--------
2 files changed, 91 insertions(+), 37 deletions(-)
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 1c12289491..30140101df 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -56,6 +56,9 @@ pub struct ExprSimplifier<S> {
/// Guarantees about the values of columns. This is provided by the user
/// in [ExprSimplifier::with_guarantees()].
guarantees: Vec<(Expr, NullableInterval)>,
+ /// Should expressions be canonicalized before simplification? Defaults to
+ /// true
+ canonicalize: bool,
}
pub const THRESHOLD_INLINE_INLIST: usize = 3;
@@ -70,6 +73,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
Self {
info,
guarantees: vec![],
+ canonicalize: true,
}
}
@@ -137,6 +141,12 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
let mut inlist_simplifier = InListSimplifier::new();
let mut guarantee_rewriter = GuaranteeRewriter::new(&self.guarantees);
+ let expr = if self.canonicalize {
+ expr.rewrite(&mut Canonicalizer::new())?
+ } else {
+ expr
+ };
+
// TODO iterate until no changes are made during rewrite
// (evaluating constants can enable new simplifications and
// simplifications can enable new constant evaluation)
@@ -151,10 +161,6 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
.rewrite(&mut simplifier)
}
- pub fn canonicalize(&self, expr: Expr) -> Result<Expr> {
- let mut canonicalizer = Canonicalizer::new();
- expr.rewrite(&mut canonicalizer)
- }
/// Apply type coercion to an [`Expr`] so that it can be
/// evaluated as a
[`PhysicalExpr`](datafusion_physical_expr::PhysicalExpr).
///
@@ -229,6 +235,60 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
self.guarantees = guarantees;
self
}
+
+ /// Should [`Canonicalizer`] be applied before simplification?
+ ///
+ /// If true (the default), the expression will be rewritten to canonical
+ /// form before simplification. This is useful to ensure that the
simplifier
+ /// can apply all possible simplifications.
+ ///
+ /// Some expressions, such as those in some Joins, can not be canonicalized
+ /// without changing their meaning. In these cases, canonicalization should
+ /// be disabled.
+ ///
+ /// ```rust
+ /// use arrow::datatypes::{DataType, Field, Schema};
+ /// use datafusion_expr::{col, lit, Expr};
+ /// use datafusion_expr::interval_arithmetic::{Interval, NullableInterval};
+ /// use datafusion_common::{Result, ScalarValue, ToDFSchema};
+ /// use datafusion_physical_expr::execution_props::ExecutionProps;
+ /// use datafusion_optimizer::simplify_expressions::{
+ /// ExprSimplifier, SimplifyContext};
+ ///
+ /// let schema = Schema::new(vec![
+ /// Field::new("a", DataType::Int64, false),
+ /// Field::new("b", DataType::Int64, false),
+ /// Field::new("c", DataType::Int64, false),
+ /// ])
+ /// .to_dfschema_ref().unwrap();
+ ///
+ /// // Create the simplifier
+ /// let props = ExecutionProps::new();
+ /// let context = SimplifyContext::new(&props)
+ /// .with_schema(schema);
+ /// let simplifier = ExprSimplifier::new(context);
+ ///
+ /// // Expression: a = c AND 1 = b
+ /// let expr = col("a").eq(col("c")).and(lit(1).eq(col("b")));
+ ///
+ /// // With canonicalization, the expression is rewritten to canonical form
+ /// // (though it is no simpler in this case):
+ /// let canonical = simplifier.simplify(expr.clone()).unwrap();
+ /// // Expression has been rewritten to: (c = a AND b = 1)
+ /// assert_eq!(canonical, col("c").eq(col("a")).and(col("b").eq(lit(1))));
+ ///
+ /// // If canonicalization is disabled, the expression is not changed
+ /// let non_canonicalized = simplifier
+ /// .with_canonicalize(false)
+ /// .simplify(expr.clone())
+ /// .unwrap();
+ ///
+ /// assert_eq!(non_canonicalized, expr);
+ /// ```
+ pub fn with_canonicalize(mut self, canonicalize: bool) -> Self {
+ self.canonicalize = canonicalize;
+ self
+ }
}
/// Canonicalize any BinaryExprs that are not in canonical form
@@ -236,7 +296,7 @@ impl<S: SimplifyInfo> ExprSimplifier<S> {
/// `<literal> <op> <col>` is rewritten to `<col> <op> <literal>`
///
/// `<col1> <op> <col2>` is rewritten so that the name of `col1` sorts higher
-/// than `col2` (`b > a` would be canonicalized to `a < b`)
+/// than `col2` (`a > b` would be canonicalized to `b < a`)
struct Canonicalizer {}
impl Canonicalizer {
@@ -2889,8 +2949,7 @@ mod tests {
let simplifier = ExprSimplifier::new(
SimplifyContext::new(&execution_props).with_schema(schema),
);
- let cano = simplifier.canonicalize(expr)?;
- simplifier.simplify(cano)
+ simplifier.simplify(expr)
}
fn simplify(expr: Expr) -> Expr {
diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
index d68474dcde..f36cd8f838 100644
--- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
+++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs
@@ -85,44 +85,39 @@ impl SimplifyExpressions {
};
let info = SimplifyContext::new(execution_props).with_schema(schema);
- let simplifier = ExprSimplifier::new(info);
-
let new_inputs = plan
.inputs()
.iter()
.map(|input| Self::optimize_internal(input, execution_props))
.collect::<Result<Vec<_>>>()?;
- let expr = match plan {
- // Canonicalize step won't reorder expressions in a Join on clause.
- // The left and right expressions in a Join on clause are not
commutative,
- // since the order of the columns must match the order of the
children.
- LogicalPlan::Join(_) => {
- plan.expressions()
- .into_iter()
- .map(|e| {
- // TODO: unify with `rewrite_preserving_name`
- let original_name = e.name_for_alias()?;
- let new_e = simplifier.simplify(e)?;
- new_e.alias_if_changed(original_name)
- })
- .collect::<Result<Vec<_>>>()?
- }
- _ => {
- plan.expressions()
- .into_iter()
- .map(|e| {
- // TODO: unify with `rewrite_preserving_name`
- let original_name = e.name_for_alias()?;
- let cano_e = simplifier.canonicalize(e)?;
- let new_e = simplifier.simplify(cano_e)?;
- new_e.alias_if_changed(original_name)
- })
- .collect::<Result<Vec<_>>>()?
- }
+ let simplifier = ExprSimplifier::new(info);
+
+ // The left and right expressions in a Join on clause are not
+ // commutative, for reasons that are not entirely clear. Thus, do not
+ // reorder expressions in Join while simplifying.
+ //
+ // This is likely related to the fact that order of the columns must
+ // match the order of the children. see
+ // https://github.com/apache/arrow-datafusion/pull/8780 for more
details
+ let simplifier = if let LogicalPlan::Join(_) = plan {
+ simplifier.with_canonicalize(false)
+ } else {
+ simplifier
};
- plan.with_new_exprs(expr, new_inputs)
+ let exprs = plan
+ .expressions()
+ .into_iter()
+ .map(|e| {
+ // TODO: unify with `rewrite_preserving_name`
+ let original_name = e.name_for_alias()?;
+ let new_e = simplifier.simplify(e)?;
+ new_e.alias_if_changed(original_name)
+ })
+ .collect::<Result<Vec<_>>>()?;
+
+ plan.with_new_exprs(exprs, new_inputs)
}
}