alamb commented on code in PR #19722:
URL: https://github.com/apache/datafusion/pull/19722#discussion_r2718750458


##########
datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs:
##########
@@ -1969,12 +1972,101 @@ impl TreeNodeRewriter for Simplifier<'_> {
                 }))
             }
 
+            // =======================================
+            // preimage_in_comparison
+            // =======================================
+            //
+            // For case:
+            // date_part('YEAR', expr) op literal
+            //
+            // Background:
+            // Datasources such as Parquet can prune partitions using simple 
predicates,
+            // but they cannot do so for complex expressions.
+            // For a complex predicate like `date_part('YEAR', c1) < 2000`, 
pruning is not possible.
+            // After rewriting it to `c1 < 2000-01-01`, pruning becomes 
feasible.
+            // Rewrites use inclusive lower and exclusive upper bounds when
+            // translating an equality into a range.
+            // NOTE: we only consider immutable UDFs with literal RHS values 
and
+            // UDFs that provide both `preimage` and `column_expr`.
+            Expr::BinaryExpr(BinaryExpr { left, op, right }) => {
+                use datafusion_expr::Operator::*;
+                let is_preimage_op = matches!(
+                    op,
+                    Eq | NotEq
+                        | Lt
+                        | LtEq
+                        | Gt
+                        | GtEq
+                        | IsDistinctFrom
+                        | IsNotDistinctFrom
+                );
+                if !is_preimage_op {
+                    return Ok(Transformed::no(Expr::BinaryExpr(BinaryExpr {
+                        left,
+                        op,
+                        right,
+                    })));
+                }
+
+                if let (Some(interval), Some(col_expr)) =
+                    get_preimage(left.as_ref(), right.as_ref(), info)?
+                {
+                    rewrite_with_preimage(info, interval, op, 
Box::new(col_expr))?
+                } else if let Some(swapped) = op.swap() {
+                    if let (Some(interval), Some(col_expr)) =
+                        get_preimage(right.as_ref(), left.as_ref(), info)?
+                    {
+                        rewrite_with_preimage(
+                            info,
+                            interval,
+                            swapped,
+                            Box::new(col_expr),
+                        )?
+                    } else {
+                        Transformed::no(Expr::BinaryExpr(BinaryExpr { left, 
op, right }))
+                    }
+                } else {
+                    Transformed::no(Expr::BinaryExpr(BinaryExpr { left, op, 
right }))
+                }
+            }
+
             // no additional rewrites possible
             expr => Transformed::no(expr),
         })
     }
 }
 
+fn get_preimage(
+    left_expr: &Expr,
+    right_expr: &Expr,
+    info: &SimplifyContext,
+) -> Result<(Option<Interval>, Option<Expr>)> {
+    let Expr::ScalarFunction(ScalarFunction { func, args }) = left_expr else {
+        return Ok((None, None));
+    };
+    if !is_literal_or_literal_cast(right_expr) {

Review Comment:
   I was thinking something like extracting the year from a computed value. For 
example, if we had a table with a base date and an interval, it seems like we 
could do something like):
   
   ```sql
   WHERE 2025 = date_part(YEAR, t.base_date + t.interval)
   ```
   
   rewrite to 
   ```sql
   WHERE (t.base_date + t.interval) >= 2025-01-01 && (t.base_date + t.interval) 
< 2026-01-01
   ```
   
   However, in this case I agree there is a tradeoff that this actually might 
be worse to optimize (take longer to evaluate) 🤔 
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to