This is an automated email from the ASF dual-hosted git repository.

jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 57a99e4d37 Coerce and simplify FixedSizeBinary equality to literal 
binary (#15726)
57a99e4d37 is described below

commit 57a99e4d373c68550e2d9938ad2c857e034333a7
Author: Leonardo Yvens <[email protected]>
AuthorDate: Fri Apr 18 02:16:02 2025 +0100

    Coerce and simplify FixedSizeBinary equality to literal binary (#15726)
    
    * coerce FixedSizeBinary to Binary
    
    * simplify FixedSizeBytes equality to literal
    
    * fix clippy
    
    * remove redundant ExprSimplifier case
    
    * Add explain test to make sure unwrapping is working correctly
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/expr-common/src/type_coercion/binary.rs |  4 +++
 .../src/simplify_expressions/expr_simplifier.rs    | 38 ++++++++++++++++++++++
 .../src/simplify_expressions/unwrap_cast.rs        | 29 +++++++++++++++++
 datafusion/sqllogictest/test_files/binary.slt      | 30 ++++++++++++++++-
 4 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/datafusion/expr-common/src/type_coercion/binary.rs 
b/datafusion/expr-common/src/type_coercion/binary.rs
index fdc61cd665..fdee00f81b 100644
--- a/datafusion/expr-common/src/type_coercion/binary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary.rs
@@ -1297,6 +1297,10 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: 
&DataType) -> Option<DataType>
             Some(LargeBinary)
         }
         (Binary, Utf8) | (Utf8, Binary) => Some(Binary),
+
+        // Cast FixedSizeBinary to Binary
+        (FixedSizeBinary(_), Binary) | (Binary, FixedSizeBinary(_)) => 
Some(Binary),
+
         _ => None,
     }
 }
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs 
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 8e25bb7534..867363735c 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -3310,6 +3310,15 @@ mod tests {
         simplifier.simplify(expr)
     }
 
+    fn coerce(expr: Expr) -> Expr {
+        let schema = expr_test_schema();
+        let execution_props = ExecutionProps::new();
+        let simplifier = ExprSimplifier::new(
+            
SimplifyContext::new(&execution_props).with_schema(Arc::clone(&schema)),
+        );
+        simplifier.coerce(expr, schema.as_ref()).unwrap()
+    }
+
     fn simplify(expr: Expr) -> Expr {
         try_simplify(expr).unwrap()
     }
@@ -3352,6 +3361,7 @@ mod tests {
                     Field::new("c2_non_null", DataType::Boolean, false),
                     Field::new("c3_non_null", DataType::Int64, false),
                     Field::new("c4_non_null", DataType::UInt32, false),
+                    Field::new("c5", DataType::FixedSizeBinary(3), true),
                 ]
                 .into(),
                 HashMap::new(),
@@ -4475,6 +4485,34 @@ mod tests {
         }
     }
 
+    #[test]
+    fn simplify_fixed_size_binary_eq_lit() {
+        let bytes = [1u8, 2, 3].as_slice();
+
+        // The expression starts simple.
+        let expr = col("c5").eq(lit(bytes));
+
+        // The type coercer introduces a cast.
+        let coerced = coerce(expr.clone());
+        let schema = expr_test_schema();
+        assert_eq!(
+            coerced,
+            col("c5")
+                .cast_to(&DataType::Binary, schema.as_ref())
+                .unwrap()
+                .eq(lit(bytes))
+        );
+
+        // The simplifier removes the cast.
+        assert_eq!(
+            simplify(coerced),
+            col("c5").eq(Expr::Literal(ScalarValue::FixedSizeBinary(
+                3,
+                Some(bytes.to_vec()),
+            )))
+        );
+    }
+
     fn if_not_null(expr: Expr, then: bool) -> Expr {
         Expr::Case(Case {
             expr: Some(expr.is_not_null().into()),
diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs 
b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
index be71a8cd19..37116018cd 100644
--- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
+++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
@@ -197,6 +197,7 @@ fn is_supported_type(data_type: &DataType) -> bool {
     is_supported_numeric_type(data_type)
         || is_supported_string_type(data_type)
         || is_supported_dictionary_type(data_type)
+        || is_supported_binary_type(data_type)
 }
 
 /// Returns true if unwrap_cast_in_comparison support this numeric type
@@ -230,6 +231,10 @@ fn is_supported_dictionary_type(data_type: &DataType) -> 
bool {
                     DataType::Dictionary(_, inner) if is_supported_type(inner))
 }
 
+fn is_supported_binary_type(data_type: &DataType) -> bool {
+    matches!(data_type, DataType::Binary | DataType::FixedSizeBinary(_))
+}
+
 ///// Tries to move a cast from an expression (such as column) to the literal 
other side of a comparison operator./
 ///
 /// Specifically, rewrites
@@ -292,6 +297,7 @@ pub(super) fn try_cast_literal_to_type(
     try_cast_numeric_literal(lit_value, target_type)
         .or_else(|| try_cast_string_literal(lit_value, target_type))
         .or_else(|| try_cast_dictionary(lit_value, target_type))
+        .or_else(|| try_cast_binary(lit_value, target_type))
 }
 
 /// Convert a numeric value from one numeric data type to another
@@ -501,6 +507,20 @@ fn cast_between_timestamp(from: &DataType, to: &DataType, 
value: i128) -> Option
     }
 }
 
+fn try_cast_binary(
+    lit_value: &ScalarValue,
+    target_type: &DataType,
+) -> Option<ScalarValue> {
+    match (lit_value, target_type) {
+        (ScalarValue::Binary(Some(v)), DataType::FixedSizeBinary(n))
+            if v.len() == *n as usize =>
+        {
+            Some(ScalarValue::FixedSizeBinary(*n, Some(v.clone())))
+        }
+        _ => None,
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1450,4 +1470,13 @@ mod tests {
             )
         }
     }
+
+    #[test]
+    fn try_cast_to_fixed_size_binary() {
+        expect_cast(
+            ScalarValue::Binary(Some(vec![1, 2, 3])),
+            DataType::FixedSizeBinary(3),
+            ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1, 
2, 3]))),
+        )
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/binary.slt 
b/datafusion/sqllogictest/test_files/binary.slt
index 5c5f9d510e..5ac13779ac 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -147,8 +147,36 @@ query error DataFusion error: Error during planning: 
Cannot infer common argumen
 SELECT column1, column1 = arrow_cast(X'0102', 'FixedSizeBinary(2)') FROM t
 
 # Comparison to different sized Binary
-query error DataFusion error: Error during planning: Cannot infer common 
argument type for comparison operation FixedSizeBinary\(3\) = Binary
+query ?B
 SELECT column1, column1 = X'0102' FROM t
+----
+000102 false
+003102 false
+NULL NULL
+ff0102 false
+000102 false
+
+query ?B
+SELECT column1, column1 = X'000102' FROM t
+----
+000102 true
+003102 false
+NULL NULL
+ff0102 false
+000102 true
+
+# Plan should not have a cast of the column (should have casted the literal
+# to FixedSizeBinary as that is much faster)
+
+query TT
+explain SELECT column1, column1 = X'000102' FROM t
+----
+logical_plan
+01)Projection: t.column1, t.column1 = FixedSizeBinary(3, "0,1,2") AS t.column1 
= Binary("0,1,2")
+02)--TableScan: t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[column1@0 as column1, column1@0 = 000102 as t.column1 
= Binary("0,1,2")]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
 
 statement ok
 drop table t_source


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to