This is an automated email from the ASF dual-hosted git repository.
jayzhan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 57a99e4d37 Coerce and simplify FixedSizeBinary equality to literal
binary (#15726)
57a99e4d37 is described below
commit 57a99e4d373c68550e2d9938ad2c857e034333a7
Author: Leonardo Yvens <[email protected]>
AuthorDate: Fri Apr 18 02:16:02 2025 +0100
Coerce and simplify FixedSizeBinary equality to literal binary (#15726)
* coerce FixedSizeBinary to Binary
* simplify FixedSizeBytes equality to literal
* fix clippy
* remove redundant ExprSimplifier case
* Add explain test to make sure unwrapping is working correctly
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/expr-common/src/type_coercion/binary.rs | 4 +++
.../src/simplify_expressions/expr_simplifier.rs | 38 ++++++++++++++++++++++
.../src/simplify_expressions/unwrap_cast.rs | 29 +++++++++++++++++
datafusion/sqllogictest/test_files/binary.slt | 30 ++++++++++++++++-
4 files changed, 100 insertions(+), 1 deletion(-)
diff --git a/datafusion/expr-common/src/type_coercion/binary.rs
b/datafusion/expr-common/src/type_coercion/binary.rs
index fdc61cd665..fdee00f81b 100644
--- a/datafusion/expr-common/src/type_coercion/binary.rs
+++ b/datafusion/expr-common/src/type_coercion/binary.rs
@@ -1297,6 +1297,10 @@ fn binary_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataType>
Some(LargeBinary)
}
(Binary, Utf8) | (Utf8, Binary) => Some(Binary),
+
+ // Cast FixedSizeBinary to Binary
+ (FixedSizeBinary(_), Binary) | (Binary, FixedSizeBinary(_)) =>
Some(Binary),
+
_ => None,
}
}
diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
index 8e25bb7534..867363735c 100644
--- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
+++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
@@ -3310,6 +3310,15 @@ mod tests {
simplifier.simplify(expr)
}
+ fn coerce(expr: Expr) -> Expr {
+ let schema = expr_test_schema();
+ let execution_props = ExecutionProps::new();
+ let simplifier = ExprSimplifier::new(
+
SimplifyContext::new(&execution_props).with_schema(Arc::clone(&schema)),
+ );
+ simplifier.coerce(expr, schema.as_ref()).unwrap()
+ }
+
fn simplify(expr: Expr) -> Expr {
try_simplify(expr).unwrap()
}
@@ -3352,6 +3361,7 @@ mod tests {
Field::new("c2_non_null", DataType::Boolean, false),
Field::new("c3_non_null", DataType::Int64, false),
Field::new("c4_non_null", DataType::UInt32, false),
+ Field::new("c5", DataType::FixedSizeBinary(3), true),
]
.into(),
HashMap::new(),
@@ -4475,6 +4485,34 @@ mod tests {
}
}
+ #[test]
+ fn simplify_fixed_size_binary_eq_lit() {
+ let bytes = [1u8, 2, 3].as_slice();
+
+ // The expression starts simple.
+ let expr = col("c5").eq(lit(bytes));
+
+ // The type coercer introduces a cast.
+ let coerced = coerce(expr.clone());
+ let schema = expr_test_schema();
+ assert_eq!(
+ coerced,
+ col("c5")
+ .cast_to(&DataType::Binary, schema.as_ref())
+ .unwrap()
+ .eq(lit(bytes))
+ );
+
+ // The simplifier removes the cast.
+ assert_eq!(
+ simplify(coerced),
+ col("c5").eq(Expr::Literal(ScalarValue::FixedSizeBinary(
+ 3,
+ Some(bytes.to_vec()),
+ )))
+ );
+ }
+
fn if_not_null(expr: Expr, then: bool) -> Expr {
Expr::Case(Case {
expr: Some(expr.is_not_null().into()),
diff --git a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
index be71a8cd19..37116018cd 100644
--- a/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
+++ b/datafusion/optimizer/src/simplify_expressions/unwrap_cast.rs
@@ -197,6 +197,7 @@ fn is_supported_type(data_type: &DataType) -> bool {
is_supported_numeric_type(data_type)
|| is_supported_string_type(data_type)
|| is_supported_dictionary_type(data_type)
+ || is_supported_binary_type(data_type)
}
/// Returns true if unwrap_cast_in_comparison support this numeric type
@@ -230,6 +231,10 @@ fn is_supported_dictionary_type(data_type: &DataType) ->
bool {
DataType::Dictionary(_, inner) if is_supported_type(inner))
}
+fn is_supported_binary_type(data_type: &DataType) -> bool {
+ matches!(data_type, DataType::Binary | DataType::FixedSizeBinary(_))
+}
+
///// Tries to move a cast from an expression (such as column) to the literal
other side of a comparison operator./
///
/// Specifically, rewrites
@@ -292,6 +297,7 @@ pub(super) fn try_cast_literal_to_type(
try_cast_numeric_literal(lit_value, target_type)
.or_else(|| try_cast_string_literal(lit_value, target_type))
.or_else(|| try_cast_dictionary(lit_value, target_type))
+ .or_else(|| try_cast_binary(lit_value, target_type))
}
/// Convert a numeric value from one numeric data type to another
@@ -501,6 +507,20 @@ fn cast_between_timestamp(from: &DataType, to: &DataType,
value: i128) -> Option
}
}
+fn try_cast_binary(
+ lit_value: &ScalarValue,
+ target_type: &DataType,
+) -> Option<ScalarValue> {
+ match (lit_value, target_type) {
+ (ScalarValue::Binary(Some(v)), DataType::FixedSizeBinary(n))
+ if v.len() == *n as usize =>
+ {
+ Some(ScalarValue::FixedSizeBinary(*n, Some(v.clone())))
+ }
+ _ => None,
+ }
+}
+
#[cfg(test)]
mod tests {
use super::*;
@@ -1450,4 +1470,13 @@ mod tests {
)
}
}
+
+ #[test]
+ fn try_cast_to_fixed_size_binary() {
+ expect_cast(
+ ScalarValue::Binary(Some(vec![1, 2, 3])),
+ DataType::FixedSizeBinary(3),
+ ExpectedCast::Value(ScalarValue::FixedSizeBinary(3, Some(vec![1,
2, 3]))),
+ )
+ }
}
diff --git a/datafusion/sqllogictest/test_files/binary.slt
b/datafusion/sqllogictest/test_files/binary.slt
index 5c5f9d510e..5ac13779ac 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -147,8 +147,36 @@ query error DataFusion error: Error during planning:
Cannot infer common argumen
SELECT column1, column1 = arrow_cast(X'0102', 'FixedSizeBinary(2)') FROM t
# Comparison to different sized Binary
-query error DataFusion error: Error during planning: Cannot infer common
argument type for comparison operation FixedSizeBinary\(3\) = Binary
+query ?B
SELECT column1, column1 = X'0102' FROM t
+----
+000102 false
+003102 false
+NULL NULL
+ff0102 false
+000102 false
+
+query ?B
+SELECT column1, column1 = X'000102' FROM t
+----
+000102 true
+003102 false
+NULL NULL
+ff0102 false
+000102 true
+
+# Plan should not have a cast of the column (should have casted the literal
+# to FixedSizeBinary as that is much faster)
+
+query TT
+explain SELECT column1, column1 = X'000102' FROM t
+----
+logical_plan
+01)Projection: t.column1, t.column1 = FixedSizeBinary(3, "0,1,2") AS t.column1
= Binary("0,1,2")
+02)--TableScan: t projection=[column1]
+physical_plan
+01)ProjectionExec: expr=[column1@0 as column1, column1@0 = 000102 as t.column1
= Binary("0,1,2")]
+02)--DataSourceExec: partitions=1, partition_sizes=[1]
statement ok
drop table t_source
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]