This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 7772453f20 Support binary argumnets (via coercion) for `like` and 
`ilike` and string functions (#7840)
7772453f20 is described below

commit 7772453f205ec721b65691d664b8ec9b5cd7fd03
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Oct 18 17:23:33 2023 -0400

    Support binary argumnets (via coercion) for `like` and `ilike` and string 
functions (#7840)
    
    fixup
---
 datafusion/expr/src/built_in_function.rs       |  23 ++++-
 datafusion/expr/src/type_coercion/binary.rs    | 113 ++++++++++++++++++++++---
 datafusion/expr/src/type_coercion/functions.rs |  11 ++-
 datafusion/sqllogictest/test_files/binary.slt  |  41 ++++++---
 4 files changed, 159 insertions(+), 29 deletions(-)

diff --git a/datafusion/expr/src/built_in_function.rs 
b/datafusion/expr/src/built_in_function.rs
index f1af6e829b..fffdf74af3 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -1523,16 +1523,29 @@ impl FromStr for BuiltinScalarFunction {
     }
 }
 
+/// Creates a function that returns the return type of a string function given
+/// the type of its first argument.
+///
+/// If the input type is `LargeUtf8` or `LargeBinary` the return type is
+/// `$largeUtf8Type`,
+///
+/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`,
 macro_rules! make_utf8_to_return_type {
     ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => {
         fn $FUNC(arg_type: &DataType, name: &str) -> Result<DataType> {
             Ok(match arg_type {
-                DataType::LargeUtf8 => $largeUtf8Type,
-                DataType::Utf8 => $utf8Type,
+                DataType::LargeUtf8  => $largeUtf8Type,
+                // LargeBinary inputs are automatically coerced to Utf8
+                DataType::LargeBinary => $largeUtf8Type,
+                DataType::Utf8  => $utf8Type,
+                // Binary inputs are automatically coerced to Utf8
+                DataType::Binary => $utf8Type,
                 DataType::Null => DataType::Null,
                 DataType::Dictionary(_, value_type) => match **value_type {
-                    DataType::LargeUtf8 => $largeUtf8Type,
+                    DataType::LargeUtf8  => $largeUtf8Type,
+                    DataType::LargeBinary => $largeUtf8Type,
                     DataType::Utf8 => $utf8Type,
+                    DataType::Binary => $utf8Type,
                     DataType::Null => DataType::Null,
                     _ => {
                         return plan_err!(
@@ -1553,8 +1566,10 @@ macro_rules! make_utf8_to_return_type {
         }
     };
 }
-
+// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input 
type size.
 make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8, 
DataType::Utf8);
+
+// `utf8_to_str_type`: returns either a Int32 or Int64 based on the input type 
size.
 make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
 
 fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) -> 
Result<DataType> {
diff --git a/datafusion/expr/src/type_coercion/binary.rs 
b/datafusion/expr/src/type_coercion/binary.rs
index ba3c21a15d..f94ce61460 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -29,7 +29,11 @@ use datafusion_common::{plan_err, DataFusionError};
 
 use crate::Operator;
 
-/// The type signature of an instantiation of binary expression
+/// The type signature of an instantiation of binary operator expression such 
as
+/// `lhs + rhs`
+///
+/// Note this is different than [`crate::signature::Signature`] which
+/// describes the type signature of a function.
 struct Signature {
     /// The type to coerce the left argument to
     lhs: DataType,
@@ -648,8 +652,9 @@ fn string_concat_internal_coercion(
     }
 }
 
-/// Coercion rules for Strings: the type that both lhs and rhs can be
-/// casted to for the purpose of a string computation
+/// Coercion rules for string types (Utf8/LargeUtf8): If at least on argument 
is
+/// a string type and both arguments can be coerced into a string type, coerce
+/// to string type.
 fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> 
Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
@@ -665,8 +670,30 @@ fn string_coercion(lhs_type: &DataType, rhs_type: 
&DataType) -> Option<DataType>
     }
 }
 
-/// Coercion rules for Binaries: the type that both lhs and rhs can be
-/// casted to for the purpose of a computation
+/// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8):
+/// If one argument is binary and the other is a string then coerce to string
+/// (e.g. for `like`)
+fn binary_to_string_coercion(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+    match (lhs_type, rhs_type) {
+        (Binary, Utf8) => Some(Utf8),
+        (Binary, LargeUtf8) => Some(LargeUtf8),
+        (LargeBinary, Utf8) => Some(LargeUtf8),
+        (LargeBinary, LargeUtf8) => Some(LargeUtf8),
+        (Utf8, Binary) => Some(Utf8),
+        (Utf8, LargeBinary) => Some(LargeUtf8),
+        (LargeUtf8, Binary) => Some(LargeUtf8),
+        (LargeUtf8, LargeBinary) => Some(LargeUtf8),
+        _ => None,
+    }
+}
+
+/// Coercion rules for binary types (Binary/LargeBinary): If at least on 
argument is
+/// a binary type and both arguments can be coerced into a binary type, coerce
+/// to binary type.
 fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> 
Option<DataType> {
     use arrow::datatypes::DataType::*;
     match (lhs_type, rhs_type) {
@@ -681,6 +708,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: 
&DataType) -> Option<DataType>
 /// This is a union of string coercion rules and dictionary coercion rules
 pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> 
Option<DataType> {
     string_coercion(lhs_type, rhs_type)
+        .or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
         .or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
         .or_else(|| null_coercion(lhs_type, rhs_type))
 }
@@ -763,7 +791,7 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: 
&DataType) -> Option<DataTyp
     }
 }
 
-/// coercion rules from NULL type. Since NULL can be casted to most of types 
in arrow,
+/// coercion rules from NULL type. Since NULL can be casted to any other type 
in arrow,
 /// either lhs or rhs is NULL, if NULL can be casted to type of the other 
side, the coercion is valid.
 fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> 
{
     match (lhs_type, rhs_type) {
@@ -917,11 +945,33 @@ mod tests {
         );
     }
 
+    /// Test coercion rules for binary operators
+    ///
+    /// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that 
the
+    /// the result type is `$RESULT_TYPE`
     macro_rules! test_coercion_binary_rule {
-        ($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{
-            let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?;
-            assert_eq!(lhs, $C_TYPE);
-            assert_eq!(rhs, $C_TYPE);
+        ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{
+            let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?;
+            assert_eq!(lhs, $RESULT_TYPE);
+            assert_eq!(rhs, $RESULT_TYPE);
+        }};
+    }
+
+    /// Test coercion rules for like
+    ///
+    /// Applies coercion rules for both
+    /// * `$LHS_TYPE LIKE $RHS_TYPE`
+    /// * `$RHS_TYPE LIKE $LHS_TYPE`
+    ///
+    /// And asserts the result type is `$RESULT_TYPE`
+    macro_rules! test_like_rule {
+        ($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{
+            println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE);
+            let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE);
+            assert_eq!(result, $RESULT_TYPE);
+            // reverse the order
+            let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE);
+            assert_eq!(result, $RESULT_TYPE);
         }};
     }
 
@@ -948,11 +998,46 @@ mod tests {
     }
 
     #[test]
-    fn test_type_coercion() -> Result<()> {
-        // test like coercion rule
-        let result = like_coercion(&DataType::Utf8, &DataType::Utf8);
-        assert_eq!(result, Some(DataType::Utf8));
+    fn test_like_coercion() {
+        // string coerce to strings
+        test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8));
+        test_like_rule!(
+            DataType::LargeUtf8,
+            DataType::Utf8,
+            Some(DataType::LargeUtf8)
+        );
+        test_like_rule!(
+            DataType::Utf8,
+            DataType::LargeUtf8,
+            Some(DataType::LargeUtf8)
+        );
+        test_like_rule!(
+            DataType::LargeUtf8,
+            DataType::LargeUtf8,
+            Some(DataType::LargeUtf8)
+        );
+
+        // Also coerce binary to strings
+        test_like_rule!(DataType::Binary, DataType::Utf8, 
Some(DataType::Utf8));
+        test_like_rule!(
+            DataType::LargeBinary,
+            DataType::Utf8,
+            Some(DataType::LargeUtf8)
+        );
+        test_like_rule!(
+            DataType::Binary,
+            DataType::LargeUtf8,
+            Some(DataType::LargeUtf8)
+        );
+        test_like_rule!(
+            DataType::LargeBinary,
+            DataType::LargeUtf8,
+            Some(DataType::LargeUtf8)
+        );
+    }
 
+    #[test]
+    fn test_type_coercion() -> Result<()> {
         test_coercion_binary_rule!(
             DataType::Utf8,
             DataType::Date32,
diff --git a/datafusion/expr/src/type_coercion/functions.rs 
b/datafusion/expr/src/type_coercion/functions.rs
index b387667ad1..d55ed0c721 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -46,6 +46,8 @@ pub fn data_types(
         return Ok(current_types.to_vec());
     }
 
+    // Try and coerce the argument types to match the signature, returning the
+    // coerced types from the first matching signature.
     for valid_types in valid_types {
         if let Some(types) = maybe_data_types(&valid_types, current_types) {
             return Ok(types);
@@ -60,6 +62,7 @@ pub fn data_types(
     )
 }
 
+/// Returns a Vec of all possible valid argument types for the given signature.
 fn get_valid_types(
     signature: &TypeSignature,
     current_types: &[DataType],
@@ -104,7 +107,12 @@ fn get_valid_types(
     Ok(valid_types)
 }
 
-/// Try to coerce current_types into valid_types.
+/// Try to coerce the current argument types to match the given `valid_types`.
+///
+/// For example, if a function `func` accepts arguments of  `(int64, int64)`,
+/// but was called with `(int32, int64)`, this function could match the
+/// valid_types by by coercing the first argument to `int64`, and would return
+/// `Some([int64, int64])`.
 fn maybe_data_types(
     valid_types: &[DataType],
     current_types: &[DataType],
@@ -220,6 +228,7 @@ fn coerced_from<'a>(
             Some(type_into.clone())
         }
         Interval(_) if matches!(type_from, Utf8 | LargeUtf8) => 
Some(type_into.clone()),
+        // Any type can be coerced into strings
         Utf8 | LargeUtf8 => Some(type_into.clone()),
         Null if can_cast_types(type_from, type_into) => 
Some(type_into.clone()),
 
diff --git a/datafusion/sqllogictest/test_files/binary.slt 
b/datafusion/sqllogictest/test_files/binary.slt
index 38f8a2e14f..0568ada3ad 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -217,30 +217,51 @@ SELECT largebinary FROM t ORDER BY largebinary;
 NULL
 
 # LIKE
-# https://github.com/apache/arrow-datafusion/issues/7342
-query error DataFusion error: type_coercion
-SELECT binary FROM t where binary LIKE '%F';
-
-query error DataFusion error: type_coercion
-SELECT largebinary FROM t where largebinary LIKE '%F';
+query ?
+SELECT binary FROM t where binary LIKE '%F%';
+----
+466f6f
+466f6f426172
 
+query ?
+SELECT largebinary FROM t where largebinary LIKE '%F%';
+----
+466f6f
+466f6f426172
 
 # character_length function
-# https://github.com/apache/arrow-datafusion/issues/7344
-query error DataFusion error: Error during planning: The "character_length" 
function can only accept strings, but got Binary\.
+query TITI
 SELECT
   cast(binary as varchar) as str,
   character_length(binary) as binary_len,
   cast(largebinary as varchar) as large_str,
   character_length(binary) as largebinary_len
 from t;
+----
+Foo 3 Foo 3
+NULL NULL NULL NULL
+Bar 3 Bar 3
+FooBar 6 FooBar 6
+
+query I
+SELECT character_length(X'20');
+----
+1
+
+# still errors on values that can not be coerced to utf8
+query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes 
from index 0
+SELECT character_length(X'c328');
 
 # regexp_replace
-# https://github.com/apache/arrow-datafusion/issues/7345
-query error DataFusion error: Error during planning: The "regexp_replace" 
function can only accept strings, but got Binary\.
+query TTTT
 SELECT
   cast(binary as varchar) as str,
   regexp_replace(binary, 'F', 'f') as binary_replaced,
   cast(largebinary as varchar) as large_str,
   regexp_replace(largebinary, 'F', 'f') as large_binary_replaced
 from t;
+----
+Foo foo Foo foo
+NULL NULL NULL NULL
+Bar Bar Bar Bar
+FooBar fooBar FooBar fooBar

Reply via email to