This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7772453f20 Support binary argumnets (via coercion) for `like` and
`ilike` and string functions (#7840)
7772453f20 is described below
commit 7772453f205ec721b65691d664b8ec9b5cd7fd03
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Oct 18 17:23:33 2023 -0400
Support binary argumnets (via coercion) for `like` and `ilike` and string
functions (#7840)
fixup
---
datafusion/expr/src/built_in_function.rs | 23 ++++-
datafusion/expr/src/type_coercion/binary.rs | 113 ++++++++++++++++++++++---
datafusion/expr/src/type_coercion/functions.rs | 11 ++-
datafusion/sqllogictest/test_files/binary.slt | 41 ++++++---
4 files changed, 159 insertions(+), 29 deletions(-)
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index f1af6e829b..fffdf74af3 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -1523,16 +1523,29 @@ impl FromStr for BuiltinScalarFunction {
}
}
+/// Creates a function that returns the return type of a string function given
+/// the type of its first argument.
+///
+/// If the input type is `LargeUtf8` or `LargeBinary` the return type is
+/// `$largeUtf8Type`,
+///
+/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`,
macro_rules! make_utf8_to_return_type {
($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => {
fn $FUNC(arg_type: &DataType, name: &str) -> Result<DataType> {
Ok(match arg_type {
- DataType::LargeUtf8 => $largeUtf8Type,
- DataType::Utf8 => $utf8Type,
+ DataType::LargeUtf8 => $largeUtf8Type,
+ // LargeBinary inputs are automatically coerced to Utf8
+ DataType::LargeBinary => $largeUtf8Type,
+ DataType::Utf8 => $utf8Type,
+ // Binary inputs are automatically coerced to Utf8
+ DataType::Binary => $utf8Type,
DataType::Null => DataType::Null,
DataType::Dictionary(_, value_type) => match **value_type {
- DataType::LargeUtf8 => $largeUtf8Type,
+ DataType::LargeUtf8 => $largeUtf8Type,
+ DataType::LargeBinary => $largeUtf8Type,
DataType::Utf8 => $utf8Type,
+ DataType::Binary => $utf8Type,
DataType::Null => DataType::Null,
_ => {
return plan_err!(
@@ -1553,8 +1566,10 @@ macro_rules! make_utf8_to_return_type {
}
};
}
-
+// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input
type size.
make_utf8_to_return_type!(utf8_to_str_type, DataType::LargeUtf8,
DataType::Utf8);
+
+// `utf8_to_str_type`: returns either a Int32 or Int64 based on the input type
size.
make_utf8_to_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
fn utf8_or_binary_to_binary_type(arg_type: &DataType, name: &str) ->
Result<DataType> {
diff --git a/datafusion/expr/src/type_coercion/binary.rs
b/datafusion/expr/src/type_coercion/binary.rs
index ba3c21a15d..f94ce61460 100644
--- a/datafusion/expr/src/type_coercion/binary.rs
+++ b/datafusion/expr/src/type_coercion/binary.rs
@@ -29,7 +29,11 @@ use datafusion_common::{plan_err, DataFusionError};
use crate::Operator;
-/// The type signature of an instantiation of binary expression
+/// The type signature of an instantiation of binary operator expression such
as
+/// `lhs + rhs`
+///
+/// Note this is different than [`crate::signature::Signature`] which
+/// describes the type signature of a function.
struct Signature {
/// The type to coerce the left argument to
lhs: DataType,
@@ -648,8 +652,9 @@ fn string_concat_internal_coercion(
}
}
-/// Coercion rules for Strings: the type that both lhs and rhs can be
-/// casted to for the purpose of a string computation
+/// Coercion rules for string types (Utf8/LargeUtf8): If at least on argument
is
+/// a string type and both arguments can be coerced into a string type, coerce
+/// to string type.
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
@@ -665,8 +670,30 @@ fn string_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataType>
}
}
-/// Coercion rules for Binaries: the type that both lhs and rhs can be
-/// casted to for the purpose of a computation
+/// Coercion rules for binary (Binary/LargeBinary) to string (Utf8/LargeUtf8):
+/// If one argument is binary and the other is a string then coerce to string
+/// (e.g. for `like`)
+fn binary_to_string_coercion(
+ lhs_type: &DataType,
+ rhs_type: &DataType,
+) -> Option<DataType> {
+ use arrow::datatypes::DataType::*;
+ match (lhs_type, rhs_type) {
+ (Binary, Utf8) => Some(Utf8),
+ (Binary, LargeUtf8) => Some(LargeUtf8),
+ (LargeBinary, Utf8) => Some(LargeUtf8),
+ (LargeBinary, LargeUtf8) => Some(LargeUtf8),
+ (Utf8, Binary) => Some(Utf8),
+ (Utf8, LargeBinary) => Some(LargeUtf8),
+ (LargeUtf8, Binary) => Some(LargeUtf8),
+ (LargeUtf8, LargeBinary) => Some(LargeUtf8),
+ _ => None,
+ }
+}
+
+/// Coercion rules for binary types (Binary/LargeBinary): If at least on
argument is
+/// a binary type and both arguments can be coerced into a binary type, coerce
+/// to binary type.
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
@@ -681,6 +708,7 @@ fn binary_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataType>
/// This is a union of string coercion rules and dictionary coercion rules
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) ->
Option<DataType> {
string_coercion(lhs_type, rhs_type)
+ .or_else(|| binary_to_string_coercion(lhs_type, rhs_type))
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| null_coercion(lhs_type, rhs_type))
}
@@ -763,7 +791,7 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type:
&DataType) -> Option<DataTyp
}
}
-/// coercion rules from NULL type. Since NULL can be casted to most of types
in arrow,
+/// coercion rules from NULL type. Since NULL can be casted to any other type
in arrow,
/// either lhs or rhs is NULL, if NULL can be casted to type of the other
side, the coercion is valid.
fn null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
{
match (lhs_type, rhs_type) {
@@ -917,11 +945,33 @@ mod tests {
);
}
+ /// Test coercion rules for binary operators
+ ///
+ /// Applies coercion rules for `$LHS_TYPE $OP $RHS_TYPE` and asserts that
the
+ /// the result type is `$RESULT_TYPE`
macro_rules! test_coercion_binary_rule {
- ($A_TYPE:expr, $B_TYPE:expr, $OP:expr, $C_TYPE:expr) => {{
- let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?;
- assert_eq!(lhs, $C_TYPE);
- assert_eq!(rhs, $C_TYPE);
+ ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{
+ let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?;
+ assert_eq!(lhs, $RESULT_TYPE);
+ assert_eq!(rhs, $RESULT_TYPE);
+ }};
+ }
+
+ /// Test coercion rules for like
+ ///
+ /// Applies coercion rules for both
+ /// * `$LHS_TYPE LIKE $RHS_TYPE`
+ /// * `$RHS_TYPE LIKE $LHS_TYPE`
+ ///
+ /// And asserts the result type is `$RESULT_TYPE`
+ macro_rules! test_like_rule {
+ ($LHS_TYPE:expr, $RHS_TYPE:expr, $RESULT_TYPE:expr) => {{
+ println!("Coercing {} LIKE {}", $LHS_TYPE, $RHS_TYPE);
+ let result = like_coercion(&$LHS_TYPE, &$RHS_TYPE);
+ assert_eq!(result, $RESULT_TYPE);
+ // reverse the order
+ let result = like_coercion(&$RHS_TYPE, &$LHS_TYPE);
+ assert_eq!(result, $RESULT_TYPE);
}};
}
@@ -948,11 +998,46 @@ mod tests {
}
#[test]
- fn test_type_coercion() -> Result<()> {
- // test like coercion rule
- let result = like_coercion(&DataType::Utf8, &DataType::Utf8);
- assert_eq!(result, Some(DataType::Utf8));
+ fn test_like_coercion() {
+ // string coerce to strings
+ test_like_rule!(DataType::Utf8, DataType::Utf8, Some(DataType::Utf8));
+ test_like_rule!(
+ DataType::LargeUtf8,
+ DataType::Utf8,
+ Some(DataType::LargeUtf8)
+ );
+ test_like_rule!(
+ DataType::Utf8,
+ DataType::LargeUtf8,
+ Some(DataType::LargeUtf8)
+ );
+ test_like_rule!(
+ DataType::LargeUtf8,
+ DataType::LargeUtf8,
+ Some(DataType::LargeUtf8)
+ );
+
+ // Also coerce binary to strings
+ test_like_rule!(DataType::Binary, DataType::Utf8,
Some(DataType::Utf8));
+ test_like_rule!(
+ DataType::LargeBinary,
+ DataType::Utf8,
+ Some(DataType::LargeUtf8)
+ );
+ test_like_rule!(
+ DataType::Binary,
+ DataType::LargeUtf8,
+ Some(DataType::LargeUtf8)
+ );
+ test_like_rule!(
+ DataType::LargeBinary,
+ DataType::LargeUtf8,
+ Some(DataType::LargeUtf8)
+ );
+ }
+ #[test]
+ fn test_type_coercion() -> Result<()> {
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Date32,
diff --git a/datafusion/expr/src/type_coercion/functions.rs
b/datafusion/expr/src/type_coercion/functions.rs
index b387667ad1..d55ed0c721 100644
--- a/datafusion/expr/src/type_coercion/functions.rs
+++ b/datafusion/expr/src/type_coercion/functions.rs
@@ -46,6 +46,8 @@ pub fn data_types(
return Ok(current_types.to_vec());
}
+ // Try and coerce the argument types to match the signature, returning the
+ // coerced types from the first matching signature.
for valid_types in valid_types {
if let Some(types) = maybe_data_types(&valid_types, current_types) {
return Ok(types);
@@ -60,6 +62,7 @@ pub fn data_types(
)
}
+/// Returns a Vec of all possible valid argument types for the given signature.
fn get_valid_types(
signature: &TypeSignature,
current_types: &[DataType],
@@ -104,7 +107,12 @@ fn get_valid_types(
Ok(valid_types)
}
-/// Try to coerce current_types into valid_types.
+/// Try to coerce the current argument types to match the given `valid_types`.
+///
+/// For example, if a function `func` accepts arguments of `(int64, int64)`,
+/// but was called with `(int32, int64)`, this function could match the
+/// valid_types by by coercing the first argument to `int64`, and would return
+/// `Some([int64, int64])`.
fn maybe_data_types(
valid_types: &[DataType],
current_types: &[DataType],
@@ -220,6 +228,7 @@ fn coerced_from<'a>(
Some(type_into.clone())
}
Interval(_) if matches!(type_from, Utf8 | LargeUtf8) =>
Some(type_into.clone()),
+ // Any type can be coerced into strings
Utf8 | LargeUtf8 => Some(type_into.clone()),
Null if can_cast_types(type_from, type_into) =>
Some(type_into.clone()),
diff --git a/datafusion/sqllogictest/test_files/binary.slt
b/datafusion/sqllogictest/test_files/binary.slt
index 38f8a2e14f..0568ada3ad 100644
--- a/datafusion/sqllogictest/test_files/binary.slt
+++ b/datafusion/sqllogictest/test_files/binary.slt
@@ -217,30 +217,51 @@ SELECT largebinary FROM t ORDER BY largebinary;
NULL
# LIKE
-# https://github.com/apache/arrow-datafusion/issues/7342
-query error DataFusion error: type_coercion
-SELECT binary FROM t where binary LIKE '%F';
-
-query error DataFusion error: type_coercion
-SELECT largebinary FROM t where largebinary LIKE '%F';
+query ?
+SELECT binary FROM t where binary LIKE '%F%';
+----
+466f6f
+466f6f426172
+query ?
+SELECT largebinary FROM t where largebinary LIKE '%F%';
+----
+466f6f
+466f6f426172
# character_length function
-# https://github.com/apache/arrow-datafusion/issues/7344
-query error DataFusion error: Error during planning: The "character_length"
function can only accept strings, but got Binary\.
+query TITI
SELECT
cast(binary as varchar) as str,
character_length(binary) as binary_len,
cast(largebinary as varchar) as large_str,
character_length(binary) as largebinary_len
from t;
+----
+Foo 3 Foo 3
+NULL NULL NULL NULL
+Bar 3 Bar 3
+FooBar 6 FooBar 6
+
+query I
+SELECT character_length(X'20');
+----
+1
+
+# still errors on values that can not be coerced to utf8
+query error Encountered non UTF\-8 data: invalid utf\-8 sequence of 1 bytes
from index 0
+SELECT character_length(X'c328');
# regexp_replace
-# https://github.com/apache/arrow-datafusion/issues/7345
-query error DataFusion error: Error during planning: The "regexp_replace"
function can only accept strings, but got Binary\.
+query TTTT
SELECT
cast(binary as varchar) as str,
regexp_replace(binary, 'F', 'f') as binary_replaced,
cast(largebinary as varchar) as large_str,
regexp_replace(largebinary, 'F', 'f') as large_binary_replaced
from t;
+----
+Foo foo Foo foo
+NULL NULL NULL NULL
+Bar Bar Bar Bar
+FooBar fooBar FooBar fooBar