This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 2cca3710f3 Create unicode module in datafusion/functions/src/unicode
and unicode_expressions feature flag, move char_length function (#9825)
2cca3710f3 is described below
commit 2cca3710f3b31148ffe99d9e225c768c921a748b
Author: Bruce Ritchie <[email protected]>
AuthorDate: Thu Mar 28 13:02:27 2024 -0400
Create unicode module in datafusion/functions/src/unicode and
unicode_expressions feature flag, move char_length function (#9825)
* Fix to_timestamp benchmark
* Remove reference to simd and nightly build as simd is no longer an
available feature in DataFusion and building with nightly may not be a good
recommendation when getting started.
* Fixed missing trim() function.
* Create unicode module in datafusion/functions/src/unicode and
unicode_expressions feature flag, move char_length function
---
datafusion-cli/Cargo.lock | 1 +
datafusion/core/Cargo.toml | 1 +
.../core/tests/dataframe/dataframe_functions.rs | 1 +
datafusion/expr/src/built_in_function.rs | 14 +-
datafusion/expr/src/expr_fn.rs | 8 -
datafusion/functions/Cargo.toml | 4 +
datafusion/functions/src/lib.rs | 9 ++
datafusion/functions/src/string/ascii.rs | 2 +-
datafusion/functions/src/string/bit_length.rs | 4 +-
datafusion/functions/src/string/btrim.rs | 1 +
datafusion/functions/src/string/chr.rs | 2 +-
datafusion/functions/src/string/common.rs | 158 +-----------------
datafusion/functions/src/string/levenshtein.rs | 3 +-
datafusion/functions/src/string/lower.rs | 8 +-
datafusion/functions/src/string/ltrim.rs | 3 +-
datafusion/functions/src/string/octet_length.rs | 13 +-
datafusion/functions/src/string/overlay.rs | 2 +-
datafusion/functions/src/string/repeat.rs | 4 +-
datafusion/functions/src/string/replace.rs | 2 +-
datafusion/functions/src/string/rtrim.rs | 1 +
datafusion/functions/src/string/split_part.rs | 4 +-
datafusion/functions/src/string/starts_with.rs | 9 +-
datafusion/functions/src/string/to_hex.rs | 9 +-
datafusion/functions/src/string/upper.rs | 3 +-
.../functions/src/unicode/character_length.rs | 176 +++++++++++++++++++++
datafusion/functions/src/unicode/mod.rs | 55 +++++++
.../functions/src/{string/common.rs => utils.rs} | 162 +------------------
datafusion/physical-expr/src/functions.rs | 70 --------
.../physical-expr/src/unicode_expressions.rs | 23 ---
datafusion/proto/proto/datafusion.proto | 2 +-
datafusion/proto/src/generated/pbjson.rs | 3 -
datafusion/proto/src/generated/prost.rs | 4 +-
datafusion/proto/src/logical_plan/from_proto.rs | 8 +-
datafusion/proto/src/logical_plan/to_proto.rs | 1 -
datafusion/sql/Cargo.toml | 1 +
datafusion/sql/tests/sql_integration.rs | 15 +-
36 files changed, 310 insertions(+), 476 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index b5535a47e9..ba60c04cea 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1273,6 +1273,7 @@ dependencies = [
"md-5",
"regex",
"sha2",
+ "unicode-segmentation",
"uuid",
]
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
index 1e5c0d748e..de03579975 100644
--- a/datafusion/core/Cargo.toml
+++ b/datafusion/core/Cargo.toml
@@ -70,6 +70,7 @@ unicode_expressions = [
"datafusion-physical-expr/unicode_expressions",
"datafusion-optimizer/unicode_expressions",
"datafusion-sql/unicode_expressions",
+ "datafusion-functions/unicode_expressions",
]
[dependencies]
diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs
b/datafusion/core/tests/dataframe/dataframe_functions.rs
index 6ebd64c9b6..4371cce856 100644
--- a/datafusion/core/tests/dataframe/dataframe_functions.rs
+++ b/datafusion/core/tests/dataframe/dataframe_functions.rs
@@ -37,6 +37,7 @@ use datafusion::assert_batches_eq;
use datafusion_common::DFSchema;
use datafusion_expr::expr::Alias;
use datafusion_expr::{approx_median, cast, ExprSchemable};
+use datafusion_functions::unicode::expr_fn::character_length;
fn test_schema() -> SchemaRef {
Arc::new(Schema::new(vec![
diff --git a/datafusion/expr/src/built_in_function.rs
b/datafusion/expr/src/built_in_function.rs
index bb0f79f8ec..eefbc131a2 100644
--- a/datafusion/expr/src/built_in_function.rs
+++ b/datafusion/expr/src/built_in_function.rs
@@ -103,8 +103,6 @@ pub enum BuiltinScalarFunction {
Cot,
// string functions
- /// character_length
- CharacterLength,
/// concat
Concat,
/// concat_ws
@@ -218,7 +216,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Cbrt => Volatility::Immutable,
BuiltinScalarFunction::Cot => Volatility::Immutable,
BuiltinScalarFunction::Trunc => Volatility::Immutable,
- BuiltinScalarFunction::CharacterLength => Volatility::Immutable,
BuiltinScalarFunction::Concat => Volatility::Immutable,
BuiltinScalarFunction::ConcatWithSeparator =>
Volatility::Immutable,
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
@@ -257,9 +254,6 @@ impl BuiltinScalarFunction {
// the return type of the built in function.
// Some built-in functions' return type depends on the incoming type.
match self {
- BuiltinScalarFunction::CharacterLength => {
- utf8_to_int_type(&input_expr_types[0], "character_length")
- }
BuiltinScalarFunction::Coalesce => {
// COALESCE has multiple args and they might get coerced, get
a preview of this
let coerced_types = data_types(input_expr_types,
&self.signature());
@@ -367,9 +361,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Coalesce => {
Signature::variadic_equal(self.volatility())
}
- BuiltinScalarFunction::CharacterLength
- | BuiltinScalarFunction::InitCap
- | BuiltinScalarFunction::Reverse => {
+ BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse =>
{
Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
}
BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => {
@@ -584,10 +576,6 @@ impl BuiltinScalarFunction {
// conditional functions
BuiltinScalarFunction::Coalesce => &["coalesce"],
- // string functions
- BuiltinScalarFunction::CharacterLength => {
- &["character_length", "char_length", "length"]
- }
BuiltinScalarFunction::Concat => &["concat"],
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
BuiltinScalarFunction::EndsWith => &["ends_with"],
diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs
index 0ea946288e..6544647986 100644
--- a/datafusion/expr/src/expr_fn.rs
+++ b/datafusion/expr/src/expr_fn.rs
@@ -577,13 +577,6 @@ scalar_expr!(Power, power, base exponent, "`base` raised
to the power of `expone
scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the
argument");
scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
-// string functions
-scalar_expr!(
- CharacterLength,
- character_length,
- string,
- "the number of characters in the `string`"
-);
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word
in `string` in uppercase and the remaining characters in lowercase");
scalar_expr!(Left, left, string n, "returns the first `n` characters in the
`string`");
scalar_expr!(Reverse, reverse, string, "reverses the `string`");
@@ -1032,7 +1025,6 @@ mod test {
test_scalar_expr!(Nanvl, nanvl, x, y);
test_scalar_expr!(Iszero, iszero, input);
- test_scalar_expr!(CharacterLength, character_length, string);
test_scalar_expr!(Gcd, gcd, arg_1, arg_2);
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
test_scalar_expr!(InitCap, initcap, string);
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 81050dfddf..0cab0276ff 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -43,6 +43,7 @@ default = [
"regex_expressions",
"crypto_expressions",
"string_expressions",
+ "unicode_expressions",
]
# enable encode/decode functions
encoding_expressions = ["base64", "hex"]
@@ -52,6 +53,8 @@ math_expressions = []
regex_expressions = ["regex"]
# enable string functions
string_expressions = []
+# enable unicode functions
+unicode_expressions = ["unicode-segmentation"]
[lib]
name = "datafusion_functions"
@@ -75,6 +78,7 @@ log = { workspace = true }
md-5 = { version = "^0.10.0", optional = true }
regex = { version = "1.8", optional = true }
sha2 = { version = "^0.10.1", optional = true }
+unicode-segmentation = { version = "^1.7.1", optional = true }
uuid = { version = "1.7", features = ["v4"] }
[dev-dependencies]
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
index f469b343e1..2a00839dc5 100644
--- a/datafusion/functions/src/lib.rs
+++ b/datafusion/functions/src/lib.rs
@@ -124,6 +124,12 @@ make_stub_package!(regex, "regex_expressions");
pub mod crypto;
make_stub_package!(crypto, "crypto_expressions");
+#[cfg(feature = "unicode_expressions")]
+pub mod unicode;
+make_stub_package!(unicode, "unicode_expressions");
+
+mod utils;
+
/// Fluent-style API for creating `Expr`s
pub mod expr_fn {
#[cfg(feature = "core_expressions")]
@@ -140,6 +146,8 @@ pub mod expr_fn {
pub use super::regex::expr_fn::*;
#[cfg(feature = "string_expressions")]
pub use super::string::expr_fn::*;
+ #[cfg(feature = "unicode_expressions")]
+ pub use super::unicode::expr_fn::*;
}
/// Registers all enabled packages with a [`FunctionRegistry`]
@@ -151,6 +159,7 @@ pub fn register_all(registry: &mut dyn FunctionRegistry) ->
Result<()> {
.chain(math::functions())
.chain(regex::functions())
.chain(crypto::functions())
+ .chain(unicode::functions())
.chain(string::functions());
all_functions.try_for_each(|udf| {
diff --git a/datafusion/functions/src/string/ascii.rs
b/datafusion/functions/src/string/ascii.rs
index 5bd77833a9..9a07f4c19c 100644
--- a/datafusion/functions/src/string/ascii.rs
+++ b/datafusion/functions/src/string/ascii.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::make_scalar_function;
+use crate::utils::make_scalar_function;
use arrow::array::Int32Array;
use arrow::array::{ArrayRef, OffsetSizeTrait};
use arrow::datatypes::DataType;
diff --git a/datafusion/functions/src/string/bit_length.rs
b/datafusion/functions/src/string/bit_length.rs
index 9f61275158..6a200471d4 100644
--- a/datafusion/functions/src/string/bit_length.rs
+++ b/datafusion/functions/src/string/bit_length.rs
@@ -15,16 +15,16 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::compute::kernels::length::bit_length;
use std::any::Any;
+use arrow::compute::kernels::length::bit_length;
use arrow::datatypes::DataType;
use datafusion_common::{exec_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::utf8_to_int_type;
#[derive(Debug)]
pub(super) struct BitLengthFunc {
diff --git a/datafusion/functions/src/string/btrim.rs
b/datafusion/functions/src/string/btrim.rs
index de1c9cc69b..573a23d070 100644
--- a/datafusion/functions/src/string/btrim.rs
+++ b/datafusion/functions/src/string/btrim.rs
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading and trailing characters removed.
If the characters are not specified, whitespace is removed.
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
diff --git a/datafusion/functions/src/string/chr.rs
b/datafusion/functions/src/string/chr.rs
index df3b803ba6..d1f8dc398a 100644
--- a/datafusion/functions/src/string/chr.rs
+++ b/datafusion/functions/src/string/chr.rs
@@ -29,7 +29,7 @@ use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::make_scalar_function;
/// Returns the character with the given code. chr(0) is disallowed because
text data types cannot store that character.
/// chr(65) = 'A'
diff --git a/datafusion/functions/src/string/common.rs
b/datafusion/functions/src/string/common.rs
index 339f4e6c1a..276aad121d 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -24,8 +24,7 @@ use arrow::datatypes::DataType;
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::Result;
use datafusion_common::{exec_err, ScalarValue};
-use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
-use datafusion_physical_expr::functions::Hint;
+use datafusion_expr::ColumnarValue;
pub(crate) enum TrimType {
Left,
@@ -98,52 +97,6 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
}
}
-/// Creates a function to identify the optimal return type of a string
function given
-/// the type of its first argument.
-///
-/// If the input type is `LargeUtf8` or `LargeBinary` the return type is
-/// `$largeUtf8Type`,
-///
-/// If the input type is `Utf8` or `Binary` the return type is `$utf8Type`,
-macro_rules! get_optimal_return_type {
- ($FUNC:ident, $largeUtf8Type:expr, $utf8Type:expr) => {
- pub(crate) fn $FUNC(arg_type: &DataType, name: &str) ->
Result<DataType> {
- Ok(match arg_type {
- // LargeBinary inputs are automatically coerced to Utf8
- DataType::LargeUtf8 | DataType::LargeBinary => $largeUtf8Type,
- // Binary inputs are automatically coerced to Utf8
- DataType::Utf8 | DataType::Binary => $utf8Type,
- DataType::Null => DataType::Null,
- DataType::Dictionary(_, value_type) => match **value_type {
- DataType::LargeUtf8 | DataType::LargeBinary =>
$largeUtf8Type,
- DataType::Utf8 | DataType::Binary => $utf8Type,
- DataType::Null => DataType::Null,
- _ => {
- return datafusion_common::exec_err!(
- "The {} function can only accept strings, but got
{:?}.",
- name.to_uppercase(),
- **value_type
- );
- }
- },
- data_type => {
- return datafusion_common::exec_err!(
- "The {} function can only accept strings, but got
{:?}.",
- name.to_uppercase(),
- data_type
- );
- }
- })
- }
- };
-}
-
-// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input
type size.
-get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8,
DataType::Utf8);
-
-// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type
size.
-get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
-
/// applies a unary expression to `args[0]` that is expected to be
downcastable to
/// a `GenericStringArray` and returns a `GenericStringArray` (which may have
a different offset)
/// # Errors
@@ -221,112 +174,3 @@ where
},
}
}
-
-pub(super) fn make_scalar_function<F>(
- inner: F,
- hints: Vec<Hint>,
-) -> ScalarFunctionImplementation
-where
- F: Fn(&[ArrayRef]) -> Result<ArrayRef> + Sync + Send + 'static,
-{
- Arc::new(move |args: &[ColumnarValue]| {
- // first, identify if any of the arguments is an Array. If yes, store
its `len`,
- // as any scalar will need to be converted to an array of len `len`.
- let len = args
- .iter()
- .fold(Option::<usize>::None, |acc, arg| match arg {
- ColumnarValue::Scalar(_) => acc,
- ColumnarValue::Array(a) => Some(a.len()),
- });
-
- let is_scalar = len.is_none();
-
- let inferred_length = len.unwrap_or(1);
- let args = args
- .iter()
- .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad)))
- .map(|(arg, hint)| {
- // Decide on the length to expand this scalar to depending
- // on the given hints.
- let expansion_len = match hint {
- Hint::AcceptsSingular => 1,
- Hint::Pad => inferred_length,
- };
- arg.clone().into_array(expansion_len)
- })
- .collect::<Result<Vec<_>>>()?;
-
- let result = (inner)(&args);
- if is_scalar {
- // If all inputs are scalar, keeps output as scalar
- let result = result.and_then(|arr|
ScalarValue::try_from_array(&arr, 0));
- result.map(ColumnarValue::Scalar)
- } else {
- result.map(ColumnarValue::Array)
- }
- })
-}
-
-#[cfg(test)]
-pub mod test {
- /// $FUNC ScalarUDFImpl to test
- /// $ARGS arguments (vec) to pass to function
- /// $EXPECTED a Result<ColumnarValue>
- /// $EXPECTED_TYPE is the expected value type
- /// $EXPECTED_DATA_TYPE is the expected result type
- /// $ARRAY_TYPE is the column type after function applied
- macro_rules! test_function {
- ($FUNC:expr, $ARGS:expr, $EXPECTED:expr, $EXPECTED_TYPE:ty,
$EXPECTED_DATA_TYPE:expr, $ARRAY_TYPE:ident) => {
- let expected: Result<Option<$EXPECTED_TYPE>> = $EXPECTED;
- let func = $FUNC;
-
- let type_array = $ARGS.iter().map(|arg|
arg.data_type()).collect::<Vec<_>>();
- let return_type = func.return_type(&type_array);
-
- match expected {
- Ok(expected) => {
- assert_eq!(return_type.is_ok(), true);
- assert_eq!(return_type.unwrap(), $EXPECTED_DATA_TYPE);
-
- let result = func.invoke($ARGS);
- assert_eq!(result.is_ok(), true);
-
- let len = $ARGS
- .iter()
- .fold(Option::<usize>::None, |acc, arg| match arg {
- ColumnarValue::Scalar(_) => acc,
- ColumnarValue::Array(a) => Some(a.len()),
- });
- let inferred_length = len.unwrap_or(1);
- let result =
result.unwrap().clone().into_array(inferred_length).expect("Failed to convert
to array");
- let result =
result.as_any().downcast_ref::<$ARRAY_TYPE>().expect("Failed to convert to
type");
-
- // value is correct
- match expected {
- Some(v) => assert_eq!(result.value(0), v),
- None => assert!(result.is_null(0)),
- };
- }
- Err(expected_error) => {
- if return_type.is_err() {
- match return_type {
- Ok(_) => assert!(false, "expected error"),
- Err(error) => {
datafusion_common::assert_contains!(expected_error.strip_backtrace(),
error.strip_backtrace()); }
- }
- }
- else {
- // invoke is expected error - cannot use .expect_err()
due to Debug not being implemented
- match func.invoke($ARGS) {
- Ok(_) => assert!(false, "expected error"),
- Err(error) => {
-
assert!(expected_error.strip_backtrace().starts_with(&error.strip_backtrace()));
- }
- }
- }
- }
- };
- };
- }
-
- pub(crate) use test_function;
-}
diff --git a/datafusion/functions/src/string/levenshtein.rs
b/datafusion/functions/src/string/levenshtein.rs
index b5de4b2894..8f497e73e3 100644
--- a/datafusion/functions/src/string/levenshtein.rs
+++ b/datafusion/functions/src/string/levenshtein.rs
@@ -21,6 +21,7 @@ use std::sync::Arc;
use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait};
use arrow::datatypes::DataType;
+use crate::utils::{make_scalar_function, utf8_to_int_type};
use datafusion_common::cast::as_generic_string_array;
use datafusion_common::utils::datafusion_strsim;
use datafusion_common::{exec_err, Result};
@@ -28,8 +29,6 @@ use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use crate::string::common::{make_scalar_function, utf8_to_int_type};
-
#[derive(Debug)]
pub(super) struct LevenshteinFunc {
signature: Signature,
diff --git a/datafusion/functions/src/string/lower.rs
b/datafusion/functions/src/string/lower.rs
index 42bda04700..327772bd80 100644
--- a/datafusion/functions/src/string/lower.rs
+++ b/datafusion/functions/src/string/lower.rs
@@ -15,12 +15,16 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::{handle, utf8_to_str_type};
+use std::any::Any;
+
use arrow::datatypes::DataType;
+
use datafusion_common::Result;
use datafusion_expr::ColumnarValue;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
+
+use crate::string::common::handle;
+use crate::utils::utf8_to_str_type;
#[derive(Debug)]
pub(super) struct LowerFunc {
diff --git a/datafusion/functions/src/string/ltrim.rs
b/datafusion/functions/src/string/ltrim.rs
index 535ffb14f5..e6926e5bd5 100644
--- a/datafusion/functions/src/string/ltrim.rs
+++ b/datafusion/functions/src/string/ltrim.rs
@@ -15,9 +15,9 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::array::{ArrayRef, OffsetSizeTrait};
use std::any::Any;
+use arrow::array::{ArrayRef, OffsetSizeTrait};
use arrow::datatypes::DataType;
use datafusion_common::{exec_err, Result};
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the
characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
diff --git a/datafusion/functions/src/string/octet_length.rs
b/datafusion/functions/src/string/octet_length.rs
index 36a62fbe4e..639bf6cb48 100644
--- a/datafusion/functions/src/string/octet_length.rs
+++ b/datafusion/functions/src/string/octet_length.rs
@@ -15,16 +15,16 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::compute::kernels::length::length;
use std::any::Any;
+use arrow::compute::kernels::length::length;
use arrow::datatypes::DataType;
use datafusion_common::{exec_err, Result, ScalarValue};
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::utf8_to_int_type;
#[derive(Debug)]
pub(super) struct OctetLengthFunc {
@@ -86,14 +86,17 @@ impl ScalarUDFImpl for OctetLengthFunc {
#[cfg(test)]
mod tests {
- use crate::string::common::test::test_function;
- use crate::string::octet_length::OctetLengthFunc;
+ use std::sync::Arc;
+
use arrow::array::{Array, Int32Array, StringArray};
use arrow::datatypes::DataType::Int32;
+
use datafusion_common::ScalarValue;
use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
- use std::sync::Arc;
+
+ use crate::string::octet_length::OctetLengthFunc;
+ use crate::utils::test::test_function;
#[test]
fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/overlay.rs
b/datafusion/functions/src/string/overlay.rs
index d7cc0da806..8b9cc03afc 100644
--- a/datafusion/functions/src/string/overlay.rs
+++ b/datafusion/functions/src/string/overlay.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
#[derive(Debug)]
pub(super) struct OverlayFunc {
diff --git a/datafusion/functions/src/string/repeat.rs
b/datafusion/functions/src/string/repeat.rs
index 83bc929cb9..f4319af0a5 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
#[derive(Debug)]
pub(super) struct RepeatFunc {
@@ -99,8 +99,8 @@ mod tests {
use datafusion_common::ScalarValue;
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
- use crate::string::common::test::test_function;
use crate::string::repeat::RepeatFunc;
+ use crate::utils::test::test_function;
#[test]
fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/replace.rs
b/datafusion/functions/src/string/replace.rs
index e352442960..e869ac2054 100644
--- a/datafusion/functions/src/string/replace.rs
+++ b/datafusion/functions/src/string/replace.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
#[derive(Debug)]
pub(super) struct ReplaceFunc {
diff --git a/datafusion/functions/src/string/rtrim.rs
b/datafusion/functions/src/string/rtrim.rs
index 17d2f8234b..d04d15ce88 100644
--- a/datafusion/functions/src/string/rtrim.rs
+++ b/datafusion/functions/src/string/rtrim.rs
@@ -26,6 +26,7 @@ use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the
characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
diff --git a/datafusion/functions/src/string/split_part.rs
b/datafusion/functions/src/string/split_part.rs
index af201e90fc..0aa968a1ef 100644
--- a/datafusion/functions/src/string/split_part.rs
+++ b/datafusion/functions/src/string/split_part.rs
@@ -27,7 +27,7 @@ use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ColumnarValue, Volatility};
use datafusion_expr::{ScalarUDFImpl, Signature};
-use crate::string::common::*;
+use crate::utils::{make_scalar_function, utf8_to_str_type};
#[derive(Debug)]
pub(super) struct SplitPartFunc {
@@ -117,8 +117,8 @@ mod tests {
use datafusion_common::{exec_err, Result};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
- use crate::string::common::test::test_function;
use crate::string::split_part::SplitPartFunc;
+ use crate::utils::test::test_function;
#[test]
fn test_functions() -> Result<()> {
diff --git a/datafusion/functions/src/string/starts_with.rs
b/datafusion/functions/src/string/starts_with.rs
index 4450b9d332..f1b03907f8 100644
--- a/datafusion/functions/src/string/starts_with.rs
+++ b/datafusion/functions/src/string/starts_with.rs
@@ -15,15 +15,18 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
use arrow::array::{ArrayRef, OffsetSizeTrait};
use arrow::datatypes::DataType;
+
use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
use datafusion_expr::ColumnarValue;
use datafusion_expr::TypeSignature::*;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
-use std::sync::Arc;
+
+use crate::utils::make_scalar_function;
/// Returns true if string starts with prefix.
/// starts_with('alphabet', 'alph') = 't'
diff --git a/datafusion/functions/src/string/to_hex.rs
b/datafusion/functions/src/string/to_hex.rs
index 1bdece3f7a..ab320c68d4 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -15,18 +15,21 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
use arrow::datatypes::{
ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type,
};
+
use datafusion_common::cast::as_primitive_array;
use datafusion_common::Result;
use datafusion_common::{exec_err, plan_err};
use datafusion_expr::ColumnarValue;
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
-use std::any::Any;
-use std::sync::Arc;
+
+use crate::utils::make_scalar_function;
/// Converts the number to its equivalent hexadecimal representation.
/// to_hex(2147483647) = '7fffffff'
diff --git a/datafusion/functions/src/string/upper.rs
b/datafusion/functions/src/string/upper.rs
index a0c910ebb2..066174abf2 100644
--- a/datafusion/functions/src/string/upper.rs
+++ b/datafusion/functions/src/string/upper.rs
@@ -15,7 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::string::common::{handle, utf8_to_str_type};
+use crate::string::common::handle;
+use crate::utils::utf8_to_str_type;
use arrow::datatypes::DataType;
use datafusion_common::Result;
use datafusion_expr::ColumnarValue;
diff --git a/datafusion/functions/src/unicode/character_length.rs
b/datafusion/functions/src/unicode/character_length.rs
new file mode 100644
index 0000000000..51331bf9a5
--- /dev/null
+++ b/datafusion/functions/src/unicode/character_length.rs
@@ -0,0 +1,176 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::utils::{make_scalar_function, utf8_to_int_type};
+use arrow::array::{
+ ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait,
PrimitiveArray,
+};
+use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
+use datafusion_common::cast::as_generic_string_array;
+use datafusion_common::exec_err;
+use datafusion_common::Result;
+use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
+use std::any::Any;
+use std::sync::Arc;
+
+#[derive(Debug)]
+pub(super) struct CharacterLengthFunc {
+ signature: Signature,
+ aliases: Vec<String>,
+}
+
+impl CharacterLengthFunc {
+ pub fn new() -> Self {
+ use DataType::*;
+ Self {
+ signature: Signature::uniform(
+ 1,
+ vec![Utf8, LargeUtf8],
+ Volatility::Immutable,
+ ),
+ aliases: vec![String::from("length"), String::from("char_length")],
+ }
+ }
+}
+
+impl ScalarUDFImpl for CharacterLengthFunc {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "character_length"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ utf8_to_int_type(&arg_types[0], "character_length")
+ }
+
+ fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ match args[0].data_type() {
+ DataType::Utf8 => {
+ make_scalar_function(character_length::<Int32Type>,
vec![])(args)
+ }
+ DataType::LargeUtf8 => {
+ make_scalar_function(character_length::<Int64Type>,
vec![])(args)
+ }
+ other => {
+ exec_err!("Unsupported data type {other:?} for function
character_length")
+ }
+ }
+ }
+
+ fn aliases(&self) -> &[String] {
+ &self.aliases
+ }
+}
+
+/// Returns number of characters in the string.
+/// character_length('josé') = 4
+/// The implementation counts UTF-8 code points to count the number of
characters
+fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) ->
Result<ArrayRef>
+where
+ T::Native: OffsetSizeTrait,
+{
+ let string_array: &GenericStringArray<T::Native> =
+ as_generic_string_array::<T::Native>(&args[0])?;
+
+ let result = string_array
+ .iter()
+ .map(|string| {
+ string.map(|string: &str| {
+ T::Native::from_usize(string.chars().count())
+ .expect("should not fail as string.chars will always
return integer")
+ })
+ })
+ .collect::<PrimitiveArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+#[cfg(test)]
+mod tests {
+ use crate::unicode::character_length::CharacterLengthFunc;
+ use crate::utils::test::test_function;
+ use arrow::array::{Array, Int32Array};
+ use arrow::datatypes::DataType::Int32;
+ use datafusion_common::{Result, ScalarValue};
+ use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+ #[test]
+ fn test_functions() -> Result<()> {
+ #[cfg(feature = "unicode_expressions")]
+ test_function!(
+ CharacterLengthFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ String::from("chars")
+ )))],
+ Ok(Some(5)),
+ i32,
+ Int32,
+ Int32Array
+ );
+ #[cfg(feature = "unicode_expressions")]
+ test_function!(
+ CharacterLengthFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ String::from("josé")
+ )))],
+ Ok(Some(4)),
+ i32,
+ Int32,
+ Int32Array
+ );
+ #[cfg(feature = "unicode_expressions")]
+ test_function!(
+ CharacterLengthFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::Utf8(Some(
+ String::from("")
+ )))],
+ Ok(Some(0)),
+ i32,
+ Int32,
+ Int32Array
+ );
+ #[cfg(feature = "unicode_expressions")]
+ test_function!(
+ CharacterLengthFunc::new(),
+ &[ColumnarValue::Scalar(ScalarValue::Utf8(None))],
+ Ok(None),
+ i32,
+ Int32,
+ Int32Array
+ );
+ #[cfg(not(feature = "unicode_expressions"))]
+ test_function!(
+ CharacterLengthFunc::new(),
+
&[ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("josé"))))],
+ internal_err!(
+ "function character_length requires compilation with feature
flag: unicode_expressions."
+ ),
+ i32,
+ Int32,
+ Int32Array
+ );
+
+ Ok(())
+ }
+}
diff --git a/datafusion/functions/src/unicode/mod.rs
b/datafusion/functions/src/unicode/mod.rs
new file mode 100644
index 0000000000..291de38439
--- /dev/null
+++ b/datafusion/functions/src/unicode/mod.rs
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! "unicode" DataFusion functions
+
+use std::sync::Arc;
+
+use datafusion_expr::ScalarUDF;
+
+mod character_length;
+
+// create UDFs
+make_udf_function!(
+ character_length::CharacterLengthFunc,
+ CHARACTER_LENGTH,
+ character_length
+);
+
+pub mod expr_fn {
+ use datafusion_expr::Expr;
+
+ #[doc = "the number of characters in the `string`"]
+ pub fn char_length(string: Expr) -> Expr {
+ character_length(string)
+ }
+
+ #[doc = "the number of characters in the `string`"]
+ pub fn character_length(string: Expr) -> Expr {
+ super::character_length().call(vec![string])
+ }
+
+ #[doc = "the number of characters in the `string`"]
+ pub fn length(string: Expr) -> Expr {
+ character_length(string)
+ }
+}
+
+/// Return a list of all functions in this package
+pub fn functions() -> Vec<Arc<ScalarUDF>> {
+ vec![character_length()]
+}
diff --git a/datafusion/functions/src/string/common.rs
b/datafusion/functions/src/utils.rs
similarity index 59%
copy from datafusion/functions/src/string/common.rs
copy to datafusion/functions/src/utils.rs
index 339f4e6c1a..f45deafdb3 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/utils.rs
@@ -15,88 +15,12 @@
// specific language governing permissions and limitations
// under the License.
-use std::fmt::{Display, Formatter};
-use std::sync::Arc;
-
-use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::ArrayRef;
use arrow::datatypes::DataType;
-
-use datafusion_common::cast::as_generic_string_array;
-use datafusion_common::Result;
-use datafusion_common::{exec_err, ScalarValue};
+use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarFunctionImplementation};
use datafusion_physical_expr::functions::Hint;
-
-pub(crate) enum TrimType {
- Left,
- Right,
- Both,
-}
-
-impl Display for TrimType {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
- match self {
- TrimType::Left => write!(f, "ltrim"),
- TrimType::Right => write!(f, "rtrim"),
- TrimType::Both => write!(f, "btrim"),
- }
- }
-}
-
-pub(crate) fn general_trim<T: OffsetSizeTrait>(
- args: &[ArrayRef],
- trim_type: TrimType,
-) -> Result<ArrayRef> {
- let func = match trim_type {
- TrimType::Left => |input, pattern: &str| {
- let pattern = pattern.chars().collect::<Vec<char>>();
- str::trim_start_matches::<&[char]>(input, pattern.as_ref())
- },
- TrimType::Right => |input, pattern: &str| {
- let pattern = pattern.chars().collect::<Vec<char>>();
- str::trim_end_matches::<&[char]>(input, pattern.as_ref())
- },
- TrimType::Both => |input, pattern: &str| {
- let pattern = pattern.chars().collect::<Vec<char>>();
- str::trim_end_matches::<&[char]>(
- str::trim_start_matches::<&[char]>(input, pattern.as_ref()),
- pattern.as_ref(),
- )
- },
- };
-
- let string_array = as_generic_string_array::<T>(&args[0])?;
-
- match args.len() {
- 1 => {
- let result = string_array
- .iter()
- .map(|string| string.map(|string: &str| func(string, " ")))
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
- }
- 2 => {
- let characters_array = as_generic_string_array::<T>(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(characters_array.iter())
- .map(|(string, characters)| match (string, characters) {
- (Some(string), Some(characters)) => Some(func(string,
characters)),
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
- }
- other => {
- exec_err!(
- "{trim_type} was called with {other} arguments. It requires at
least 1 and at most 2."
- )
- }
- }
-}
+use std::sync::Arc;
/// Creates a function to identify the optimal return type of a string
function given
/// the type of its first argument.
@@ -144,84 +68,6 @@ get_optimal_return_type!(utf8_to_str_type,
DataType::LargeUtf8, DataType::Utf8);
// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type
size.
get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
-/// applies a unary expression to `args[0]` that is expected to be
downcastable to
-/// a `GenericStringArray` and returns a `GenericStringArray` (which may have
a different offset)
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-pub(crate) fn unary_string_function<'a, T, O, F, R>(
- args: &[&'a dyn Array],
- op: F,
- name: &str,
-) -> Result<GenericStringArray<O>>
-where
- R: AsRef<str>,
- O: OffsetSizeTrait,
- T: OffsetSizeTrait,
- F: Fn(&'a str) -> R,
-{
- if args.len() != 1 {
- return exec_err!(
- "{:?} args were supplied but {} takes exactly one argument",
- args.len(),
- name
- );
- }
-
- let string_array = as_generic_string_array::<T>(args[0])?;
-
- // first map is the iterator, second is for the `Option<_>`
- Ok(string_array.iter().map(|string| string.map(&op)).collect())
-}
-
-pub(crate) fn handle<'a, F, R>(
- args: &'a [ColumnarValue],
- op: F,
- name: &str,
-) -> Result<ColumnarValue>
-where
- R: AsRef<str>,
- F: Fn(&'a str) -> R,
-{
- match &args[0] {
- ColumnarValue::Array(a) => match a.data_type() {
- DataType::Utf8 => {
- Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
- i32,
- i32,
- _,
- _,
- >(
- &[a.as_ref()], op, name
- )?)))
- }
- DataType::LargeUtf8 => {
- Ok(ColumnarValue::Array(Arc::new(unary_string_function::<
- i64,
- i64,
- _,
- _,
- >(
- &[a.as_ref()], op, name
- )?)))
- }
- other => exec_err!("Unsupported data type {other:?} for function
{name}"),
- },
- ColumnarValue::Scalar(scalar) => match scalar {
- ScalarValue::Utf8(a) => {
- let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
- }
- ScalarValue::LargeUtf8(a) => {
- let result = a.as_ref().map(|x| (op)(x).as_ref().to_string());
- Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
- }
- other => exec_err!("Unsupported data type {other:?} for function
{name}"),
- },
- }
-}
-
pub(super) fn make_scalar_function<F>(
inner: F,
hints: Vec<Hint>,
@@ -254,7 +100,7 @@ where
};
arg.clone().into_array(expansion_len)
})
- .collect::<Result<Vec<_>>>()?;
+ .collect::<datafusion_common::Result<Vec<_>>>()?;
let result = (inner)(&args);
if is_scalar {
diff --git a/datafusion/physical-expr/src/functions.rs
b/datafusion/physical-expr/src/functions.rs
index cd9bba63d6..9adc853634 100644
--- a/datafusion/physical-expr/src/functions.rs
+++ b/datafusion/physical-expr/src/functions.rs
@@ -254,29 +254,6 @@ pub fn create_physical_fun(
Arc::new(|args|
make_scalar_function_inner(math_expressions::cot)(args))
}
// string functions
- BuiltinScalarFunction::CharacterLength => {
- Arc::new(|args| match args[0].data_type() {
- DataType::Utf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(
- character_length,
- Int32Type,
- "character_length"
- );
- make_scalar_function_inner(func)(args)
- }
- DataType::LargeUtf8 => {
- let func = invoke_if_unicode_expressions_feature_flag!(
- character_length,
- Int64Type,
- "character_length"
- );
- make_scalar_function_inner(func)(args)
- }
- other => exec_err!(
- "Unsupported data type {other:?} for function
character_length"
- ),
- })
- }
BuiltinScalarFunction::Coalesce =>
Arc::new(conditional_expressions::coalesce),
BuiltinScalarFunction::Concat => Arc::new(string_expressions::concat),
BuiltinScalarFunction::ConcatWithSeparator => Arc::new(|args| {
@@ -595,53 +572,6 @@ mod tests {
#[test]
fn test_functions() -> Result<()> {
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- CharacterLength,
- &[lit("chars")],
- Ok(Some(5)),
- i32,
- Int32,
- Int32Array
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- CharacterLength,
- &[lit("josé")],
- Ok(Some(4)),
- i32,
- Int32,
- Int32Array
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- CharacterLength,
- &[lit("")],
- Ok(Some(0)),
- i32,
- Int32,
- Int32Array
- );
- #[cfg(feature = "unicode_expressions")]
- test_function!(
- CharacterLength,
- &[lit(ScalarValue::Utf8(None))],
- Ok(None),
- i32,
- Int32,
- Int32Array
- );
- #[cfg(not(feature = "unicode_expressions"))]
- test_function!(
- CharacterLength,
- &[lit("josé")],
- internal_err!(
- "function character_length requires compilation with feature
flag: unicode_expressions."
- ),
- i32,
- Int32,
- Int32Array
- );
test_function!(
Concat,
&[lit("aa"), lit("bb"), lit("cc"),],
diff --git a/datafusion/physical-expr/src/unicode_expressions.rs
b/datafusion/physical-expr/src/unicode_expressions.rs
index 8ec9e062d9..c7e4b7d7c4 100644
--- a/datafusion/physical-expr/src/unicode_expressions.rs
+++ b/datafusion/physical-expr/src/unicode_expressions.rs
@@ -36,29 +36,6 @@ use datafusion_common::{
exec_err, Result,
};
-/// Returns number of characters in the string.
-/// character_length('josé') = 4
-/// The implementation counts UTF-8 code points to count the number of
characters
-pub fn character_length<T: ArrowPrimitiveType>(args: &[ArrayRef]) ->
Result<ArrayRef>
-where
- T::Native: OffsetSizeTrait,
-{
- let string_array: &GenericStringArray<T::Native> =
- as_generic_string_array::<T::Native>(&args[0])?;
-
- let result = string_array
- .iter()
- .map(|string| {
- string.map(|string: &str| {
- T::Native::from_usize(string.chars().count())
- .expect("should not fail as string.chars will always
return integer")
- })
- })
- .collect::<PrimitiveArray<T>>();
-
- Ok(Arc::new(result) as ArrayRef)
-}
-
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
/// The implementation uses UTF-8 code points as characters
diff --git a/datafusion/proto/proto/datafusion.proto
b/datafusion/proto/proto/datafusion.proto
index f405ecf976..766ca6633e 100644
--- a/datafusion/proto/proto/datafusion.proto
+++ b/datafusion/proto/proto/datafusion.proto
@@ -565,7 +565,7 @@ enum ScalarFunction {
// RegexpMatch = 21;
// 22 was BitLength
// 23 was Btrim
- CharacterLength = 24;
+ // 24 was CharacterLength
// 25 was Chr
Concat = 26;
ConcatWithSeparator = 27;
diff --git a/datafusion/proto/src/generated/pbjson.rs
b/datafusion/proto/src/generated/pbjson.rs
index 0d22ba5db7..f2814956ef 100644
--- a/datafusion/proto/src/generated/pbjson.rs
+++ b/datafusion/proto/src/generated/pbjson.rs
@@ -22928,7 +22928,6 @@ impl serde::Serialize for ScalarFunction {
Self::Sin => "Sin",
Self::Sqrt => "Sqrt",
Self::Trunc => "Trunc",
- Self::CharacterLength => "CharacterLength",
Self::Concat => "Concat",
Self::ConcatWithSeparator => "ConcatWithSeparator",
Self::InitCap => "InitCap",
@@ -22988,7 +22987,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Sin",
"Sqrt",
"Trunc",
- "CharacterLength",
"Concat",
"ConcatWithSeparator",
"InitCap",
@@ -23077,7 +23075,6 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction {
"Sin" => Ok(ScalarFunction::Sin),
"Sqrt" => Ok(ScalarFunction::Sqrt),
"Trunc" => Ok(ScalarFunction::Trunc),
- "CharacterLength" => Ok(ScalarFunction::CharacterLength),
"Concat" => Ok(ScalarFunction::Concat),
"ConcatWithSeparator" =>
Ok(ScalarFunction::ConcatWithSeparator),
"InitCap" => Ok(ScalarFunction::InitCap),
diff --git a/datafusion/proto/src/generated/prost.rs
b/datafusion/proto/src/generated/prost.rs
index 07c3fad153..ecc94fcdaf 100644
--- a/datafusion/proto/src/generated/prost.rs
+++ b/datafusion/proto/src/generated/prost.rs
@@ -2864,7 +2864,7 @@ pub enum ScalarFunction {
/// RegexpMatch = 21;
/// 22 was BitLength
/// 23 was Btrim
- CharacterLength = 24,
+ /// 24 was CharacterLength
/// 25 was Chr
Concat = 26,
ConcatWithSeparator = 27,
@@ -3001,7 +3001,6 @@ impl ScalarFunction {
ScalarFunction::Sin => "Sin",
ScalarFunction::Sqrt => "Sqrt",
ScalarFunction::Trunc => "Trunc",
- ScalarFunction::CharacterLength => "CharacterLength",
ScalarFunction::Concat => "Concat",
ScalarFunction::ConcatWithSeparator => "ConcatWithSeparator",
ScalarFunction::InitCap => "InitCap",
@@ -3055,7 +3054,6 @@ impl ScalarFunction {
"Sin" => Some(Self::Sin),
"Sqrt" => Some(Self::Sqrt),
"Trunc" => Some(Self::Trunc),
- "CharacterLength" => Some(Self::CharacterLength),
"Concat" => Some(Self::Concat),
"ConcatWithSeparator" => Some(Self::ConcatWithSeparator),
"InitCap" => Some(Self::InitCap),
diff --git a/datafusion/proto/src/logical_plan/from_proto.rs
b/datafusion/proto/src/logical_plan/from_proto.rs
index 4b9874bf8f..19edd71a3a 100644
--- a/datafusion/proto/src/logical_plan/from_proto.rs
+++ b/datafusion/proto/src/logical_plan/from_proto.rs
@@ -48,8 +48,8 @@ use datafusion_expr::expr::Unnest;
use datafusion_expr::expr::{Alias, Placeholder};
use datafusion_expr::window_frame::{check_window_frame,
regularize_window_order_by};
use datafusion_expr::{
- acosh, asinh, atan, atan2, atanh, cbrt, ceil, character_length, coalesce,
- concat_expr, concat_ws_expr, cos, cosh, cot, degrees, ends_with, exp,
+ acosh, asinh, atan, atan2, atanh, cbrt, ceil, coalesce, concat_expr,
concat_ws_expr,
+ cos, cosh, cot, degrees, ends_with, exp,
expr::{self, InList, Sort, WindowFunction},
factorial, find_in_set, floor, gcd, initcap, iszero, lcm, left, ln, log,
log10, log2,
logical_plan::{PlanType, StringifiedPlan},
@@ -450,7 +450,6 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Concat => Self::Concat,
ScalarFunction::Log2 => Self::Log2,
ScalarFunction::Signum => Self::Signum,
- ScalarFunction::CharacterLength => Self::CharacterLength,
ScalarFunction::ConcatWithSeparator => Self::ConcatWithSeparator,
ScalarFunction::EndsWith => Self::EndsWith,
ScalarFunction::InitCap => Self::InitCap,
@@ -1372,9 +1371,6 @@ pub fn parse_expr(
ScalarFunction::Signum => {
Ok(signum(parse_expr(&args[0], registry, codec)?))
}
- ScalarFunction::CharacterLength => {
- Ok(character_length(parse_expr(&args[0], registry,
codec)?))
- }
ScalarFunction::InitCap => {
Ok(initcap(parse_expr(&args[0], registry, codec)?))
}
diff --git a/datafusion/proto/src/logical_plan/to_proto.rs
b/datafusion/proto/src/logical_plan/to_proto.rs
index 1335d511a0..11fc7362c7 100644
--- a/datafusion/proto/src/logical_plan/to_proto.rs
+++ b/datafusion/proto/src/logical_plan/to_proto.rs
@@ -1442,7 +1442,6 @@ impl TryFrom<&BuiltinScalarFunction> for
protobuf::ScalarFunction {
BuiltinScalarFunction::Concat => Self::Concat,
BuiltinScalarFunction::Log2 => Self::Log2,
BuiltinScalarFunction::Signum => Self::Signum,
- BuiltinScalarFunction::CharacterLength => Self::CharacterLength,
BuiltinScalarFunction::ConcatWithSeparator =>
Self::ConcatWithSeparator,
BuiltinScalarFunction::EndsWith => Self::EndsWith,
BuiltinScalarFunction::InitCap => Self::InitCap,
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index ca2c1a240c..b9f6dc259e 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -49,6 +49,7 @@ strum = { version = "0.26.1", features = ["derive"] }
[dev-dependencies]
ctor = { workspace = true }
+datafusion-functions = { workspace = true, default-features = true }
env_logger = { workspace = true }
paste = "^1.0"
rstest = { workspace = true }
diff --git a/datafusion/sql/tests/sql_integration.rs
b/datafusion/sql/tests/sql_integration.rs
index 448a9c5420..101c31039c 100644
--- a/datafusion/sql/tests/sql_integration.rs
+++ b/datafusion/sql/tests/sql_integration.rs
@@ -38,6 +38,7 @@ use datafusion_sql::{
planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel},
};
+use datafusion_functions::unicode;
use rstest::rstest;
use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect};
use sqlparser::parser::Parser;
@@ -88,7 +89,7 @@ fn parse_decimals() {
fn parse_ident_normalization() {
let test_data = [
(
- "SELECT LENGTH('str')",
+ "SELECT CHARACTER_LENGTH('str')",
"Ok(Projection: character_length(Utf8(\"str\"))\n EmptyRelation)",
false,
),
@@ -2688,6 +2689,7 @@ fn logical_plan_with_dialect_and_options(
options: ParserOptions,
) -> Result<LogicalPlan> {
let context = MockContextProvider::default()
+ .with_udf(unicode::character_length().as_ref().clone())
.with_udf(make_udf(
"nullif",
vec![DataType::Int32, DataType::Int32],
@@ -4508,26 +4510,27 @@ fn test_field_not_found_window_function() {
#[test]
fn test_parse_escaped_string_literal_value() {
- let sql = r"SELECT length('\r\n') AS len";
+ let sql = r"SELECT character_length('\r\n') AS len";
let expected = "Projection: character_length(Utf8(\"\\r\\n\")) AS len\
\n EmptyRelation";
quick_test(sql, expected);
- let sql = r"SELECT length(E'\r\n') AS len";
+ let sql = r"SELECT character_length(E'\r\n') AS len";
let expected = "Projection: character_length(Utf8(\"\r\n\")) AS len\
\n EmptyRelation";
quick_test(sql, expected);
- let sql = r"SELECT length(E'\445') AS len, E'\x4B' AS hex, E'\u0001' AS
unicode";
+ let sql =
+ r"SELECT character_length(E'\445') AS len, E'\x4B' AS hex, E'\u0001'
AS unicode";
let expected =
"Projection: character_length(Utf8(\"%\")) AS len, Utf8(\"\u{004b}\")
AS hex, Utf8(\"\u{0001}\") AS unicode\
\n EmptyRelation";
quick_test(sql, expected);
- let sql = r"SELECT length(E'\000') AS len";
+ let sql = r"SELECT character_length(E'\000') AS len";
assert_eq!(
logical_plan(sql).unwrap_err().strip_backtrace(),
- "SQL error: TokenizerError(\"Unterminated encoded string literal at
Line: 1, Column 15\")"
+ "SQL error: TokenizerError(\"Unterminated encoded string literal at
Line: 1, Column 25\")"
)
}