alamb commented on code in PR #9971: URL: https://github.com/apache/arrow-datafusion/pull/9971#discussion_r1561567051
########## datafusion/functions/benches/lower.rs: ########## @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{ArrayRef, StringArray}; +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use datafusion_functions::string; +use std::sync::Arc; + +fn create_args1(size: usize, str_len: usize) -> Vec<ColumnarValue> { + let array = Arc::new(create_string_array_with_len::<i32>(size, 0.2, str_len)); + vec![ColumnarValue::Array(array)] +} + +fn create_args2(size: usize) -> Vec<ColumnarValue> { Review Comment: adding some context here about why this is used (aka non ascii data, data where the upper/lower case values are not the same size in bytes) would probably help future readers ########## datafusion/functions/src/string/common.rs: ########## @@ -97,80 +101,145 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>( } } -/// applies a unary expression to `args[0]` that is expected to be downcastable to -/// a `GenericStringArray` and returns a `GenericStringArray` (which may have a different offset) -/// # Errors -/// This function errors when: -/// * the number of arguments is not 1 -/// * the first argument is not castable to a `GenericStringArray` -pub(crate) fn unary_string_function<'a, T, O, F, R>( - args: &[&'a dyn Array], - op: F, - name: &str, -) -> Result<GenericStringArray<O>> -where - R: AsRef<str>, - O: OffsetSizeTrait, - T: OffsetSizeTrait, - F: Fn(&'a str) -> R, -{ - if args.len() != 1 { - return exec_err!( - "{:?} args were supplied but {} takes exactly one argument", - args.len(), - name - ); - } - - let string_array = as_generic_string_array::<T>(args[0])?; +pub(crate) fn to_lower(args: &[ColumnarValue], name: &str) -> Result<ColumnarValue> { + case_conversion(args, |string| string.to_lowercase(), name) +} - // first map is the iterator, second is for the `Option<_>` - Ok(string_array.iter().map(|string| string.map(&op)).collect()) +pub(crate) fn to_upper(args: &[ColumnarValue], name: &str) -> Result<ColumnarValue> { + case_conversion(args, |string| string.to_uppercase(), name) } -pub(crate) fn handle<'a, F, R>( +fn case_conversion<'a, F>( args: &'a [ColumnarValue], op: F, name: &str, ) -> Result<ColumnarValue> where - R: AsRef<str>, - F: Fn(&'a str) -> R, + F: Fn(&'a str) -> String, { match &args[0] { - ColumnarValue::Array(a) => match a.data_type() { - DataType::Utf8 => { - Ok(ColumnarValue::Array(Arc::new(unary_string_function::< - i32, - i32, - _, - _, - >( - &[a.as_ref()], op, name - )?))) - } - DataType::LargeUtf8 => { - Ok(ColumnarValue::Array(Arc::new(unary_string_function::< - i64, - i64, - _, - _, - >( - &[a.as_ref()], op, name - )?))) - } + ColumnarValue::Array(array) => match array.data_type() { + DataType::Utf8 => Ok(ColumnarValue::Array(case_conversion_array::<i32, _>( + array, op, + )?)), + DataType::LargeUtf8 => Ok(ColumnarValue::Array(case_conversion_array::< + i64, + _, + >(array, op)?)), other => exec_err!("Unsupported data type {other:?} for function {name}"), }, ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) => { - let result = a.as_ref().map(|x| (op)(x).as_ref().to_string()); + let result = a.as_ref().map(|x| op(x)); Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) } ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| (op)(x).as_ref().to_string()); + let result = a.as_ref().map(|x| op(x)); Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result))) } other => exec_err!("Unsupported data type {other:?} for function {name}"), }, } } + +fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> Result<ArrayRef> +where + O: OffsetSizeTrait, + F: Fn(&'a str) -> String, +{ + let string_array = as_generic_string_array::<O>(array)?; + let item_len = string_array.len(); + + // Find the first nonascii string at the beginning. + let find_the_first_nonascii = || { Review Comment: I agree it would be good to try the simpler approach. However, as long at this current implementation is well tested and shows performance improvements I think we could merge it as is and simplify the implementation in a follow on PR as well. If this is your preference @JasonLi-cn I will try and find some more time to review the implementation carefully. A simpler implementation has the benefit it is easier (and thus faster) to review. Some other random optimization thoughts: * We could and upper/lower values as a single string in one call, for example, detecting when the relevant value was a different length and doing a special path then * We could also special case when the string had nulls and when it didn't (which can make the inner loop simpler and allow a better chance for auto vectorization) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
