This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b73734f9b4 Improve substr() performance by avoiding using owned string
(#13688)
b73734f9b4 is described below
commit b73734f9b4277b421ec790436e641d6f40dd1560
Author: Zhang Li <[email protected]>
AuthorDate: Tue Dec 10 05:49:19 2024 +0800
Improve substr() performance by avoiding using owned string (#13688)
Co-authored-by: zhangli20 <[email protected]>
---
datafusion/functions/src/unicode/substr.rs | 77 ++++++++++++++++--------------
1 file changed, 40 insertions(+), 37 deletions(-)
diff --git a/datafusion/functions/src/unicode/substr.rs
b/datafusion/functions/src/unicode/substr.rs
index 141984cf26..687f77dbef 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -21,8 +21,8 @@ use std::sync::{Arc, OnceLock};
use crate::strings::{make_and_append_view, StringArrayType};
use crate::utils::{make_scalar_function, utf8_to_str_type};
use arrow::array::{
- Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array,
OffsetSizeTrait,
- StringViewArray,
+ Array, ArrayIter, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+ OffsetSizeTrait, StringViewArray,
};
use arrow::datatypes::DataType;
use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
@@ -448,10 +448,9 @@ where
match args.len() {
1 => {
let iter = ArrayIter::new(string_array);
-
- let result = iter
- .zip(start_array.iter())
- .map(|(string, start)| match (string, start) {
+ let mut result_builder = GenericStringBuilder::<T>::new();
+ for (string, start) in iter.zip(start_array.iter()) {
+ match (string, start) {
(Some(string), Some(start)) => {
let (start, end) = get_true_start_end(
string,
@@ -460,47 +459,51 @@ where
enable_ascii_fast_path,
); // start, end is byte-based
let substr = &string[start..end];
- Some(substr.to_string())
+ result_builder.append_value(substr);
}
- _ => None,
- })
- .collect::<GenericStringArray<T>>();
- Ok(Arc::new(result) as ArrayRef)
+ _ => {
+ result_builder.append_null();
+ }
+ }
+ }
+ Ok(Arc::new(result_builder.finish()) as ArrayRef)
}
2 => {
let iter = ArrayIter::new(string_array);
let count_array = count_array_opt.unwrap();
+ let mut result_builder = GenericStringBuilder::<T>::new();
- let result = iter
- .zip(start_array.iter())
- .zip(count_array.iter())
- .map(|((string, start), count)| {
- match (string, start, count) {
- (Some(string), Some(start), Some(count)) => {
- if count < 0 {
- exec_err!(
+ for ((string, start), count) in
+ iter.zip(start_array.iter()).zip(count_array.iter())
+ {
+ match (string, start, count) {
+ (Some(string), Some(start), Some(count)) => {
+ if count < 0 {
+ return exec_err!(
"negative substring length not allowed:
substr(<str>, {start}, {count})"
- )
- } else {
- if start == i64::MIN {
- return exec_err!("negative overflow when
calculating skip value");
- }
- let (start, end) = get_true_start_end(
- string,
- start,
- Some(count as u64),
- enable_ascii_fast_path,
- ); // start, end is byte-based
- let substr = &string[start..end];
- Ok(Some(substr.to_string()))
+ );
+ } else {
+ if start == i64::MIN {
+ return exec_err!(
+ "negative overflow when calculating skip
value"
+ );
}
+ let (start, end) = get_true_start_end(
+ string,
+ start,
+ Some(count as u64),
+ enable_ascii_fast_path,
+ ); // start, end is byte-based
+ let substr = &string[start..end];
+ result_builder.append_value(substr);
}
- _ => Ok(None),
}
- })
- .collect::<Result<GenericStringArray<T>>>()?;
-
- Ok(Arc::new(result) as ArrayRef)
+ _ => {
+ result_builder.append_null();
+ }
+ }
+ }
+ Ok(Arc::new(result_builder.finish()) as ArrayRef)
}
other => {
exec_err!("substr was called with {other} arguments. It requires 2
or 3.")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]