This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b73734f9b4 Improve substr() performance by avoiding using owned string 
(#13688)
b73734f9b4 is described below

commit b73734f9b4277b421ec790436e641d6f40dd1560
Author: Zhang Li <[email protected]>
AuthorDate: Tue Dec 10 05:49:19 2024 +0800

    Improve substr() performance by avoiding using owned string (#13688)
    
    Co-authored-by: zhangli20 <[email protected]>
---
 datafusion/functions/src/unicode/substr.rs | 77 ++++++++++++++++--------------
 1 file changed, 40 insertions(+), 37 deletions(-)

diff --git a/datafusion/functions/src/unicode/substr.rs 
b/datafusion/functions/src/unicode/substr.rs
index 141984cf26..687f77dbef 100644
--- a/datafusion/functions/src/unicode/substr.rs
+++ b/datafusion/functions/src/unicode/substr.rs
@@ -21,8 +21,8 @@ use std::sync::{Arc, OnceLock};
 use crate::strings::{make_and_append_view, StringArrayType};
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use arrow::array::{
-    Array, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array, 
OffsetSizeTrait,
-    StringViewArray,
+    Array, ArrayIter, ArrayRef, AsArray, GenericStringBuilder, Int64Array,
+    OffsetSizeTrait, StringViewArray,
 };
 use arrow::datatypes::DataType;
 use arrow_buffer::{NullBufferBuilder, ScalarBuffer};
@@ -448,10 +448,9 @@ where
     match args.len() {
         1 => {
             let iter = ArrayIter::new(string_array);
-
-            let result = iter
-                .zip(start_array.iter())
-                .map(|(string, start)| match (string, start) {
+            let mut result_builder = GenericStringBuilder::<T>::new();
+            for (string, start) in iter.zip(start_array.iter()) {
+                match (string, start) {
                     (Some(string), Some(start)) => {
                         let (start, end) = get_true_start_end(
                             string,
@@ -460,47 +459,51 @@ where
                             enable_ascii_fast_path,
                         ); // start, end is byte-based
                         let substr = &string[start..end];
-                        Some(substr.to_string())
+                        result_builder.append_value(substr);
                     }
-                    _ => None,
-                })
-                .collect::<GenericStringArray<T>>();
-            Ok(Arc::new(result) as ArrayRef)
+                    _ => {
+                        result_builder.append_null();
+                    }
+                }
+            }
+            Ok(Arc::new(result_builder.finish()) as ArrayRef)
         }
         2 => {
             let iter = ArrayIter::new(string_array);
             let count_array = count_array_opt.unwrap();
+            let mut result_builder = GenericStringBuilder::<T>::new();
 
-            let result = iter
-                .zip(start_array.iter())
-                .zip(count_array.iter())
-                .map(|((string, start), count)| {
-                    match (string, start, count) {
-                        (Some(string), Some(start), Some(count)) => {
-                            if count < 0 {
-                                exec_err!(
+            for ((string, start), count) in
+                iter.zip(start_array.iter()).zip(count_array.iter())
+            {
+                match (string, start, count) {
+                    (Some(string), Some(start), Some(count)) => {
+                        if count < 0 {
+                            return exec_err!(
                                 "negative substring length not allowed: 
substr(<str>, {start}, {count})"
-                            )
-                            } else {
-                                if start == i64::MIN {
-                                    return exec_err!("negative overflow when 
calculating skip value");
-                                }
-                                let (start, end) = get_true_start_end(
-                                    string,
-                                    start,
-                                    Some(count as u64),
-                                    enable_ascii_fast_path,
-                                ); // start, end is byte-based
-                                let substr = &string[start..end];
-                                Ok(Some(substr.to_string()))
+                            );
+                        } else {
+                            if start == i64::MIN {
+                                return exec_err!(
+                                    "negative overflow when calculating skip 
value"
+                                );
                             }
+                            let (start, end) = get_true_start_end(
+                                string,
+                                start,
+                                Some(count as u64),
+                                enable_ascii_fast_path,
+                            ); // start, end is byte-based
+                            let substr = &string[start..end];
+                            result_builder.append_value(substr);
                         }
-                        _ => Ok(None),
                     }
-                })
-                .collect::<Result<GenericStringArray<T>>>()?;
-
-            Ok(Arc::new(result) as ArrayRef)
+                    _ => {
+                        result_builder.append_null();
+                    }
+                }
+            }
+            Ok(Arc::new(result_builder.finish()) as ArrayRef)
         }
         other => {
             exec_err!("substr was called with {other} arguments. It requires 2 
or 3.")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to