This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 4f4e814ce3 perf: Optimize concat()/concat_ws() UDFs (#20317)
4f4e814ce3 is described below

commit 4f4e814ce331bc10430dfc6a034024826b101c37
Author: Neil Conway <[email protected]>
AuthorDate: Wed Feb 18 22:19:37 2026 -0500

    perf: Optimize concat()/concat_ws() UDFs (#20317)
    
    ## Which issue does this PR close?
    
    - Closes #20316.
    
    ## Rationale for this change
    
    Faster is better.
    
    ## What changes are included in this PR?
    
    This commit implements three optimizations:
    
    * In `StringViewArrayBuilder`, we recreated `block` after every call to
    `append_offset`. It is cheaper to instead clear and re-use `block`.
    
    * In `StringViewArrayBuilder::write()`, we re-validated that a string
    array consists of valid UTF8 characters. This was unnecessary work and
    can be skipped.
    
    * In the concat() UDF implementation, we miscalculated the initial size
    of the StringViewArrayBuilder buffer. This didn't lead to incorrect
    behavior but it resulted in unnecessarily needing to reallocate the
    buffer.
    
    ## Are these changes tested?
    
    Yes; no additional test cases warranted.
    
    ## Are there any user-facing changes?
    
    No.
---
 datafusion/functions/src/string/concat.rs    |  4 +++-
 datafusion/functions/src/string/concat_ws.rs |  2 ++
 datafusion/functions/src/strings.rs          | 23 +++++++----------------
 3 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/datafusion/functions/src/string/concat.rs 
b/datafusion/functions/src/string/concat.rs
index c8da67c186..e674541253 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -204,7 +204,9 @@ impl ScalarUDFImpl for ConcatFunc {
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
-                            data_size += string_array.len();
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 
bytes).
+                            data_size += 
string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 
ColumnarValueRef::NullableStringViewArray(string_array)
                             } else {
diff --git a/datafusion/functions/src/string/concat_ws.rs 
b/datafusion/functions/src/string/concat_ws.rs
index ee62c36c04..9d3b32eedf 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -247,6 +247,8 @@ impl ScalarUDFImpl for ConcatWsFunc {
                         DataType::Utf8View => {
                             let string_array = as_string_view_array(array)?;
 
+                            // This is an estimate; in particular, it will
+                            // undercount arrays of short strings (<= 12 
bytes).
                             data_size += 
string_array.total_buffer_bytes_used();
                             let column = if array.is_nullable() {
                                 
ColumnarValueRef::NullableStringViewArray(string_array)
diff --git a/datafusion/functions/src/strings.rs 
b/datafusion/functions/src/strings.rs
index a7be3ef792..cfddf57b09 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -152,43 +152,34 @@ impl StringViewArrayBuilder {
             }
             ColumnarValueRef::NullableArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NullableLargeStringArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NullableStringViewArray(array) => {
                 if !CHECK_VALID || array.is_valid(i) {
-                    self.block.push_str(
-                        
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
-                    );
+                    self.block.push_str(array.value(i));
                 }
             }
             ColumnarValueRef::NonNullableArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
             ColumnarValueRef::NonNullableLargeStringArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
             ColumnarValueRef::NonNullableStringViewArray(array) => {
-                self.block
-                    
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+                self.block.push_str(array.value(i));
             }
         }
     }
 
     pub fn append_offset(&mut self) {
         self.builder.append_value(&self.block);
-        self.block = String::new();
+        self.block.clear();
     }
 
     pub fn finish(mut self) -> StringViewArray {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to