This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4f4e814ce3 perf: Optimize concat()/concat_ws() UDFs (#20317)
4f4e814ce3 is described below
commit 4f4e814ce331bc10430dfc6a034024826b101c37
Author: Neil Conway <[email protected]>
AuthorDate: Wed Feb 18 22:19:37 2026 -0500
perf: Optimize concat()/concat_ws() UDFs (#20317)
## Which issue does this PR close?
- Closes #20316.
## Rationale for this change
Faster is better.
## What changes are included in this PR?
This commit implements three optimizations:
* In `StringViewArrayBuilder`, we recreated `block` after every call to
`append_offset`. It is cheaper to instead clear and re-use `block`.
* In `StringViewArrayBuilder::write()`, we re-validated that a string
array consists of valid UTF8 characters. This was unnecessary work and
can be skipped.
* In the concat() UDF implementation, we miscalculated the initial size
of the StringViewArrayBuilder buffer. This didn't lead to incorrect
behavior but it resulted in unnecessarily needing to reallocate the
buffer.
## Are these changes tested?
Yes; no additional test cases warranted.
## Are there any user-facing changes?
No.
---
datafusion/functions/src/string/concat.rs | 4 +++-
datafusion/functions/src/string/concat_ws.rs | 2 ++
datafusion/functions/src/strings.rs | 23 +++++++----------------
3 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/datafusion/functions/src/string/concat.rs
b/datafusion/functions/src/string/concat.rs
index c8da67c186..e674541253 100644
--- a/datafusion/functions/src/string/concat.rs
+++ b/datafusion/functions/src/string/concat.rs
@@ -204,7 +204,9 @@ impl ScalarUDFImpl for ConcatFunc {
DataType::Utf8View => {
let string_array = as_string_view_array(array)?;
- data_size += string_array.len();
+ // This is an estimate; in particular, it will
+ // undercount arrays of short strings (<= 12
bytes).
+ data_size +=
string_array.total_buffer_bytes_used();
let column = if array.is_nullable() {
ColumnarValueRef::NullableStringViewArray(string_array)
} else {
diff --git a/datafusion/functions/src/string/concat_ws.rs
b/datafusion/functions/src/string/concat_ws.rs
index ee62c36c04..9d3b32eedf 100644
--- a/datafusion/functions/src/string/concat_ws.rs
+++ b/datafusion/functions/src/string/concat_ws.rs
@@ -247,6 +247,8 @@ impl ScalarUDFImpl for ConcatWsFunc {
DataType::Utf8View => {
let string_array = as_string_view_array(array)?;
+ // This is an estimate; in particular, it will
+ // undercount arrays of short strings (<= 12
bytes).
data_size +=
string_array.total_buffer_bytes_used();
let column = if array.is_nullable() {
ColumnarValueRef::NullableStringViewArray(string_array)
diff --git a/datafusion/functions/src/strings.rs
b/datafusion/functions/src/strings.rs
index a7be3ef792..cfddf57b09 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -152,43 +152,34 @@ impl StringViewArrayBuilder {
}
ColumnarValueRef::NullableArray(array) => {
if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
+ self.block.push_str(array.value(i));
}
}
ColumnarValueRef::NullableLargeStringArray(array) => {
if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
+ self.block.push_str(array.value(i));
}
}
ColumnarValueRef::NullableStringViewArray(array) => {
if !CHECK_VALID || array.is_valid(i) {
- self.block.push_str(
-
std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
- );
+ self.block.push_str(array.value(i));
}
}
ColumnarValueRef::NonNullableArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ self.block.push_str(array.value(i));
}
ColumnarValueRef::NonNullableLargeStringArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ self.block.push_str(array.value(i));
}
ColumnarValueRef::NonNullableStringViewArray(array) => {
- self.block
-
.push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
+ self.block.push_str(array.value(i));
}
}
}
pub fn append_offset(&mut self) {
self.builder.append_value(&self.block);
- self.block = String::new();
+ self.block.clear();
}
pub fn finish(mut self) -> StringViewArray {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]