This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 12314c5564 perf: Optimize `array_to_string` to avoid a copy (#20639)
12314c5564 is described below
commit 12314c5564337bf556763b8a414502d72094ed1e
Author: Neil Conway <[email protected]>
AuthorDate: Mon Mar 2 17:06:34 2026 -0500
perf: Optimize `array_to_string` to avoid a copy (#20639)
## Which issue does this PR close?
N/A
## Rationale for this change
`array_to_string` used a temporary buffer to convert array elements to a
string, before then copying that buffer into a `StringBuilder` via
`append_value`. We can skip this copy by arranging to write directly
into the `StringBuilder's buffer because it implements `fmt::Write`.
This is a 12-18% performance improvement on the `array_to_string`
benchmarks.
## What changes are included in this PR?
* Optimization
* Change the signature of the `append` closure to return `Result`, which
avoids a lot of `unwrap` clutter
## Are these changes tested?
Yes -- covered by existing tests and benchmarks.
## Are there any user-facing changes?
No.
---
datafusion/functions-nested/src/string.rs | 80 +++++++++++++++----------------
1 file changed, 39 insertions(+), 41 deletions(-)
diff --git a/datafusion/functions-nested/src/string.rs
b/datafusion/functions-nested/src/string.rs
index 8aabc49309..ce72e878de 100644
--- a/datafusion/functions-nested/src/string.rs
+++ b/datafusion/functions-nested/src/string.rs
@@ -29,7 +29,7 @@ use datafusion_common::utils::ListCoercion;
use datafusion_common::{DataFusionError, Result, not_impl_err};
use std::any::Any;
-use std::fmt::Write;
+use std::fmt::{self, Write};
use crate::utils::make_scalar_function;
use arrow::array::{
@@ -324,7 +324,6 @@ fn generate_string_array<O: OffsetSizeTrait>(
null_strings: &[Option<&str>],
) -> Result<StringArray> {
let mut builder = StringBuilder::with_capacity(list_arr.len(), 0);
- let mut buf = String::new();
for ((arr, &delimiter), &null_string) in list_arr
.iter()
@@ -336,17 +335,16 @@ fn generate_string_array<O: OffsetSizeTrait>(
continue;
};
- buf.clear();
let mut first = true;
- compute_array_to_string(&mut buf, &arr, delimiter, null_string, &mut
first)?;
- builder.append_value(&buf);
+ compute_array_to_string(&mut builder, &arr, delimiter, null_string,
&mut first)?;
+ builder.append_value("");
}
Ok(builder.finish())
}
fn compute_array_to_string(
- buf: &mut String,
+ w: &mut impl Write,
arr: &ArrayRef,
delimiter: &str,
null_string: Option<&str>,
@@ -358,7 +356,7 @@ fn compute_array_to_string(
for i in 0..$list_array.len() {
if !$list_array.is_null(i) {
compute_array_to_string(
- buf,
+ w,
&$list_array.value(i),
delimiter,
null_string,
@@ -368,9 +366,9 @@ fn compute_array_to_string(
if *first {
*first = false;
} else {
- buf.push_str(delimiter);
+ w.write_str(delimiter)?;
}
- buf.push_str(ns);
+ w.write_str(ns)?;
}
}
};
@@ -399,71 +397,69 @@ fn compute_array_to_string(
DataFusionError::from(e)
.context("Casting dictionary to values in
compute_array_to_string")
})?;
- compute_array_to_string(buf, &values, delimiter, null_string,
first)
+ compute_array_to_string(w, &values, delimiter, null_string, first)
}
Null => Ok(()),
data_type => {
macro_rules! str_leaf {
($ARRAY_TYPE:ident) => {
write_leaf_to_string(
- buf,
+ w,
downcast_arg!(arr, $ARRAY_TYPE),
delimiter,
null_string,
first,
- |buf, x: &str| buf.push_str(x),
- )
+ |w, x: &str| w.write_str(x),
+ )?
};
}
macro_rules! bool_leaf {
($ARRAY_TYPE:ident) => {
write_leaf_to_string(
- buf,
+ w,
downcast_arg!(arr, $ARRAY_TYPE),
delimiter,
null_string,
first,
- |buf, x: bool| {
+ |w, x: bool| {
if x {
- buf.push_str("true");
+ w.write_str("true")
} else {
- buf.push_str("false");
+ w.write_str("false")
}
},
- )
+ )?
};
}
macro_rules! int_leaf {
($ARRAY_TYPE:ident) => {
write_leaf_to_string(
- buf,
+ w,
downcast_arg!(arr, $ARRAY_TYPE),
delimiter,
null_string,
first,
- |buf, x| {
+ |w, x| {
let mut itoa_buf = itoa::Buffer::new();
- buf.push_str(itoa_buf.format(x));
+ w.write_str(itoa_buf.format(x))
},
- )
+ )?
};
}
macro_rules! float_leaf {
($ARRAY_TYPE:ident) => {
write_leaf_to_string(
- buf,
+ w,
downcast_arg!(arr, $ARRAY_TYPE),
delimiter,
null_string,
first,
- |buf, x| {
- // TODO: Consider switching to a more efficient
- // floating point display library (e.g., ryu). This
- // might result in some differences in the output
- // format, however.
- write!(buf, "{}", x).unwrap();
- },
- )
+ // TODO: Consider switching to a more efficient
+ // floating point display library (e.g., ryu). This
+ // might result in some differences in the output
+ // format, however.
+ |w, x| write!(w, "{}", x),
+ )?
};
}
match data_type {
@@ -487,7 +483,7 @@ fn compute_array_to_string(
.context("Casting to string in array_to_string")
})?;
return compute_array_to_string(
- buf,
+ w,
&str_arr,
delimiter,
null_string,
@@ -506,17 +502,18 @@ fn compute_array_to_string(
}
/// Appends the string representation of each element in a leaf (non-list)
-/// array to `buf`, separated by `delimiter`. Null elements are rendered
+/// array to `w`, separated by `delimiter`. Null elements are rendered
/// using `null_string` if provided, or skipped otherwise. The `append`
-/// closure controls how each non-null element is written to the buffer.
-fn write_leaf_to_string<'a, A, T>(
- buf: &mut String,
+/// closure controls how each non-null element is written.
+fn write_leaf_to_string<'a, W: Write, A, T>(
+ w: &mut W,
arr: &'a A,
delimiter: &str,
null_string: Option<&str>,
first: &mut bool,
- append: impl Fn(&mut String, T),
-) where
+ append: impl Fn(&mut W, T) -> fmt::Result,
+) -> Result<()>
+where
&'a A: IntoIterator<Item = Option<T>>,
{
for x in arr {
@@ -528,14 +525,15 @@ fn write_leaf_to_string<'a, A, T>(
if *first {
*first = false;
} else {
- buf.push_str(delimiter);
+ w.write_str(delimiter)?;
}
match x {
- Some(x) => append(buf, x),
- None => buf.push_str(null_string.unwrap()),
+ Some(x) => append(w, x)?,
+ None => w.write_str(null_string.unwrap())?,
}
}
+ Ok(())
}
/// String_to_array SQL function
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]