This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b6d46a6382 perf: Optimize `initcap()` (#20352)
b6d46a6382 is described below
commit b6d46a63824f003117297848d8d83b659ac2e759
Author: Neil Conway <[email protected]>
AuthorDate: Tue Feb 24 01:11:08 2026 -0500
perf: Optimize `initcap()` (#20352)
## Which issue does this PR close?
- Closes #20351.
## Rationale for this change
When all values in a `Utf8`/`LargeUtf8` array are ASCII, we can skip
using `GenericStringBuilder` and instead process the entire input buffer
in a single pass using byte-level operations. This also avoids
recomputing the offsets and nulls arrays. A similar optimization is
already used for lower() and upper().
Along the way, optimize `initcap_string()` for ASCII-only inputs. It
already had an ASCII-only fastpath but there was room for further
optimization, by iterating over bytes rather than characters.
## What changes are included in this PR?
* Cleanup benchmarks: we ran the scalar benchmark for different array
sizes, despite the fact that it is invariant to the array size
* Add benchmark for different string lengths
* Add benchmark for Unicode array input
* Optimize for ASCII-only inputs as described above
* Add test case for ASCII-only input that is a sliced array
* Add test case variants for `LargeStringArray`
## Are these changes tested?
Yes, plus an additional test added.
## Are there any user-facing changes?
No.
---
datafusion/functions/benches/initcap.rs | 108 ++++++++++++---
datafusion/functions/src/unicode/initcap.rs | 197 ++++++++++++++++++++++++++--
2 files changed, 276 insertions(+), 29 deletions(-)
diff --git a/datafusion/functions/benches/initcap.rs
b/datafusion/functions/benches/initcap.rs
index e68e41baa2..b5e653e413 100644
--- a/datafusion/functions/benches/initcap.rs
+++ b/datafusion/functions/benches/initcap.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-use arrow::array::OffsetSizeTrait;
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray, StringViewBuilder};
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
@@ -47,52 +47,124 @@ fn create_args<O: OffsetSizeTrait>(
}
}
+/// Create a Utf8 array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8_args(size: usize) -> Vec<ColumnarValue> {
+ let array = Arc::new(StringArray::from_iter_values(std::iter::repeat_n(
+ "ñAnDÚ ÁrBOL ОлЕГ ÍslENsku",
+ size,
+ ))) as ArrayRef;
+ vec![ColumnarValue::Array(array)]
+}
+
+/// Create a Utf8View array where every value contains non-ASCII Unicode text.
+fn create_unicode_utf8view_args(size: usize) -> Vec<ColumnarValue> {
+ let mut builder = StringViewBuilder::with_capacity(size);
+ for _ in 0..size {
+ builder.append_value("ñAnDÚ ÁrBOL ОлЕГ ÍslENsku");
+ }
+ let array = Arc::new(builder.finish()) as ArrayRef;
+ vec![ColumnarValue::Array(array)]
+}
+
fn criterion_benchmark(c: &mut Criterion) {
let initcap = unicode::initcap();
let config_options = Arc::new(ConfigOptions::default());
- // Grouped benchmarks for array sizes - to compare with scalar performance
+ // Array benchmarks: vary both row count and string length
+ for size in [1024, 4096, 8192] {
+ for str_len in [16, 128] {
+ let mut group =
+ c.benchmark_group(format!("initcap size={size}
str_len={str_len}"));
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+ group.measurement_time(Duration::from_secs(10));
+
+ // Utf8
+ let array_args = create_args::<i32>(size, str_len, false);
+ let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8,
true).into()];
+
+ group.bench_function("array_utf8", |b| {
+ b.iter(|| {
+ black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+ args: array_args.clone(),
+ arg_fields: array_arg_fields.clone(),
+ number_rows: size,
+ return_field: Field::new("f", DataType::Utf8,
true).into(),
+ config_options: Arc::clone(&config_options),
+ }))
+ })
+ });
+
+ // Utf8View
+ let array_view_args = create_args::<i32>(size, str_len, true);
+ let array_view_arg_fields =
+ vec![Field::new("arg_0", DataType::Utf8View, true).into()];
+
+ group.bench_function("array_utf8view", |b| {
+ b.iter(|| {
+ black_box(initcap.invoke_with_args(ScalarFunctionArgs {
+ args: array_view_args.clone(),
+ arg_fields: array_view_arg_fields.clone(),
+ number_rows: size,
+ return_field: Field::new("f", DataType::Utf8View,
true).into(),
+ config_options: Arc::clone(&config_options),
+ }))
+ })
+ });
+
+ group.finish();
+ }
+ }
+
+ // Unicode array benchmarks
for size in [1024, 4096, 8192] {
- let mut group = c.benchmark_group(format!("initcap size={size}"));
+ let mut group = c.benchmark_group(format!("initcap unicode
size={size}"));
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));
- // Array benchmark - Utf8
- let array_args = create_args::<i32>(size, 16, false);
- let array_arg_fields = vec![Field::new("arg_0", DataType::Utf8,
true).into()];
- let batch_len = size;
+ let unicode_args = create_unicode_utf8_args(size);
+ let unicode_arg_fields = vec![Field::new("arg_0", DataType::Utf8,
true).into()];
group.bench_function("array_utf8", |b| {
b.iter(|| {
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
- args: array_args.clone(),
- arg_fields: array_arg_fields.clone(),
- number_rows: batch_len,
+ args: unicode_args.clone(),
+ arg_fields: unicode_arg_fields.clone(),
+ number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
});
- // Array benchmark - Utf8View
- let array_view_args = create_args::<i32>(size, 16, true);
- let array_view_arg_fields =
+ let unicode_view_args = create_unicode_utf8view_args(size);
+ let unicode_view_arg_fields =
vec![Field::new("arg_0", DataType::Utf8View, true).into()];
group.bench_function("array_utf8view", |b| {
b.iter(|| {
black_box(initcap.invoke_with_args(ScalarFunctionArgs {
- args: array_view_args.clone(),
- arg_fields: array_view_arg_fields.clone(),
- number_rows: batch_len,
+ args: unicode_view_args.clone(),
+ arg_fields: unicode_view_arg_fields.clone(),
+ number_rows: size,
return_field: Field::new("f", DataType::Utf8View,
true).into(),
config_options: Arc::clone(&config_options),
}))
})
});
- // Scalar benchmark - Utf8 (the optimization we added)
+ group.finish();
+ }
+
+ // Scalar benchmarks: independent of array size, run once
+ {
+ let mut group = c.benchmark_group("initcap scalar");
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+ group.measurement_time(Duration::from_secs(10));
+
+ // Utf8
let scalar_args = vec![ColumnarValue::Scalar(ScalarValue::Utf8(Some(
"hello world test string".to_string(),
)))];
@@ -110,7 +182,7 @@ fn criterion_benchmark(c: &mut Criterion) {
})
});
- // Scalar benchmark - Utf8View
+ // Utf8View
let scalar_view_args =
vec![ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
"hello world test string".to_string(),
)))];
diff --git a/datafusion/functions/src/unicode/initcap.rs
b/datafusion/functions/src/unicode/initcap.rs
index e2fc913099..a0cae69c52 100644
--- a/datafusion/functions/src/unicode/initcap.rs
+++ b/datafusion/functions/src/unicode/initcap.rs
@@ -19,8 +19,10 @@ use std::any::Any;
use std::sync::Arc;
use arrow::array::{
- Array, ArrayRef, GenericStringBuilder, OffsetSizeTrait, StringViewBuilder,
+ Array, ArrayRef, GenericStringArray, GenericStringBuilder, OffsetSizeTrait,
+ StringViewBuilder,
};
+use arrow::buffer::{Buffer, OffsetBuffer};
use arrow::datatypes::DataType;
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -148,8 +150,8 @@ impl ScalarUDFImpl for InitcapFunc {
}
}
-/// Converts the first letter of each word to upper case and the rest to lower
-/// case. Words are sequences of alphanumeric characters separated by
+/// Converts the first letter of each word to uppercase and the rest to
+/// lowercase. Words are sequences of alphanumeric characters separated by
/// non-alphanumeric characters.
///
/// Example:
@@ -159,6 +161,10 @@ impl ScalarUDFImpl for InitcapFunc {
fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_array = as_generic_string_array::<T>(&args[0])?;
+ if string_array.is_ascii() {
+ return Ok(initcap_ascii_array(string_array));
+ }
+
let mut builder = GenericStringBuilder::<T>::with_capacity(
string_array.len(),
string_array.value_data().len(),
@@ -176,12 +182,67 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) ->
Result<ArrayRef> {
Ok(Arc::new(builder.finish()) as ArrayRef)
}
+/// Fast path for `Utf8` or `LargeUtf8` arrays that are ASCII-only. We can use
a
+/// single pass over the buffer and operate directly on bytes.
+fn initcap_ascii_array<T: OffsetSizeTrait>(
+ string_array: &GenericStringArray<T>,
+) -> ArrayRef {
+ let offsets = string_array.offsets();
+ let src = string_array.value_data();
+ let first_offset = offsets.first().unwrap().as_usize();
+ let last_offset = offsets.last().unwrap().as_usize();
+
+ // For sliced arrays, only convert the visible bytes, not the entire input
+ // buffer.
+ let mut out = Vec::with_capacity(last_offset - first_offset);
+
+ for window in offsets.windows(2) {
+ let start = window[0].as_usize();
+ let end = window[1].as_usize();
+
+ let mut prev_is_alnum = false;
+ for &b in &src[start..end] {
+ let converted = if prev_is_alnum {
+ b.to_ascii_lowercase()
+ } else {
+ b.to_ascii_uppercase()
+ };
+ out.push(converted);
+ prev_is_alnum = b.is_ascii_alphanumeric();
+ }
+ }
+
+ let values = Buffer::from_vec(out);
+ let out_offsets = if first_offset == 0 {
+ offsets.clone()
+ } else {
+ // For sliced arrays, we need to rebase the offsets to reflect that the
+ // output only contains the bytes in the visible slice.
+ let rebased_offsets = offsets
+ .iter()
+ .map(|offset| T::usize_as(offset.as_usize() - first_offset))
+ .collect::<Vec<_>>();
+ OffsetBuffer::<T>::new(rebased_offsets.into())
+ };
+
+ // SAFETY: ASCII case conversion preserves byte length, so the original
+ // string boundaries are preserved. `out_offsets` is either identical to
+ // the input offsets or a rebased version relative to the compacted values
+ // buffer.
+ Arc::new(unsafe {
+ GenericStringArray::<T>::new_unchecked(
+ out_offsets,
+ values,
+ string_array.nulls().cloned(),
+ )
+ })
+}
+
fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
let string_view_array = as_string_view_array(&args[0])?;
-
let mut builder =
StringViewBuilder::with_capacity(string_view_array.len());
-
let mut container = String::new();
+
string_view_array.iter().for_each(|str| match str {
Some(s) => {
initcap_string(s, &mut container);
@@ -198,13 +259,16 @@ fn initcap_string(input: &str, container: &mut String) {
let mut prev_is_alphanumeric = false;
if input.is_ascii() {
- for c in input.chars() {
+ container.reserve(input.len());
+ // SAFETY: each byte is ASCII, so the result is valid UTF-8.
+ let out = unsafe { container.as_mut_vec() };
+ for &b in input.as_bytes() {
if prev_is_alphanumeric {
- container.push(c.to_ascii_lowercase());
+ out.push(b.to_ascii_lowercase());
} else {
- container.push(c.to_ascii_uppercase());
- };
- prev_is_alphanumeric = c.is_ascii_alphanumeric();
+ out.push(b.to_ascii_uppercase());
+ }
+ prev_is_alphanumeric = b.is_ascii_alphanumeric();
}
} else {
for c in input.chars() {
@@ -222,10 +286,11 @@ fn initcap_string(input: &str, container: &mut String) {
mod tests {
use crate::unicode::initcap::InitcapFunc;
use crate::utils::test::test_function;
- use arrow::array::{Array, StringArray, StringViewArray};
+ use arrow::array::{Array, ArrayRef, LargeStringArray, StringArray,
StringViewArray};
use arrow::datatypes::DataType::{Utf8, Utf8View};
use datafusion_common::{Result, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+ use std::sync::Arc;
#[test]
fn test_functions() -> Result<()> {
@@ -329,4 +394,114 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_initcap_ascii_array() -> Result<()> {
+ let array = StringArray::from(vec![
+ Some("hello world"),
+ None,
+ Some("foo-bar_baz/baX"),
+ Some(""),
+ Some("123 abc 456DEF"),
+ Some("ALL CAPS"),
+ Some("already correct"),
+ ]);
+ let args: Vec<ArrayRef> = vec![Arc::new(array)];
+ let result = super::initcap::<i32>(&args)?;
+ let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+ assert_eq!(result.len(), 7);
+ assert_eq!(result.value(0), "Hello World");
+ assert!(result.is_null(1));
+ assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+ assert_eq!(result.value(3), "");
+ assert_eq!(result.value(4), "123 Abc 456def");
+ assert_eq!(result.value(5), "All Caps");
+ assert_eq!(result.value(6), "Already Correct");
+ Ok(())
+ }
+
+ #[test]
+ fn test_initcap_ascii_large_array() -> Result<()> {
+ let array = LargeStringArray::from(vec![
+ Some("hello world"),
+ None,
+ Some("foo-bar_baz/baX"),
+ Some(""),
+ Some("123 abc 456DEF"),
+ Some("ALL CAPS"),
+ Some("already correct"),
+ ]);
+ let args: Vec<ArrayRef> = vec![Arc::new(array)];
+ let result = super::initcap::<i64>(&args)?;
+ let result =
result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+ assert_eq!(result.len(), 7);
+ assert_eq!(result.value(0), "Hello World");
+ assert!(result.is_null(1));
+ assert_eq!(result.value(2), "Foo-Bar_Baz/Bax");
+ assert_eq!(result.value(3), "");
+ assert_eq!(result.value(4), "123 Abc 456def");
+ assert_eq!(result.value(5), "All Caps");
+ assert_eq!(result.value(6), "Already Correct");
+ Ok(())
+ }
+
+ /// Test that initcap works correctly on a sliced ASCII StringArray.
+ #[test]
+ fn test_initcap_sliced_ascii_array() -> Result<()> {
+ let array = StringArray::from(vec![
+ Some("hello world"),
+ Some("foo bar"),
+ Some("baz qux"),
+ ]);
+ // Slice to get only the last two elements. The resulting array's
+ // offsets are [11, 18, 25] (non-zero start), but value_data still
+ // contains the full original buffer.
+ let sliced = array.slice(1, 2);
+ let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+ let result = super::initcap::<i32>(&args)?;
+ let result = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+ assert_eq!(result.len(), 2);
+ assert_eq!(result.value(0), "Foo Bar");
+ assert_eq!(result.value(1), "Baz Qux");
+
+ // The output values buffer should be compact
+ assert_eq!(*result.offsets().first().unwrap(), 0);
+ assert_eq!(
+ result.value_data().len(),
+ *result.offsets().last().unwrap() as usize
+ );
+ Ok(())
+ }
+
+ /// Test that initcap works correctly on a sliced ASCII LargeStringArray.
+ #[test]
+ fn test_initcap_sliced_ascii_large_array() -> Result<()> {
+ let array = LargeStringArray::from(vec![
+ Some("hello world"),
+ Some("foo bar"),
+ Some("baz qux"),
+ ]);
+ // Slice to get only the last two elements. The resulting array's
+ // offsets are [11, 18, 25] (non-zero start), but value_data still
+ // contains the full original buffer.
+ let sliced = array.slice(1, 2);
+ let args: Vec<ArrayRef> = vec![Arc::new(sliced)];
+ let result = super::initcap::<i64>(&args)?;
+ let result =
result.as_any().downcast_ref::<LargeStringArray>().unwrap();
+
+ assert_eq!(result.len(), 2);
+ assert_eq!(result.value(0), "Foo Bar");
+ assert_eq!(result.value(1), "Baz Qux");
+
+ // The output values buffer should be compact
+ assert_eq!(*result.offsets().first().unwrap(), 0);
+ assert_eq!(
+ result.value_data().len(),
+ *result.offsets().last().unwrap() as usize
+ );
+ Ok(())
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]