This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new c3f080774c perf: Optimize translate() UDF for scalar inputs (#20305)
c3f080774c is described below
commit c3f080774cd5dd588b5250317ac3b0fc393c0647
Author: Neil Conway <[email protected]>
AuthorDate: Thu Feb 19 02:03:19 2026 -0500
perf: Optimize translate() UDF for scalar inputs (#20305)
## Which issue does this PR close?
- Closes #20302.
## Rationale for this change
`translate()` is commonly invoked with constant values for its second
and third arguments. We can take advantage of that to significantly
optimize its performance by precomputing the translation lookup table,
rather than recomputing it for every row. For ASCII-only inputs, we can
further replace the hashmap lookup table with a fixed-size array that
maps ASCII byte values directly.
For scalar ASCII inputs, this yields roughly a 10x performance
improvement. For scalar UTF8 inputs, the performance improvement is more
like 50%, although less so for long strings.
Along the way, add support for `translate()` on `LargeUtf8` input, along
with an SLT test, and improve the docs.
## What changes are included in this PR?
* Add a benchmark for scalar/constant input to translate
* Add a missing test case
* Improve translate() docs
* Support translate() on LargeUtf8 input
* Optimize translate() for scalar inputs by precomputing lookup hashmap
* Optimize translate() for ASCII inputs by precomputing ASCII byte-wise
lookup table
## Are these changes tested?
Yes. Added an extra test case and did a bunch of benchmarking.
## Are there any user-facing changes?
No.
---------
Co-authored-by: Martin Grigorov <[email protected]>
Co-authored-by: Jeffrey Vo <[email protected]>
---
datafusion/functions/benches/translate.rs | 48 ++++--
datafusion/functions/src/unicode/translate.rs | 186 ++++++++++++++++++++++-
datafusion/sqllogictest/test_files/functions.slt | 5 +
docs/source/user-guide/sql/scalar_functions.md | 8 +-
4 files changed, 221 insertions(+), 26 deletions(-)
diff --git a/datafusion/functions/benches/translate.rs
b/datafusion/functions/benches/translate.rs
index f63faacd39..d0568ba0f5 100644
--- a/datafusion/functions/benches/translate.rs
+++ b/datafusion/functions/benches/translate.rs
@@ -19,17 +19,19 @@ use arrow::array::OffsetSizeTrait;
use arrow::datatypes::{DataType, Field};
use arrow::util::bench_util::create_string_array_with_len;
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
-use datafusion_common::DataFusionError;
use datafusion_common::config::ConfigOptions;
+use datafusion_common::{DataFusionError, ScalarValue};
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::unicode;
use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;
-fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) ->
Vec<ColumnarValue> {
+fn create_args_array_from_to<O: OffsetSizeTrait>(
+ size: usize,
+ str_len: usize,
+) -> Vec<ColumnarValue> {
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1,
str_len));
- // Create simple from/to strings for translation
let from_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 3));
let to_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 2));
@@ -40,6 +42,19 @@ fn create_args<O: OffsetSizeTrait>(size: usize, str_len:
usize) -> Vec<ColumnarV
]
}
+fn create_args_scalar_from_to<O: OffsetSizeTrait>(
+ size: usize,
+ str_len: usize,
+) -> Vec<ColumnarValue> {
+ let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1,
str_len));
+
+ vec![
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Scalar(ScalarValue::from("aeiou")),
+ ColumnarValue::Scalar(ScalarValue::from("AEIOU")),
+ ]
+}
+
fn invoke_translate_with_args(
args: Vec<ColumnarValue>,
number_rows: usize,
@@ -67,17 +82,22 @@ fn criterion_benchmark(c: &mut Criterion) {
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));
- for str_len in [8, 32] {
- let args = create_args::<i32>(size, str_len);
- group.bench_function(
- format!("translate_string [size={size}, str_len={str_len}]"),
- |b| {
- b.iter(|| {
- let args_cloned = args.clone();
- black_box(invoke_translate_with_args(args_cloned,
size))
- })
- },
- );
+ for str_len in [8, 32, 128, 1024] {
+ let args = create_args_array_from_to::<i32>(size, str_len);
+ group.bench_function(format!("array_from_to [str_len={str_len}]"),
|b| {
+ b.iter(|| {
+ let args_cloned = args.clone();
+ black_box(invoke_translate_with_args(args_cloned, size))
+ })
+ });
+
+ let args = create_args_scalar_from_to::<i32>(size, str_len);
+ group.bench_function(format!("scalar_from_to
[str_len={str_len}]"), |b| {
+ b.iter(|| {
+ let args_cloned = args.clone();
+ black_box(invoke_translate_with_args(args_cloned, size))
+ })
+ });
}
group.finish();
diff --git a/datafusion/functions/src/unicode/translate.rs
b/datafusion/functions/src/unicode/translate.rs
index f97c0ed5c2..e86eaf8111 100644
--- a/datafusion/functions/src/unicode/translate.rs
+++ b/datafusion/functions/src/unicode/translate.rs
@@ -35,8 +35,8 @@ use datafusion_macros::user_doc;
#[user_doc(
doc_section(label = "String Functions"),
- description = "Translates characters in a string to specified translation
characters.",
- syntax_example = "translate(str, chars, translation)",
+ description = "Performs character-wise substitution based on a mapping.",
+ syntax_example = "translate(str, from, to)",
sql_example = r#"```sql
> select translate('twice', 'wic', 'her');
+--------------------------------------------------+
@@ -46,10 +46,10 @@ use datafusion_macros::user_doc;
+--------------------------------------------------+
```"#,
standard_argument(name = "str", prefix = "String"),
- argument(name = "chars", description = "Characters to translate."),
+ argument(name = "from", description = "The characters to be replaced."),
argument(
- name = "translation",
- description = "Translation characters. Translation characters replace
only characters at the same position in the **chars** string."
+ name = "to",
+ description = "The characters to replace them with. Each character in
**from** that is found in **str** is replaced by the character at the same
index in **to**. Any characters in **from** that don't have a corresponding
character in **to** are removed. If a character appears more than once in
**from**, the first occurrence determines the mapping."
)
)]
#[derive(Debug, PartialEq, Eq, Hash)]
@@ -71,6 +71,7 @@ impl TranslateFunc {
vec![
Exact(vec![Utf8View, Utf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8]),
+ Exact(vec![LargeUtf8, Utf8, Utf8]),
],
Volatility::Immutable,
),
@@ -99,6 +100,61 @@ impl ScalarUDFImpl for TranslateFunc {
&self,
args: datafusion_expr::ScalarFunctionArgs,
) -> Result<ColumnarValue> {
+ // When from and to are scalars, pre-build the translation map once
+ if let (Some(from_str), Some(to_str)) = (
+ try_as_scalar_str(&args.args[1]),
+ try_as_scalar_str(&args.args[2]),
+ ) {
+ let to_graphemes: Vec<&str> = to_str.graphemes(true).collect();
+
+ let mut from_map: HashMap<&str, usize> = HashMap::new();
+ for (index, c) in from_str.graphemes(true).enumerate() {
+ // Ignore characters that already exist in from_map
+ from_map.entry(c).or_insert(index);
+ }
+
+ let ascii_table = build_ascii_translate_table(from_str, to_str);
+
+ let string_array =
args.args[0].to_array_of_size(args.number_rows)?;
+
+ let result = match string_array.data_type() {
+ DataType::Utf8View => {
+ let arr = string_array.as_string_view();
+ translate_with_map::<i32, _>(
+ arr,
+ &from_map,
+ &to_graphemes,
+ ascii_table.as_ref(),
+ )
+ }
+ DataType::Utf8 => {
+ let arr = string_array.as_string::<i32>();
+ translate_with_map::<i32, _>(
+ arr,
+ &from_map,
+ &to_graphemes,
+ ascii_table.as_ref(),
+ )
+ }
+ DataType::LargeUtf8 => {
+ let arr = string_array.as_string::<i64>();
+ translate_with_map::<i64, _>(
+ arr,
+ &from_map,
+ &to_graphemes,
+ ascii_table.as_ref(),
+ )
+ }
+ other => {
+ return exec_err!(
+ "Unsupported data type {other:?} for function
translate"
+ );
+ }
+ }?;
+
+ return Ok(ColumnarValue::Array(result));
+ }
+
make_scalar_function(invoke_translate, vec![])(&args.args)
}
@@ -107,6 +163,14 @@ impl ScalarUDFImpl for TranslateFunc {
}
}
+/// If `cv` is a non-null scalar string, return its value.
+fn try_as_scalar_str(cv: &ColumnarValue) -> Option<&str> {
+ match cv {
+ ColumnarValue::Scalar(s) => s.try_as_str().flatten(),
+ _ => None,
+ }
+}
+
fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8View => {
@@ -123,8 +187,8 @@ fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
}
DataType::LargeUtf8 => {
let string_array = args[0].as_string::<i64>();
- let from_array = args[1].as_string::<i64>();
- let to_array = args[2].as_string::<i64>();
+ let from_array = args[1].as_string::<i32>();
+ let to_array = args[2].as_string::<i32>();
translate::<i64, _, _>(string_array, from_array, to_array)
}
other => {
@@ -170,7 +234,7 @@ where
// Build from_map using reusable buffer
from_graphemes.extend(from.graphemes(true));
for (index, c) in from_graphemes.iter().enumerate() {
- // Ignore characters that already exist in from_map, else
insert
+ // Ignore characters that already exist in from_map
from_map.entry(*c).or_insert(index);
}
@@ -199,6 +263,97 @@ where
Ok(Arc::new(result) as ArrayRef)
}
+/// Sentinel value in the ASCII translate table indicating the character should
+/// be deleted (the `from` character has no corresponding `to` character). Any
+/// value > 127 works since valid ASCII is 0–127.
+const ASCII_DELETE: u8 = 0xFF;
+
+/// If `from` and `to` are both ASCII, build a fixed-size lookup table for
+/// translation. Each entry maps an input byte to its replacement byte, or to
+/// [`ASCII_DELETE`] if the character should be removed. Returns `None` if
+/// either string contains non-ASCII characters.
+fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
+ if !from.is_ascii() || !to.is_ascii() {
+ return None;
+ }
+ let mut table = [0u8; 128];
+ for i in 0..128u8 {
+ table[i as usize] = i;
+ }
+ let to_bytes = to.as_bytes();
+ let mut seen = [false; 128];
+ for (i, from_byte) in from.bytes().enumerate() {
+ let idx = from_byte as usize;
+ if !seen[idx] {
+ seen[idx] = true;
+ if i < to_bytes.len() {
+ table[idx] = to_bytes[i];
+ } else {
+ table[idx] = ASCII_DELETE;
+ }
+ }
+ }
+ Some(table)
+}
+
+/// Optimized translate for constant `from` and `to` arguments: uses a
pre-built
+/// translation map instead of rebuilding it for every row. When an ASCII byte
+/// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
+/// inputs fallback to using the map.
+fn translate_with_map<'a, T: OffsetSizeTrait, V>(
+ string_array: V,
+ from_map: &HashMap<&str, usize>,
+ to_graphemes: &[&str],
+ ascii_table: Option<&[u8; 128]>,
+) -> Result<ArrayRef>
+where
+ V: ArrayAccessor<Item = &'a str>,
+{
+ let mut result_graphemes: Vec<&str> = Vec::new();
+ let mut ascii_buf: Vec<u8> = Vec::new();
+
+ let result = ArrayIter::new(string_array)
+ .map(|string| {
+ string.map(|s| {
+ // Fast path: byte-level table lookup for ASCII strings
+ if let Some(table) = ascii_table
+ && s.is_ascii()
+ {
+ ascii_buf.clear();
+ for &b in s.as_bytes() {
+ let mapped = table[b as usize];
+ if mapped != ASCII_DELETE {
+ ascii_buf.push(mapped);
+ }
+ }
+ // SAFETY: all bytes are ASCII, hence valid UTF-8.
+ return unsafe {
+ std::str::from_utf8_unchecked(&ascii_buf).to_owned()
+ };
+ }
+
+ // Slow path: grapheme-based translation
+ result_graphemes.clear();
+
+ for c in s.graphemes(true) {
+ match from_map.get(c) {
+ Some(n) => {
+ if let Some(replacement) = to_graphemes.get(*n) {
+ result_graphemes.push(*replacement);
+ }
+ }
+ None => result_graphemes.push(c),
+ }
+ }
+
+ result_graphemes.concat()
+ })
+ })
+ .collect::<GenericStringArray<T>>();
+
+ Ok(Arc::new(result) as ArrayRef)
+}
+
#[cfg(test)]
mod tests {
use arrow::array::{Array, StringArray};
@@ -284,6 +439,21 @@ mod tests {
Utf8,
StringArray
);
+ // Non-ASCII input with ASCII scalar from/to: exercises the
+ // grapheme fallback within translate_with_map.
+ test_function!(
+ TranslateFunc::new(),
+ vec![
+ ColumnarValue::Scalar(ScalarValue::from("café")),
+ ColumnarValue::Scalar(ScalarValue::from("ae")),
+ ColumnarValue::Scalar(ScalarValue::from("AE"))
+ ],
+ Ok(Some("cAfé")),
+ &str,
+ Utf8,
+ StringArray
+ );
+
#[cfg(not(feature = "unicode_expressions"))]
test_function!(
TranslateFunc::new(),
diff --git a/datafusion/sqllogictest/test_files/functions.slt
b/datafusion/sqllogictest/test_files/functions.slt
index 6c87d618c7..35a32897d0 100644
--- a/datafusion/sqllogictest/test_files/functions.slt
+++ b/datafusion/sqllogictest/test_files/functions.slt
@@ -239,6 +239,11 @@ SELECT translate('12345', '143', NULL)
----
NULL
+query T
+SELECT translate(arrow_cast('12345', 'LargeUtf8'), '143', 'ax')
+----
+a2x5
+
statement ok
CREATE TABLE test(
c1 VARCHAR
diff --git a/docs/source/user-guide/sql/scalar_functions.md
b/docs/source/user-guide/sql/scalar_functions.md
index e09c4cb7cb..02b4b55fe6 100644
--- a/docs/source/user-guide/sql/scalar_functions.md
+++ b/docs/source/user-guide/sql/scalar_functions.md
@@ -2068,17 +2068,17 @@ to_hex(int)
### `translate`
-Translates characters in a string to specified translation characters.
+Performs character-wise substitution based on a mapping.
```sql
-translate(str, chars, translation)
+translate(str, from, to)
```
#### Arguments
- **str**: String expression to operate on. Can be a constant, column, or
function, and any combination of operators.
-- **chars**: Characters to translate.
-- **translation**: Translation characters. Translation characters replace only
characters at the same position in the **chars** string.
+- **from**: The characters to be replaced.
+- **to**: The characters to replace them with. Each character in **from** that
is found in **str** is replaced by the character at the same index in **to**.
Any characters in **from** that don't have a corresponding character in **to**
are removed. If a character appears more than once in **from**, the first
occurrence determines the mapping.
#### Example
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]