This is an automated email from the ASF dual-hosted git repository.
weijun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b09c09a727 Speedup `to_hex` (~2x faster) (#14686)
b09c09a727 is described below
commit b09c09a7278ea8b96ec0e50f55149e0bbb7f62f9
Author: Simon Vandel Sillesen <[email protected]>
AuthorDate: Mon Feb 17 12:26:45 2025 +0100
Speedup `to_hex` (~2x faster) (#14686)
* add bench
* speed up by using write!
---
datafusion/functions/Cargo.toml | 5 ++++
datafusion/functions/benches/to_hex.rs | 47 +++++++++++++++++++++++++++++++
datafusion/functions/src/string/to_hex.rs | 39 +++++++++++++++----------
3 files changed, 76 insertions(+), 15 deletions(-)
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index a7e11bd85a..c77e58f0c0 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -113,6 +113,11 @@ harness = false
name = "uuid"
required-features = ["string_expressions"]
+[[bench]]
+harness = false
+name = "to_hex"
+required-features = ["string_expressions"]
+
[[bench]]
harness = false
name = "regx"
diff --git a/datafusion/functions/benches/to_hex.rs
b/datafusion/functions/benches/to_hex.rs
new file mode 100644
index 0000000000..ce3767cc48
--- /dev/null
+++ b/datafusion/functions/benches/to_hex.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::{
+ datatypes::{Int32Type, Int64Type},
+ util::bench_util::create_primitive_array,
+};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_expr::ColumnarValue;
+use datafusion_functions::string;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let hex = string::to_hex();
+ let size = 1024;
+ let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.2));
+ let batch_len = i32_array.len();
+ let i32_args = vec![ColumnarValue::Array(i32_array)];
+ c.bench_function(&format!("to_hex i32 array: {}", size), |b| {
+ b.iter(|| black_box(hex.invoke_batch(&i32_args, batch_len).unwrap()))
+ });
+ let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.2));
+ let batch_len = i64_array.len();
+ let i64_args = vec![ColumnarValue::Array(i64_array)];
+ c.bench_function(&format!("to_hex i64 array: {}", size), |b| {
+ b.iter(|| black_box(hex.invoke_batch(&i64_args, batch_len).unwrap()))
+ });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/src/string/to_hex.rs
b/datafusion/functions/src/string/to_hex.rs
index 64654ef6ef..5c7c92cc34 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -16,9 +16,10 @@
// under the License.
use std::any::Any;
+use std::fmt::Write;
use std::sync::Arc;
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::{ArrayRef, GenericStringBuilder, OffsetSizeTrait};
use arrow::datatypes::{
ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type,
};
@@ -40,22 +41,30 @@ where
{
let integer_array = as_primitive_array::<T>(&args[0])?;
- let result = integer_array
- .iter()
- .map(|integer| {
- if let Some(value) = integer {
- if let Some(value_usize) = value.to_usize() {
- Ok(Some(format!("{value_usize:x}")))
- } else if let Some(value_isize) = value.to_isize() {
- Ok(Some(format!("{value_isize:x}")))
- } else {
- exec_err!("Unsupported data type {integer:?} for function
to_hex")
- }
+ let mut result = GenericStringBuilder::<i32>::with_capacity(
+ integer_array.len(),
+ // * 8 to convert to bits, / 4 bits per hex char
+ integer_array.len() * (T::Native::get_byte_width() * 8 / 4),
+ );
+
+ for integer in integer_array {
+ if let Some(value) = integer {
+ if let Some(value_usize) = value.to_usize() {
+ write!(result, "{value_usize:x}")?;
+ } else if let Some(value_isize) = value.to_isize() {
+ write!(result, "{value_isize:x}")?;
} else {
- Ok(None)
+ return exec_err!(
+ "Unsupported data type {integer:?} for function to_hex"
+ );
}
- })
- .collect::<Result<GenericStringArray<i32>>>()?;
+ result.append_value("");
+ } else {
+ result.append_null();
+ }
+ }
+
+ let result = result.finish();
Ok(Arc::new(result) as ArrayRef)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]