This is an automated email from the ASF dual-hosted git repository.

weijun pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new b09c09a727 Speedup `to_hex` (~2x faster) (#14686)
b09c09a727 is described below

commit b09c09a7278ea8b96ec0e50f55149e0bbb7f62f9
Author: Simon Vandel Sillesen <[email protected]>
AuthorDate: Mon Feb 17 12:26:45 2025 +0100

    Speedup `to_hex` (~2x faster) (#14686)
    
    * add bench
    
    * speed up by using write!
---
 datafusion/functions/Cargo.toml           |  5 ++++
 datafusion/functions/benches/to_hex.rs    | 47 +++++++++++++++++++++++++++++++
 datafusion/functions/src/string/to_hex.rs | 39 +++++++++++++++----------
 3 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index a7e11bd85a..c77e58f0c0 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -113,6 +113,11 @@ harness = false
 name = "uuid"
 required-features = ["string_expressions"]
 
+[[bench]]
+harness = false
+name = "to_hex"
+required-features = ["string_expressions"]
+
 [[bench]]
 harness = false
 name = "regx"
diff --git a/datafusion/functions/benches/to_hex.rs 
b/datafusion/functions/benches/to_hex.rs
new file mode 100644
index 0000000000..ce3767cc48
--- /dev/null
+++ b/datafusion/functions/benches/to_hex.rs
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+extern crate criterion;
+
+use arrow::{
+    datatypes::{Int32Type, Int64Type},
+    util::bench_util::create_primitive_array,
+};
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datafusion_expr::ColumnarValue;
+use datafusion_functions::string;
+use std::sync::Arc;
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let hex = string::to_hex();
+    let size = 1024;
+    let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.2));
+    let batch_len = i32_array.len();
+    let i32_args = vec![ColumnarValue::Array(i32_array)];
+    c.bench_function(&format!("to_hex i32 array: {}", size), |b| {
+        b.iter(|| black_box(hex.invoke_batch(&i32_args, batch_len).unwrap()))
+    });
+    let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.2));
+    let batch_len = i64_array.len();
+    let i64_args = vec![ColumnarValue::Array(i64_array)];
+    c.bench_function(&format!("to_hex i64 array: {}", size), |b| {
+        b.iter(|| black_box(hex.invoke_batch(&i64_args, batch_len).unwrap()))
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/src/string/to_hex.rs 
b/datafusion/functions/src/string/to_hex.rs
index 64654ef6ef..5c7c92cc34 100644
--- a/datafusion/functions/src/string/to_hex.rs
+++ b/datafusion/functions/src/string/to_hex.rs
@@ -16,9 +16,10 @@
 // under the License.
 
 use std::any::Any;
+use std::fmt::Write;
 use std::sync::Arc;
 
-use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
+use arrow::array::{ArrayRef, GenericStringBuilder, OffsetSizeTrait};
 use arrow::datatypes::{
     ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type,
 };
@@ -40,22 +41,30 @@ where
 {
     let integer_array = as_primitive_array::<T>(&args[0])?;
 
-    let result = integer_array
-        .iter()
-        .map(|integer| {
-            if let Some(value) = integer {
-                if let Some(value_usize) = value.to_usize() {
-                    Ok(Some(format!("{value_usize:x}")))
-                } else if let Some(value_isize) = value.to_isize() {
-                    Ok(Some(format!("{value_isize:x}")))
-                } else {
-                    exec_err!("Unsupported data type {integer:?} for function 
to_hex")
-                }
+    let mut result = GenericStringBuilder::<i32>::with_capacity(
+        integer_array.len(),
+        // * 8 to convert to bits, / 4 bits per hex char
+        integer_array.len() * (T::Native::get_byte_width() * 8 / 4),
+    );
+
+    for integer in integer_array {
+        if let Some(value) = integer {
+            if let Some(value_usize) = value.to_usize() {
+                write!(result, "{value_usize:x}")?;
+            } else if let Some(value_isize) = value.to_isize() {
+                write!(result, "{value_isize:x}")?;
             } else {
-                Ok(None)
+                return exec_err!(
+                    "Unsupported data type {integer:?} for function to_hex"
+                );
             }
-        })
-        .collect::<Result<GenericStringArray<i32>>>()?;
+            result.append_value("");
+        } else {
+            result.append_null();
+        }
+    }
+
+    let result = result.finish();
 
     Ok(Arc::new(result) as ArrayRef)
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to