This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 619f368dc chore: use datafusion impl of hex function (#2915)
619f368dc is described below
commit 619f368dc4d437be8ef0d9605d1c4d152932ad92
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Tue Dec 16 08:02:07 2025 -0800
chore: use datafusion impl of hex function (#2915)
* impl map_from_entries
* Use datafusion impl of hex function
* Revert "impl map_from_entries"
This reverts commit 768b3e90f261c7aea58bdb98dc698b90deeeae34.
---------
Co-authored-by: Kazantsev Maksim <[email protected]>
---
native/core/src/execution/jni_api.rs | 2 +
native/spark-expr/src/comet_scalar_funcs.rs | 8 +-
native/spark-expr/src/lib.rs | 4 +-
native/spark-expr/src/math_funcs/hex.rs | 291 ----------------------------
native/spark-expr/src/math_funcs/mod.rs | 2 -
5 files changed, 6 insertions(+), 301 deletions(-)
diff --git a/native/core/src/execution/jni_api.rs
b/native/core/src/execution/jni_api.rs
index a24d99305..aaed1fc74 100644
--- a/native/core/src/execution/jni_api.rs
+++ b/native/core/src/execution/jni_api.rs
@@ -47,6 +47,7 @@ use
datafusion_spark::function::datetime::date_sub::SparkDateSub;
use datafusion_spark::function::hash::sha1::SparkSha1;
use datafusion_spark::function::hash::sha2::SparkSha2;
use datafusion_spark::function::math::expm1::SparkExpm1;
+use datafusion_spark::function::math::hex::SparkHex;
use datafusion_spark::function::string::char::CharFunc;
use datafusion_spark::function::string::concat::SparkConcat;
use futures::poll;
@@ -337,6 +338,7 @@ fn register_datafusion_spark_function(session_ctx:
&SessionContext) {
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkSha1::default()));
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkConcat::default()));
session_ctx.register_udf(ScalarUDF::new_from_impl(SparkBitwiseNot::default()));
+ session_ctx.register_udf(ScalarUDF::new_from_impl(SparkHex::default()));
}
/// Prepares arrow arrays for output.
diff --git a/native/spark-expr/src/comet_scalar_funcs.rs
b/native/spark-expr/src/comet_scalar_funcs.rs
index e9f1caa8a..56b12a9e4 100644
--- a/native/spark-expr/src/comet_scalar_funcs.rs
+++ b/native/spark-expr/src/comet_scalar_funcs.rs
@@ -21,8 +21,8 @@ use crate::math_funcs::checked_arithmetic::{checked_add,
checked_div, checked_mu
use crate::math_funcs::modulo_expr::spark_modulo;
use crate::{
spark_array_repeat, spark_ceil, spark_decimal_div,
spark_decimal_integral_div, spark_floor,
- spark_hex, spark_isnan, spark_lpad, spark_make_decimal,
spark_read_side_padding, spark_round,
- spark_rpad, spark_unhex, spark_unscaled_value, EvalMode,
SparkBitwiseCount, SparkDateTrunc,
+ spark_isnan, spark_lpad, spark_make_decimal, spark_read_side_padding,
spark_round, spark_rpad,
+ spark_unhex, spark_unscaled_value, EvalMode, SparkBitwiseCount,
SparkDateTrunc,
SparkStringSpace,
};
use arrow::datatypes::DataType;
@@ -130,10 +130,6 @@ pub fn create_comet_physical_fun_with_eval_mode(
"make_decimal" => {
make_comet_scalar_udf!("make_decimal", spark_make_decimal,
data_type)
}
- "hex" => {
- let func = Arc::new(spark_hex);
- make_comet_scalar_udf!("hex", func, without data_type)
- }
"unhex" => {
let func = Arc::new(spark_unhex);
make_comet_scalar_udf!("unhex", func, without data_type)
diff --git a/native/spark-expr/src/lib.rs b/native/spark-expr/src/lib.rs
index 932fcbe53..2903061d6 100644
--- a/native/spark-expr/src/lib.rs
+++ b/native/spark-expr/src/lib.rs
@@ -74,8 +74,8 @@ pub use hash_funcs::*;
pub use json_funcs::ToJson;
pub use math_funcs::{
create_modulo_expr, create_negate_expr, spark_ceil, spark_decimal_div,
- spark_decimal_integral_div, spark_floor, spark_hex, spark_make_decimal,
spark_round,
- spark_unhex, spark_unscaled_value, CheckOverflow, NegativeExpr,
NormalizeNaNAndZero,
+ spark_decimal_integral_div, spark_floor, spark_make_decimal, spark_round,
spark_unhex,
+ spark_unscaled_value, CheckOverflow, NegativeExpr, NormalizeNaNAndZero,
};
pub use string_funcs::*;
diff --git a/native/spark-expr/src/math_funcs/hex.rs
b/native/spark-expr/src/math_funcs/hex.rs
deleted file mode 100644
index 5f476a673..000000000
--- a/native/spark-expr/src/math_funcs/hex.rs
+++ /dev/null
@@ -1,291 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use std::sync::Arc;
-
-use arrow::array::StringArray;
-use arrow::datatypes::DataType;
-use arrow::{
- array::{as_dictionary_array, as_largestring_array, as_string_array},
- datatypes::Int32Type,
-};
-use datafusion::common::{
- cast::{as_binary_array, as_fixed_size_binary_array, as_int64_array},
- exec_err, DataFusionError,
-};
-use datafusion::logical_expr::ColumnarValue;
-use std::fmt::Write;
-
-fn hex_int64(num: i64) -> String {
- format!("{num:X}")
-}
-
-#[inline(always)]
-fn hex_encode<T: AsRef<[u8]>>(data: T, lower_case: bool) -> String {
- let mut s = String::with_capacity(data.as_ref().len() * 2);
- if lower_case {
- for b in data.as_ref() {
- // Writing to a string never errors, so we can unwrap here.
- write!(&mut s, "{b:02x}").unwrap();
- }
- } else {
- for b in data.as_ref() {
- // Writing to a string never errors, so we can unwrap here.
- write!(&mut s, "{b:02X}").unwrap();
- }
- }
- s
-}
-
-#[inline(always)]
-fn hex_bytes<T: AsRef<[u8]>>(bytes: T) -> Result<String, std::fmt::Error> {
- let hex_string = hex_encode(bytes, false);
- Ok(hex_string)
-}
-
-/// Spark-compatible `hex` function
-pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue,
DataFusionError> {
- if args.len() != 1 {
- return Err(DataFusionError::Internal(
- "hex expects exactly one argument".to_string(),
- ));
- }
-
- match &args[0] {
- ColumnarValue::Array(array) => match array.data_type() {
- DataType::Int64 => {
- let array = as_int64_array(array)?;
-
- let hexed_array: StringArray = array.iter().map(|v|
v.map(hex_int64)).collect();
-
- Ok(ColumnarValue::Array(Arc::new(hexed_array)))
- }
- DataType::Utf8 => {
- let array = as_string_array(array);
-
- let hexed: StringArray = array
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?;
-
- Ok(ColumnarValue::Array(Arc::new(hexed)))
- }
- DataType::LargeUtf8 => {
- let array = as_largestring_array(array);
-
- let hexed: StringArray = array
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?;
-
- Ok(ColumnarValue::Array(Arc::new(hexed)))
- }
- DataType::Binary => {
- let array = as_binary_array(array)?;
-
- let hexed: StringArray = array
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?;
-
- Ok(ColumnarValue::Array(Arc::new(hexed)))
- }
- DataType::FixedSizeBinary(_) => {
- let array = as_fixed_size_binary_array(array)?;
-
- let hexed: StringArray = array
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?;
-
- Ok(ColumnarValue::Array(Arc::new(hexed)))
- }
- DataType::Dictionary(_, value_type) => {
- let dict = as_dictionary_array::<Int32Type>(&array);
-
- let values = match **value_type {
- DataType::Int64 => as_int64_array(dict.values())?
- .iter()
- .map(|v| v.map(hex_int64))
- .collect::<Vec<_>>(),
- DataType::Utf8 => as_string_array(dict.values())
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?,
- DataType::Binary => as_binary_array(dict.values())?
- .iter()
- .map(|v| v.map(hex_bytes).transpose())
- .collect::<Result<_, _>>()?,
- _ => exec_err!(
- "hex got an unexpected argument type: {:?}",
- array.data_type()
- )?,
- };
-
- let new_values: Vec<Option<String>> = dict
- .keys()
- .iter()
- .map(|key| key.map(|k| values[k as
usize].clone()).unwrap_or(None))
- .collect();
-
- let string_array_values = StringArray::from(new_values);
-
- Ok(ColumnarValue::Array(Arc::new(string_array_values)))
- }
- _ => exec_err!(
- "hex got an unexpected argument type: {:?}",
- array.data_type()
- ),
- },
- _ => exec_err!("native hex does not support scalar values at this
time"),
- }
-}
-
-#[cfg(test)]
-mod test {
- use std::sync::Arc;
-
- use arrow::array::{Int64Array, StringArray};
- use arrow::{
- array::{
- as_string_array, BinaryDictionaryBuilder,
PrimitiveDictionaryBuilder, StringBuilder,
- StringDictionaryBuilder,
- },
- datatypes::{Int32Type, Int64Type},
- };
- use datafusion::logical_expr::ColumnarValue;
-
- #[test]
- fn test_dictionary_hex_utf8() {
- let mut input_builder = StringDictionaryBuilder::<Int32Type>::new();
- input_builder.append_value("hi");
- input_builder.append_value("bye");
- input_builder.append_null();
- input_builder.append_value("rust");
- let input = input_builder.finish();
-
- let mut string_builder = StringBuilder::new();
- string_builder.append_value("6869");
- string_builder.append_value("627965");
- string_builder.append_null();
- string_builder.append_value("72757374");
- let expected = string_builder.finish();
-
- let columnar_value = ColumnarValue::Array(Arc::new(input));
- let result = super::spark_hex(&[columnar_value]).unwrap();
-
- let result = match result {
- ColumnarValue::Array(array) => array,
- _ => panic!("Expected array"),
- };
-
- let result = as_string_array(&result);
-
- assert_eq!(result, &expected);
- }
-
- #[test]
- fn test_dictionary_hex_int64() {
- let mut input_builder = PrimitiveDictionaryBuilder::<Int32Type,
Int64Type>::new();
- input_builder.append_value(1);
- input_builder.append_value(2);
- input_builder.append_null();
- input_builder.append_value(3);
- let input = input_builder.finish();
-
- let mut string_builder = StringBuilder::new();
- string_builder.append_value("1");
- string_builder.append_value("2");
- string_builder.append_null();
- string_builder.append_value("3");
- let expected = string_builder.finish();
-
- let columnar_value = ColumnarValue::Array(Arc::new(input));
- let result = super::spark_hex(&[columnar_value]).unwrap();
-
- let result = match result {
- ColumnarValue::Array(array) => array,
- _ => panic!("Expected array"),
- };
-
- let result = as_string_array(&result);
-
- assert_eq!(result, &expected);
- }
-
- #[test]
- fn test_dictionary_hex_binary() {
- let mut input_builder = BinaryDictionaryBuilder::<Int32Type>::new();
- input_builder.append_value("1");
- input_builder.append_value("j");
- input_builder.append_null();
- input_builder.append_value("3");
- let input = input_builder.finish();
-
- let mut expected_builder = StringBuilder::new();
- expected_builder.append_value("31");
- expected_builder.append_value("6A");
- expected_builder.append_null();
- expected_builder.append_value("33");
- let expected = expected_builder.finish();
-
- let columnar_value = ColumnarValue::Array(Arc::new(input));
- let result = super::spark_hex(&[columnar_value]).unwrap();
-
- let result = match result {
- ColumnarValue::Array(array) => array,
- _ => panic!("Expected array"),
- };
-
- let result = as_string_array(&result);
-
- assert_eq!(result, &expected);
- }
-
- #[test]
- fn test_hex_int64() {
- let num = 1234;
- let hexed = super::hex_int64(num);
- assert_eq!(hexed, "4D2".to_string());
-
- let num = -1;
- let hexed = super::hex_int64(num);
- assert_eq!(hexed, "FFFFFFFFFFFFFFFF".to_string());
- }
-
- #[test]
- fn test_spark_hex_int64() {
- let int_array = Int64Array::from(vec![Some(1), Some(2), None,
Some(3)]);
- let columnar_value = ColumnarValue::Array(Arc::new(int_array));
-
- let result = super::spark_hex(&[columnar_value]).unwrap();
- let result = match result {
- ColumnarValue::Array(array) => array,
- _ => panic!("Expected array"),
- };
-
- let string_array = as_string_array(&result);
- let expected_array = StringArray::from(vec![
- Some("1".to_string()),
- Some("2".to_string()),
- None,
- Some("3".to_string()),
- ]);
-
- assert_eq!(string_array, &expected_array);
- }
-}
diff --git a/native/spark-expr/src/math_funcs/mod.rs
b/native/spark-expr/src/math_funcs/mod.rs
index 7df87eb9f..35c1dc650 100644
--- a/native/spark-expr/src/math_funcs/mod.rs
+++ b/native/spark-expr/src/math_funcs/mod.rs
@@ -20,7 +20,6 @@ mod ceil;
pub(crate) mod checked_arithmetic;
mod div;
mod floor;
-pub(crate) mod hex;
pub mod internal;
pub mod modulo_expr;
mod negative;
@@ -32,7 +31,6 @@ pub use ceil::spark_ceil;
pub use div::spark_decimal_div;
pub use div::spark_decimal_integral_div;
pub use floor::spark_floor;
-pub use hex::spark_hex;
pub use internal::*;
pub use modulo_expr::create_modulo_expr;
pub use negative::{create_negate_expr, NegativeExpr};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]