Jefffrey commented on code in PR #20044:
URL: https://github.com/apache/datafusion/pull/20044#discussion_r2736557584
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -101,63 +97,81 @@ impl ScalarUDFImpl for SparkHex {
})
}
- fn invoke_with_args(
- &self,
- args: ScalarFunctionArgs,
- ) -> datafusion_common::Result<ColumnarValue> {
- spark_hex(&args.args)
- }
-
- fn aliases(&self) -> &[String] {
- &self.aliases
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(compute_hex,
vec![Hint::AcceptsSingular])(&args.args)
Review Comment:
Using `Hint::AcceptsSingular` makes our code simpler in handling scalars; we
could use `ColumnarValue` but for now can make use of `make_scalar_function`
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -74,7 +71,6 @@ impl SparkHex {
Self {
signature: Signature::one_of(variants, Volatility::Immutable),
- aliases: vec![],
Review Comment:
No aliases, remove this
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -101,63 +97,81 @@ impl ScalarUDFImpl for SparkHex {
})
}
- fn invoke_with_args(
- &self,
- args: ScalarFunctionArgs,
- ) -> datafusion_common::Result<ColumnarValue> {
- spark_hex(&args.args)
- }
-
- fn aliases(&self) -> &[String] {
- &self.aliases
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(compute_hex,
vec![Hint::AcceptsSingular])(&args.args)
}
}
-/// Hex encoding lookup tables for fast byte-to-hex conversion
-const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
-const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
-
-#[inline]
-fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
- if num == 0 {
- return b"0";
+fn compute_hex(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("hex", args)?;
+ downcast_dictionary_array! {
+ array => {
+ let values = hex_values(array.values())?;
+ Ok(Arc::new(array.with_values(values)))
+ },
+ _ => {
+ hex_values(array)
+ }
}
+}
- let mut n = num as u64;
- let mut i = 16;
- while n != 0 {
- i -= 1;
- buffer[i] = HEX_CHARS_UPPER[(n & 0xF) as usize];
- n >>= 4;
+fn hex_values(array: &ArrayRef) -> Result<ArrayRef> {
+ match array.data_type() {
+ DataType::Int64 => {
+ let array = as_int64_array(array)?;
+ hex_encode_int64(array.iter())
+ }
+ DataType::Utf8 => {
+ let array = as_string_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::Utf8View => {
+ let array = as_string_view_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::LargeUtf8 => {
+ let array = as_large_string_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::Binary => {
+ let array = as_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::LargeBinary => {
+ let array = as_large_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::BinaryView => {
+ let array = as_binary_view_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::FixedSizeBinary(_) => {
+ let array = as_fixed_size_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ dt => internal_err!("Unexpected data type for hex: {dt}"),
}
- &buffer[i..]
}
+/// Hex encoding lookup tables for fast byte-to-hex conversion
+const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
+
/// Generic hex encoding for byte array types
-fn hex_encode_bytes<'a, I, T>(
- iter: I,
- lowercase: bool,
- len: usize,
-) -> Result<ArrayRef, DataFusionError>
+fn hex_encode_bytes<'a, I, T>(iter: I) -> Result<ArrayRef>
where
- I: Iterator<Item = Option<T>>,
+ I: ExactSizeIterator<Item = Option<T>>,
Review Comment:
Using `ExactSizeIterator` lets us get the length from the iterator, instead
of passing it separately
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -101,63 +97,81 @@ impl ScalarUDFImpl for SparkHex {
})
}
- fn invoke_with_args(
- &self,
- args: ScalarFunctionArgs,
- ) -> datafusion_common::Result<ColumnarValue> {
- spark_hex(&args.args)
- }
-
- fn aliases(&self) -> &[String] {
- &self.aliases
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(compute_hex,
vec![Hint::AcceptsSingular])(&args.args)
}
}
-/// Hex encoding lookup tables for fast byte-to-hex conversion
-const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
-const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
-
-#[inline]
-fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
- if num == 0 {
- return b"0";
+fn compute_hex(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("hex", args)?;
+ downcast_dictionary_array! {
+ array => {
+ let values = hex_values(array.values())?;
+ Ok(Arc::new(array.with_values(values)))
+ },
+ _ => {
+ hex_values(array)
+ }
}
+}
- let mut n = num as u64;
- let mut i = 16;
- while n != 0 {
- i -= 1;
- buffer[i] = HEX_CHARS_UPPER[(n & 0xF) as usize];
- n >>= 4;
+fn hex_values(array: &ArrayRef) -> Result<ArrayRef> {
Review Comment:
Deduplicating this match arm (previously had same for dictionary)
##########
datafusion/sqllogictest/test_files/spark/math/hex.slt:
##########
@@ -83,3 +83,25 @@ query T
SELECT arrow_typeof(hex(dict_col)) FROM t_dict_binary LIMIT 1;
----
Dictionary(Int32, Utf8)
+
+query TT
+WITH values AS (
+ SELECT arrow_cast(column1, 'Dictionary(Int32, Utf8)') AS column1
+ FROM VALUES ('hi'), ('bye'), (NULL), ('rust')
+) SELECT hex(column1), arrow_typeof(hex(column1)) FROM values
+----
+6869 Utf8
+627965 Utf8
+NULL Utf8
+72757374 Utf8
+
+query TT
+WITH values AS (
+ SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') AS column1
+ FROM VALUES (1), (2), (NULL), (3)
+) SELECT hex(column1), arrow_typeof(hex(column1)) FROM values
+----
+1 Utf8
Review Comment:
Ideally these would stay dictionary, but theres known issue with passing
dictionary types via coercible API
- https://github.com/apache/datafusion/issues/19458
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -194,297 +223,40 @@ fn hex_encode_int64(
Ok(Arc::new(builder.finish()))
}
-/// Spark-compatible `hex` function
-pub fn spark_hex(args: &[ColumnarValue]) -> Result<ColumnarValue,
DataFusionError> {
- compute_hex(args, false)
-}
-
-/// Spark-compatible `sha2` function
-pub fn spark_sha2_hex(args: &[ColumnarValue]) -> Result<ColumnarValue,
DataFusionError> {
- compute_hex(args, true)
-}
-
-pub fn compute_hex(
- args: &[ColumnarValue],
- lowercase: bool,
-) -> Result<ColumnarValue, DataFusionError> {
- let input = match take_function_args("hex", args)? {
- [ColumnarValue::Scalar(value)] =>
ColumnarValue::Array(value.to_array()?),
- [ColumnarValue::Array(arr)] => ColumnarValue::Array(Arc::clone(arr)),
- };
-
- match &input {
- ColumnarValue::Array(array) => match array.data_type() {
- DataType::Int64 => {
- let array = as_int64_array(array)?;
- Ok(ColumnarValue::Array(hex_encode_int64(
- array.iter(),
- array.len(),
- )?))
- }
- DataType::Utf8 => {
- let array = as_string_array(array);
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::Utf8View => {
- let array = as_string_view_array(array)?;
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::LargeUtf8 => {
- let array = as_largestring_array(array);
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::Binary => {
- let array = as_binary_array(array)?;
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::LargeBinary => {
- let array = as_large_binary_array(array)?;
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::FixedSizeBinary(_) => {
- let array = as_fixed_size_binary_array(array)?;
- Ok(ColumnarValue::Array(hex_encode_bytes(
- array.iter(),
- lowercase,
- array.len(),
- )?))
- }
- DataType::Dictionary(key_type, _) => {
- if **key_type != DataType::Int32 {
- return exec_err!(
- "hex only supports Int32 dictionary keys, get: {}",
- key_type
- );
- }
-
- let dict = as_dictionary_array::<Int32Type>(&array);
- let dict_values = dict.values();
-
- let encoded_values = match dict_values.data_type() {
- DataType::Int64 => {
- let arr = as_int64_array(dict_values)?;
- hex_encode_int64(arr.iter(), arr.len())?
- }
- DataType::Utf8 => {
- let arr = as_string_array(dict_values);
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- DataType::LargeUtf8 => {
- let arr = as_largestring_array(dict_values);
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- DataType::Utf8View => {
- let arr = as_string_view_array(dict_values)?;
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- DataType::Binary => {
- let arr = as_binary_array(dict_values)?;
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- DataType::LargeBinary => {
- let arr = as_large_binary_array(dict_values)?;
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- DataType::FixedSizeBinary(_) => {
- let arr = as_fixed_size_binary_array(dict_values)?;
- hex_encode_bytes(arr.iter(), lowercase, arr.len())?
- }
- _ => {
- return exec_err!(
- "hex got an unexpected argument type: {}",
- dict_values.data_type()
- );
- }
- };
-
- let new_dict = dict.with_values(encoded_values);
- Ok(ColumnarValue::Array(Arc::new(new_dict)))
- }
- _ => exec_err!("hex got an unexpected argument type: {}",
array.data_type()),
- },
- _ => exec_err!("native hex does not support scalar values at this
time"),
- }
-}
-
#[cfg(test)]
mod test {
use std::str::from_utf8_unchecked;
use std::sync::Arc;
use arrow::array::{DictionaryArray, Int32Array, Int64Array, StringArray};
- use arrow::{
- array::{
- BinaryDictionaryBuilder, PrimitiveDictionaryBuilder,
StringDictionaryBuilder,
- as_string_array,
- },
- datatypes::{Int32Type, Int64Type},
- };
- use datafusion_common::cast::as_dictionary_array;
- use datafusion_expr::ColumnarValue;
-
- #[test]
- fn test_dictionary_hex_utf8() {
Review Comment:
Moving tests to SLTs
##########
datafusion/spark/src/function/math/hex.rs:
##########
@@ -101,63 +97,81 @@ impl ScalarUDFImpl for SparkHex {
})
}
- fn invoke_with_args(
- &self,
- args: ScalarFunctionArgs,
- ) -> datafusion_common::Result<ColumnarValue> {
- spark_hex(&args.args)
- }
-
- fn aliases(&self) -> &[String] {
- &self.aliases
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(compute_hex,
vec![Hint::AcceptsSingular])(&args.args)
}
}
-/// Hex encoding lookup tables for fast byte-to-hex conversion
-const HEX_CHARS_LOWER: &[u8; 16] = b"0123456789abcdef";
-const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
-
-#[inline]
-fn hex_int64(num: i64, buffer: &mut [u8; 16]) -> &[u8] {
- if num == 0 {
- return b"0";
+fn compute_hex(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("hex", args)?;
+ downcast_dictionary_array! {
+ array => {
+ let values = hex_values(array.values())?;
+ Ok(Arc::new(array.with_values(values)))
+ },
+ _ => {
+ hex_values(array)
+ }
}
+}
- let mut n = num as u64;
- let mut i = 16;
- while n != 0 {
- i -= 1;
- buffer[i] = HEX_CHARS_UPPER[(n & 0xF) as usize];
- n >>= 4;
+fn hex_values(array: &ArrayRef) -> Result<ArrayRef> {
+ match array.data_type() {
+ DataType::Int64 => {
+ let array = as_int64_array(array)?;
+ hex_encode_int64(array.iter())
+ }
+ DataType::Utf8 => {
+ let array = as_string_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::Utf8View => {
+ let array = as_string_view_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::LargeUtf8 => {
+ let array = as_large_string_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::Binary => {
+ let array = as_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::LargeBinary => {
+ let array = as_large_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::BinaryView => {
+ let array = as_binary_view_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ DataType::FixedSizeBinary(_) => {
+ let array = as_fixed_size_binary_array(array)?;
+ hex_encode_bytes(array.iter())
+ }
+ dt => internal_err!("Unexpected data type for hex: {dt}"),
}
- &buffer[i..]
}
+/// Hex encoding lookup tables for fast byte-to-hex conversion
+const HEX_CHARS_UPPER: &[u8; 16] = b"0123456789ABCDEF";
Review Comment:
The lowercase was only meant for sha2; remove the code here, as sha2 wasn't
using it from here
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]