houqp commented on a change in pull request #1090:
URL: https://github.com/apache/arrow-datafusion/pull/1090#discussion_r725518578
##########
File path: datafusion/src/physical_plan/crypto_expressions.rs
##########
@@ -16,183 +16,283 @@
// under the License.
//! Crypto expressions
-use std::sync::Arc;
-
-use md5::Md5;
-use sha2::{
- digest::Output as SHA2DigestOutput, Digest as SHA2Digest, Sha224, Sha256,
Sha384,
- Sha512,
-};
-
+use super::ColumnarValue;
use crate::{
error::{DataFusionError, Result},
scalar::ScalarValue,
};
use arrow::{
- array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait},
+ array::{
+ Array, ArrayRef, BinaryArray, GenericStringArray, StringArray,
+ StringOffsetSizeTrait,
+ },
datatypes::DataType,
};
+use md5::Md5;
+use sha2::{Digest as SHA2Digest, Sha224, Sha256, Sha384, Sha512};
+use std::any::type_name;
+use std::sync::Arc;
+use std::{fmt, str::FromStr};
-use super::{string_expressions::unary_string_function, ColumnarValue};
-
-/// Computes the md5 of a string.
-fn md5_process(input: &str) -> String {
- let mut digest = Md5::default();
- digest.update(&input);
-
- let mut result = String::new();
-
- for byte in &digest.finalize() {
- result.push_str(&format!("{:02x}", byte));
- }
-
- result
+/// digest algorithm
+#[derive(Debug, Copy, Clone)]
+enum DigestAlgorithm {
+ Md5,
+ Sha224,
+ Sha256,
+ Sha384,
+ Sha512,
}
-// It's not possible to return &[u8], because trait in trait without short
lifetime
-fn sha_process<D: SHA2Digest + Default>(input: &str) -> SHA2DigestOutput<D> {
- let mut digest = D::default();
- digest.update(&input);
-
- digest.finalize()
+macro_rules! downcast_string_arg {
+ ($ARG:expr, $NAME:expr, $T:ident) => {{
+ $ARG.as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(format!(
+ "could not cast {} to {}",
+ $NAME,
+ type_name::<GenericStringArray<T>>()
+ ))
+ })?
+ }};
}
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-fn unary_binary_function<T, R, F>(
- args: &[&dyn Array],
- op: F,
- name: &str,
-) -> Result<BinaryArray>
-where
- R: AsRef<[u8]>,
- T: StringOffsetSizeTrait,
- F: Fn(&str) -> R,
-{
- if args.len() != 1 {
- return Err(DataFusionError::Internal(format!(
- "{:?} args were supplied but {} takes exactly one argument",
- args.len(),
- name,
- )));
- }
-
- let array = args[0]
- .as_any()
- .downcast_ref::<GenericStringArray<T>>()
- .ok_or_else(|| {
- DataFusionError::Internal("failed to downcast to
string".to_string())
- })?;
-
- // first map is the iterator, second is for the `Option<_>`
- Ok(array.iter().map(|x| x.map(|x| op(x))).collect())
-}
-
-fn handle<F, R>(args: &[ColumnarValue], op: F, name: &str) ->
Result<ColumnarValue>
-where
- R: AsRef<[u8]>,
- F: Fn(&str) -> R,
-{
- match &args[0] {
+fn digest_process(
+ value: &ColumnarValue,
+ digest_algorithm: DigestAlgorithm,
+ return_utf8: bool,
Review comment:
Given that we only have md5 as the special case for returning value as
hex, I think the code would be simpler if we isolate the return hex logic into
the old `pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> `
function. Then we can avoid this `return_utf8` flag passing and checking for
all other functions. Efficiency wise, it'a also better to have the return type
specified using the type system statically instead of switching behaviors at
runtime.
##########
File path: datafusion/src/physical_plan/crypto_expressions.rs
##########
@@ -16,183 +16,283 @@
// under the License.
//! Crypto expressions
-use std::sync::Arc;
-
-use md5::Md5;
-use sha2::{
- digest::Output as SHA2DigestOutput, Digest as SHA2Digest, Sha224, Sha256,
Sha384,
- Sha512,
-};
-
+use super::ColumnarValue;
use crate::{
error::{DataFusionError, Result},
scalar::ScalarValue,
};
use arrow::{
- array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait},
+ array::{
+ Array, ArrayRef, BinaryArray, GenericStringArray, StringArray,
+ StringOffsetSizeTrait,
+ },
datatypes::DataType,
};
+use md5::Md5;
+use sha2::{Digest as SHA2Digest, Sha224, Sha256, Sha384, Sha512};
+use std::any::type_name;
+use std::sync::Arc;
+use std::{fmt, str::FromStr};
-use super::{string_expressions::unary_string_function, ColumnarValue};
-
-/// Computes the md5 of a string.
-fn md5_process(input: &str) -> String {
- let mut digest = Md5::default();
- digest.update(&input);
-
- let mut result = String::new();
-
- for byte in &digest.finalize() {
- result.push_str(&format!("{:02x}", byte));
- }
-
- result
+/// digest algorithm
+#[derive(Debug, Copy, Clone)]
+enum DigestAlgorithm {
+ Md5,
+ Sha224,
+ Sha256,
+ Sha384,
+ Sha512,
}
-// It's not possible to return &[u8], because trait in trait without short
lifetime
-fn sha_process<D: SHA2Digest + Default>(input: &str) -> SHA2DigestOutput<D> {
- let mut digest = D::default();
- digest.update(&input);
-
- digest.finalize()
+macro_rules! downcast_string_arg {
Review comment:
looks like this macro is only used once in `digest_array`, if so, it
would be better to manually expand it in `digest_array` or turn it into an
inline function. macros are usually harder to maintain and debug.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]