Jefffrey commented on code in PR #20385:
URL: https://github.com/apache/datafusion/pull/20385#discussion_r2832621121


##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -38,6 +38,7 @@ use crate::make_array::make_array_udf;
 use crate::utils::make_scalar_function;
 
 use std::any::Any;
+use std::collections::HashSet;

Review Comment:
   Do we get performance gains if we use `hashbrown` instead?
   
   - Related issue: https://github.com/apache/datafusion/issues/19869



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -612,7 +792,19 @@ impl ScalarUDFImpl for ArrayHasAny {
         &self,
         args: datafusion_expr::ScalarFunctionArgs,
     ) -> Result<ColumnarValue> {
-        make_scalar_function(array_has_any_inner)(&args.args)
+        let [first_arg, second_arg] = take_function_args(self.name(), 
&args.args)?;
+
+        // array_has_any is symmetric: if either argument is scalar, build a
+        // HashSet from it and probe with the rows of the other argument.
+        match (&first_arg, &second_arg) {
+            (_, ColumnarValue::Scalar(scalar)) => {
+                array_has_any_with_scalar(first_arg, scalar)
+            }
+            (ColumnarValue::Scalar(scalar), _) => {
+                array_has_any_with_scalar(second_arg, scalar)
+            }

Review Comment:
   ```suggestion
               (cv, ColumnarValue::Scalar(scalar)) | 
(ColumnarValue::Scalar(scalar), cv) => {
                   array_has_any_with_scalar(cv, scalar)
               }
   ```



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            ColumnarValue::Scalar(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+            }
+        };
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let scalar_strings = string_array_to_vec(scalar_values.as_ref());
+    let has_null_scalar = scalar_strings.iter().any(|s| s.is_none());

Review Comment:
   ```suggestion
       let has_null_scalar = scalar_values.null_count() > 0;
   ```



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            ColumnarValue::Scalar(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+            }
+        };
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let scalar_strings = string_array_to_vec(scalar_values.as_ref());

Review Comment:
   I wonder if we should defer creating this vec, since if we take the hashing 
path we effectively allocate a vec to allocate a hashset, which seems redundant



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();

Review Comment:
   Is there an edge case here where sliced arrays aren't properly considered?



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));

Review Comment:
   We can return scalar regardless here



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            ColumnarValue::Scalar(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+            }
+        };
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let scalar_strings = string_array_to_vec(scalar_values.as_ref());
+    let has_null_scalar = scalar_strings.iter().any(|s| s.is_none());
+
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let all_col_strings = string_array_to_vec(col_list.values().as_ref());

Review Comment:
   I feel we could avoid this allocation of `all_col_strings`, perhaps making 
use of something like 
[`StringArrayType`](https://docs.rs/arrow/latest/arrow/array/trait.StringArrayType.html)
 to make it generic across the string types



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            ColumnarValue::Scalar(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+            }
+        };
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let scalar_strings = string_array_to_vec(scalar_values.as_ref());
+    let has_null_scalar = scalar_strings.iter().any(|s| s.is_none());
+
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let all_col_strings = string_array_to_vec(col_list.values().as_ref());
+    let col_offsets: Vec<usize> = col_list.offsets().collect();

Review Comment:
   We could probably avoid this collect of offsets if we take advantage of 
peeking the iter



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) -> 
Result<ArrayRef> {
     array_has_all_and_any_inner(args, ComparisonType::Any)
 }
 
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+    columnar_arg: &ColumnarValue,
+    scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+    if scalar_arg.is_null() {
+        return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+    }
+
+    // Convert the scalar to a 1-element ListArray, then extract the inner 
values
+    let scalar_array = scalar_arg.to_array_of_size(1)?;
+    let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+    let scalar_values = scalar_list.values();
+
+    // If scalar list is empty, result is always false
+    if scalar_values.is_empty() {
+        return match columnar_arg {
+            ColumnarValue::Array(arr) => {
+                let result = 
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+                Ok(ColumnarValue::Array(Arc::new(result)))
+            }
+            ColumnarValue::Scalar(_) => {
+                Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+            }
+        };
+    }
+
+    match scalar_values.data_type() {
+        DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+            array_has_any_with_scalar_string(columnar_arg, scalar_values)
+        }
+        _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+    }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+    columnar_arg: &ColumnarValue,
+    scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+    let scalar_strings = string_array_to_vec(scalar_values.as_ref());
+    let has_null_scalar = scalar_strings.iter().any(|s| s.is_none());
+
+    let (col_arr, is_scalar_output) = match columnar_arg {
+        ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+        ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+    };
+
+    let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+    let all_col_strings = string_array_to_vec(col_list.values().as_ref());
+    let col_offsets: Vec<usize> = col_list.offsets().collect();
+    let col_nulls = col_list.nulls();
+
+    let mut builder = BooleanArray::builder(col_list.len());
+
+    if scalar_strings.len() > SCALAR_SMALL_THRESHOLD {
+        // Large scalar: build HashSet for O(1) lookups
+        let scalar_set: HashSet<&str> =
+            scalar_strings.iter().copied().flatten().collect();
+
+        for i in 0..col_list.len() {
+            if col_nulls.is_some_and(|v| !v.is_valid(i)) {

Review Comment:
   ```suggestion
               if col_nulls.is_some_and(|v| v.is_null(i)) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to