neilconway commented on code in PR #20385:
URL: https://github.com/apache/datafusion/pull/20385#discussion_r2833570925
##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -476,6 +483,179 @@ fn array_has_any_inner(args: &[ArrayRef]) ->
Result<ArrayRef> {
array_has_all_and_any_inner(args, ComparisonType::Any)
}
+/// Fast path for `array_has_any` when exactly one argument is a scalar.
+fn array_has_any_with_scalar(
+ columnar_arg: &ColumnarValue,
+ scalar_arg: &ScalarValue,
+) -> Result<ColumnarValue> {
+ if scalar_arg.is_null() {
+ return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
+ }
+
+ // Convert the scalar to a 1-element ListArray, then extract the inner
values
+ let scalar_array = scalar_arg.to_array_of_size(1)?;
+ let scalar_list: ArrayWrapper = scalar_array.as_ref().try_into()?;
+ let scalar_values = scalar_list.values();
+
+ // If scalar list is empty, result is always false
+ if scalar_values.is_empty() {
+ return match columnar_arg {
+ ColumnarValue::Array(arr) => {
+ let result =
BooleanArray::from(BooleanBuffer::new_unset(arr.len()));
+ Ok(ColumnarValue::Array(Arc::new(result)))
+ }
+ ColumnarValue::Scalar(_) => {
+ Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))))
+ }
+ };
+ }
+
+ match scalar_values.data_type() {
+ DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => {
+ array_has_any_with_scalar_string(columnar_arg, scalar_values)
+ }
+ _ => array_has_any_with_scalar_general(columnar_arg, scalar_values),
+ }
+}
+
+/// When the scalar argument has more elements than this, the scalar fast path
+/// builds a HashSet for O(1) lookups. At or below this threshold, it falls
+/// back to a linear scan, since hashing every columnar element is more
+/// expensive than a linear scan over a short array.
+const SCALAR_SMALL_THRESHOLD: usize = 8;
+
+/// String-specialized scalar fast path for `array_has_any`.
+fn array_has_any_with_scalar_string(
+ columnar_arg: &ColumnarValue,
+ scalar_values: &ArrayRef,
+) -> Result<ColumnarValue> {
+ let scalar_strings = string_array_to_vec(scalar_values.as_ref());
+ let has_null_scalar = scalar_strings.iter().any(|s| s.is_none());
+
+ let (col_arr, is_scalar_output) = match columnar_arg {
+ ColumnarValue::Array(arr) => (Arc::clone(arr), false),
+ ColumnarValue::Scalar(s) => (s.to_array_of_size(1)?, true),
+ };
+
+ let col_list: ArrayWrapper = col_arr.as_ref().try_into()?;
+ let all_col_strings = string_array_to_vec(col_list.values().as_ref());
+ let col_offsets: Vec<usize> = col_list.offsets().collect();
Review Comment:
Interestingly, this turned out to be much slower:
```
array_has_any_scalar/string_no_match/1
time: [97.474 µs 98.334 µs 99.302 µs]
change: [−15.448% −14.766% −14.081%] (p = 0.00 <
0.05)
Performance has improved.
array_has_any_scalar/string_no_match/10
time: [298.73 µs 317.71 µs 343.54 µs]
change: [+32.141% +60.866% +85.741%] (p = 0.00 <
0.05)
Performance has regressed.
Found 20 outliers among 100 measurements (20.00%)
3 (3.00%) low mild
17 (17.00%) high severe
array_has_any_scalar/string_no_match/100
time: [437.14 µs 455.34 µs 480.03 µs]
change: [+22.766% +39.379% +59.120%] (p = 0.00 <
0.05)
Performance has regressed.
Found 18 outliers among 100 measurements (18.00%)
1 (1.00%) low mild
17 (17.00%) high severe
array_has_any_scalar/string_no_match/1000
time: [332.13 µs 351.25 µs 376.77 µs]
change: [+28.480% +54.273% +78.992%] (p = 0.00 <
0.05)
Performance has regressed.
```
I didn't dig into why; maybe dynamic dispatch because of the iterator adds a
bunch of overhead? I'll leave this as-is for now.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]