Jefffrey commented on code in PR #22390:
URL: https://github.com/apache/datafusion/pull/22390#discussion_r3292176194
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -357,6 +381,47 @@ fn array_remove_internal(
}
}
+/// Dispatches scalar-needle array removal by list offset type.
+///
+/// `needle` must be a length-1 array containing the scalar element to remove.
+fn array_remove_dispatch_scalar(
+ array: &ArrayRef,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ match array.data_type() {
+ DataType::List(_) => {
+ let list_array = array.as_list::<i32>();
+ general_remove_with_scalar::<i32>(list_array, needle, arr_n)
+ }
+ DataType::LargeList(_) => {
+ let list_array = array.as_list::<i64>();
+ general_remove_with_scalar::<i64>(list_array, needle, arr_n)
+ }
+ array_type => exec_err!(
+ "array_remove/array_remove_n/array_remove_all does not support
type '{array_type}'."
+ ),
+ }
+}
+
+/// Removes elements matching a scalar needle from a list array.
+///
+/// Uses a bulk `distinct` comparison for non-null, non-nested scalar elements,
+/// falling back to the per-row `general_remove` path for null or nested types.
+fn remove_with_scalar_needle(
Review Comment:
I find this function a little confusing, since the code paths can then be
like:
```
invoke with args -> remove_with_scalar_needle -> array_remove_internal
(fallback)
```
Perhaps we can hoist this null/nested check earlier to remove need for this
function? Or the benefits of centralizing the check outweighs it?
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -214,7 +229,23 @@ impl ScalarUDFImpl for ArrayRemoveN {
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- make_scalar_function(array_remove_n_inner)(&args.args)
+ let [list_arg, element_arg, max_arg] =
+ take_function_args(self.name(), &args.args)?;
+ let num_rows = args.number_rows;
+ let list_array = list_arg.to_array(num_rows)?;
+ let max_array = max_arg.to_array(num_rows)?;
+ let arr_n = as_int64_array(&max_array)?.values().to_vec();
Review Comment:
We're ignoring nulls in `max_arg` here, though I think this may be an
existing issue
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,114 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
Review Comment:
Similar question to that of array_replace PR, in if we should just handle
when needle & max are scalars only
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,114 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+
+ let list_offsets = list_array.offsets();
+ let first_offset = list_offsets[0].to_usize().unwrap();
+ let last_offset = list_offsets[list_offsets.len() - 1].to_usize().unwrap();
+ let values_range_len = last_offset - first_offset;
+ let values_slice = list_array.values().slice(first_offset,
values_range_len);
+ let original_data = values_slice.to_data();
+ let mut offsets = Vec::<OffsetSize>::with_capacity(list_array.len() + 1);
Review Comment:
Use
[`OffsetBufferBuilder`](https://docs.rs/arrow/latest/arrow/array/struct.OffsetBufferBuilder.html)
here
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,114 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+
+ let list_offsets = list_array.offsets();
+ let first_offset = list_offsets[0].to_usize().unwrap();
+ let last_offset = list_offsets[list_offsets.len() - 1].to_usize().unwrap();
+ let values_range_len = last_offset - first_offset;
+ let values_slice = list_array.values().slice(first_offset,
values_range_len);
+ let original_data = values_slice.to_data();
+ let mut offsets = Vec::<OffsetSize>::with_capacity(list_array.len() + 1);
+ offsets.push(OffsetSize::zero());
+
+ let mut mutable = MutableArrayData::with_capacities(
Review Comment:
I wonder if an approach using `take` kernel could provide even more
performance gains?
##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -468,6 +533,114 @@ fn general_remove<OffsetSize: OffsetSizeTrait>(
)?))
}
+/// For each element of `list_array[i]`, removed up to `arr_n[i]` occurrences
+/// of `needle[0]` (scalar element broadcasted).
+///
+/// This is a specialized version of `general_remove` for scalar elements that
+/// uses bulk comparison for better performance.
+fn general_remove_with_scalar<OffsetSize: OffsetSizeTrait>(
+ list_array: &GenericListArray<OffsetSize>,
+ needle: &ArrayRef,
+ arr_n: &[i64],
+) -> Result<ArrayRef> {
+ let list_field = match list_array.data_type() {
+ DataType::List(field) | DataType::LargeList(field) => field,
+ _ => {
+ return exec_err!(
+ "Expected List or LargeList data type, got {:?}",
+ list_array.data_type()
+ );
+ }
+ };
+
+ let list_offsets = list_array.offsets();
+ let first_offset = list_offsets[0].to_usize().unwrap();
+ let last_offset = list_offsets[list_offsets.len() - 1].to_usize().unwrap();
+ let values_range_len = last_offset - first_offset;
+ let values_slice = list_array.values().slice(first_offset,
values_range_len);
+ let original_data = values_slice.to_data();
+ let mut offsets = Vec::<OffsetSize>::with_capacity(list_array.len() + 1);
+ offsets.push(OffsetSize::zero());
+
+ let mut mutable = MutableArrayData::with_capacities(
+ vec![&original_data],
+ false,
+ Capacities::Array(original_data.len()),
+ );
+ let nulls = list_array.nulls().cloned();
+ let keep_mask =
+ arrow_ord::cmp::distinct(&values_slice,
&Scalar::new(Arc::clone(needle)))?;
+ let remove_bits = match keep_mask.nulls() {
Review Comment:
> `distinct` is similar to `neq`, only differing in null handling. In
particular, two
operands are considered DISTINCT if they have a different value or if one of
them is NULL
and the other isn’t. The result of `distinct` is never NULL.
https://docs.rs/arrow/latest/arrow/compute/kernels/cmp/fn.distinct.html
I don't think we should have handling around the null buffer of `keep_mask`
according to the documentation
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]