gstvg commented on code in PR #9970:
URL: https://github.com/apache/arrow-rs/pull/9970#discussion_r3244938928


##########
arrow-select/src/cleanup_non_empty_nulls.rs:
##########
@@ -0,0 +1,873 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Rewrite variable-length arrays so that null entries point to empty offset
+//! ranges, while preserving the original null mask.
+//!
+//! Some variable-length array types (`Binary`, `Utf8`, `List`, `Map`, ...) can
+//! legally hold null entries whose offsets still reference real bytes/values
+//! in the underlying buffer. Iterating the child values exposes that data,
+//! which is rarely what callers want. This module provides:
+//!
+//! * [`has_non_empty_nulls`] - cheap check for whether the array contains
+//!   any such null entries.
+//! * [`cleanup_non_empty_nulls`] - produces an equivalent array where every
+//!   null entry has a zero-length offset range.
+
+use std::sync::Arc;
+
+use arrow_array::{
+    Array, ArrayRef, GenericByteArray, GenericListArray, MapArray, 
OffsetSizeTrait, UInt32Array,
+    cast::AsArray, make_array, new_null_array, types::ByteArrayType,
+};
+use arrow_buffer::{Buffer, OffsetBuffer};
+use arrow_schema::{ArrowError, DataType};
+
+/// Return true if there are no nulls pointing to non-empty values.
+///
+/// This will return true if [`cleanup_non_empty_nulls`] will do anything.
+///
+/// This should be called before [`cleanup_non_empty_nulls`] to avoid 
unnecessary work.
+pub fn has_non_empty_nulls(array: &dyn Array) -> Result<bool, ArrowError> {
+    Ok(match array.data_type() {
+        DataType::Binary => array.as_binary::<i32>().has_non_empty_nulls(),
+        DataType::LargeBinary => 
array.as_binary::<i64>().has_non_empty_nulls(),
+        DataType::Utf8 => array.as_string::<i32>().has_non_empty_nulls(),
+        DataType::LargeUtf8 => array.as_string::<i64>().has_non_empty_nulls(),
+        DataType::List(_) => array.as_list::<i32>().has_non_empty_nulls(),
+        DataType::LargeList(_) => array.as_list::<i64>().has_non_empty_nulls(),
+        DataType::Map(_, _) => array.as_map().has_non_empty_nulls(),
+        dt => {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "data type {dt:?} is not supported"
+            )));
+        }
+    })
+}
+
+/// Create a new list/map/bytes array with the same nulls as the original 
list/map/bytes array but all the nulls
+/// are pointing to an empty list/map/byte slice.
+///
+/// Users should first check if [`has_non_empty_nulls`] returns true before 
calling this method
+/// to avoid unnecessary work.
+///
+/// This is useful when wanting to go over the list/map/bytes values
+/// and not wanting to deal with values that are not used by the 
list/map/bytes.
+pub fn cleanup_non_empty_nulls(array: ArrayRef) -> Result<ArrayRef, 
ArrowError> {
+    match array.data_type() {
+        DataType::Binary => array.as_binary::<i32>().cleanup_non_empty_nulls(),
+        DataType::LargeBinary => 
array.as_binary::<i64>().cleanup_non_empty_nulls(),
+        DataType::Utf8 => array.as_string::<i32>().cleanup_non_empty_nulls(),
+        DataType::LargeUtf8 => 
array.as_string::<i64>().cleanup_non_empty_nulls(),
+        DataType::List(_) => array.as_list::<i32>().cleanup_non_empty_nulls(),
+        DataType::LargeList(_) => 
array.as_list::<i64>().cleanup_non_empty_nulls(),
+        DataType::Map(_, _) => array.as_map().cleanup_non_empty_nulls(),
+        dt => Err(ArrowError::InvalidArgumentError(format!(
+            "data type {dt:?} is not supported"
+        ))),
+    }
+}
+
+/// Helper trait to make it easier to implement the cleanup nulls for variable 
length with offsets
+trait VariableLengthArrayExt<OffsetSize: OffsetSizeTrait>: Array + Clone + 
Sized + 'static {
+    /// Get the offsets of the variable length array.
+    fn get_offsets(&self) -> &OffsetBuffer<OffsetSize>;
+
+    fn has_non_empty_nulls(&self) -> bool {
+        self.get_offsets().has_non_empty_nulls(self.nulls())
+    }
+
+    /// Create a new list/map/bytes array with the same nulls as the original 
list/map/bytes array but all the nulls
+    /// are pointing to an empty list/map/byte slice.
+    ///
+    /// Users should first check if [`Self::has_non_empty_nulls`] returns true 
before calling this method
+    /// to avoid unnecessary work.
+    ///
+    /// This is useful when wanting to go over the list/map/bytes values
+    /// and not wanting to deal with values that are not used by the 
list/map/bytes.
+    fn cleanup_non_empty_nulls(&self) -> Result<ArrayRef, ArrowError> {
+        let Some(nulls) = self.nulls().filter(|n| n.null_count() > 0) else {
+            // If no nulls, return as is
+            return Ok(Arc::new(self.clone()));
+        };
+
+        // Find an empty value so we can use the `take` kernel
+        let index_of_empty_value_to_reuse: Option<u32> = self
+            .get_offsets()
+            .lengths()
+            .position(|length| length == 0)
+            .map(|index| index as u32);
+
+        if let Some(index_of_empty_value) = index_of_empty_value_to_reuse {
+            let buffer = {
+                let iter = nulls.iter().enumerate().map(|(i, is_valid)| {
+                    if is_valid {
+                        i as u32
+                    } else {
+                        index_of_empty_value
+                    }
+                });
+
+                // SAFETY: upper bound is trusted because `iter` is just map 
over `nulls`
+                unsafe { Buffer::from_trusted_len_iter(iter) }
+            };
+
+            let cleanup_array = crate::take::take(

Review Comment:
   non-blocking: for lists/maps with big chunks of valid or empty sublists, is 
possible that using `MutableArrayData` directly be faster, since we can copy 
the given chunk in a single `memcpy` for some data types, and perform less  
[dynamic 
dispatches](https://github.com/apache/arrow-rs/blob/2e8e0c750b930c5bc3138d434c6007ffb7c22e61/arrow-select/src/take.rs#L649)
 compared to `take_list`?
   
   Now for string/binary I'm not sure since is 
[static](https://github.com/apache/arrow-rs/blob/2e8e0c750b930c5bc3138d434c6007ffb7c22e61/arrow-select/src/take.rs#L538)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to