alamb commented on a change in pull request #9093:
URL: https://github.com/apache/arrow/pull/9093#discussion_r551885031



##########
File path: rust/datafusion/tests/sql.rs
##########
@@ -132,6 +132,7 @@ async fn parquet_single_nan_schema() {
 }
 
 #[tokio::test]
+#[ignore = "Test ignored, will be enabled as part of the nested Parquet 
reader"]

Review comment:
       What happened to this test?  It looks like it used to pass and now it 
doesn't?

##########
File path: rust/arrow/src/array/equal/boolean.rs
##########
@@ -15,35 +15,60 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::array::ArrayData;
+use crate::array::{data::count_nulls, ArrayData};
+use crate::buffer::Buffer;
+use crate::util::bit_util::get_bit;
 
 use super::utils::equal_bits;
 
 pub(super) fn boolean_equal(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_values = lhs.buffers()[0].as_slice();
     let rhs_values = rhs.buffers()[0].as_slice();
 
-    // TODO: we can do this more efficiently if all values are not-null
-    (0..len).all(|i| {
-        let lhs_pos = lhs_start + i;
-        let rhs_pos = rhs_start + i;
-        let lhs_is_null = lhs.is_null(lhs_pos);
-        let rhs_is_null = rhs.is_null(rhs_pos);
-
-        lhs_is_null
-            || (lhs_is_null == rhs_is_null)
-                && equal_bits(
-                    lhs_values,
-                    rhs_values,
-                    lhs_pos + lhs.offset(),
-                    rhs_pos + rhs.offset(),
-                    1,
-                )
-    })
+    let lhs_null_count = count_nulls(lhs_nulls, lhs_start, len);
+    let rhs_null_count = count_nulls(rhs_nulls, rhs_start, len);
+
+    if lhs_null_count == 0 && rhs_null_count == 0 {
+        (0..len).all(|i| {
+            let lhs_pos = lhs_start + i;
+            let rhs_pos = rhs_start + i;
+
+            equal_bits(
+                lhs_values,
+                rhs_values,
+                lhs_pos + lhs.offset(),
+                rhs_pos + rhs.offset(),
+                1,
+            )
+        })
+    } else {
+        // get a ref of the null buffer bytes, to use in testing for nullness
+        let lhs_null_bytes = lhs_nulls.as_ref().unwrap().as_slice();

Review comment:
       Is it possible for `lhs_nulls == Some(..)` but `rhs_nulls == None` (and 
visa versa?) Given they are optional arguments I wasn't sure if they would 
always both be either `None` or `Some`

##########
File path: rust/arrow/src/array/equal/structure.rs
##########
@@ -37,39 +37,20 @@ fn equal_values(
     rhs_start: usize,
     len: usize,
 ) -> bool {
-    let mut temp_lhs: Option<Buffer> = None;
-    let mut temp_rhs: Option<Buffer> = None;
-
     lhs.child_data()
         .iter()
         .zip(rhs.child_data())
         .all(|(lhs_values, rhs_values)| {
             // merge the null data
-            let lhs_merged_nulls = match (lhs_nulls, lhs_values.null_buffer()) 
{

Review comment:
       I think the use of `temp_lhs` and `temp_rhs` here is to avoid the 
`lhs_nulls.cloned()` and `rhs_nulls.cloned()` calls below. 

##########
File path: rust/datafusion/tests/sql.rs
##########
@@ -132,6 +132,7 @@ async fn parquet_single_nan_schema() {
 }
 
 #[tokio::test]
+#[ignore = "Test ignored, will be enabled as part of the nested Parquet 
reader"]

Review comment:
       When I ran this test locally, it fails with a seemingly non-sensical 
error
   
   ```
   failures:
   
   ---- parquet_list_columns stdout ----
   thread 'parquet_list_columns' panicked at 'assertion failed: `(left == 
right)`
     left: `PrimitiveArray<Int64>
   [
     null,
     1,
   ]`,
    right: `PrimitiveArray<Int64>
   [
     null,
     1,
   ]`', datafusion/tests/sql.rs:204:5
   ```

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == 
rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,

Review comment:
       I think if you changed 
   
   ```
       let parent_bitmap = 
logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
   ```
   
   to 
   
   ```
       let parent_bitmap = 
logical_null_buffer.cloned().map(Bitmap::from).unwrap_or_else(|| {
   
   ```
   
   Then the signature could take an `Option<&Buffer>` and the code is cleaner 
(fewer calls to `.cloned()` outside this function). 
   
   But I don't think it has any runtime effect

##########
File path: rust/arrow/src/array/equal/list.rs
##########
@@ -71,45 +79,94 @@ fn offset_value_equal<T: OffsetSizeTrait>(
 pub(super) fn list_equal<T: OffsetSizeTrait>(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_offsets = lhs.buffer::<T>(0);
     let rhs_offsets = rhs.buffer::<T>(0);
 
+    // There is an edge-case where a n-length list that has 0 children, 
results in panics.
+    // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but 
will have

Review comment:
       I probably am mis understanding but `[0, 0, 0, 0, 0]` has 5 entries but 
this comment says "4 slots"

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == 
rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,
+    child_data: &ArrayData,
+) -> Option<Buffer> {
+    let parent_len = parent_data.len();
+    let parent_bitmap = 
logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
+        let ceil = bit_util::ceil(parent_len, 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| {
+        let ceil = bit_util::ceil(child_data.len(), 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    match parent_data.data_type() {
+        DataType::List(_) => Some(logical_list_bitmap::<i32>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::LargeList(_) => Some(logical_list_bitmap::<i64>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::FixedSizeList(_, len) => {
+            let len = *len as usize;
+            let array_offset = parent_data.offset();
+            let bitmap_len = bit_util::ceil(parent_len * len, 8);
+            let mut buffer =
+                MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false);
+            let mut null_slice = buffer.as_slice_mut();
+            (array_offset..parent_len + array_offset).for_each(|index| {
+                let start = index * len;
+                let end = start + len;
+                let mask = parent_bitmap.is_set(index);
+                (start..end).for_each(|child_index| {
+                    if mask && self_null_bitmap.is_set(child_index) {
+                        bit_util::set_bit(&mut null_slice, child_index);
+                    }
+                });
+            });
+            Some(buffer.into())
+        }
+        DataType::Struct(_) => {
+            // Arrow implementations are free to pad data, which can result in 
null buffers not
+            // having the same length.
+            // Rust bitwise comparisons will return an error if left AND right 
is performed on
+            // buffers of different length.
+            // This might be a valid case during integration testing, where we 
read Arrow arrays
+            // from IPC data, which has padding.
+            //
+            // We first perform a bitwise comparison, and if there is an 
error, we revert to a
+            // slower method that indexes into the buffers one-by-one.
+            let result = &parent_bitmap & &self_null_bitmap;
+            if let Ok(bitmap) = result {
+                return Some(bitmap.bits);
+            }
+            // slow path
+            let array_offset = parent_data.offset();
+            let mut buffer = MutableBuffer::new_null(parent_len);
+            let mut null_slice = buffer.as_slice_mut();
+            (0..parent_len).for_each(|index| {
+                if parent_bitmap.is_set(index + array_offset)
+                    && self_null_bitmap.is_set(index + array_offset)
+                {
+                    bit_util::set_bit(&mut null_slice, index);
+                }
+            });
+            Some(buffer.into())
+        }
+        DataType::Union(_) => {
+            unimplemented!("Logical equality not yet implemented for union 
arrays")
+        }
+        DataType::Dictionary(_, _) => {
+            unimplemented!("Logical equality not yet implemented for nested 
dictionaries")
+        }
+        data_type => {
+            panic!("Data type {:?} is not a supported nested type", data_type)
+        }
+    }
+}
+
+// Calculate a list child's logical bitmap/buffer

Review comment:
       I don't fully understand how lists work  in arrow, but I will take your 
word for it that it does the right thing and that the tests are accurate. 

##########
File path: rust/arrow/src/array/equal/list.rs
##########
@@ -71,45 +79,94 @@ fn offset_value_equal<T: OffsetSizeTrait>(
 pub(super) fn list_equal<T: OffsetSizeTrait>(
     lhs: &ArrayData,
     rhs: &ArrayData,
+    lhs_nulls: Option<&Buffer>,
+    rhs_nulls: Option<&Buffer>,
     lhs_start: usize,
     rhs_start: usize,
     len: usize,
 ) -> bool {
     let lhs_offsets = lhs.buffer::<T>(0);
     let rhs_offsets = rhs.buffer::<T>(0);
 
+    // There is an edge-case where a n-length list that has 0 children, 
results in panics.
+    // For example; an array with offsets [0, 0, 0, 0, 0] has 4 slots, but 
will have
+    // no valid children.
+    // Under logical equality, the child null bitmap will be an empty buffer, 
as there are
+    // no child values. This causes panics when trying to count set bits.
+    //
+    // We caught this by chance from an accidental test-case, but due to the 
nature of this
+    // crash only occuring on list equality checks, we are adding a check 
here, instead of
+    // on the buffer/bitmap utilities, as a length check would incur a penalty 
for almost all
+    // other use-cases.
+    //
+    // The solution is to check the number of child values from offsets, and 
return `true` if
+    // they = 0. Empty arrays are equal, so this is correct.
+    //
+    // It's unlikely that one would create a n-length list array with no 
values, where n > 0,
+    // however, one is more likely to slice into a list array and get a region 
that has 0
+    // child values.
+    // The test that triggered this behaviour had [4, 4] as a slice of 1 value 
slot.
+    let lhs_child_length = lhs_offsets.get(len).unwrap().to_usize().unwrap()

Review comment:
       I don't fully understand the need for this check given that 
`count_nulls` seems to handle a buffer of `None` by returning zero.  
   
   When this code is commented out, however, I see the panic of
   
   ```
   ---- array::equal::tests::test_list_offsets stdout ----
   thread 'array::equal::tests::test_list_offsets' panicked at 'assertion 
failed: ceil(offset + len, 8) <= buffer.len() * 8', 
arrow/src/util/bit_chunk_iterator.rs:33:9
   ```
   
   So given that it is covered by tests, 👍 

##########
File path: rust/arrow/src/array/equal/utils.rs
##########
@@ -76,3 +80,185 @@ pub(super) fn equal_len(
 ) -> bool {
     lhs_values[lhs_start..(lhs_start + len)] == 
rhs_values[rhs_start..(rhs_start + len)]
 }
+
+/// Computes the logical validity bitmap of the array data using the
+/// parent's array data. The parent should be a list or struct, else
+/// the logical bitmap of the array is returned unaltered.
+///
+/// Parent data is passed along with the parent's logical bitmap, as
+/// nested arrays could have a logical bitmap different to the physical
+/// one on the `ArrayData`.
+pub(super) fn child_logical_null_buffer(
+    parent_data: &ArrayData,
+    logical_null_buffer: Option<Buffer>,
+    child_data: &ArrayData,
+) -> Option<Buffer> {
+    let parent_len = parent_data.len();
+    let parent_bitmap = 
logical_null_buffer.map(Bitmap::from).unwrap_or_else(|| {
+        let ceil = bit_util::ceil(parent_len, 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    let self_null_bitmap = child_data.null_bitmap().clone().unwrap_or_else(|| {
+        let ceil = bit_util::ceil(child_data.len(), 8);
+        Bitmap::from(Buffer::from(vec![0b11111111; ceil]))
+    });
+    match parent_data.data_type() {
+        DataType::List(_) => Some(logical_list_bitmap::<i32>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::LargeList(_) => Some(logical_list_bitmap::<i64>(
+            parent_data,
+            parent_bitmap,
+            self_null_bitmap,
+        )),
+        DataType::FixedSizeList(_, len) => {
+            let len = *len as usize;
+            let array_offset = parent_data.offset();
+            let bitmap_len = bit_util::ceil(parent_len * len, 8);
+            let mut buffer =
+                MutableBuffer::new(bitmap_len).with_bitset(bitmap_len, false);
+            let mut null_slice = buffer.as_slice_mut();
+            (array_offset..parent_len + array_offset).for_each(|index| {
+                let start = index * len;
+                let end = start + len;
+                let mask = parent_bitmap.is_set(index);
+                (start..end).for_each(|child_index| {
+                    if mask && self_null_bitmap.is_set(child_index) {
+                        bit_util::set_bit(&mut null_slice, child_index);
+                    }
+                });
+            });
+            Some(buffer.into())
+        }
+        DataType::Struct(_) => {
+            // Arrow implementations are free to pad data, which can result in 
null buffers not
+            // having the same length.
+            // Rust bitwise comparisons will return an error if left AND right 
is performed on
+            // buffers of different length.
+            // This might be a valid case during integration testing, where we 
read Arrow arrays
+            // from IPC data, which has padding.
+            //
+            // We first perform a bitwise comparison, and if there is an 
error, we revert to a

Review comment:
       👍  for comments.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to