nevi-me commented on a change in pull request #8541: URL: https://github.com/apache/arrow/pull/8541#discussion_r516989790
########## File path: rust/arrow/src/array/equal/variable_size.rs ########## @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::{ArrayData, OffsetSizeTrait}; + +use super::utils::equal_len; + +fn offset_value_equal<T: OffsetSizeTrait>( + lhs_values: &[u8], + rhs_values: &[u8], + lhs_offsets: &[T], + rhs_offsets: &[T], + lhs_pos: usize, + rhs_pos: usize, + len: usize, +) -> bool { + let lhs_start = lhs_offsets[lhs_pos].to_usize().unwrap(); + let rhs_start = rhs_offsets[rhs_pos].to_usize().unwrap(); + let lhs_len = lhs_offsets[lhs_pos + len] - lhs_offsets[lhs_pos]; + let rhs_len = rhs_offsets[rhs_pos + len] - rhs_offsets[rhs_pos]; + + lhs_len == rhs_len + && equal_len( + lhs_values, + rhs_values, + lhs_start, + rhs_start, + lhs_len.to_usize().unwrap(), + ) +} + +pub fn variable_sized_equal<T: OffsetSizeTrait>( Review comment: May you please add a doc, as it's a public function ########## File path: rust/arrow/src/array/equal/null.rs ########## @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::array::ArrayData; + +#[inline] +pub fn null_equal( + _lhs: &ArrayData, + _rhs: &ArrayData, + _lhs_start: usize, + _rhs_start: usize, + _len: usize, +) -> bool { + // a null buffer's range is always true, as every element is by definition equal (to null). + // We only need to compare data_types + true Review comment: We also need to compare the lengths so we can achieve logical equality. `NullArray::new(3) != NullArray::new(4)`. If I write an IPC file with a single array for both those 2 arrays, it'll be unequal because we still encode lengths in the array's data. ########## File path: rust/arrow/src/array/data.rs ########## @@ -26,6 +26,15 @@ use crate::buffer::Buffer; use crate::datatypes::DataType; use crate::util::bit_util; +fn count_nulls(null_bit_buffer: Option<&Buffer>, offset: usize, len: usize) -> usize { Review comment: Perhaps we could inline this? I haven't peeked at what happens with the assembly of these small functions, but I've read somewhere in the past that it might be beneficial to always inline small functions. ########## File path: rust/arrow/src/array/data.rs ########## @@ -207,6 +210,56 @@ impl ArrayData { size } + + /// Creates a zero-copy slice of itself. This creates a new [ArrayData] + /// with a different offset, len and a shifted null bitmap. + /// + /// # Panics + /// + /// Panics if `offset + length > self.len()`. + pub fn slice(&self, offset: usize, length: usize) -> ArrayData { + assert!((offset + length) <= self.len()); + + let mut new_data = self.clone(); + + new_data.len = length; + new_data.offset = offset + self.offset; + + new_data.null_count = + count_nulls(new_data.null_buffer(), new_data.offset, new_data.len); + + new_data + } + + /// Returns the buffer `buffer` as a slice of type `T`. When the expected buffer is bit-packed, + /// the slice is not offset. + #[inline] + pub(super) fn buffer<T>(&self, buffer: usize) -> &[T] { + let values = unsafe { self.buffers[buffer].data().align_to::<T>() }; + if values.0.len() != 0 || values.2.len() != 0 { + panic!("The buffer is not byte-aligned with its interpretation") + }; + if self.data_type.width(buffer).unwrap() == 1 { + // bitpacked buffers are offset differently and it is the consumers that need to worry about. + &values.1 + } else { + &values.1[self.offset..] + } + } + + /// Returns the buffer `buffer` as a slice of type `T`. When the expected buffer is bit-packed, + /// the slice is not offset. + #[inline] + pub fn buffer_data(&self, buffer: usize) -> &[u8] { + let values = self.buffers[buffer].data(); + let width = self.data_type.width(buffer).unwrap(); + if width == 1 { + // bitpacked buffers are offset differently and it is the consumers that need to worry about. + &values + } else { + &values[self.offset * (width / 8)..] + } + } } impl PartialEq for ArrayData { Review comment: My changes here muddy the water. While reviewing this, I started to clean this up. I'll try submit a PR on top of this, removing the asserts, and completing the implementation. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org