This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push: new 713bd6d Truncate bitmask on split (#1183) 713bd6d is described below commit 713bd6dca996417f1deccc5b130eeb4e53c8d89a Author: Raphael Taylor-Davies <1781103+tustv...@users.noreply.github.com> AuthorDate: Mon Jan 17 15:51:21 2022 +0000 Truncate bitmask on split (#1183) * Truncate bitmask on split * Fix BooleanBufferBuilder::resize * Format --- arrow/src/array/builder.rs | 32 ++++++++++++++ .../src/arrow/record_reader/definition_levels.rs | 51 +++++++++++++++++----- 2 files changed, 72 insertions(+), 11 deletions(-) diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 85c013c..18deac3 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -367,6 +367,15 @@ impl BooleanBufferBuilder { } } + /// Resizes the buffer, either truncating its contents (with no change in capacity), or + /// growing it (potentially reallocating it) and writing `false` in the newly available bits. + #[inline] + pub fn resize(&mut self, len: usize) { + let len_bytes = bit_util::ceil(len, 8); + self.buffer.resize(len_bytes, 0); + self.len = len; + } + #[inline] pub fn append(&mut self, v: bool) { self.advance(1); @@ -2932,6 +2941,29 @@ mod tests { } #[test] + fn test_boolean_array_builder_resize() { + let mut builder = BooleanBufferBuilder::new(20); + builder.append_n(4, true); + builder.append_n(7, false); + builder.append_n(2, true); + builder.resize(20); + + assert_eq!(builder.len, 20); + assert_eq!( + builder.buffer.as_slice(), + &[0b00001111, 0b00011000, 0b00000000] + ); + + builder.resize(5); + assert_eq!(builder.len, 5); + assert_eq!(builder.buffer.as_slice(), &[0b00001111]); + + builder.append_n(4, true); + assert_eq!(builder.len, 9); + assert_eq!(builder.buffer.as_slice(), &[0b11101111, 0b00000001]); + } + + #[test] fn test_boolean_builder_increases_buffer_len() { // 00000010 01001000 let buf = Buffer::from([72_u8, 2_u8]); diff --git a/parquet/src/arrow/record_reader/definition_levels.rs b/parquet/src/arrow/record_reader/definition_levels.rs index c38505a..750e118 100644 --- a/parquet/src/arrow/record_reader/definition_levels.rs +++ b/parquet/src/arrow/record_reader/definition_levels.rs @@ -105,25 +105,25 @@ impl DefinitionLevelBuffer { /// Split `len` levels out of `self` pub fn split_bitmask(&mut self, len: usize) -> Bitmap { - let builder = match &mut self.inner { + let old_builder = match &mut self.inner { BufferInner::Full { nulls, .. } => nulls, BufferInner::Mask { nulls } => nulls, }; - let old_len = builder.len(); - let num_left_values = old_len - len; - let new_bitmap_builder = + // Compute the number of values left behind + let num_left_values = old_builder.len() - len; + let mut new_builder = BooleanBufferBuilder::new(MIN_BATCH_SIZE.max(num_left_values)); - let old_bitmap = std::mem::replace(builder, new_bitmap_builder).finish(); - let old_bitmap = Bitmap::from(old_bitmap); + // Copy across remaining values + new_builder.append_packed_range(len..old_builder.len(), old_builder.as_slice()); - for i in len..old_len { - builder.append(old_bitmap.is_set(i)); - } + // Truncate buffer + old_builder.resize(len); - self.len = builder.len(); - old_bitmap + // Swap into self + self.len = new_builder.len(); + Bitmap::from(std::mem::replace(old_builder, new_builder).finish()) } /// Returns an iterator of the valid positions in `range` in descending order @@ -391,8 +391,11 @@ fn iter_set_bits_rev(bytes: &[u8]) -> impl Iterator<Item = usize> + '_ { #[cfg(test)] mod tests { use super::*; + use std::sync::Arc; + use crate::basic::Type as PhysicalType; use crate::encodings::rle::RleEncoder; + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type}; use rand::{thread_rng, Rng, RngCore}; #[test] @@ -462,4 +465,30 @@ mod tests { assert_eq!(actual, expected); } } + + #[test] + fn test_split_off() { + let t = Type::primitive_type_builder("col", PhysicalType::INT32) + .build() + .unwrap(); + + let descriptor = Arc::new(ColumnDescriptor::new( + Arc::new(t), + 1, + 0, + ColumnPath::new(vec![]), + )); + + let mut buffer = DefinitionLevelBuffer::new(&descriptor, true); + match &mut buffer.inner { + BufferInner::Mask { nulls } => nulls.append_n(100, false), + _ => unreachable!(), + }; + + let bitmap = buffer.split_bitmask(19); + + // Should have split off 19 records leaving, 81 behind + assert_eq!(bitmap.len(), 3 * 8); // Note: bitmask only tracks bytes not bits + assert_eq!(buffer.nulls().len(), 81); + } }