tustvold commented on a change in pull request #1041:
URL: https://github.com/apache/arrow-rs/pull/1041#discussion_r777130081
##########
File path: parquet/src/arrow/record_reader/definition_levels.rs
##########
@@ -0,0 +1,82 @@
+use arrow::array::BooleanBufferBuilder;
+use arrow::bitmap::Bitmap;
+use arrow::buffer::Buffer;
+use std::ops::Range;
+
+use crate::column::reader::decoder::ColumnLevelDecoderImpl;
+use crate::schema::types::ColumnDescPtr;
+
+use super::{
+ buffer::{RecordBuffer, TypedBuffer},
+ MIN_BATCH_SIZE,
+};
+
+pub struct DefinitionLevelBuffer {
+ buffer: TypedBuffer<i16>,
+ builder: BooleanBufferBuilder,
+ max_level: i16,
+}
+
+impl RecordBuffer for DefinitionLevelBuffer {
+ type Output = Buffer;
+ type Writer = [i16];
+
+ fn split(&mut self, len: usize) -> Self::Output {
+ self.buffer.split(len)
+ }
+
+ fn writer(&mut self, batch_size: usize) -> &mut Self::Writer {
+ assert_eq!(self.buffer.len(), self.builder.len());
+ self.buffer.writer(batch_size)
+ }
+
+ fn commit(&mut self, len: usize) {
+ self.buffer.commit(len);
+ let buf = self.buffer.as_slice();
+
+ let range = self.builder.len()..len;
+ self.builder.reserve(range.end - range.start);
+ for i in &buf[range] {
+ self.builder.append(*i == self.max_level)
+ }
+ }
+}
+
+impl DefinitionLevelBuffer {
+ pub fn new(desc: &ColumnDescPtr) -> Self {
+ Self {
+ buffer: TypedBuffer::new(),
+ builder: BooleanBufferBuilder::new(0),
+ max_level: desc.max_def_level(),
+ }
+ }
+
+ /// Split `len` levels out of `self`
+ pub fn split_bitmask(&mut self, len: usize) -> Bitmap {
+ let old_len = self.builder.len();
+ let num_left_values = old_len - len;
+ let new_bitmap_builder =
+ BooleanBufferBuilder::new(MIN_BATCH_SIZE.max(num_left_values));
+
+ let old_bitmap =
+ std::mem::replace(&mut self.builder, new_bitmap_builder).finish();
+ let old_bitmap = Bitmap::from(old_bitmap);
+
+ for i in len..old_len {
+ self.builder.append(old_bitmap.is_set(i));
+ }
+
+ old_bitmap
+ }
+
+ pub fn valid_position_iter(
+ &self,
+ range: Range<usize>,
+ ) -> impl Iterator<Item = usize> + '_ {
+ let max_def_level = self.max_level;
+ let slice = self.buffer.as_slice();
+ range.rev().filter(move |x| slice[*x] == max_def_level)
Review comment:
Currently BooleanBufferBuilder doesn't have a story for appending other
BooleanBuffers - #1039 adds this but I'd rather not make this PR depend on it.
Additionally the cost of the memory allocation and copy may outweigh the
gains from SIMD.
Given this I'm going to leave this as is, especially as #1054 will remove
this code from the decode path for files without nested nullability.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]