tustvold commented on code in PR #2999: URL: https://github.com/apache/arrow-rs/pull/2999#discussion_r1014596963
########## parquet/src/column/reader/decoder.rs: ########## @@ -323,41 +368,153 @@ impl DefinitionLevelDecoder for ColumnLevelDecoderImpl { ) -> Result<(usize, usize)> { let mut level_skip = 0; let mut value_skip = 0; - match self.decoder.as_mut().unwrap() { - LevelDecoderInner::Packed(reader, bit_width) => { - for _ in 0..num_levels { - // Values are delimited by max_def_level - if max_def_level - == reader - .get_value::<i16>(*bit_width as usize) - .expect("Not enough values in Packed ColumnLevelDecoderImpl.") - { - value_skip += 1; - } - level_skip += 1; - } - } - LevelDecoderInner::Rle(reader) => { - for _ in 0..num_levels { - if let Some(level) = reader - .get::<i16>() - .expect("Not enough values in Rle ColumnLevelDecoderImpl.") - { - // Values are delimited by max_def_level - if level == max_def_level { - value_skip += 1; - } - } - level_skip += 1; + while level_skip < num_levels { + let remaining_levels = num_levels - level_skip; + + if self.buffer.is_empty() { + // Only read number of needed values + self.read_to_buffer(remaining_levels.min(SKIP_BUFFER_SIZE))?; + if self.buffer.is_empty() { + // Reached end of page + break; } } + let to_read = self.buffer.len().min(remaining_levels); + + level_skip += to_read; + value_skip += self.buffer[..to_read] + .iter() + .filter(|x| **x == max_def_level) + .count(); + + self.split_off_buffer(to_read) } + Ok((value_skip, level_skip)) } } impl RepetitionLevelDecoder for ColumnLevelDecoderImpl { - fn skip_rep_levels(&mut self, _num_records: usize) -> Result<(usize, usize)> { - Err(nyi_err!("https://github.com/apache/arrow-rs/issues/1792")) + fn skip_rep_levels(&mut self, num_records: usize) -> Result<(usize, usize)> { + let mut level_skip = 0; + let mut record_skip = 0; + + loop { + if self.buffer.is_empty() { + // Read SKIP_BUFFER_SIZE as we don't know how many to read + self.read_to_buffer(SKIP_BUFFER_SIZE)?; + if self.buffer.is_empty() { + // Reached end of page + break; + } + } + + let mut to_skip = 0; + while to_skip < self.buffer.len() && record_skip != num_records { + if self.buffer[to_skip] == 0 { + record_skip += 1; + } + to_skip += 1; + } + + // Find end of record + while to_skip < self.buffer.len() && self.buffer[to_skip] != 0 { + to_skip += 1; + } + + level_skip += to_skip; + if to_skip >= self.buffer.len() { Review Comment: Yeah, the hope is that >= helps LLVM elide bound checks that follow. Not sure it makes any difference, just habit 😅 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org