yordan-pavlov commented on a change in pull request #692:
URL: https://github.com/apache/arrow-rs/pull/692#discussion_r688979972
##########
File path: parquet/src/arrow/arrow_array_reader.rs
##########
@@ -733,19 +757,27 @@ impl<I: Iterator<Item = Box<dyn ValueDecoder>>>
ValueDecoder
&mut self,
num_values: usize,
read_bytes: &mut dyn FnMut(&[u8], usize),
- ) -> Result<usize> {
+ ) -> Result<(usize, usize)> {
let mut values_to_read = num_values;
+ let mut child_values_to_read = num_values;
while values_to_read > 0 {
let value_decoder = match self.current_decoder.as_mut() {
Some(d) => d,
// no more decoders
None => break,
};
while values_to_read > 0 {
- let values_read =
+ let (values_read, child_values_read) =
value_decoder.read_value_bytes(values_to_read,
read_bytes)?;
+ if values_read != child_values_read {
+ child_values_to_read = child_values_read;
Review comment:
wouldn't `child_values_read` need to be accumulated / summed up across
loop iterations (instead of only taking the value from the last iteration) ?
##########
File path: parquet/src/arrow/arrow_writer.rs
##########
@@ -1728,4 +1728,34 @@ mod tests {
let stats = column.statistics().unwrap();
assert_eq!(stats.null_count(), 2);
}
+
+ #[test]
+ // This writes a list in row groups of 1 record, to check that the reader
will
+ // correctly read the right number of values across row group boundaries
+ // to fill the required number of list slots.
+ // See https://github.com/apache/arrow-rs/issues/518, as this test fixes
that.
+ fn list_single_column_string() {
+ let a_values = StringArray::from(vec![
+ "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten",
Review comment:
something to keep in mind is that the new `ArrowArrayReader` is
currently only used for strings, which limits the effect of this fix;
`ArrowArrayReader` could be used / "turned on" for primitive types as well, but
it's currently slower for primitive types compared to the old
`PrimitiveArrayReader`
##########
File path: parquet/src/arrow/arrow_array_reader.rs
##########
@@ -777,31 +829,118 @@ impl ValueDecoder for LevelValueDecoder {
&mut self,
num_values: usize,
read_bytes: &mut dyn FnMut(&[u8], usize),
- ) -> Result<usize> {
+ ) -> Result<(usize, usize)> {
let value_size = std::mem::size_of::<i16>();
let mut total_values_read = 0;
- while total_values_read < num_values {
- let values_to_read = std::cmp::min(
- num_values - total_values_read,
- self.level_value_buffer.len(),
- );
- let values_read = match self
- .level_decoder
- .get(&mut self.level_value_buffer[..values_to_read])
- {
- Ok(values_read) => values_read,
- Err(e) => return Err(e),
- };
- if values_read > 0 {
- let level_value_bytes =
- &self.level_value_buffer.to_byte_slice()[..values_read *
value_size];
- read_bytes(level_value_bytes, values_read);
- total_values_read += values_read;
- } else {
- break;
+
+ // When reading repetition levels, num_values will likely be less than
the array values
Review comment:
with these changes, how does this ArrowArrayReader work with the
`ListArrayReader`?
##########
File path: parquet/src/arrow/arrow_array_reader.rs
##########
@@ -777,31 +829,118 @@ impl ValueDecoder for LevelValueDecoder {
&mut self,
num_values: usize,
read_bytes: &mut dyn FnMut(&[u8], usize),
- ) -> Result<usize> {
+ ) -> Result<(usize, usize)> {
let value_size = std::mem::size_of::<i16>();
let mut total_values_read = 0;
- while total_values_read < num_values {
- let values_to_read = std::cmp::min(
- num_values - total_values_read,
- self.level_value_buffer.len(),
- );
- let values_read = match self
- .level_decoder
- .get(&mut self.level_value_buffer[..values_to_read])
- {
- Ok(values_read) => values_read,
- Err(e) => return Err(e),
- };
- if values_read > 0 {
- let level_value_bytes =
- &self.level_value_buffer.to_byte_slice()[..values_read *
value_size];
- read_bytes(level_value_bytes, values_read);
- total_values_read += values_read;
- } else {
- break;
+
+ // When reading repetition levels, num_values will likely be less than
the array values
+ // needed, e.g. a list array with [[0, 1], [2, 3]] has 2 values, but
needs 4 level values
+ // to be read. We have to count the number of records read by checking
where repetition = 0 to denote
+ // the start of a list slot.
+ // Thus we separate the logic for repetitions and definitions,
+ // as we do not need to do this for them.
+ // In the example above, we could have `num_values` being 1 because we
want to only read
+ // one value, but we will return that we need to read 2 values to fill
that 1 list slot.
+ match self.level_type {
Review comment:
this match operation can be avoided if the `LevelValueDecoder` is split
into separate decoders for repetition and definition levels
##########
File path: parquet/src/arrow/arrow_array_reader.rs
##########
@@ -754,20 +786,40 @@ impl<I: Iterator<Item = Box<dyn ValueDecoder>>>
ValueDecoder
}
}
- Ok(num_values - values_to_read)
+ Ok((num_values - values_to_read, child_values_to_read))
Review comment:
it appears the `child_values_to_read` return value is not actually used
anywhere; is that on purpose?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]