zhuqi-lucas commented on code in PR #7537: URL: https://github.com/apache/arrow-rs/pull/7537#discussion_r2104577693
########## parquet/src/arrow/arrow_reader/mod.rs: ########## @@ -808,54 +808,45 @@ impl ParquetRecordBatchReader { /// Returns `Result<Option<..>>` rather than `Option<Result<..>>` to /// simplify error handling with `?` fn next_inner(&mut self) -> Result<Option<RecordBatch>> { + let mut end_of_stream = false; let mut read_records = 0; let batch_size = self.batch_size(); - match self.read_plan.selection_mut() { - Some(selection) => { - while read_records < batch_size && !selection.is_empty() { - let front = selection.pop_front().unwrap(); - if front.skip { - let skipped = self.array_reader.skip_records(front.row_count)?; - - if skipped != front.row_count { - return Err(general_err!( - "failed to skip rows, expected {}, got {}", - front.row_count, - skipped - )); - } - continue; - } + while read_records < batch_size { + let Some(front) = self.read_plan.next() else { + end_of_stream = true; + break; + }; - //Currently, when RowSelectors with row_count = 0 are included then its interpreted as end of reader. - //Fix is to skip such entries. See https://github.com/apache/arrow-rs/issues/2669 - if front.row_count == 0 { - continue; - } + if front.skip { + let skipped = self.array_reader.skip_records(front.row_count)?; - // try to read record - let need_read = batch_size - read_records; - let to_read = match front.row_count.checked_sub(need_read) { - Some(remaining) if remaining != 0 => { - // if page row count less than batch_size we must set batch size to page row count. - // add check avoid dead loop - selection.push_front(RowSelector::select(remaining)); - need_read - } - _ => front.row_count, - }; - match self.array_reader.read_records(to_read)? { - 0 => break, - rec => read_records += rec, - }; + if skipped != front.row_count { + return Err(general_err!( + "Internal Error: failed to skip rows, expected {}, got {}", + front.row_count, + skipped + )); + } + } else { + let read = self.array_reader.read_records(front.row_count)?; Review Comment: > I am not quite sure what you mean here -- if `read < row_count` I think that means the array is exhausted and the row group is done. > > What sort of fast path would it be? This right @alamb , sorry for that i am not describing it correctly, i mean do we need to break early for it. It looks like not needed. ########## parquet/src/arrow/arrow_reader/mod.rs: ########## @@ -808,54 +808,45 @@ impl ParquetRecordBatchReader { /// Returns `Result<Option<..>>` rather than `Option<Result<..>>` to /// simplify error handling with `?` fn next_inner(&mut self) -> Result<Option<RecordBatch>> { + let mut end_of_stream = false; let mut read_records = 0; let batch_size = self.batch_size(); - match self.read_plan.selection_mut() { - Some(selection) => { - while read_records < batch_size && !selection.is_empty() { - let front = selection.pop_front().unwrap(); - if front.skip { - let skipped = self.array_reader.skip_records(front.row_count)?; - - if skipped != front.row_count { - return Err(general_err!( - "failed to skip rows, expected {}, got {}", - front.row_count, - skipped - )); - } - continue; - } + while read_records < batch_size { + let Some(front) = self.read_plan.next() else { + end_of_stream = true; + break; + }; - //Currently, when RowSelectors with row_count = 0 are included then its interpreted as end of reader. - //Fix is to skip such entries. See https://github.com/apache/arrow-rs/issues/2669 - if front.row_count == 0 { - continue; - } + if front.skip { + let skipped = self.array_reader.skip_records(front.row_count)?; - // try to read record - let need_read = batch_size - read_records; - let to_read = match front.row_count.checked_sub(need_read) { - Some(remaining) if remaining != 0 => { - // if page row count less than batch_size we must set batch size to page row count. - // add check avoid dead loop - selection.push_front(RowSelector::select(remaining)); - need_read - } - _ => front.row_count, - }; - match self.array_reader.read_records(to_read)? { - 0 => break, - rec => read_records += rec, - }; + if skipped != front.row_count { + return Err(general_err!( + "Internal Error: failed to skip rows, expected {}, got {}", + front.row_count, + skipped + )); + } + } else { + let read = self.array_reader.read_records(front.row_count)?; Review Comment: > I am not quite sure what you mean here -- if `read < row_count` I think that means the array is exhausted and the row group is done. > > What sort of fast path would it be? That's right @alamb , sorry for that i am not describing it correctly, i mean do we need to break early for it. It looks like not needed. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org