zhuqi-lucas commented on code in PR #7537:
URL: https://github.com/apache/arrow-rs/pull/7537#discussion_r2104577693


##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -808,54 +808,45 @@ impl ParquetRecordBatchReader {
     /// Returns `Result<Option<..>>` rather than `Option<Result<..>>` to
     /// simplify error handling with `?`
     fn next_inner(&mut self) -> Result<Option<RecordBatch>> {
+        let mut end_of_stream = false;
         let mut read_records = 0;
         let batch_size = self.batch_size();
-        match self.read_plan.selection_mut() {
-            Some(selection) => {
-                while read_records < batch_size && !selection.is_empty() {
-                    let front = selection.pop_front().unwrap();
-                    if front.skip {
-                        let skipped = 
self.array_reader.skip_records(front.row_count)?;
-
-                        if skipped != front.row_count {
-                            return Err(general_err!(
-                                "failed to skip rows, expected {}, got {}",
-                                front.row_count,
-                                skipped
-                            ));
-                        }
-                        continue;
-                    }
+        while read_records < batch_size {
+            let Some(front) = self.read_plan.next() else {
+                end_of_stream = true;
+                break;
+            };
 
-                    //Currently, when RowSelectors with row_count = 0 are 
included then its interpreted as end of reader.
-                    //Fix is to skip such entries. See 
https://github.com/apache/arrow-rs/issues/2669
-                    if front.row_count == 0 {
-                        continue;
-                    }
+            if front.skip {
+                let skipped = self.array_reader.skip_records(front.row_count)?;
 
-                    // try to read record
-                    let need_read = batch_size - read_records;
-                    let to_read = match front.row_count.checked_sub(need_read) 
{
-                        Some(remaining) if remaining != 0 => {
-                            // if page row count less than batch_size we must 
set batch size to page row count.
-                            // add check avoid dead loop
-                            
selection.push_front(RowSelector::select(remaining));
-                            need_read
-                        }
-                        _ => front.row_count,
-                    };
-                    match self.array_reader.read_records(to_read)? {
-                        0 => break,
-                        rec => read_records += rec,
-                    };
+                if skipped != front.row_count {
+                    return Err(general_err!(
+                        "Internal Error: failed to skip rows, expected {}, got 
{}",
+                        front.row_count,
+                        skipped
+                    ));
+                }
+            } else {
+                let read = self.array_reader.read_records(front.row_count)?;

Review Comment:
   > I am not quite sure what you mean here -- if `read < row_count` I think 
that means the array is exhausted and the row group is done.
   > 
   > What sort of fast path would it be?
   
   This right @alamb , sorry for that i am not describing it correctly, i mean 
do we need to break early for it. It looks like not needed.
   
   



##########
parquet/src/arrow/arrow_reader/mod.rs:
##########
@@ -808,54 +808,45 @@ impl ParquetRecordBatchReader {
     /// Returns `Result<Option<..>>` rather than `Option<Result<..>>` to
     /// simplify error handling with `?`
     fn next_inner(&mut self) -> Result<Option<RecordBatch>> {
+        let mut end_of_stream = false;
         let mut read_records = 0;
         let batch_size = self.batch_size();
-        match self.read_plan.selection_mut() {
-            Some(selection) => {
-                while read_records < batch_size && !selection.is_empty() {
-                    let front = selection.pop_front().unwrap();
-                    if front.skip {
-                        let skipped = 
self.array_reader.skip_records(front.row_count)?;
-
-                        if skipped != front.row_count {
-                            return Err(general_err!(
-                                "failed to skip rows, expected {}, got {}",
-                                front.row_count,
-                                skipped
-                            ));
-                        }
-                        continue;
-                    }
+        while read_records < batch_size {
+            let Some(front) = self.read_plan.next() else {
+                end_of_stream = true;
+                break;
+            };
 
-                    //Currently, when RowSelectors with row_count = 0 are 
included then its interpreted as end of reader.
-                    //Fix is to skip such entries. See 
https://github.com/apache/arrow-rs/issues/2669
-                    if front.row_count == 0 {
-                        continue;
-                    }
+            if front.skip {
+                let skipped = self.array_reader.skip_records(front.row_count)?;
 
-                    // try to read record
-                    let need_read = batch_size - read_records;
-                    let to_read = match front.row_count.checked_sub(need_read) 
{
-                        Some(remaining) if remaining != 0 => {
-                            // if page row count less than batch_size we must 
set batch size to page row count.
-                            // add check avoid dead loop
-                            
selection.push_front(RowSelector::select(remaining));
-                            need_read
-                        }
-                        _ => front.row_count,
-                    };
-                    match self.array_reader.read_records(to_read)? {
-                        0 => break,
-                        rec => read_records += rec,
-                    };
+                if skipped != front.row_count {
+                    return Err(general_err!(
+                        "Internal Error: failed to skip rows, expected {}, got 
{}",
+                        front.row_count,
+                        skipped
+                    ));
+                }
+            } else {
+                let read = self.array_reader.read_records(front.row_count)?;

Review Comment:
   > I am not quite sure what you mean here -- if `read < row_count` I think 
that means the array is exhausted and the row group is done.
   > 
   > What sort of fast path would it be?
   
   That's right @alamb , sorry for that i am not describing it correctly, i 
mean do we need to break early for it. It looks like not needed.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to