Re: [PR] feat(parquet): add ParquetPushDecoder::peek_next_row_group() [arrow-rs]

via GitHub Thu, 18 Jun 2026 05:33:00 -0700


Copilot commented on code in PR #10158:
URL: https://github.com/apache/arrow-rs/pull/10158#discussion_r3435772847



##########
parquet/src/arrow/push_decoder/mod.rs:
##########
@@ -1743,6 +1776,93 @@ mod test {
         expect_finished(decoder.try_decode());
     }
 
+    /// `peek_next_row_group` reports the index of the row group the
+    /// next `try_next_reader` call will hand back, matching the
+    /// frontier's internal skip logic.
+    #[test]
+    fn test_peek_next_row_group_basic() {
+        let mut decoder = 
ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata())
+            .unwrap()
+            .build()
+            .unwrap();
+
+        // Two row groups (0, 1). At boundary before any read, peek should
+        // see RG 0.
+        assert_eq!(decoder.peek_next_row_group(), Some(0));
+        assert!(decoder.is_at_row_group_boundary());
+
+        let ranges = expect_needs_data(decoder.try_next_reader());
+        push_ranges_to_decoder(&mut decoder, ranges);
+        let reader = expect_data(decoder.try_next_reader());
+        // Once the reader for RG 0 has been handed off, the decoder is
+        // back at a boundary waiting for RG 1 — peek must reflect that
+        // (the active reader lives outside the decoder).
+        assert!(decoder.is_at_row_group_boundary());
+        assert_eq!(decoder.peek_next_row_group(), Some(1));
+
+        // Drain RG 0's reader and consume RG 1.
+        for batch in reader {
+            let _ = batch.unwrap();
+        }
+        let ranges = expect_needs_data(decoder.try_next_reader());
+        push_ranges_to_decoder(&mut decoder, ranges);
+        let reader = expect_data(decoder.try_next_reader());
+        for batch in reader {
+            let _ = batch.unwrap();
+        }
+
+        // No row groups left.
+        assert_eq!(decoder.peek_next_row_group(), None);
+    }
+
+    /// `peek_next_row_group` honors `with_row_groups` — restricting the
+    /// scan to a single row group means peek reports only that one and
+    /// then `None`.
+    #[test]
+    fn test_peek_next_row_group_respects_with_row_groups() {
+        let decoder = 
ParquetPushDecoderBuilder::try_new_decoder(test_file_parquet_metadata())
+            .unwrap()
+            .with_row_groups(vec![1])
+            .build()
+            .unwrap();
+
+        assert_eq!(decoder.peek_next_row_group(), Some(1));
+    }
+
+    /// When a row-selection segment leaves the next row group with zero
+    /// selected rows, `peek_next_row_group` mirrors
+    /// `next_readable_row_group`'s skip: it returns the *following*
+    /// row group instead of the empty one.
+    #[test]

Review Comment:
   The new API documents that peeking simulates offset/limit budget skipping, 
but the added tests don't cover budget-driven row-group skipping. Adding a 
small test for `with_offset` (or `with_limit`) would help ensure 
`peek_next_row_group` stays aligned with `next_readable_row_group`'s budget 
logic.



##########
parquet/src/arrow/push_decoder/remaining.rs:
##########
@@ -93,6 +93,62 @@ impl RowGroupFrontier {
         self.budget = budget;
     }
 
+    /// Peek at the next row-group index `next_readable_row_group` would
+    /// hand out, without mutating any state. Returns `None` if every
+    /// remaining row group would be skipped under the current
+    /// selection/budget, or if the queue is empty.
+    ///
+    /// Mirrors the structure of `next_readable_row_group` but only walks
+    /// borrowed state — used by 
[`super::ParquetPushDecoder::peek_next_row_group`]
+    /// to let adaptive callers (e.g. dynamic row-group pruners or per-RG
+    /// `RowFilter` toggles) keep their per-RG state in lock-step with
+    /// the reader the decoder is about to emit.
+    fn peek_next_row_group(&self) -> Option<usize> {
+        // Short-circuit: budget exhausted or selection drained ⇒ same
+        // outcome as `next_readable_row_group`'s early return.
+        if self.budget.is_exhausted()
+            || self
+                .selection
+                .as_ref()
+                .is_some_and(|selection| selection.row_count() == 0)
+        {
+            return None;
+        }
+
+        // We may have to walk past row groups whose split selection is
+        // empty. Cloning the selection lets us run the same `split_off`
+        // logic without disturbing the real one.
+        let mut selection = self.selection.clone();
+        let mut budget = self.budget;
+        for &row_group_idx in &self.row_groups {
+            let row_count = self.row_group_num_rows(row_group_idx).ok()?;
+            let selected_rows = match selection.as_mut() {

Review Comment:
   `peek_next_row_group` currently converts `row_group_num_rows` errors into 
`None` via `.ok()?`. This can mask real metadata problems (e.g. row-count 
overflow on 32-bit targets), making the decoder look "finished" even though 
`try_next_reader` would return `Err`. Consider returning `Result<Option<usize>, 
ParquetError>` (and plumbing it through 
`RemainingRowGroups`/`ParquetPushDecoder`) so peek and read paths report errors 
consistently.



##########
parquet/src/arrow/push_decoder/remaining.rs:
##########
@@ -93,6 +93,62 @@ impl RowGroupFrontier {
         self.budget = budget;
     }
 
+    /// Peek at the next row-group index `next_readable_row_group` would
+    /// hand out, without mutating any state. Returns `None` if every
+    /// remaining row group would be skipped under the current
+    /// selection/budget, or if the queue is empty.
+    ///
+    /// Mirrors the structure of `next_readable_row_group` but only walks
+    /// borrowed state — used by 
[`super::ParquetPushDecoder::peek_next_row_group`]
+    /// to let adaptive callers (e.g. dynamic row-group pruners or per-RG
+    /// `RowFilter` toggles) keep their per-RG state in lock-step with
+    /// the reader the decoder is about to emit.
+    fn peek_next_row_group(&self) -> Option<usize> {
+        // Short-circuit: budget exhausted or selection drained ⇒ same
+        // outcome as `next_readable_row_group`'s early return.
+        if self.budget.is_exhausted()
+            || self
+                .selection
+                .as_ref()
+                .is_some_and(|selection| selection.row_count() == 0)
+        {
+            return None;
+        }
+
+        // We may have to walk past row groups whose split selection is
+        // empty. Cloning the selection lets us run the same `split_off`
+        // logic without disturbing the real one.
+        let mut selection = self.selection.clone();
+        let mut budget = self.budget;
+        for &row_group_idx in &self.row_groups {
+            let row_count = self.row_group_num_rows(row_group_idx).ok()?;
+            let selected_rows = match selection.as_mut() {
+                Some(remaining) => {
+                    let rg_segment = remaining.split_off(row_count);
+                    rg_segment.row_count()
+                }
+                None => row_count,
+            };
+            if selected_rows == 0 {
+                // Same skip path as `next_readable_row_group`: row
+                // selection drained for this RG, move on.
+                continue;
+            }
+            if self.has_predicates {
+                // Predicates → always read, regardless of budget.
+                return Some(row_group_idx);
+            }

Review Comment:
   The comment "Predicates → always read, regardless of budget" is slightly 
misleading since the function still returns `None` when the budget is 
exhausted. Rewording this makes the intended meaning ("budget-based row-group 
skipping is disabled when predicates are present") clearer.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] feat(parquet): add ParquetPushDecoder::peek_next_row_group() [arrow-rs]

Reply via email to