Re: [PR] [Parquet] Split byte-array batches transparently when i32 offset would overflow [arrow-rs]

via GitHub Wed, 04 Mar 2026 07:33:19 -0800


Copilot commented on code in PR #9504:
URL: https://github.com/apache/arrow-rs/pull/9504#discussion_r2884471138



##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -489,22 +498,33 @@ impl ByteArrayDecoderDeltaLength {
         output.values.reserve(total_bytes);
 
         let mut current_offset = self.data_offset;
+        let mut read = 0;
         for length in src_lengths {
-            let end_offset = current_offset + *length as usize;
+            let value_len = *length as usize;
+            let end_offset = current_offset + value_len;
+
+            // Stop early rather than overflow the 32-bit offset buffer.
+            // length_offset / data_offset are only advanced by what was
+            // actually consumed, so the next read() resumes correctly.
+            if output.would_overflow(value_len) {
+                break;
+            }

Review Comment:
   `output.values.reserve(total_bytes)` happens before any `would_overflow` 
checks. For large `total_bytes` (the exact scenario this PR targets), this can 
attempt to reserve multi‑GiB memory and potentially abort/OOM even though the 
decoder will stop early. Consider capping the reservation to the remaining 
representable offset range (e.g., `min(total_bytes, max_appendable_bytes)`) or 
deferring reservation until after you’ve determined how many values will 
actually be appended in this call.



##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -397,17 +397,26 @@ impl ByteArrayDecoderPlain {
                 return Err(ParquetError::EOF("eof decoding byte 
array".into()));
             }
 
+            // Stop early rather than overflow the 32-bit offset buffer.
+            // self.offset is NOT advanced, so the next read() call resumes
+            // from this value in the following batch.
+            if output.would_overflow(len as usize) {
+                break;
+            }
+
             output.try_push(&buf[start_offset..end_offset], 
self.validate_utf8)?;
 
             self.offset = end_offset;
             read += 1;
         }
-        self.max_remaining_values -= to_read;
+        // Use actual reads, not the requested amount, so max_remaining_values
+        // stays correct when we stop early.
+        self.max_remaining_values -= read;
 
         if self.validate_utf8 {
-            output.check_valid_utf8(initial_values_length)?;
+            output.check_valid_utf8(initial_values_length)?

Review Comment:
   The `if self.validate_utf8 { ... }` block isn’t terminated with a semicolon 
before `Ok(read)`, which makes this function fail to compile. Add a trailing 
`;` after the `if` block (or after the `check_valid_utf8(...)?` inside it).
   ```suggestion
               output.check_valid_utf8(initial_values_length)?;
   ```



##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -489,22 +498,33 @@ impl ByteArrayDecoderDeltaLength {
         output.values.reserve(total_bytes);
 
         let mut current_offset = self.data_offset;
+        let mut read = 0;
         for length in src_lengths {
-            let end_offset = current_offset + *length as usize;
+            let value_len = *length as usize;
+            let end_offset = current_offset + value_len;
+
+            // Stop early rather than overflow the 32-bit offset buffer.
+            // length_offset / data_offset are only advanced by what was
+            // actually consumed, so the next read() resumes correctly.
+            if output.would_overflow(value_len) {
+                break;
+            }
+
             output.try_push(
                 &self.data.as_ref()[current_offset..end_offset],
                 self.validate_utf8,
             )?;
             current_offset = end_offset;
+            read += 1;
         }
 
         self.data_offset = current_offset;
-        self.length_offset += to_read;
+        self.length_offset += read;
 
         if self.validate_utf8 {
-            output.check_valid_utf8(initial_values_length)?;
+            output.check_valid_utf8(initial_values_length)?

Review Comment:
   The `if self.validate_utf8 { ... }` block is missing a terminating semicolon 
before `Ok(read)`, so this won’t compile. Add `;` after the `if` block (or 
after `check_valid_utf8(...)?` inside the block).
   ```suggestion
               output.check_valid_utf8(initial_values_length)?;
   ```



##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -580,9 +622,45 @@ impl ByteArrayDecoderDictionary {
             return Ok(0);
         }
 
-        self.decoder.read(len, |keys| {
-            output.extend_from_dictionary(keys, dict.offsets.as_slice(), 
dict.values.as_slice())
-        })
+        let dict_offsets = dict.offsets.as_slice();
+        let dict_values = dict.values.as_slice();
+        let mut total_read = 0;
+
+        // Process one key at a time so we can stop cleanly when the output
+        // buffer would overflow.  When the closure returns Err the
+        // DictIndexDecoder does NOT advance its position, so the same key will
+        // be retried in the next batch – no values are lost or skipped.
+        while total_read < len {
+            let mut overflow = false;
+            let n = self.decoder.read(1, |keys| {
+                let key = keys[0] as usize;
+                if key + 1 >= dict_offsets.len() {
+                    return Err(general_err!(
+                        "dictionary key beyond bounds of dictionary: 0..{}",
+                        dict_offsets.len().saturating_sub(1)
+                    ));
+                }
+                let start = dict_offsets[key].as_usize();
+                let end = dict_offsets[key + 1].as_usize();
+                let value = &dict_values[start..end];
+
+                if output.would_overflow(value.len()) {
+                    overflow = true;
+                    return Err(general_err!("index overflow decoding byte 
array"));
+                }
+                // Dictionary values were validated at dictionary-page decode 
time.
+                output.try_push(value, false)
+            });

Review Comment:
   This dictionary path now calls `self.decoder.read(1, ...)` in a loop, which 
adds substantial per-value overhead compared to decoding indices in batches. If 
possible, consider decoding a larger chunk of keys at a time and stopping 
without losing already-consumed keys (e.g., by tracking how many keys in the 
provided slice were successfully processed and advancing the decoder by that 
amount), or otherwise document/justify the performance tradeoff since 
dictionary pages are often hot paths.



##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -580,9 +622,45 @@ impl ByteArrayDecoderDictionary {
             return Ok(0);
         }
 
-        self.decoder.read(len, |keys| {
-            output.extend_from_dictionary(keys, dict.offsets.as_slice(), 
dict.values.as_slice())
-        })
+        let dict_offsets = dict.offsets.as_slice();
+        let dict_values = dict.values.as_slice();
+        let mut total_read = 0;
+
+        // Process one key at a time so we can stop cleanly when the output
+        // buffer would overflow.  When the closure returns Err the
+        // DictIndexDecoder does NOT advance its position, so the same key will
+        // be retried in the next batch – no values are lost or skipped.
+        while total_read < len {
+            let mut overflow = false;
+            let n = self.decoder.read(1, |keys| {
+                let key = keys[0] as usize;
+                if key + 1 >= dict_offsets.len() {
+                    return Err(general_err!(
+                        "dictionary key beyond bounds of dictionary: 0..{}",
+                        dict_offsets.len().saturating_sub(1)
+                    ));
+                }

Review Comment:
   `let key = keys[0] as usize;` can wrap negative dictionary indices to huge 
`usize` values, and `key + 1` can overflow (e.g. key == -1) causing a panic in 
debug builds. Convert with `usize::try_from(keys[0])` (returning an error on 
negatives) and avoid `key + 1` overflow by comparing `key >= 
dict_offsets.len().saturating_sub(1)` or using `checked_add(1)`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [Parquet] Split byte-array batches transparently when i32 offset would overflow [arrow-rs]

Reply via email to