Copilot commented on code in PR #9504:
URL: https://github.com/apache/arrow-rs/pull/9504#discussion_r2884471138
##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -489,22 +498,33 @@ impl ByteArrayDecoderDeltaLength {
output.values.reserve(total_bytes);
let mut current_offset = self.data_offset;
+ let mut read = 0;
for length in src_lengths {
- let end_offset = current_offset + *length as usize;
+ let value_len = *length as usize;
+ let end_offset = current_offset + value_len;
+
+ // Stop early rather than overflow the 32-bit offset buffer.
+ // length_offset / data_offset are only advanced by what was
+ // actually consumed, so the next read() resumes correctly.
+ if output.would_overflow(value_len) {
+ break;
+ }
Review Comment:
`output.values.reserve(total_bytes)` happens before any `would_overflow`
checks. For large `total_bytes` (the exact scenario this PR targets), this can
attempt to reserve multi‑GiB memory and potentially abort/OOM even though the
decoder will stop early. Consider capping the reservation to the remaining
representable offset range (e.g., `min(total_bytes, max_appendable_bytes)`) or
deferring reservation until after you’ve determined how many values will
actually be appended in this call.
##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -397,17 +397,26 @@ impl ByteArrayDecoderPlain {
return Err(ParquetError::EOF("eof decoding byte
array".into()));
}
+ // Stop early rather than overflow the 32-bit offset buffer.
+ // self.offset is NOT advanced, so the next read() call resumes
+ // from this value in the following batch.
+ if output.would_overflow(len as usize) {
+ break;
+ }
+
output.try_push(&buf[start_offset..end_offset],
self.validate_utf8)?;
self.offset = end_offset;
read += 1;
}
- self.max_remaining_values -= to_read;
+ // Use actual reads, not the requested amount, so max_remaining_values
+ // stays correct when we stop early.
+ self.max_remaining_values -= read;
if self.validate_utf8 {
- output.check_valid_utf8(initial_values_length)?;
+ output.check_valid_utf8(initial_values_length)?
Review Comment:
The `if self.validate_utf8 { ... }` block isn’t terminated with a semicolon
before `Ok(read)`, which makes this function fail to compile. Add a trailing
`;` after the `if` block (or after the `check_valid_utf8(...)?` inside it).
```suggestion
output.check_valid_utf8(initial_values_length)?;
```
##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -489,22 +498,33 @@ impl ByteArrayDecoderDeltaLength {
output.values.reserve(total_bytes);
let mut current_offset = self.data_offset;
+ let mut read = 0;
for length in src_lengths {
- let end_offset = current_offset + *length as usize;
+ let value_len = *length as usize;
+ let end_offset = current_offset + value_len;
+
+ // Stop early rather than overflow the 32-bit offset buffer.
+ // length_offset / data_offset are only advanced by what was
+ // actually consumed, so the next read() resumes correctly.
+ if output.would_overflow(value_len) {
+ break;
+ }
+
output.try_push(
&self.data.as_ref()[current_offset..end_offset],
self.validate_utf8,
)?;
current_offset = end_offset;
+ read += 1;
}
self.data_offset = current_offset;
- self.length_offset += to_read;
+ self.length_offset += read;
if self.validate_utf8 {
- output.check_valid_utf8(initial_values_length)?;
+ output.check_valid_utf8(initial_values_length)?
Review Comment:
The `if self.validate_utf8 { ... }` block is missing a terminating semicolon
before `Ok(read)`, so this won’t compile. Add `;` after the `if` block (or
after `check_valid_utf8(...)?` inside the block).
```suggestion
output.check_valid_utf8(initial_values_length)?;
```
##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -580,9 +622,45 @@ impl ByteArrayDecoderDictionary {
return Ok(0);
}
- self.decoder.read(len, |keys| {
- output.extend_from_dictionary(keys, dict.offsets.as_slice(),
dict.values.as_slice())
- })
+ let dict_offsets = dict.offsets.as_slice();
+ let dict_values = dict.values.as_slice();
+ let mut total_read = 0;
+
+ // Process one key at a time so we can stop cleanly when the output
+ // buffer would overflow. When the closure returns Err the
+ // DictIndexDecoder does NOT advance its position, so the same key will
+ // be retried in the next batch – no values are lost or skipped.
+ while total_read < len {
+ let mut overflow = false;
+ let n = self.decoder.read(1, |keys| {
+ let key = keys[0] as usize;
+ if key + 1 >= dict_offsets.len() {
+ return Err(general_err!(
+ "dictionary key beyond bounds of dictionary: 0..{}",
+ dict_offsets.len().saturating_sub(1)
+ ));
+ }
+ let start = dict_offsets[key].as_usize();
+ let end = dict_offsets[key + 1].as_usize();
+ let value = &dict_values[start..end];
+
+ if output.would_overflow(value.len()) {
+ overflow = true;
+ return Err(general_err!("index overflow decoding byte
array"));
+ }
+ // Dictionary values were validated at dictionary-page decode
time.
+ output.try_push(value, false)
+ });
Review Comment:
This dictionary path now calls `self.decoder.read(1, ...)` in a loop, which
adds substantial per-value overhead compared to decoding indices in batches. If
possible, consider decoding a larger chunk of keys at a time and stopping
without losing already-consumed keys (e.g., by tracking how many keys in the
provided slice were successfully processed and advancing the decoder by that
amount), or otherwise document/justify the performance tradeoff since
dictionary pages are often hot paths.
##########
parquet/src/arrow/array_reader/byte_array.rs:
##########
@@ -580,9 +622,45 @@ impl ByteArrayDecoderDictionary {
return Ok(0);
}
- self.decoder.read(len, |keys| {
- output.extend_from_dictionary(keys, dict.offsets.as_slice(),
dict.values.as_slice())
- })
+ let dict_offsets = dict.offsets.as_slice();
+ let dict_values = dict.values.as_slice();
+ let mut total_read = 0;
+
+ // Process one key at a time so we can stop cleanly when the output
+ // buffer would overflow. When the closure returns Err the
+ // DictIndexDecoder does NOT advance its position, so the same key will
+ // be retried in the next batch – no values are lost or skipped.
+ while total_read < len {
+ let mut overflow = false;
+ let n = self.decoder.read(1, |keys| {
+ let key = keys[0] as usize;
+ if key + 1 >= dict_offsets.len() {
+ return Err(general_err!(
+ "dictionary key beyond bounds of dictionary: 0..{}",
+ dict_offsets.len().saturating_sub(1)
+ ));
+ }
Review Comment:
`let key = keys[0] as usize;` can wrap negative dictionary indices to huge
`usize` values, and `key + 1` can overflow (e.g. key == -1) causing a panic in
debug builds. Convert with `usize::try_from(keys[0])` (returning an error on
negatives) and avoid `key + 1` overflow by comparing `key >=
dict_offsets.len().saturating_sub(1)` or using `checked_add(1)`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]