This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 41ccf2d3fc perf(parquet): Defer fixed length byte array buffer alloc
and skip zero-batch init (#9756)
41ccf2d3fc is described below
commit 41ccf2d3fcc715335c06fd5a583b5ddebcc7246f
Author: Lanqing Yang <[email protected]>
AuthorDate: Wed Apr 22 11:12:11 2026 -0700
perf(parquet): Defer fixed length byte array buffer alloc and skip
zero-batch init (#9756)
- follow ups from https://github.com/apache/arrow-rs/pull/9093
- FixedLenByteArrayBuffer: preserve the value-count hint in
`with_capacity` and defer the buffer allocation to the first
`ValueDecoder::read`, when `byte_length` is known. This lets the buffer
be sized exactly once (`values_capacity * byte_length`) instead of
growing incrementally from `Vec::new()`.
- RecordReader::read_one_batch: short-circuit with `Ok(0)` when
`batch_size == 0` to avoid the lazy buffer init on an end-of-stream
read.
Signed-off-by: lyang24 <[email protected]>
---
.../src/arrow/array_reader/fixed_len_byte_array.rs | 24 ++++++++++++++++++----
parquet/src/arrow/record_reader/mod.rs | 3 +++
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
index d562c88cb8..f7e83510cf 100644
--- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
+++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
@@ -265,6 +265,9 @@ struct FixedLenByteArrayBuffer {
buffer: Vec<u8>,
/// The length of each element in bytes
byte_length: Option<usize>,
+ /// Preserved value-count hint used to allocate `buffer` once `byte_length`
+ /// becomes known on the first decode.
+ values_capacity: Option<usize>,
}
#[inline]
@@ -291,12 +294,13 @@ fn move_values<F>(
}
impl ValuesBuffer for FixedLenByteArrayBuffer {
- fn with_capacity(_capacity: usize) -> Self {
- // byte_length is not known at trait level, so we return a default
buffer
- // The decoder will pre-allocate when it knows both capacity and
byte_length
+ fn with_capacity(capacity: usize) -> Self {
+ // `byte_length` is not known initially, so preserve the value-count
+ // hint so the first decode can allocate the exact byte capacity.
Self {
buffer: Vec::new(),
byte_length: None,
+ values_capacity: Some(capacity),
}
}
@@ -419,7 +423,19 @@ impl ColumnValueDecoder for ValueDecoder {
fn read(&mut self, out: &mut Self::Buffer, num_values: usize) ->
Result<usize> {
match out.byte_length {
Some(x) => assert_eq!(x, self.byte_length),
- None => out.byte_length = Some(self.byte_length),
+ None => {
+ out.byte_length = Some(self.byte_length);
+ // TODO: collapse to a let-chain once MSRV ≥ 1.88
+ // (`if out.buffer.is_empty() && let Some(cap) =
out.values_capacity.take()`)
+ if out.buffer.is_empty() {
+ if let Some(values_capacity) = out.values_capacity.take() {
+ // now that the byte length per output element is
known,
+ // allocate the actual needed space.
+ let byte_capacity =
values_capacity.saturating_mul(self.byte_length);
+ out.buffer = Vec::with_capacity(byte_capacity);
+ }
+ }
+ }
}
match self.decoder.as_mut().unwrap() {
diff --git a/parquet/src/arrow/record_reader/mod.rs
b/parquet/src/arrow/record_reader/mod.rs
index a33b489c62..d2d1326239 100644
--- a/parquet/src/arrow/record_reader/mod.rs
+++ b/parquet/src/arrow/record_reader/mod.rs
@@ -219,6 +219,9 @@ where
/// Try to read one batch of data returning the number of records read
fn read_one_batch(&mut self, batch_size: usize) -> Result<usize> {
+ if batch_size == 0 {
+ return Ok(0);
+ }
// Update capacity hint to the largest batch size seen
if batch_size > self.capacity_hint {
self.capacity_hint = batch_size;