(arrow-rs) branch main updated: perf(parquet): Defer fixed length byte array buffer alloc and skip zero-batch init (#9756)

alamb Wed, 22 Apr 2026 11:12:31 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 41ccf2d3fc perf(parquet): Defer fixed length byte array buffer alloc 
and skip zero-batch init (#9756)
41ccf2d3fc is described below

commit 41ccf2d3fcc715335c06fd5a583b5ddebcc7246f
Author: Lanqing Yang <[email protected]>
AuthorDate: Wed Apr 22 11:12:11 2026 -0700

    perf(parquet): Defer fixed length byte array buffer alloc and skip 
zero-batch init (#9756)
    
    - follow ups from https://github.com/apache/arrow-rs/pull/9093
    
    - FixedLenByteArrayBuffer: preserve the value-count hint in
    `with_capacity` and defer the buffer allocation to the first
    `ValueDecoder::read`, when `byte_length` is known. This lets the buffer
    be sized exactly once (`values_capacity * byte_length`) instead of
    growing incrementally from `Vec::new()`.
    - RecordReader::read_one_batch: short-circuit with `Ok(0)` when
    `batch_size == 0` to avoid the lazy buffer init on an end-of-stream
    read.
    
    Signed-off-by: lyang24 <[email protected]>
---
 .../src/arrow/array_reader/fixed_len_byte_array.rs | 24 ++++++++++++++++++----
 parquet/src/arrow/record_reader/mod.rs             |  3 +++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs 
b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
index d562c88cb8..f7e83510cf 100644
--- a/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
+++ b/parquet/src/arrow/array_reader/fixed_len_byte_array.rs
@@ -265,6 +265,9 @@ struct FixedLenByteArrayBuffer {
     buffer: Vec<u8>,
     /// The length of each element in bytes
     byte_length: Option<usize>,
+    /// Preserved value-count hint used to allocate `buffer` once `byte_length`
+    /// becomes known on the first decode.
+    values_capacity: Option<usize>,
 }
 
 #[inline]
@@ -291,12 +294,13 @@ fn move_values<F>(
 }
 
 impl ValuesBuffer for FixedLenByteArrayBuffer {
-    fn with_capacity(_capacity: usize) -> Self {
-        // byte_length is not known at trait level, so we return a default 
buffer
-        // The decoder will pre-allocate when it knows both capacity and 
byte_length
+    fn with_capacity(capacity: usize) -> Self {
+        // `byte_length` is not known initially, so preserve the value-count
+        // hint so the first decode can allocate the exact byte capacity.
         Self {
             buffer: Vec::new(),
             byte_length: None,
+            values_capacity: Some(capacity),
         }
     }
 
@@ -419,7 +423,19 @@ impl ColumnValueDecoder for ValueDecoder {
     fn read(&mut self, out: &mut Self::Buffer, num_values: usize) -> 
Result<usize> {
         match out.byte_length {
             Some(x) => assert_eq!(x, self.byte_length),
-            None => out.byte_length = Some(self.byte_length),
+            None => {
+                out.byte_length = Some(self.byte_length);
+                // TODO: collapse to a let-chain once MSRV ≥ 1.88
+                // (`if out.buffer.is_empty() && let Some(cap) = 
out.values_capacity.take()`)
+                if out.buffer.is_empty() {
+                    if let Some(values_capacity) = out.values_capacity.take() {
+                        // now that the byte length per output element is 
known,
+                        // allocate the actual needed space.
+                        let byte_capacity = 
values_capacity.saturating_mul(self.byte_length);
+                        out.buffer = Vec::with_capacity(byte_capacity);
+                    }
+                }
+            }
         }
 
         match self.decoder.as_mut().unwrap() {
diff --git a/parquet/src/arrow/record_reader/mod.rs 
b/parquet/src/arrow/record_reader/mod.rs
index a33b489c62..d2d1326239 100644
--- a/parquet/src/arrow/record_reader/mod.rs
+++ b/parquet/src/arrow/record_reader/mod.rs
@@ -219,6 +219,9 @@ where
 
     /// Try to read one batch of data returning the number of records read
     fn read_one_batch(&mut self, batch_size: usize) -> Result<usize> {
+        if batch_size == 0 {
+            return Ok(0);
+        }
         // Update capacity hint to the largest batch size seen
         if batch_size > self.capacity_hint {
             self.capacity_hint = batch_size;

(arrow-rs) branch main updated: perf(parquet): Defer fixed length byte array buffer alloc and skip zero-batch init (#9756)

Reply via email to