(arrow-rs) branch main updated: bench(parquet): add short and large string `arrow_writer` benchmarks (#10021)

etseidl Tue, 26 May 2026 16:40:41 -0700

This is an automated email from the ASF dual-hosted git repository.

etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new bbbe8a60b9 bench(parquet): add short and large string `arrow_writer` 
benchmarks (#10021)
bbbe8a60b9 is described below

commit bbbe8a60b950d70b0f59991ee2099eb17f65ceb8
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Tue May 26 18:40:23 2026 -0500

    bench(parquet): add short and large string `arrow_writer` benchmarks 
(#10021)
    
    # Which issue does this PR close?
    
    Split out of #9972 per [this review
    
comment](https://github.com/apache/arrow-rs/pull/9972#discussion_r3307256819).
    
    # Rationale for this change
    
    #9972 makes the parquet writer's mini-batch sizing byte-budget aware so
    large variable-width values don't produce oversized data pages. To
    measure that change against a stable baseline — and in particular to see
    the difference in the large-string case — these benchmarks belong on
    `main` first.
    
    # What changes are included in this PR?
    
    Adds two BYTE_ARRAY write cases to the `arrow_writer` criterion bench:
    
    - **`short_string_non_null`** — 1M fixed-width 8-byte strings. The
    small-value hot path, where byte-budget-based sub-batch sizing should
    always resolve to the full chunk (no granular splitting, no regression).
    - **`large_string_non_null`** — 1024 × 256 KiB strings (256 MiB total).
    The large-value case: with the default 1 MiB page byte limit each value
    needs its own page, and a `write_batch_size` of 1024 would otherwise
    buffer all 256 MiB before the post-write size check runs.
    
    No library code changes — benchmarks only.
    
    # Are there any user-facing changes?
    
    No.
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
 parquet/benches/arrow_writer.rs | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 9b22bb04b3..3636cb0402 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -32,7 +32,7 @@ use std::sync::Arc;
 use arrow::datatypes::*;
 use arrow::util::bench_util::{create_f16_array, create_f32_array, 
create_f64_array};
 use arrow::{record_batch::RecordBatch, util::data_gen::*};
-use arrow_array::RecordBatchOptions;
+use arrow_array::{RecordBatchOptions, StringArray};
 use parquet::errors::Result;
 use parquet::file::properties::{CdcOptions, WriterProperties, WriterVersion};
 
@@ -100,6 +100,29 @@ fn create_string_bench_batch(
     )?)
 }
 
+/// 1 M short, fixed-width 8-byte strings. Exercises the BYTE_ARRAY hot path
+/// for the case where individual values are small enough that the byte-budget
+/// based sub-batch sizing in `write_batch_internal` should always resolve to
+/// the full chunk (no granular splitting, no regression vs. current behavior).
+fn create_short_string_bench_batch(size: usize) -> Result<RecordBatch> {
+    let array = Arc::new(StringArray::from_iter_values(
+        (0..size).map(|i| format!("{i:08}")),
+    )) as _;
+    Ok(RecordBatch::try_from_iter([("col", array)])?)
+}
+
+/// `size` rows of `value_size`-byte strings. Exercises the BYTE_ARRAY path
+/// where individual values are large enough that batching the default
+/// `write_batch_size` of them would blow the page byte limit by orders of
+/// magnitude — the case the page-size fix targets.
+fn create_large_string_bench_batch(size: usize, value_size: usize) -> 
Result<RecordBatch> {
+    let value = "x".repeat(value_size);
+    let array = Arc::new(StringArray::from_iter_values(
+        (0..size).map(|_| value.as_str()),
+    )) as _;
+    Ok(RecordBatch::try_from_iter([("col", array)])?)
+}
+
 fn create_string_and_binary_view_bench_batch(
     size: usize,
     null_density: f32,
@@ -392,6 +415,16 @@ fn create_batches() -> Vec<(&'static str, RecordBatch)> {
     let batch = create_string_bench_batch(BATCH_SIZE, 0.25, 0.75).unwrap();
     batches.push(("string", batch));
 
+    let batch = create_short_string_bench_batch(BATCH_SIZE).unwrap();
+    batches.push(("short_string_non_null", batch));
+
+    // 1024 rows × 256 KiB = 256 MiB total. With the default 1 MiB page byte
+    // limit, this is the case where the page-size fix kicks in: each value
+    // needs its own page, and `write_batch_size = 1024` would otherwise
+    // buffer all 256 MiB before the post-write check runs.
+    let batch = create_large_string_bench_batch(1024, 256 * 1024).unwrap();
+    batches.push(("large_string_non_null", batch));
+
     let batch = create_string_and_binary_view_bench_batch(BATCH_SIZE, 0.25, 
0.75).unwrap();
     batches.push(("string_and_binary_view", batch));

(arrow-rs) branch main updated: bench(parquet): add short and large string `arrow_writer` benchmarks (#10021)

Reply via email to