This is an automated email from the ASF dual-hosted git repository.
etseidl pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new bbbe8a60b9 bench(parquet): add short and large string `arrow_writer`
benchmarks (#10021)
bbbe8a60b9 is described below
commit bbbe8a60b950d70b0f59991ee2099eb17f65ceb8
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Tue May 26 18:40:23 2026 -0500
bench(parquet): add short and large string `arrow_writer` benchmarks
(#10021)
# Which issue does this PR close?
Split out of #9972 per [this review
comment](https://github.com/apache/arrow-rs/pull/9972#discussion_r3307256819).
# Rationale for this change
#9972 makes the parquet writer's mini-batch sizing byte-budget aware so
large variable-width values don't produce oversized data pages. To
measure that change against a stable baseline — and in particular to see
the difference in the large-string case — these benchmarks belong on
`main` first.
# What changes are included in this PR?
Adds two BYTE_ARRAY write cases to the `arrow_writer` criterion bench:
- **`short_string_non_null`** — 1M fixed-width 8-byte strings. The
small-value hot path, where byte-budget-based sub-batch sizing should
always resolve to the full chunk (no granular splitting, no regression).
- **`large_string_non_null`** — 1024 × 256 KiB strings (256 MiB total).
The large-value case: with the default 1 MiB page byte limit each value
needs its own page, and a `write_batch_size` of 1024 would otherwise
buffer all 256 MiB before the post-write size check runs.
No library code changes — benchmarks only.
# Are there any user-facing changes?
No.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-authored-by: Claude Opus 4.7 (1M context) <[email protected]>
---
parquet/benches/arrow_writer.rs | 35 ++++++++++++++++++++++++++++++++++-
1 file changed, 34 insertions(+), 1 deletion(-)
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 9b22bb04b3..3636cb0402 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -32,7 +32,7 @@ use std::sync::Arc;
use arrow::datatypes::*;
use arrow::util::bench_util::{create_f16_array, create_f32_array,
create_f64_array};
use arrow::{record_batch::RecordBatch, util::data_gen::*};
-use arrow_array::RecordBatchOptions;
+use arrow_array::{RecordBatchOptions, StringArray};
use parquet::errors::Result;
use parquet::file::properties::{CdcOptions, WriterProperties, WriterVersion};
@@ -100,6 +100,29 @@ fn create_string_bench_batch(
)?)
}
+/// 1 M short, fixed-width 8-byte strings. Exercises the BYTE_ARRAY hot path
+/// for the case where individual values are small enough that the byte-budget
+/// based sub-batch sizing in `write_batch_internal` should always resolve to
+/// the full chunk (no granular splitting, no regression vs. current behavior).
+fn create_short_string_bench_batch(size: usize) -> Result<RecordBatch> {
+ let array = Arc::new(StringArray::from_iter_values(
+ (0..size).map(|i| format!("{i:08}")),
+ )) as _;
+ Ok(RecordBatch::try_from_iter([("col", array)])?)
+}
+
+/// `size` rows of `value_size`-byte strings. Exercises the BYTE_ARRAY path
+/// where individual values are large enough that batching the default
+/// `write_batch_size` of them would blow the page byte limit by orders of
+/// magnitude — the case the page-size fix targets.
+fn create_large_string_bench_batch(size: usize, value_size: usize) ->
Result<RecordBatch> {
+ let value = "x".repeat(value_size);
+ let array = Arc::new(StringArray::from_iter_values(
+ (0..size).map(|_| value.as_str()),
+ )) as _;
+ Ok(RecordBatch::try_from_iter([("col", array)])?)
+}
+
fn create_string_and_binary_view_bench_batch(
size: usize,
null_density: f32,
@@ -392,6 +415,16 @@ fn create_batches() -> Vec<(&'static str, RecordBatch)> {
let batch = create_string_bench_batch(BATCH_SIZE, 0.25, 0.75).unwrap();
batches.push(("string", batch));
+ let batch = create_short_string_bench_batch(BATCH_SIZE).unwrap();
+ batches.push(("short_string_non_null", batch));
+
+ // 1024 rows × 256 KiB = 256 MiB total. With the default 1 MiB page byte
+ // limit, this is the case where the page-size fix kicks in: each value
+ // needs its own page, and `write_batch_size = 1024` would otherwise
+ // buffer all 256 MiB before the post-write check runs.
+ let batch = create_large_string_bench_batch(1024, 256 * 1024).unwrap();
+ batches.push(("large_string_non_null", batch));
+
let batch = create_string_and_binary_view_bench_batch(BATCH_SIZE, 0.25,
0.75).unwrap();
batches.push(("string_and_binary_view", batch));