This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 06c3bd06c8 feat(parquet): add wide-schema writer overhead benchmark
(#9723)
06c3bd06c8 is described below
commit 06c3bd06c8891b533168002b5ee84ae6b007235a
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Wed Apr 15 16:56:00 2026 -0400
feat(parquet): add wide-schema writer overhead benchmark (#9723)
# Which issue does this PR close?
- Contributes to #9722
# Rationale for this change
Existing writer benchmarks use narrow schemas (5–10 columns) and
primarily measure data encoding throughput. They don't capture
per-column structural overhead that dominates at high column cardinality
(thousands to hundreds of thousands of columns), such as allocation, and
metadata assembly.
# What changes are included in this PR?
This commit adds benchmarks to fill that gap by writing a single-row
batch through `ArrowWriter` with 1k/5k/10k flat `Float32` columns and
per-column `WriterProperties` entries, isolating the cost of the writer
infrastructure itself.
Baseline results (Apple M1 Max):
```
writer_overhead/1000_cols/per_column_props 3.72 ms
writer_overhead/5000_cols/per_column_props 54.96 ms
writer_overhead/10000_cols/per_column_props 220.73 ms
```
# Are these changes tested?
N/A
# Are there any user-facing changes?
N/A
Signed-off-by: Hippolyte Barraud <[email protected]>
---
parquet/Cargo.toml | 5 +++
parquet/benches/writer_overhead.rs | 86 ++++++++++++++++++++++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 314960adf1..9f4d2a33df 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -226,6 +226,11 @@ name = "push_decoder"
required-features = ["arrow"]
harness = false
+[[bench]]
+name = "writer_overhead"
+required-features = ["arrow"]
+harness = false
+
[[bench]]
name = "arrow_reader"
required-features = ["arrow", "test_common", "experimental"]
diff --git a/parquet/benches/writer_overhead.rs
b/parquet/benches/writer_overhead.rs
new file mode 100644
index 0000000000..fc4f616eb2
--- /dev/null
+++ b/parquet/benches/writer_overhead.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for writer per-column overhead at high column cardinality.
+//!
+//! These benchmarks measure the structural cost of creating, writing, and
+//! closing a parquet file with many columns while keeping actual data
+//! encoding negligible (1 row per column). This isolates overhead such as
+//! `WriterProperties` per-column lookups, `GenericColumnWriter` allocation,
+//! and metadata assembly.
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use std::hint::black_box;
+use std::io::Empty;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_array::{Float32Array, RecordBatch};
+use parquet::arrow::ArrowWriter;
+use parquet::basic::Compression;
+use parquet::file::properties::WriterProperties;
+use parquet::schema::types::ColumnPath;
+
+const COLUMN_COUNTS: &[usize] = &[1_000, 5_000, 10_000];
+
+fn make_wide_schema(num_columns: usize) -> SchemaRef {
+ let fields: Vec<Field> = (0..num_columns)
+ .map(|i| Field::new(format!("c{i}"), DataType::Float32, false))
+ .collect();
+ Arc::new(Schema::new(fields))
+}
+
+fn make_single_row_batch(schema: &SchemaRef) -> RecordBatch {
+ let columns: Vec<Arc<dyn arrow_array::Array>> = (0..schema.fields().len())
+ .map(|_| Arc::new(Float32Array::from(vec![0.0f32])) as _)
+ .collect();
+ RecordBatch::try_new(schema.clone(), columns).unwrap()
+}
+
+/// Build WriterProperties with a per-column property set for every column,
+/// populating the internal HashMap so that per-column lookups are exercised.
+fn make_per_column_props(schema: &SchemaRef) -> WriterProperties {
+ let mut builder =
WriterProperties::builder().set_dictionary_enabled(false);
+ for field in schema.fields() {
+ builder = builder.set_column_compression(
+ ColumnPath::from(field.name().as_str()),
+ Compression::UNCOMPRESSED,
+ );
+ }
+ builder.build()
+}
+
+fn bench_writer_overhead(c: &mut Criterion) {
+ for &num_cols in COLUMN_COUNTS {
+ let schema = make_wide_schema(num_cols);
+ let batch = make_single_row_batch(&schema);
+ let props = make_per_column_props(&schema);
+
+ c.bench_function(&format!("writer_overhead/{num_cols}_cols"), |b| {
+ b.iter(|| {
+ let mut writer =
+ ArrowWriter::try_new(Empty::default(), schema.clone(),
Some(props.clone()))
+ .unwrap();
+ writer.write(black_box(&batch)).unwrap();
+ black_box(writer.close()).unwrap();
+ });
+ });
+ }
+}
+
+criterion_group!(benches, bench_writer_overhead);
+criterion_main!(benches);