This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 06c3bd06c8 feat(parquet): add wide-schema writer overhead benchmark 
(#9723)
06c3bd06c8 is described below

commit 06c3bd06c8891b533168002b5ee84ae6b007235a
Author: Hippolyte Barraud <[email protected]>
AuthorDate: Wed Apr 15 16:56:00 2026 -0400

    feat(parquet): add wide-schema writer overhead benchmark (#9723)
    
    # Which issue does this PR close?
    
    - Contributes to #9722
    
    # Rationale for this change
    
    Existing writer benchmarks use narrow schemas (5–10 columns) and
    primarily measure data encoding throughput. They don't capture
    per-column structural overhead that dominates at high column cardinality
    (thousands to hundreds of thousands of columns), such as allocation, and
    metadata assembly.
    
    # What changes are included in this PR?
    
    This commit adds benchmarks to fill that gap by writing a single-row
    batch through `ArrowWriter` with 1k/5k/10k flat `Float32` columns and
    per-column `WriterProperties` entries, isolating the cost of the writer
    infrastructure itself.
    
    Baseline results (Apple M1 Max):
    
    ```
      writer_overhead/1000_cols/per_column_props      3.72 ms
      writer_overhead/5000_cols/per_column_props     54.96 ms
      writer_overhead/10000_cols/per_column_props   220.73 ms
    ```
    
    # Are these changes tested?
    
    N/A
    
    # Are there any user-facing changes?
    
    N/A
    
    Signed-off-by: Hippolyte Barraud <[email protected]>
---
 parquet/Cargo.toml                 |  5 +++
 parquet/benches/writer_overhead.rs | 86 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 314960adf1..9f4d2a33df 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -226,6 +226,11 @@ name = "push_decoder"
 required-features = ["arrow"]
 harness = false
 
+[[bench]]
+name = "writer_overhead"
+required-features = ["arrow"]
+harness = false
+
 [[bench]]
 name = "arrow_reader"
 required-features = ["arrow", "test_common", "experimental"]
diff --git a/parquet/benches/writer_overhead.rs 
b/parquet/benches/writer_overhead.rs
new file mode 100644
index 0000000000..fc4f616eb2
--- /dev/null
+++ b/parquet/benches/writer_overhead.rs
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for writer per-column overhead at high column cardinality.
+//!
+//! These benchmarks measure the structural cost of creating, writing, and
+//! closing a parquet file with many columns while keeping actual data
+//! encoding negligible (1 row per column). This isolates overhead such as
+//! `WriterProperties` per-column lookups, `GenericColumnWriter` allocation,
+//! and metadata assembly.
+
+use criterion::{Criterion, criterion_group, criterion_main};
+use std::hint::black_box;
+use std::io::Empty;
+use std::sync::Arc;
+
+use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+use arrow_array::{Float32Array, RecordBatch};
+use parquet::arrow::ArrowWriter;
+use parquet::basic::Compression;
+use parquet::file::properties::WriterProperties;
+use parquet::schema::types::ColumnPath;
+
+const COLUMN_COUNTS: &[usize] = &[1_000, 5_000, 10_000];
+
+fn make_wide_schema(num_columns: usize) -> SchemaRef {
+    let fields: Vec<Field> = (0..num_columns)
+        .map(|i| Field::new(format!("c{i}"), DataType::Float32, false))
+        .collect();
+    Arc::new(Schema::new(fields))
+}
+
+fn make_single_row_batch(schema: &SchemaRef) -> RecordBatch {
+    let columns: Vec<Arc<dyn arrow_array::Array>> = (0..schema.fields().len())
+        .map(|_| Arc::new(Float32Array::from(vec![0.0f32])) as _)
+        .collect();
+    RecordBatch::try_new(schema.clone(), columns).unwrap()
+}
+
+/// Build WriterProperties with a per-column property set for every column,
+/// populating the internal HashMap so that per-column lookups are exercised.
+fn make_per_column_props(schema: &SchemaRef) -> WriterProperties {
+    let mut builder = 
WriterProperties::builder().set_dictionary_enabled(false);
+    for field in schema.fields() {
+        builder = builder.set_column_compression(
+            ColumnPath::from(field.name().as_str()),
+            Compression::UNCOMPRESSED,
+        );
+    }
+    builder.build()
+}
+
+fn bench_writer_overhead(c: &mut Criterion) {
+    for &num_cols in COLUMN_COUNTS {
+        let schema = make_wide_schema(num_cols);
+        let batch = make_single_row_batch(&schema);
+        let props = make_per_column_props(&schema);
+
+        c.bench_function(&format!("writer_overhead/{num_cols}_cols"), |b| {
+            b.iter(|| {
+                let mut writer =
+                    ArrowWriter::try_new(Empty::default(), schema.clone(), 
Some(props.clone()))
+                        .unwrap();
+                writer.write(black_box(&batch)).unwrap();
+                black_box(writer.close()).unwrap();
+            });
+        });
+    }
+}
+
+criterion_group!(benches, bench_writer_overhead);
+criterion_main!(benches);

Reply via email to