(arrow-rs) branch main updated: benchmarks for writing REE arrays to parquet (#9936)

jeffreyvo Wed, 27 May 2026 00:37:58 -0700

This is an automated email from the ASF dual-hosted git repository.

Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 40ebcf03a1 benchmarks for writing REE arrays to parquet (#9936)
40ebcf03a1 is described below

commit 40ebcf03a12cd5934d4b9cf6ccd6e970eb658366
Author: RIchard Baah <[email protected]>
AuthorDate: Wed May 27 03:37:42 2026 -0400

    benchmarks for writing REE arrays to parquet (#9936)
    
    # Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    -->
    
    - Closes #9935.
    
    # Rationale for this change
    there is no way to currently tell which approach to writing out REE
    columns to parquet is more performant. This PR aims to solve that.
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    # What changes are included in this PR?
    Added a `create_string_ree_bench_batch()` function that builds record
    batches of REE data — it plugs into the existing benchmark structure.
    
    For controlling the shape of the generated REE arrays, I currently have
    two constants, `MIN_RUN` and `MAX_RUN`, that bound the run length. The
    intent is to let benchmarks cover long uniform runs as well as shorter /
    more sparse data, rather than only one shape.
    
    An alternative would be a small params struct with defaults that callers
    can override — happy to switch to that if it's preferred, but that would
    require changing other callsites
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    
    # Are these changes tested?
    yes
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    
    If this PR claims a performance improvement, please include evidence
    such as benchmark results.
    -->
    
    # Are there any user-facing changes?
    no
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    -->
---
 arrow/src/util/data_gen.rs      | 83 +++++++++++++++++++++++++++++++++++++++++
 parquet/benches/arrow_writer.rs | 30 +++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index e54ab34994..a5a0647aa8 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -178,6 +178,9 @@ pub fn create_random_array(
         Map(_, _) => create_random_map_array(field, size, null_density, 
true_density)?,
         Decimal128(_, _) => create_random_decimal_array(field, size, 
null_density)?,
         Decimal256(_, _) => create_random_decimal_array(field, size, 
null_density)?,
+        RunEndEncoded(index, value) => {
+            create_random_run_end_encoded_array(index, value, size, 
null_density, true_density)?
+        }
         other => {
             return Err(ArrowError::NotYetImplemented(format!(
                 "Generating random arrays not yet implemented for {other:?}"
@@ -230,6 +233,62 @@ fn create_random_decimal_array(field: &Field, size: usize, 
null_density: f32) ->
         ))),
     }
 }
+#[inline]
+fn create_random_run_end_encoded_array(
+    index: &Field,
+    value: &Field,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<ArrayRef> {
+    const MIN_RUN: usize = 8;
+    const MAX_RUN: usize = 32;
+
+    let mut rng = seedable_rng();
+    let mut run_lengths: Vec<usize> = Vec::new();
+    let mut remaining = size;
+    while remaining > 0 {
+        let len = rng.random_range(MIN_RUN..=MAX_RUN).min(remaining);
+        run_lengths.push(len);
+        remaining -= len;
+    }
+    let num_runs = run_lengths.len();
+
+    let mut cumulative: i64 = 0;
+    let run_ends_i64: Vec<i64> = run_lengths
+        .iter()
+        .map(|&l| {
+            cumulative += l as i64;
+            cumulative
+        })
+        .collect();
+
+    let values = create_random_array(value, num_runs, null_density, 
true_density)?;
+
+    match index.data_type() {
+        DataType::Int16 => {
+            let run_ends: Int16Array = run_ends_i64.iter().map(|&v| v as 
i16).collect();
+            Ok(Arc::new(RunArray::<Int16Type>::try_new(
+                &run_ends, &values,
+            )?))
+        }
+        DataType::Int32 => {
+            let run_ends: Int32Array = run_ends_i64.iter().map(|&v| v as 
i32).collect();
+            Ok(Arc::new(RunArray::<Int32Type>::try_new(
+                &run_ends, &values,
+            )?))
+        }
+        DataType::Int64 => {
+            let run_ends: Int64Array = run_ends_i64.iter().copied().collect();
+            Ok(Arc::new(RunArray::<Int64Type>::try_new(
+                &run_ends, &values,
+            )?))
+        }
+        other => Err(ArrowError::InvalidArgumentError(format!(
+            "Unsupported run-ends type for REE: {other:?}"
+        ))),
+    }
+}
 
 #[inline]
 fn create_random_list_array(
@@ -648,6 +707,30 @@ mod tests {
         assert_eq!(col_d_y.null_count(), 0);
     }
 
+    #[test]
+    fn test_create_run_end_encoded_array() {
+        let size = 1000;
+        let ree_field = Field::new(
+            "ree",
+            DataType::RunEndEncoded(
+                Arc::new(Field::new("run_ends", DataType::Int32, false)),
+                Arc::new(Field::new("values", DataType::Utf8, true)),
+            ),
+            false,
+        );
+
+        let array = create_random_array(&ree_field, size, 0.25, 0.0).unwrap();
+        assert_eq!(array.len(), size);
+
+        let ree = array.as_run::<Int32Type>();
+        let run_ends = ree.run_ends().values();
+        let num_runs = run_ends.len();
+
+        assert_eq!(*run_ends.last().unwrap() as usize, size);
+
+        assert_eq!(ree.values().len(), num_runs);
+    }
+
     #[test]
     fn test_create_list_array_nested_nullability() {
         let list_field = Field::new_list(
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 3636cb0402..80d3e7144b 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -159,6 +159,30 @@ fn create_string_dictionary_bench_batch(
         true_density,
     )?)
 }
+// commenting out until implementation of RunEndEncoded is complete. See 
https://github.com/apache/arrow-rs/pull/9936#discussion_r3242936421
+#[allow(dead_code)]
+fn create_ree_bench_batch(
+    value_dt: DataType,
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+) -> Result<RecordBatch> {
+    let fields = vec![Field::new(
+        "_1",
+        DataType::RunEndEncoded(
+            Arc::new(Field::new("run_ends", DataType::Int32, false)),
+            Arc::new(Field::new("values", value_dt, true)),
+        ),
+        true,
+    )];
+    let schema = Schema::new(fields);
+    Ok(create_random_batch(
+        Arc::new(schema),
+        size,
+        null_density,
+        true_density,
+    )?)
+}
 
 fn create_string_bench_batch_non_null(
     size: usize,
@@ -434,6 +458,12 @@ fn create_batches() -> Vec<(&'static str, RecordBatch)> {
     let batch = create_string_bench_batch_non_null(BATCH_SIZE, 0.25, 
0.75).unwrap();
     batches.push(("string_non_null", batch));
 
+    //let batch = create_ree_bench_batch(DataType::Utf8, BATCH_SIZE, 0.25, 
0.75).unwrap();
+    //batches.push(("string_ree", batch));
+
+    //let batch = create_ree_bench_batch(DataType::Int32, BATCH_SIZE, 0.25, 
0.75).unwrap();
+    //batches.push(("int32_ree", batch));
+
     let batch = create_float_bench_batch_with_nans(BATCH_SIZE, 0.5).unwrap();
     batches.push(("float_with_nans", batch));

(arrow-rs) branch main updated: benchmarks for writing REE arrays to parquet (#9936)

Reply via email to