This is an automated email from the ASF dual-hosted git repository.
Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 40ebcf03a1 benchmarks for writing REE arrays to parquet (#9936)
40ebcf03a1 is described below
commit 40ebcf03a12cd5934d4b9cf6ccd6e970eb658366
Author: RIchard Baah <[email protected]>
AuthorDate: Wed May 27 03:37:42 2026 -0400
benchmarks for writing REE arrays to parquet (#9936)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9935.
# Rationale for this change
there is no way to currently tell which approach to writing out REE
columns to parquet is more performant. This PR aims to solve that.
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
# What changes are included in this PR?
Added a `create_string_ree_bench_batch()` function that builds record
batches of REE data — it plugs into the existing benchmark structure.
For controlling the shape of the generated REE arrays, I currently have
two constants, `MIN_RUN` and `MAX_RUN`, that bound the run length. The
intent is to let benchmarks cover long uniform runs as well as shorter /
more sparse data, rather than only one shape.
An alternative would be a small params struct with defaults that callers
can override — happy to switch to that if it's preferred, but that would
require changing other callsites
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
# Are these changes tested?
yes
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
If this PR claims a performance improvement, please include evidence
such as benchmark results.
-->
# Are there any user-facing changes?
no
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
---
arrow/src/util/data_gen.rs | 83 +++++++++++++++++++++++++++++++++++++++++
parquet/benches/arrow_writer.rs | 30 +++++++++++++++
2 files changed, 113 insertions(+)
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index e54ab34994..a5a0647aa8 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -178,6 +178,9 @@ pub fn create_random_array(
Map(_, _) => create_random_map_array(field, size, null_density,
true_density)?,
Decimal128(_, _) => create_random_decimal_array(field, size,
null_density)?,
Decimal256(_, _) => create_random_decimal_array(field, size,
null_density)?,
+ RunEndEncoded(index, value) => {
+ create_random_run_end_encoded_array(index, value, size,
null_density, true_density)?
+ }
other => {
return Err(ArrowError::NotYetImplemented(format!(
"Generating random arrays not yet implemented for {other:?}"
@@ -230,6 +233,62 @@ fn create_random_decimal_array(field: &Field, size: usize,
null_density: f32) ->
))),
}
}
+#[inline]
+fn create_random_run_end_encoded_array(
+ index: &Field,
+ value: &Field,
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<ArrayRef> {
+ const MIN_RUN: usize = 8;
+ const MAX_RUN: usize = 32;
+
+ let mut rng = seedable_rng();
+ let mut run_lengths: Vec<usize> = Vec::new();
+ let mut remaining = size;
+ while remaining > 0 {
+ let len = rng.random_range(MIN_RUN..=MAX_RUN).min(remaining);
+ run_lengths.push(len);
+ remaining -= len;
+ }
+ let num_runs = run_lengths.len();
+
+ let mut cumulative: i64 = 0;
+ let run_ends_i64: Vec<i64> = run_lengths
+ .iter()
+ .map(|&l| {
+ cumulative += l as i64;
+ cumulative
+ })
+ .collect();
+
+ let values = create_random_array(value, num_runs, null_density,
true_density)?;
+
+ match index.data_type() {
+ DataType::Int16 => {
+ let run_ends: Int16Array = run_ends_i64.iter().map(|&v| v as
i16).collect();
+ Ok(Arc::new(RunArray::<Int16Type>::try_new(
+ &run_ends, &values,
+ )?))
+ }
+ DataType::Int32 => {
+ let run_ends: Int32Array = run_ends_i64.iter().map(|&v| v as
i32).collect();
+ Ok(Arc::new(RunArray::<Int32Type>::try_new(
+ &run_ends, &values,
+ )?))
+ }
+ DataType::Int64 => {
+ let run_ends: Int64Array = run_ends_i64.iter().copied().collect();
+ Ok(Arc::new(RunArray::<Int64Type>::try_new(
+ &run_ends, &values,
+ )?))
+ }
+ other => Err(ArrowError::InvalidArgumentError(format!(
+ "Unsupported run-ends type for REE: {other:?}"
+ ))),
+ }
+}
#[inline]
fn create_random_list_array(
@@ -648,6 +707,30 @@ mod tests {
assert_eq!(col_d_y.null_count(), 0);
}
+ #[test]
+ fn test_create_run_end_encoded_array() {
+ let size = 1000;
+ let ree_field = Field::new(
+ "ree",
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", DataType::Utf8, true)),
+ ),
+ false,
+ );
+
+ let array = create_random_array(&ree_field, size, 0.25, 0.0).unwrap();
+ assert_eq!(array.len(), size);
+
+ let ree = array.as_run::<Int32Type>();
+ let run_ends = ree.run_ends().values();
+ let num_runs = run_ends.len();
+
+ assert_eq!(*run_ends.last().unwrap() as usize, size);
+
+ assert_eq!(ree.values().len(), num_runs);
+ }
+
#[test]
fn test_create_list_array_nested_nullability() {
let list_field = Field::new_list(
diff --git a/parquet/benches/arrow_writer.rs b/parquet/benches/arrow_writer.rs
index 3636cb0402..80d3e7144b 100644
--- a/parquet/benches/arrow_writer.rs
+++ b/parquet/benches/arrow_writer.rs
@@ -159,6 +159,30 @@ fn create_string_dictionary_bench_batch(
true_density,
)?)
}
+// commenting out until implementation of RunEndEncoded is complete. See
https://github.com/apache/arrow-rs/pull/9936#discussion_r3242936421
+#[allow(dead_code)]
+fn create_ree_bench_batch(
+ value_dt: DataType,
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+) -> Result<RecordBatch> {
+ let fields = vec![Field::new(
+ "_1",
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Int32, false)),
+ Arc::new(Field::new("values", value_dt, true)),
+ ),
+ true,
+ )];
+ let schema = Schema::new(fields);
+ Ok(create_random_batch(
+ Arc::new(schema),
+ size,
+ null_density,
+ true_density,
+ )?)
+}
fn create_string_bench_batch_non_null(
size: usize,
@@ -434,6 +458,12 @@ fn create_batches() -> Vec<(&'static str, RecordBatch)> {
let batch = create_string_bench_batch_non_null(BATCH_SIZE, 0.25,
0.75).unwrap();
batches.push(("string_non_null", batch));
+ //let batch = create_ree_bench_batch(DataType::Utf8, BATCH_SIZE, 0.25,
0.75).unwrap();
+ //batches.push(("string_ree", batch));
+
+ //let batch = create_ree_bench_batch(DataType::Int32, BATCH_SIZE, 0.25,
0.75).unwrap();
+ //batches.push(("int32_ree", batch));
+
let batch = create_float_bench_batch_with_nans(BATCH_SIZE, 0.5).unwrap();
batches.push(("float_with_nans", batch));