This is an automated email from the ASF dual-hosted git repository. alamb pushed a commit to branch alamb/prune-cast-benchmarks in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 55fa12a0f7c918cba0605a08e4c5ce4e7c24224f Author: Andrew Lamb <[email protected]> AuthorDate: Wed Apr 22 10:41:47 2026 -0400 Prune redundant benchmarks in cast_kernels --- arrow/benches/cast_kernels.rs | 157 ++++++++++++++++++++++++++++-------------- arrow/src/util/bench_util.rs | 23 +++++++ 2 files changed, 129 insertions(+), 51 deletions(-) diff --git a/arrow/benches/cast_kernels.rs b/arrow/benches/cast_kernels.rs index 040c118a1e..ac0df48a01 100644 --- a/arrow/benches/cast_kernels.rs +++ b/arrow/benches/cast_kernels.rs @@ -83,36 +83,6 @@ fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef { Arc::new(builder.finish()) } -fn build_decimal32_array(size: usize, precision: u8, scale: i8) -> ArrayRef { - let mut rng = seedable_rng(); - let mut builder = Decimal32Builder::with_capacity(size); - - for _ in 0..size { - builder.append_value(rng.random_range::<i32, _>(0..1000000)); - } - Arc::new( - builder - .finish() - .with_precision_and_scale(precision, scale) - .unwrap(), - ) -} - -fn build_decimal64_array(size: usize, precision: u8, scale: i8) -> ArrayRef { - let mut rng = seedable_rng(); - let mut builder = Decimal64Builder::with_capacity(size); - - for _ in 0..size { - builder.append_value(rng.random_range::<i64, _>(0..1000000000)); - } - Arc::new( - builder - .finish() - .with_precision_and_scale(precision, scale) - .unwrap(), - ) -} - fn build_decimal128_array(size: usize, precision: u8, scale: i8) -> ArrayRef { let mut rng = seedable_rng(); let mut builder = Decimal128Builder::with_capacity(size); @@ -157,6 +127,53 @@ fn build_string_array(size: usize) -> ArrayRef { Arc::new(builder.finish()) } +fn build_string_float_array(size: usize, null_density: f32) -> ArrayRef { + let mut builder = StringBuilder::new(); + + let mut rng = seedable_rng(); + + for _ in 0..size { + if rng.random::<f32>() < null_density { + builder.append_null() + } else { + builder.append_value( + rng.random_range(-999_999_999f32..999_999_999f32) + .to_string(), + ) + } + } + Arc::new(builder.finish()) +} + +macro_rules! build_array_with_samples { + ($builder: ident, $size: ident, $null_density: expr, $samples: ident) => {{ + let mut rng = seedable_rng(); + for i in 0..$size { + if rng.random::<f32>() < $null_density { + $builder.append_null(); + } else { + $builder.append_value($samples[i % $samples.len()]) + } + } + Arc::new($builder.finish()) + }}; +} + +fn build_float64_array_for_cast_to_decimal(size: usize, null_density: f32) -> ArrayRef { + Arc::new(create_primitive_array_range::<Float64Type>( + size, + null_density, + -999_999_999f64..999_999_999f64, + )) +} + +fn build_float64_array_invalid_items(size: usize, null_density: f32) -> ArrayRef { + let mut builder = Float64Builder::with_capacity(size); + let invalid_values = [f64::NAN, f64::INFINITY, f64::NEG_INFINITY]; + + build_array_with_samples!(builder, size, null_density, invalid_values) +} + fn build_dict_array(size: usize) -> ArrayRef { let values = StringArray::from_iter([ Some("small"), @@ -170,7 +187,7 @@ fn build_dict_array(size: usize) -> ArrayRef { // cast array from specified primitive array type to desired data type fn cast_array(array: &ArrayRef, to_type: DataType) { - hint::black_box(cast(array, &to_type).unwrap()); + hint::black_box(cast(hint::black_box(array), hint::black_box(&to_type)).unwrap()); } fn add_benchmark(c: &mut Criterion) { @@ -189,10 +206,8 @@ fn add_benchmark(c: &mut Criterion) { let utf8_date_array = build_utf8_date_array(512, true); let utf8_date_time_array = build_utf8_date_time_array(512, true); - let decimal32_array = build_decimal32_array(512, 9, 3); - let decimal64_array = build_decimal64_array(512, 10, 3); - let decimal128_array = build_decimal128_array(512, 10, 3); - let decimal256_array = build_decimal256_array(512, 50, 3); + let decimal128_array = build_decimal128_array(8_000, 10, 3); + let decimal256_array = build_decimal256_array(8_000, 50, 3); let string_array = build_string_array(512); let wide_string_array = cast(&string_array, &DataType::LargeUtf8).unwrap(); @@ -200,6 +215,10 @@ fn add_benchmark(c: &mut Criterion) { let string_view_array = cast(&dict_array, &DataType::Utf8View).unwrap(); let binary_view_array = cast(&string_view_array, &DataType::BinaryView).unwrap(); + let string_float_array_normal = build_string_float_array(5_000, 0.1); + let float64_array_cast_to_decimal = build_float64_array_for_cast_to_decimal(8_000, 0.1); + let invalid_float64_array_to_decimal = build_float64_array_invalid_items(8_000, 0.1); + c.bench_function("cast int32 to int32 512", |b| { b.iter(|| cast_array(&i32_array, DataType::Int32)) }); @@ -280,22 +299,6 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| cast_array(&utf8_date_time_array, DataType::Date64)) }); - c.bench_function("cast decimal32 to decimal32 512", |b| { - b.iter(|| cast_array(&decimal32_array, DataType::Decimal32(9, 4))) - }); - c.bench_function("cast decimal32 to decimal32 512 lower precision", |b| { - b.iter(|| cast_array(&decimal32_array, DataType::Decimal32(6, 5))) - }); - c.bench_function("cast decimal32 to decimal64 512", |b| { - b.iter(|| cast_array(&decimal32_array, DataType::Decimal64(11, 5))) - }); - c.bench_function("cast decimal64 to decimal32 512", |b| { - b.iter(|| cast_array(&decimal64_array, DataType::Decimal32(9, 2))) - }); - c.bench_function("cast decimal64 to decimal64 512", |b| { - b.iter(|| cast_array(&decimal64_array, DataType::Decimal64(12, 4))) - }); - c.bench_function("cast decimal128 to decimal128 512", |b| { b.iter(|| cast_array(&decimal128_array, DataType::Decimal128(30, 5))) }); @@ -360,6 +363,58 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| cast_array(&binary_view_array, DataType::Utf8View)) }); + macro_rules! benchmark_cast { + ($name: expr, $input_array: ident, $target_type: expr) => { + c.bench_function(stringify!($name), |b| { + b.iter(|| cast_array(&$input_array, $target_type)) + }); + }; + } + + // cast string with normal items to decimals + benchmark_cast!( + "cast string to decimal128(38, 3)", + string_float_array_normal, + DataType::Decimal128(38, 3) + ); + + // cast float64 to decimals + benchmark_cast!( + "cast float64 to decimal128(32, 3)", + float64_array_cast_to_decimal, + DataType::Decimal128(32, 3) + ); + + // cast invalid float64 to decimals + benchmark_cast!( + "cast invalid float64 to to decimal128(32, 3)", + invalid_float64_array_to_decimal, + DataType::Decimal128(32, 3) + ); + + // cast decimals to float/integers + benchmark_cast!( + "cast decimal128 to float64", + decimal128_array, + DataType::Float64 + ); + benchmark_cast!( + "cast decimal128 to int64", + decimal128_array, + DataType::Int64 + ); + + benchmark_cast!( + "cast decimal256 to float64", + decimal256_array, + DataType::Float64 + ); + benchmark_cast!( + "cast decimal256 to int64", + decimal256_array, + DataType::Int64 + ); + c.bench_function("cast string single run to ree<int32>", |b| { let source_array = StringArray::from(vec!["a"; 8192]); let array_ref = Arc::new(source_array) as ArrayRef; diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index aba95ba4aa..7a5d854e4b 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -51,6 +51,29 @@ where .collect() } +/// Creates a random (but fixed-seeded) array of a given size and null density with a specific range +pub fn create_primitive_array_range<T>( + size: usize, + null_density: f32, + range: Range<T::Native>, +) -> PrimitiveArray<T> +where + T: ArrowPrimitiveType, + T::Native: SampleUniform, +{ + let mut rng = seedable_rng(); + + (0..size) + .map(|_| { + if rng.random::<f32>() < null_density { + None + } else { + Some(rng.random_range(range.clone())) + } + }) + .collect() +} + /// Creates a [`PrimitiveArray`] of a given `size` and `null_density` /// filling it with random numbers generated using the provided `seed`. pub fn create_primitive_array_with_seed<T>(
