This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new bb9a4a7ea1 bench: increase in_list benchmark coverage (#19443)
bb9a4a7ea1 is described below
commit bb9a4a7ea19896fab7da6554cf5dd103e50d8e74
Author: Geoffrey Claude <[email protected]>
AuthorDate: Mon Dec 22 20:35:19 2025 +0100
bench: increase in_list benchmark coverage (#19443)
## Which issue does this PR close?
- Related to #19241
## Rationale for this change
This PR adds benchmarks and tests to ground upcoming `in_list`
optimizations:
1. **Realistic Data Patterns**: Adds mixed-length string benchmarks to
accurately measure the `StringView` two-stage lookup (prefix check +
validation) performance across variable lengths.
2. **Type Coverage**: Adds baseline tests for temporal and decimal types
to ensure correctness before they are migrated to specialized evaluation
paths.
## What changes are included in this PR?
- **Mixed-Length Benchmarks**: Scenarios for `StringArray` and
`StringViewArray` with variable lengths, match rates, and null
densities.
- **Extended Tests**: Coverage for esoteric types (Temporal, Duration,
Interval, Decimal256) in `physical-expr`.
## Are these changes tested?
Yes, via new unit tests and benchmark verification.
## Are there any user-facing changes?
No.
---
datafusion/physical-expr/benches/in_list.rs | 77 ++++++
.../physical-expr/src/expressions/in_list.rs | 302 ++++++++++++++++++++-
2 files changed, 369 insertions(+), 10 deletions(-)
diff --git a/datafusion/physical-expr/benches/in_list.rs
b/datafusion/physical-expr/benches/in_list.rs
index 9fe15febe5..954715d0e5 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -54,6 +54,10 @@ const NULL_PERCENTS: [f64; 2] = [0., 0.2];
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
const ARRAY_LENGTH: usize = 8192;
+/// Mixed string lengths for realistic benchmarks.
+/// ~50% short (≤12 bytes), ~50% long (>12 bytes).
+const MIXED_STRING_LENGTHS: &[usize] = &[3, 6, 9, 12, 16, 20, 25, 30];
+
/// Returns a friendly type name for the array type.
fn array_type_name<A: 'static>() -> &'static str {
let id = TypeId::of::<A>();
@@ -150,6 +154,71 @@ fn bench_numeric_type<T, A>(
}
}
+/// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
+fn random_mixed_length_string(rng: &mut StdRng) -> String {
+ let len = *MIXED_STRING_LENGTHS.choose(rng).unwrap();
+ random_string(rng, len)
+}
+
+/// Benchmarks realistic mixed-length IN list scenario.
+///
+/// Tests with:
+/// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
+/// - Varying prefixes (fully random strings)
+/// - Configurable match rate (% of values that are in the IN list)
+/// - Various IN list sizes (3, 8, 28, 100)
+fn bench_realistic_mixed_strings<A>(
+ c: &mut Criterion,
+ rng: &mut StdRng,
+ make_scalar: fn(String) -> ScalarValue,
+) where
+ A: Array + FromIterator<Option<String>> + 'static,
+{
+ for in_list_length in IN_LIST_LENGTHS {
+ for match_percent in [0.0, 0.25, 0.75] {
+ for null_percent in NULL_PERCENTS {
+ // Generate IN list with mixed-length random strings
+ let in_list_strings: Vec<String> = (0..in_list_length)
+ .map(|_| random_mixed_length_string(rng))
+ .collect();
+
+ let in_list: Vec<_> = in_list_strings
+ .iter()
+ .map(|s| make_scalar(s.clone()))
+ .collect();
+
+ // Generate values array with controlled match rate
+ let values: A = (0..ARRAY_LENGTH)
+ .map(|_| {
+ if !rng.random_bool(1.0 - null_percent) {
+ None
+ } else if rng.random_bool(match_percent) {
+ // Pick from IN list (will match)
+ Some(in_list_strings.choose(rng).unwrap().clone())
+ } else {
+ // Generate new random string (unlikely to match)
+ Some(random_mixed_length_string(rng))
+ }
+ })
+ .collect();
+
+ do_bench(
+ c,
+ &format!(
+ "in_list/{}/mixed/list={}/match={}%/nulls={}%",
+ array_type_name::<A>(),
+ in_list_length,
+ (match_percent * 100.0) as u32,
+ (null_percent * 100.0) as u32
+ ),
+ Arc::new(values),
+ &in_list,
+ );
+ }
+ }
+ }
+}
+
/// Entry point: registers in_list benchmarks for string and numeric array
types.
fn criterion_benchmark(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(120320);
@@ -158,6 +227,14 @@ fn criterion_benchmark(c: &mut Criterion) {
bench_string_type::<StringArray>(c, &mut rng, |s|
ScalarValue::Utf8(Some(s)));
bench_string_type::<StringViewArray>(c, &mut rng, |s|
ScalarValue::Utf8View(Some(s)));
+ // Realistic mixed-length string benchmarks (TPC-H style)
+ bench_realistic_mixed_strings::<StringArray>(c, &mut rng, |s| {
+ ScalarValue::Utf8(Some(s))
+ });
+ bench_realistic_mixed_strings::<StringViewArray>(c, &mut rng, |s| {
+ ScalarValue::Utf8View(Some(s))
+ });
+
// Benchmarks for numeric types
bench_numeric_type::<u8, UInt8Array>(
c,
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs
b/datafusion/physical-expr/src/expressions/in_list.rs
index b6b67c85c4..5c2f1adcd0 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -898,6 +898,7 @@ mod tests {
use super::*;
use crate::expressions::{col, lit, try_cast};
use arrow::buffer::NullBuffer;
+ use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano, i256};
use datafusion_common::plan_err;
use datafusion_expr::type_coercion::binary::comparison_coercion;
use datafusion_physical_expr_common::physical_expr::fmt_sql;
@@ -1335,16 +1336,59 @@ mod tests {
/// Test data: 0 (in list), 2000 (not in list), [1000, 3000] (other list
values)
#[test]
fn in_list_timestamp_types() -> Result<()> {
- run_test_cases(vec![InListPrimitiveTestCase {
- name: "timestamp_nanosecond",
- value_in: ScalarValue::TimestampNanosecond(Some(0), None),
- value_not_in: ScalarValue::TimestampNanosecond(Some(2000), None),
- other_list_values: vec![
- ScalarValue::TimestampNanosecond(Some(1000), None),
- ScalarValue::TimestampNanosecond(Some(3000), None),
- ],
- null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
- }])
+ run_test_cases(vec![
+ InListPrimitiveTestCase {
+ name: "timestamp_nanosecond",
+ value_in: ScalarValue::TimestampNanosecond(Some(0), None),
+ value_not_in: ScalarValue::TimestampNanosecond(Some(2000),
None),
+ other_list_values: vec![
+ ScalarValue::TimestampNanosecond(Some(1000), None),
+ ScalarValue::TimestampNanosecond(Some(3000), None),
+ ],
+ null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
+ },
+ InListPrimitiveTestCase {
+ name: "timestamp_millisecond_with_tz",
+ value_in: ScalarValue::TimestampMillisecond(
+ Some(1500000),
+ Some("+05:00".into()),
+ ),
+ value_not_in: ScalarValue::TimestampMillisecond(
+ Some(2500000),
+ Some("+05:00".into()),
+ ),
+ other_list_values: vec![ScalarValue::TimestampMillisecond(
+ Some(3500000),
+ Some("+05:00".into()),
+ )],
+ null_value: Some(ScalarValue::TimestampMillisecond(
+ None,
+ Some("+05:00".into()),
+ )),
+ },
+ InListPrimitiveTestCase {
+ name: "timestamp_millisecond_mixed_tz",
+ value_in: ScalarValue::TimestampMillisecond(
+ Some(1500000),
+ Some("+05:00".into()),
+ ),
+ value_not_in: ScalarValue::TimestampMillisecond(
+ Some(2500000),
+ Some("+05:00".into()),
+ ),
+ other_list_values: vec![
+ ScalarValue::TimestampMillisecond(
+ Some(3500000),
+ Some("+01:00".into()),
+ ),
+ ScalarValue::TimestampMillisecond(Some(4500000),
Some("UTC".into())),
+ ],
+ null_value: Some(ScalarValue::TimestampMillisecond(
+ None,
+ Some("+05:00".into()),
+ )),
+ },
+ ])
}
#[test]
@@ -3225,4 +3269,242 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn test_in_list_esoteric_types() -> Result<()> {
+ // Test esoteric/less common types to validate the transform and
mapping flow.
+ // These types are reinterpreted to base primitive types (e.g.,
Timestamp -> UInt64,
+ // Interval -> Decimal128, Float16 -> UInt16). We just need to verify
basic
+ // functionality works - no need for comprehensive null handling tests.
+
+ // Helper: simple IN test that expects [Some(true), Some(false)]
+ let test_type = |data_type: DataType,
+ in_array: ArrayRef,
+ list_values: Vec<ScalarValue>|
+ -> Result<()> {
+ let schema = Schema::new(vec![Field::new("a", data_type.clone(),
false)]);
+ let col_a = col("a", &schema)?;
+ let batch = RecordBatch::try_new(Arc::new(schema.clone()),
vec![in_array])?;
+
+ let list = list_values.into_iter().map(lit).collect();
+ in_list!(
+ batch,
+ list,
+ &false,
+ vec![Some(true), Some(false)],
+ col_a,
+ &schema
+ );
+ Ok(())
+ };
+
+ // Timestamp types (all units map to Int64 -> UInt64)
+ test_type(
+ DataType::Timestamp(TimeUnit::Second, None),
+ Arc::new(TimestampSecondArray::from(vec![Some(1000), Some(2000)])),
+ vec![
+ ScalarValue::TimestampSecond(Some(1000), None),
+ ScalarValue::TimestampSecond(Some(1500), None),
+ ],
+ )?;
+
+ test_type(
+ DataType::Timestamp(TimeUnit::Millisecond, None),
+ Arc::new(TimestampMillisecondArray::from(vec![
+ Some(1000000),
+ Some(2000000),
+ ])),
+ vec![
+ ScalarValue::TimestampMillisecond(Some(1000000), None),
+ ScalarValue::TimestampMillisecond(Some(1500000), None),
+ ],
+ )?;
+
+ test_type(
+ DataType::Timestamp(TimeUnit::Microsecond, None),
+ Arc::new(TimestampMicrosecondArray::from(vec![
+ Some(1000000000),
+ Some(2000000000),
+ ])),
+ vec![
+ ScalarValue::TimestampMicrosecond(Some(1000000000), None),
+ ScalarValue::TimestampMicrosecond(Some(1500000000), None),
+ ],
+ )?;
+
+ // Time32 and Time64 (map to Int32 -> UInt32 and Int64 -> UInt64
respectively)
+ test_type(
+ DataType::Time32(TimeUnit::Second),
+ Arc::new(Time32SecondArray::from(vec![Some(3600), Some(7200)])),
+ vec![
+ ScalarValue::Time32Second(Some(3600)),
+ ScalarValue::Time32Second(Some(5400)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Time32(TimeUnit::Millisecond),
+ Arc::new(Time32MillisecondArray::from(vec![
+ Some(3600000),
+ Some(7200000),
+ ])),
+ vec![
+ ScalarValue::Time32Millisecond(Some(3600000)),
+ ScalarValue::Time32Millisecond(Some(5400000)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Time64(TimeUnit::Microsecond),
+ Arc::new(Time64MicrosecondArray::from(vec![
+ Some(3600000000),
+ Some(7200000000),
+ ])),
+ vec![
+ ScalarValue::Time64Microsecond(Some(3600000000)),
+ ScalarValue::Time64Microsecond(Some(5400000000)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Time64(TimeUnit::Nanosecond),
+ Arc::new(Time64NanosecondArray::from(vec![
+ Some(3600000000000),
+ Some(7200000000000),
+ ])),
+ vec![
+ ScalarValue::Time64Nanosecond(Some(3600000000000)),
+ ScalarValue::Time64Nanosecond(Some(5400000000000)),
+ ],
+ )?;
+
+ // Duration types (map to Int64 -> UInt64)
+ test_type(
+ DataType::Duration(TimeUnit::Second),
+ Arc::new(DurationSecondArray::from(vec![Some(86400),
Some(172800)])),
+ vec![
+ ScalarValue::DurationSecond(Some(86400)),
+ ScalarValue::DurationSecond(Some(129600)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Duration(TimeUnit::Millisecond),
+ Arc::new(DurationMillisecondArray::from(vec![
+ Some(86400000),
+ Some(172800000),
+ ])),
+ vec![
+ ScalarValue::DurationMillisecond(Some(86400000)),
+ ScalarValue::DurationMillisecond(Some(129600000)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Duration(TimeUnit::Microsecond),
+ Arc::new(DurationMicrosecondArray::from(vec![
+ Some(86400000000),
+ Some(172800000000),
+ ])),
+ vec![
+ ScalarValue::DurationMicrosecond(Some(86400000000)),
+ ScalarValue::DurationMicrosecond(Some(129600000000)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Duration(TimeUnit::Nanosecond),
+ Arc::new(DurationNanosecondArray::from(vec![
+ Some(86400000000000),
+ Some(172800000000000),
+ ])),
+ vec![
+ ScalarValue::DurationNanosecond(Some(86400000000000)),
+ ScalarValue::DurationNanosecond(Some(129600000000000)),
+ ],
+ )?;
+
+ // Interval types (map to 16-byte Decimal128Type)
+ test_type(
+ DataType::Interval(IntervalUnit::YearMonth),
+ Arc::new(IntervalYearMonthArray::from(vec![Some(12), Some(24)])),
+ vec![
+ ScalarValue::IntervalYearMonth(Some(12)),
+ ScalarValue::IntervalYearMonth(Some(18)),
+ ],
+ )?;
+
+ test_type(
+ DataType::Interval(IntervalUnit::DayTime),
+ Arc::new(IntervalDayTimeArray::from(vec![
+ Some(IntervalDayTime {
+ days: 1,
+ milliseconds: 0,
+ }),
+ Some(IntervalDayTime {
+ days: 2,
+ milliseconds: 0,
+ }),
+ ])),
+ vec![
+ ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+ days: 1,
+ milliseconds: 0,
+ })),
+ ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+ days: 1,
+ milliseconds: 500,
+ })),
+ ],
+ )?;
+
+ test_type(
+ DataType::Interval(IntervalUnit::MonthDayNano),
+ Arc::new(IntervalMonthDayNanoArray::from(vec![
+ Some(IntervalMonthDayNano {
+ months: 1,
+ days: 0,
+ nanoseconds: 0,
+ }),
+ Some(IntervalMonthDayNano {
+ months: 2,
+ days: 0,
+ nanoseconds: 0,
+ }),
+ ])),
+ vec![
+ ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+ months: 1,
+ days: 0,
+ nanoseconds: 0,
+ })),
+ ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+ months: 1,
+ days: 15,
+ nanoseconds: 0,
+ })),
+ ],
+ )?;
+
+ // Decimal256 (maps to Decimal128Type for 16-byte width)
+ // Need to use with_precision_and_scale() to set the metadata
+ let precision = 38;
+ let scale = 10;
+ test_type(
+ DataType::Decimal256(precision, scale),
+ Arc::new(
+ Decimal256Array::from(vec![
+ Some(i256::from(12345)),
+ Some(i256::from(67890)),
+ ])
+ .with_precision_and_scale(precision, scale)?,
+ ),
+ vec![
+ ScalarValue::Decimal256(Some(i256::from(12345)), precision,
scale),
+ ScalarValue::Decimal256(Some(i256::from(54321)), precision,
scale),
+ ],
+ )?;
+
+ Ok(())
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]