(datafusion) branch main updated: bench: increase in_list benchmark coverage (#19443)

github-bot Mon, 22 Dec 2025 11:35:48 -0800

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new bb9a4a7ea1 bench: increase in_list benchmark coverage (#19443)
bb9a4a7ea1 is described below

commit bb9a4a7ea19896fab7da6554cf5dd103e50d8e74
Author: Geoffrey Claude <[email protected]>
AuthorDate: Mon Dec 22 20:35:19 2025 +0100

    bench: increase in_list benchmark coverage (#19443)
    
    ## Which issue does this PR close?
    
    - Related to #19241
    
    ## Rationale for this change
    
    This PR adds benchmarks and tests to ground upcoming `in_list`
    optimizations:
    
    1. **Realistic Data Patterns**: Adds mixed-length string benchmarks to
    accurately measure the `StringView` two-stage lookup (prefix check +
    validation) performance across variable lengths.
    
    2. **Type Coverage**: Adds baseline tests for temporal and decimal types
    to ensure correctness before they are migrated to specialized evaluation
    paths.
    
    ## What changes are included in this PR?
    
    - **Mixed-Length Benchmarks**: Scenarios for `StringArray` and
    `StringViewArray` with variable lengths, match rates, and null
    densities.
    
    - **Extended Tests**: Coverage for esoteric types (Temporal, Duration,
    Interval, Decimal256) in `physical-expr`.
    
    ## Are these changes tested?
    
    Yes, via new unit tests and benchmark verification.
    
    ## Are there any user-facing changes?
    
    No.
---
 datafusion/physical-expr/benches/in_list.rs        |  77 ++++++
 .../physical-expr/src/expressions/in_list.rs       | 302 ++++++++++++++++++++-
 2 files changed, 369 insertions(+), 10 deletions(-)

diff --git a/datafusion/physical-expr/benches/in_list.rs 
b/datafusion/physical-expr/benches/in_list.rs
index 9fe15febe5..954715d0e5 100644
--- a/datafusion/physical-expr/benches/in_list.rs
+++ b/datafusion/physical-expr/benches/in_list.rs
@@ -54,6 +54,10 @@ const NULL_PERCENTS: [f64; 2] = [0., 0.2];
 const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
 const ARRAY_LENGTH: usize = 8192;
 
+/// Mixed string lengths for realistic benchmarks.
+/// ~50% short (≤12 bytes), ~50% long (>12 bytes).
+const MIXED_STRING_LENGTHS: &[usize] = &[3, 6, 9, 12, 16, 20, 25, 30];
+
 /// Returns a friendly type name for the array type.
 fn array_type_name<A: 'static>() -> &'static str {
     let id = TypeId::of::<A>();
@@ -150,6 +154,71 @@ fn bench_numeric_type<T, A>(
     }
 }
 
+/// Generates a random string with a length chosen from MIXED_STRING_LENGTHS.
+fn random_mixed_length_string(rng: &mut StdRng) -> String {
+    let len = *MIXED_STRING_LENGTHS.choose(rng).unwrap();
+    random_string(rng, len)
+}
+
+/// Benchmarks realistic mixed-length IN list scenario.
+///
+/// Tests with:
+/// - Mixed short (≤12 bytes) and long (>12 bytes) strings in the IN list
+/// - Varying prefixes (fully random strings)
+/// - Configurable match rate (% of values that are in the IN list)
+/// - Various IN list sizes (3, 8, 28, 100)
+fn bench_realistic_mixed_strings<A>(
+    c: &mut Criterion,
+    rng: &mut StdRng,
+    make_scalar: fn(String) -> ScalarValue,
+) where
+    A: Array + FromIterator<Option<String>> + 'static,
+{
+    for in_list_length in IN_LIST_LENGTHS {
+        for match_percent in [0.0, 0.25, 0.75] {
+            for null_percent in NULL_PERCENTS {
+                // Generate IN list with mixed-length random strings
+                let in_list_strings: Vec<String> = (0..in_list_length)
+                    .map(|_| random_mixed_length_string(rng))
+                    .collect();
+
+                let in_list: Vec<_> = in_list_strings
+                    .iter()
+                    .map(|s| make_scalar(s.clone()))
+                    .collect();
+
+                // Generate values array with controlled match rate
+                let values: A = (0..ARRAY_LENGTH)
+                    .map(|_| {
+                        if !rng.random_bool(1.0 - null_percent) {
+                            None
+                        } else if rng.random_bool(match_percent) {
+                            // Pick from IN list (will match)
+                            Some(in_list_strings.choose(rng).unwrap().clone())
+                        } else {
+                            // Generate new random string (unlikely to match)
+                            Some(random_mixed_length_string(rng))
+                        }
+                    })
+                    .collect();
+
+                do_bench(
+                    c,
+                    &format!(
+                        "in_list/{}/mixed/list={}/match={}%/nulls={}%",
+                        array_type_name::<A>(),
+                        in_list_length,
+                        (match_percent * 100.0) as u32,
+                        (null_percent * 100.0) as u32
+                    ),
+                    Arc::new(values),
+                    &in_list,
+                );
+            }
+        }
+    }
+}
+
 /// Entry point: registers in_list benchmarks for string and numeric array 
types.
 fn criterion_benchmark(c: &mut Criterion) {
     let mut rng = StdRng::seed_from_u64(120320);
@@ -158,6 +227,14 @@ fn criterion_benchmark(c: &mut Criterion) {
     bench_string_type::<StringArray>(c, &mut rng, |s| 
ScalarValue::Utf8(Some(s)));
     bench_string_type::<StringViewArray>(c, &mut rng, |s| 
ScalarValue::Utf8View(Some(s)));
 
+    // Realistic mixed-length string benchmarks (TPC-H style)
+    bench_realistic_mixed_strings::<StringArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8(Some(s))
+    });
+    bench_realistic_mixed_strings::<StringViewArray>(c, &mut rng, |s| {
+        ScalarValue::Utf8View(Some(s))
+    });
+
     // Benchmarks for numeric types
     bench_numeric_type::<u8, UInt8Array>(
         c,
diff --git a/datafusion/physical-expr/src/expressions/in_list.rs 
b/datafusion/physical-expr/src/expressions/in_list.rs
index b6b67c85c4..5c2f1adcd0 100644
--- a/datafusion/physical-expr/src/expressions/in_list.rs
+++ b/datafusion/physical-expr/src/expressions/in_list.rs
@@ -898,6 +898,7 @@ mod tests {
     use super::*;
     use crate::expressions::{col, lit, try_cast};
     use arrow::buffer::NullBuffer;
+    use arrow::datatypes::{IntervalDayTime, IntervalMonthDayNano, i256};
     use datafusion_common::plan_err;
     use datafusion_expr::type_coercion::binary::comparison_coercion;
     use datafusion_physical_expr_common::physical_expr::fmt_sql;
@@ -1335,16 +1336,59 @@ mod tests {
     /// Test data: 0 (in list), 2000 (not in list), [1000, 3000] (other list 
values)
     #[test]
     fn in_list_timestamp_types() -> Result<()> {
-        run_test_cases(vec![InListPrimitiveTestCase {
-            name: "timestamp_nanosecond",
-            value_in: ScalarValue::TimestampNanosecond(Some(0), None),
-            value_not_in: ScalarValue::TimestampNanosecond(Some(2000), None),
-            other_list_values: vec![
-                ScalarValue::TimestampNanosecond(Some(1000), None),
-                ScalarValue::TimestampNanosecond(Some(3000), None),
-            ],
-            null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
-        }])
+        run_test_cases(vec![
+            InListPrimitiveTestCase {
+                name: "timestamp_nanosecond",
+                value_in: ScalarValue::TimestampNanosecond(Some(0), None),
+                value_not_in: ScalarValue::TimestampNanosecond(Some(2000), 
None),
+                other_list_values: vec![
+                    ScalarValue::TimestampNanosecond(Some(1000), None),
+                    ScalarValue::TimestampNanosecond(Some(3000), None),
+                ],
+                null_value: Some(ScalarValue::TimestampNanosecond(None, None)),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_with_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![ScalarValue::TimestampMillisecond(
+                    Some(3500000),
+                    Some("+05:00".into()),
+                )],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+            InListPrimitiveTestCase {
+                name: "timestamp_millisecond_mixed_tz",
+                value_in: ScalarValue::TimestampMillisecond(
+                    Some(1500000),
+                    Some("+05:00".into()),
+                ),
+                value_not_in: ScalarValue::TimestampMillisecond(
+                    Some(2500000),
+                    Some("+05:00".into()),
+                ),
+                other_list_values: vec![
+                    ScalarValue::TimestampMillisecond(
+                        Some(3500000),
+                        Some("+01:00".into()),
+                    ),
+                    ScalarValue::TimestampMillisecond(Some(4500000), 
Some("UTC".into())),
+                ],
+                null_value: Some(ScalarValue::TimestampMillisecond(
+                    None,
+                    Some("+05:00".into()),
+                )),
+            },
+        ])
     }
 
     #[test]
@@ -3225,4 +3269,242 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_in_list_esoteric_types() -> Result<()> {
+        // Test esoteric/less common types to validate the transform and 
mapping flow.
+        // These types are reinterpreted to base primitive types (e.g., 
Timestamp -> UInt64,
+        // Interval -> Decimal128, Float16 -> UInt16). We just need to verify 
basic
+        // functionality works - no need for comprehensive null handling tests.
+
+        // Helper: simple IN test that expects [Some(true), Some(false)]
+        let test_type = |data_type: DataType,
+                         in_array: ArrayRef,
+                         list_values: Vec<ScalarValue>|
+         -> Result<()> {
+            let schema = Schema::new(vec![Field::new("a", data_type.clone(), 
false)]);
+            let col_a = col("a", &schema)?;
+            let batch = RecordBatch::try_new(Arc::new(schema.clone()), 
vec![in_array])?;
+
+            let list = list_values.into_iter().map(lit).collect();
+            in_list!(
+                batch,
+                list,
+                &false,
+                vec![Some(true), Some(false)],
+                col_a,
+                &schema
+            );
+            Ok(())
+        };
+
+        // Timestamp types (all units map to Int64 -> UInt64)
+        test_type(
+            DataType::Timestamp(TimeUnit::Second, None),
+            Arc::new(TimestampSecondArray::from(vec![Some(1000), Some(2000)])),
+            vec![
+                ScalarValue::TimestampSecond(Some(1000), None),
+                ScalarValue::TimestampSecond(Some(1500), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Millisecond, None),
+            Arc::new(TimestampMillisecondArray::from(vec![
+                Some(1000000),
+                Some(2000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMillisecond(Some(1000000), None),
+                ScalarValue::TimestampMillisecond(Some(1500000), None),
+            ],
+        )?;
+
+        test_type(
+            DataType::Timestamp(TimeUnit::Microsecond, None),
+            Arc::new(TimestampMicrosecondArray::from(vec![
+                Some(1000000000),
+                Some(2000000000),
+            ])),
+            vec![
+                ScalarValue::TimestampMicrosecond(Some(1000000000), None),
+                ScalarValue::TimestampMicrosecond(Some(1500000000), None),
+            ],
+        )?;
+
+        // Time32 and Time64 (map to Int32 -> UInt32 and Int64 -> UInt64 
respectively)
+        test_type(
+            DataType::Time32(TimeUnit::Second),
+            Arc::new(Time32SecondArray::from(vec![Some(3600), Some(7200)])),
+            vec![
+                ScalarValue::Time32Second(Some(3600)),
+                ScalarValue::Time32Second(Some(5400)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time32(TimeUnit::Millisecond),
+            Arc::new(Time32MillisecondArray::from(vec![
+                Some(3600000),
+                Some(7200000),
+            ])),
+            vec![
+                ScalarValue::Time32Millisecond(Some(3600000)),
+                ScalarValue::Time32Millisecond(Some(5400000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Microsecond),
+            Arc::new(Time64MicrosecondArray::from(vec![
+                Some(3600000000),
+                Some(7200000000),
+            ])),
+            vec![
+                ScalarValue::Time64Microsecond(Some(3600000000)),
+                ScalarValue::Time64Microsecond(Some(5400000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Time64(TimeUnit::Nanosecond),
+            Arc::new(Time64NanosecondArray::from(vec![
+                Some(3600000000000),
+                Some(7200000000000),
+            ])),
+            vec![
+                ScalarValue::Time64Nanosecond(Some(3600000000000)),
+                ScalarValue::Time64Nanosecond(Some(5400000000000)),
+            ],
+        )?;
+
+        // Duration types (map to Int64 -> UInt64)
+        test_type(
+            DataType::Duration(TimeUnit::Second),
+            Arc::new(DurationSecondArray::from(vec![Some(86400), 
Some(172800)])),
+            vec![
+                ScalarValue::DurationSecond(Some(86400)),
+                ScalarValue::DurationSecond(Some(129600)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Millisecond),
+            Arc::new(DurationMillisecondArray::from(vec![
+                Some(86400000),
+                Some(172800000),
+            ])),
+            vec![
+                ScalarValue::DurationMillisecond(Some(86400000)),
+                ScalarValue::DurationMillisecond(Some(129600000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Microsecond),
+            Arc::new(DurationMicrosecondArray::from(vec![
+                Some(86400000000),
+                Some(172800000000),
+            ])),
+            vec![
+                ScalarValue::DurationMicrosecond(Some(86400000000)),
+                ScalarValue::DurationMicrosecond(Some(129600000000)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Duration(TimeUnit::Nanosecond),
+            Arc::new(DurationNanosecondArray::from(vec![
+                Some(86400000000000),
+                Some(172800000000000),
+            ])),
+            vec![
+                ScalarValue::DurationNanosecond(Some(86400000000000)),
+                ScalarValue::DurationNanosecond(Some(129600000000000)),
+            ],
+        )?;
+
+        // Interval types (map to 16-byte Decimal128Type)
+        test_type(
+            DataType::Interval(IntervalUnit::YearMonth),
+            Arc::new(IntervalYearMonthArray::from(vec![Some(12), Some(24)])),
+            vec![
+                ScalarValue::IntervalYearMonth(Some(12)),
+                ScalarValue::IntervalYearMonth(Some(18)),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::DayTime),
+            Arc::new(IntervalDayTimeArray::from(vec![
+                Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                }),
+                Some(IntervalDayTime {
+                    days: 2,
+                    milliseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 0,
+                })),
+                ScalarValue::IntervalDayTime(Some(IntervalDayTime {
+                    days: 1,
+                    milliseconds: 500,
+                })),
+            ],
+        )?;
+
+        test_type(
+            DataType::Interval(IntervalUnit::MonthDayNano),
+            Arc::new(IntervalMonthDayNanoArray::from(vec![
+                Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+                Some(IntervalMonthDayNano {
+                    months: 2,
+                    days: 0,
+                    nanoseconds: 0,
+                }),
+            ])),
+            vec![
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 0,
+                    nanoseconds: 0,
+                })),
+                ScalarValue::IntervalMonthDayNano(Some(IntervalMonthDayNano {
+                    months: 1,
+                    days: 15,
+                    nanoseconds: 0,
+                })),
+            ],
+        )?;
+
+        // Decimal256 (maps to Decimal128Type for 16-byte width)
+        // Need to use with_precision_and_scale() to set the metadata
+        let precision = 38;
+        let scale = 10;
+        test_type(
+            DataType::Decimal256(precision, scale),
+            Arc::new(
+                Decimal256Array::from(vec![
+                    Some(i256::from(12345)),
+                    Some(i256::from(67890)),
+                ])
+                .with_precision_and_scale(precision, scale)?,
+            ),
+            vec![
+                ScalarValue::Decimal256(Some(i256::from(12345)), precision, 
scale),
+                ScalarValue::Decimal256(Some(i256::from(54321)), precision, 
scale),
+            ],
+        )?;
+
+        Ok(())
+    }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: bench: increase in_list benchmark coverage (#19443)

Reply via email to