Re: [PR] Add extra case_when benchmarks [datafusion]

via GitHub Fri, 17 Oct 2025 13:53:50 -0700


pepijnve commented on code in PR #18097:
URL: https://github.com/apache/datafusion/pull/18097#discussion_r2438676107



##########
datafusion/physical-expr/benches/case_when.rs:
##########
@@ -54,69 +53,148 @@ fn criterion_benchmark(c: &mut Criterion) {
     let c1 = Arc::new(c1.finish());
     let c2 = Arc::new(c2.finish());
     let c3 = Arc::new(c3.finish());
-    let schema = Schema::new(vec![
-        Field::new("c1", DataType::Int32, true),
-        Field::new("c2", DataType::Utf8, true),
-        Field::new("c3", DataType::Utf8, true),
-    ]);
-    let batch = RecordBatch::try_new(Arc::new(schema), vec![c1, c2, 
c3]).unwrap();
-
-    // use same predicate for all benchmarks
-    let predicate = Arc::new(BinaryExpr::new(
-        make_col("c1", 0),
-        Operator::LtEq,
-        make_lit_i32(500),
-    ));
+    let mut columns: Vec<ArrayRef> = vec![c1, c2, c3];
+    for _ in 3..column_count {
+        columns.push(Arc::new(Int32Array::from_value(0, row_count)));
+    }
 
-    // CASE WHEN c1 <= 500 THEN 1 ELSE 0 END
-    c.bench_function("case_when: scalar or scalar", |b| {
-        let expr = Arc::new(
-            CaseExpr::try_new(
-                None,
-                vec![(predicate.clone(), make_lit_i32(1))],
-                Some(make_lit_i32(0)),
+    let fields = columns
+        .iter()
+        .enumerate()
+        .map(|(i, c)| {
+            Field::new(
+                format!("c{}", i + 1),
+                c.data_type().clone(),
+                c.is_nullable(),
             )
-            .unwrap(),
-        );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
-    });
+        })
+        .collect::<Vec<_>>();
 
-    // CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END
-    c.bench_function("case_when: column or null", |b| {
-        let expr = Arc::new(
-            CaseExpr::try_new(None, vec![(predicate.clone(), make_col("c2", 
1))], None)
+    let schema = Arc::new(Schema::new(fields));
+    RecordBatch::try_new(Arc::clone(&schema), columns).unwrap()
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    run_benchmarks(c, &make_batch(8192, 3));
+    run_benchmarks(c, &make_batch(8192, 50));
+    run_benchmarks(c, &make_batch(8192, 100));
+}
+
+fn run_benchmarks(c: &mut Criterion, batch: &RecordBatch) {
+    let c1 = col("c1", &batch.schema()).unwrap();
+    let c2 = col("c2", &batch.schema()).unwrap();
+    let c3 = col("c3", &batch.schema()).unwrap();
+
+    // No expression, when/then/else, literal values
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN 1 ELSE 0 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), lit(1))],
+                    Some(lit(0)),
+                )
                 .unwrap(),
-        );
-        b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
-    });
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // No expression, when/then/else, column reference values
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN c2 ELSE c3 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), 
Arc::clone(&c2))],
+                    Some(Arc::clone(&c3)),
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
 
-    // CASE WHEN c1 <= 500 THEN c2 ELSE c3 END
-    c.bench_function("case_when: expr or expr", |b| {
+    // No expression, when/then, implicit else
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE WHEN c1 <= 500 THEN c2 [ELSE NULL] END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    None,
+                    vec![(make_x_cmp_y(&c1, Operator::LtEq, 500), 
Arc::clone(&c2))],
+                    None,
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // With expression, two when/then branches
+    c.bench_function(
+        format!(
+            "case_when {}x{}: CASE c1 WHEN 1 THEN c2 WHEN 2 THEN c3 END",
+            batch.num_rows(),
+            batch.num_columns()
+        )
+        .as_str(),
+        |b| {
+            let expr = Arc::new(
+                case(
+                    Some(Arc::clone(&c1)),
+                    vec![(lit(1), Arc::clone(&c2)), (lit(2), Arc::clone(&c3))],
+                    None,
+                )
+                .unwrap(),
+            );
+            b.iter(|| black_box(expr.evaluate(black_box(batch)).unwrap()))
+        },
+    );
+
+    // Many when/then branches where all are effectively reachable
+    c.bench_function(format!("case_when {}x{}: CASE WHEN c1 == 0 THEN 0 WHEN 
c1 == 1 THEN 1 ... WHEN c1 == n THEN n ELSE n + 1 END", batch.num_rows(), 
batch.num_columns()).as_str(), |b| {
+        let when_thens = (0..batch.num_rows() as i32).map(|i| 
(make_x_cmp_y(&c1, Operator::Eq, i), lit(i))).collect();

Review Comment:
   Intentionally so. This is a torture test benchmark to really stress the code.
   
   The first 'all reachable' one is really a worst case scenario test case. 
This is intended to be able to measure improvements in the processing that's 
being done in each loop iteration. Filtering, scattering, etc.
   
   The second 'few reachable' one is intended to measure the short circuiting 
behaviour.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Add extra case_when benchmarks [datafusion]

Reply via email to