This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new af3bd81e1 perf: Improve criterion benchmarks for cast string to int
(#3049)
af3bd81e1 is described below
commit af3bd81e196b8af54ccccbafc03b6f9bdce26821
Author: Andy Grove <[email protected]>
AuthorDate: Wed Jan 7 09:10:51 2026 -0700
perf: Improve criterion benchmarks for cast string to int (#3049)
---
native/spark-expr/benches/cast_from_string.rs | 98 +++++++++++++++++++++------
1 file changed, 76 insertions(+), 22 deletions(-)
diff --git a/native/spark-expr/benches/cast_from_string.rs
b/native/spark-expr/benches/cast_from_string.rs
index 990cdec21..a09afae6e 100644
--- a/native/spark-expr/benches/cast_from_string.rs
+++ b/native/spark-expr/benches/cast_from_string.rs
@@ -23,45 +23,99 @@ use datafusion_comet_spark_expr::{Cast, EvalMode,
SparkCastOptions};
use std::sync::Arc;
fn criterion_benchmark(c: &mut Criterion) {
- let batch = create_utf8_batch();
+ let small_int_batch = create_small_int_string_batch();
+ let int_batch = create_int_string_batch();
+ let decimal_batch = create_decimal_string_batch();
let expr = Arc::new(Column::new("a", 0));
+
+ for (mode, mode_name) in [
+ (EvalMode::Legacy, "legacy"),
+ (EvalMode::Ansi, "ansi"),
+ (EvalMode::Try, "try"),
+ ] {
+ let spark_cast_options = SparkCastOptions::new(mode, "", false);
+ let cast_to_i8 = Cast::new(expr.clone(), DataType::Int8,
spark_cast_options.clone());
+ let cast_to_i16 = Cast::new(expr.clone(), DataType::Int16,
spark_cast_options.clone());
+ let cast_to_i32 = Cast::new(expr.clone(), DataType::Int32,
spark_cast_options.clone());
+ let cast_to_i64 = Cast::new(expr.clone(), DataType::Int64,
spark_cast_options);
+
+ let mut group = c.benchmark_group(format!("cast_string_to_int/{}",
mode_name));
+ group.bench_function("i8", |b| {
+ b.iter(|| cast_to_i8.evaluate(&small_int_batch).unwrap());
+ });
+ group.bench_function("i16", |b| {
+ b.iter(|| cast_to_i16.evaluate(&small_int_batch).unwrap());
+ });
+ group.bench_function("i32", |b| {
+ b.iter(|| cast_to_i32.evaluate(&int_batch).unwrap());
+ });
+ group.bench_function("i64", |b| {
+ b.iter(|| cast_to_i64.evaluate(&int_batch).unwrap());
+ });
+ group.finish();
+ }
+
+ // Benchmark decimal truncation (Legacy mode only)
let spark_cast_options = SparkCastOptions::new(EvalMode::Legacy, "",
false);
- let cast_string_to_i8 = Cast::new(expr.clone(), DataType::Int8,
spark_cast_options.clone());
- let cast_string_to_i16 = Cast::new(expr.clone(), DataType::Int16,
spark_cast_options.clone());
- let cast_string_to_i32 = Cast::new(expr.clone(), DataType::Int32,
spark_cast_options.clone());
- let cast_string_to_i64 = Cast::new(expr, DataType::Int64,
spark_cast_options);
+ let cast_to_i32 = Cast::new(expr.clone(), DataType::Int32,
spark_cast_options.clone());
+ let cast_to_i64 = Cast::new(expr.clone(), DataType::Int64,
spark_cast_options);
- let mut group = c.benchmark_group("cast_string_to_int");
- group.bench_function("cast_string_to_i8", |b| {
- b.iter(|| cast_string_to_i8.evaluate(&batch).unwrap());
+ let mut group = c.benchmark_group("cast_string_to_int/legacy_decimals");
+ group.bench_function("i32", |b| {
+ b.iter(|| cast_to_i32.evaluate(&decimal_batch).unwrap());
});
- group.bench_function("cast_string_to_i16", |b| {
- b.iter(|| cast_string_to_i16.evaluate(&batch).unwrap());
- });
- group.bench_function("cast_string_to_i32", |b| {
- b.iter(|| cast_string_to_i32.evaluate(&batch).unwrap());
- });
- group.bench_function("cast_string_to_i64", |b| {
- b.iter(|| cast_string_to_i64.evaluate(&batch).unwrap());
+ group.bench_function("i64", |b| {
+ b.iter(|| cast_to_i64.evaluate(&decimal_batch).unwrap());
});
+ group.finish();
}
-// Create UTF8 batch with strings representing ints, floats, nulls
-fn create_utf8_batch() -> RecordBatch {
+/// Create batch with small integer strings that fit in i8 range (for i8/i16
benchmarks)
+fn create_small_int_string_batch() -> RecordBatch {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8,
true)]));
let mut b = StringBuilder::new();
for i in 0..1000 {
if i % 10 == 0 {
b.append_null();
- } else if i % 2 == 0 {
- b.append_value(format!("{}", rand::random::<f64>()));
} else {
- b.append_value(format!("{}", rand::random::<i64>()));
+ b.append_value(format!("{}", rand::random::<i8>()));
}
}
let array = b.finish();
+ RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap()
+}
- RecordBatch::try_new(schema.clone(), vec![Arc::new(array)]).unwrap()
+/// Create batch with valid integer strings (works for all eval modes)
+fn create_int_string_batch() -> RecordBatch {
+ let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8,
true)]));
+ let mut b = StringBuilder::new();
+ for i in 0..1000 {
+ if i % 10 == 0 {
+ b.append_null();
+ } else {
+ b.append_value(format!("{}", rand::random::<i32>()));
+ }
+ }
+ let array = b.finish();
+ RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap()
+}
+
+/// Create batch with decimal strings (for Legacy mode decimal truncation)
+fn create_decimal_string_batch() -> RecordBatch {
+ let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8,
true)]));
+ let mut b = StringBuilder::new();
+ for i in 0..1000 {
+ if i % 10 == 0 {
+ b.append_null();
+ } else {
+ // Generate integers with decimal portions to test truncation
+ let int_part: i32 = rand::random();
+ let dec_part: u32 = rand::random::<u32>() % 1000;
+ b.append_value(format!("{}.{}", int_part, dec_part));
+ }
+ }
+ let array = b.finish();
+ RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap()
}
fn config() -> Criterion {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]