kumarUjjawal commented on code in PR #22143:
URL: https://github.com/apache/datafusion/pull/22143#discussion_r3239992184
##########
datafusion/functions-nested/benches/array_expression.rs:
##########
@@ -15,33 +15,143 @@
// specific language governing permissions and limitations
// under the License.
-use criterion::{Criterion, criterion_group, criterion_main};
-use datafusion_expr::lit;
-use datafusion_functions_nested::expr_fn::{array_replace_all, make_array};
+use arrow::array::{ArrayRef, Int64Builder, ListArray};
+use arrow::datatypes::{DataType, Field};
+use criterion::{
+ BenchmarkGroup, Criterion, criterion_group, criterion_main,
measurement::WallTime,
+};
+use datafusion_common::ScalarValue;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDF};
+use datafusion_functions_nested::replace::{
+ array_replace_all_udf, array_replace_n_udf, array_replace_udf,
+};
+use rand::seq::IndexedRandom;
+use rand::{Rng, SeedableRng, rngs::StdRng};
use std::hint::black_box;
+use std::sync::Arc;
+
+// (num_rows, list_size) — tuned so benches finish in a few seconds. At
+// NEEDLE_DENSITY = 0.1, list_size = 100/500 reliably exercise the
+// match-and-replace path (and the counter cutoff for `array_replace_n`);
+// list_size = 10 exercises the null-row and no-match early-return paths.
+const SIZES: &[(usize, usize)] = &[(4_000, 10), (10_000, 100), (10_000, 500)];
+const SEED: u64 = 42;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
+const NEEDLE: i64 = 0;
+const REPLACEMENT: i64 = -1;
fn criterion_benchmark(c: &mut Criterion) {
- // Construct large arrays for benchmarking
+ let list_data_type =
+ DataType::List(Arc::new(Field::new_list_field(DataType::Int64, true)));
+ let return_field: Arc<Field> =
+ Field::new("result", list_data_type.clone(), true).into();
+ let config_options = Arc::new(ConfigOptions::default());
- let array_len = 100_000;
+ let array_field: Arc<Field> = Field::new("array", list_data_type,
true).into();
+ let from_field: Arc<Field> = Field::new("from", DataType::Int64,
true).into();
+ let to_field: Arc<Field> = Field::new("to", DataType::Int64, true).into();
+ let max_field: Arc<Field> = Field::new("max", DataType::Int64,
true).into();
- let array = (0..array_len).map(|_| lit(2_i64)).collect::<Vec<_>>();
- let list_array = make_array(vec![make_array(array); 3]);
- let from_array = make_array(vec![lit(2_i64); 3]);
- let to_array = make_array(vec![lit(-2_i64); 3]);
+ let three_arg_fields =
+ vec![array_field.clone(), from_field.clone(), to_field.clone()];
+ let four_arg_fields = vec![array_field, from_field, to_field, max_field];
- // Benchmark array functions
+ let mut group = c.benchmark_group("array_replace_int64");
Review Comment:
Sounds good, I removed the duplicate `array_expression`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]