This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 5abef41413 Improve benchmark for ltrim (#12513)
5abef41413 is described below
commit 5abef4141390dc7e8b89f4c82f7a72c21ab1a340
Author: kamille <[email protected]>
AuthorDate: Wed Sep 18 22:17:31 2024 +0800
Improve benchmark for ltrim (#12513)
* complete benchmark for ltrim.
* improve benchmarks.
* remove unused param.
* fix bench.
* refactor to remove repeated codes.
* fix clippy.
* Update datafusion/functions/benches/ltrim.rs
Co-authored-by: Andrew Lamb <[email protected]>
* improve codes and add more comments.
* fix clippy.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/functions/benches/ltrim.rs | 223 +++++++++++++++++++++++++++++++---
1 file changed, 206 insertions(+), 17 deletions(-)
diff --git a/datafusion/functions/benches/ltrim.rs
b/datafusion/functions/benches/ltrim.rs
index 01acb9de33..b3fa5ef4fd 100644
--- a/datafusion/functions/benches/ltrim.rs
+++ b/datafusion/functions/benches/ltrim.rs
@@ -17,32 +17,221 @@
extern crate criterion;
-use arrow::array::{ArrayRef, StringArray};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use criterion::{
+ black_box, criterion_group, criterion_main, measurement::Measurement,
BenchmarkGroup,
+ Criterion, SamplingMode,
+};
use datafusion_common::ScalarValue;
-use datafusion_expr::ColumnarValue;
+use datafusion_expr::{ColumnarValue, ScalarUDF};
use datafusion_functions::string;
-use std::sync::Arc;
+use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
+use std::{fmt, sync::Arc};
-fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
- let iter =
- std::iter::repeat(format!("{}datafusion{}", characters,
characters)).take(size);
- let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+pub fn seedable_rng() -> StdRng {
+ StdRng::seed_from_u64(42)
+}
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+ Utf8View,
+ Utf8,
+ LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ StringArrayType::Utf8View => f.write_str("string_view"),
+ StringArrayType::Utf8 => f.write_str("string"),
+ StringArrayType::LargeUtf8 => f.write_str("large_string"),
+ }
+ }
+}
+
+/// returns an array of strings, and `characters` as a ScalarValue
+pub fn create_string_array_and_characters(
+ size: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+ string_array_type: StringArrayType,
+) -> (ArrayRef, ScalarValue) {
+ let rng = &mut seedable_rng();
+
+ // Create `size` rows:
+ // - 10% rows will be `None`
+ // - Other 90% will be strings with same `remaining_len` lengths
+ // We will build the string array on it later.
+ let string_iter = (0..size).map(|_| {
+ if rng.gen::<f32>() < 0.1 {
+ None
+ } else {
+ let mut value = trimmed.as_bytes().to_vec();
+ let generated = rng.sample_iter(&Alphanumeric).take(remaining_len);
+ value.extend(generated);
+ Some(String::from_utf8(value).unwrap())
+ }
+ });
+
+ // Build the target `string array` and `characters` according to
`string_array_type`
+ match string_array_type {
+ StringArrayType::Utf8View => (
+ Arc::new(string_iter.collect::<StringViewArray>()),
+ ScalarValue::Utf8View(Some(characters.to_string())),
+ ),
+ StringArrayType::Utf8 => (
+ Arc::new(string_iter.collect::<StringArray>()),
+ ScalarValue::Utf8(Some(characters.to_string())),
+ ),
+ StringArrayType::LargeUtf8 => (
+ Arc::new(string_iter.collect::<LargeStringArray>()),
+ ScalarValue::LargeUtf8(Some(characters.to_string())),
+ ),
+ }
+}
+
+/// Create args for the ltrim benchmark
+/// Inputs:
+/// - size: rows num of the test array
+/// - characters: the characters we need to trim
+/// - trimmed: the part in the testing string that will be trimmed
+/// - remaining_len: the len of the remaining part of testing string after
trimming
+/// - string_array_type: the method used to store the testing strings
+///
+/// Outputs:
+/// - testing string array
+/// - trimmed characters
+///
+fn create_args(
+ size: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+ string_array_type: StringArrayType,
+) -> Vec<ColumnarValue> {
+ let (string_array, pattern) = create_string_array_and_characters(
+ size,
+ characters,
+ trimmed,
+ remaining_len,
+ string_array_type,
+ );
vec![
- ColumnarValue::Array(array),
- ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
+ ColumnarValue::Array(string_array),
+ ColumnarValue::Scalar(pattern),
]
}
+#[allow(clippy::too_many_arguments)]
+fn run_with_string_type<M: Measurement>(
+ group: &mut BenchmarkGroup<'_, M>,
+ ltrim: &ScalarUDF,
+ size: usize,
+ len: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+ string_type: StringArrayType,
+) {
+ let args = create_args(size, characters, trimmed, remaining_len,
string_type);
+ group.bench_function(
+ format!(
+ "{string_type} [size={size}, len_before={len},
len_after={remaining_len}]",
+ ),
+ |b| b.iter(|| black_box(ltrim.invoke(&args))),
+ );
+}
+
+#[allow(clippy::too_many_arguments)]
+fn run_one_group(
+ c: &mut Criterion,
+ group_name: &str,
+ ltrim: &ScalarUDF,
+ string_types: &[StringArrayType],
+ size: usize,
+ len: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+) {
+ let mut group = c.benchmark_group(group_name);
+ group.sampling_mode(SamplingMode::Flat);
+ group.sample_size(10);
+
+ for string_type in string_types {
+ run_with_string_type(
+ &mut group,
+ ltrim,
+ size,
+ len,
+ characters,
+ trimmed,
+ remaining_len,
+ *string_type,
+ );
+ }
+
+ group.finish();
+}
+
fn criterion_benchmark(c: &mut Criterion) {
let ltrim = string::ltrim();
- for char in ["\"", "Header:"] {
- for size in [1024, 4096, 8192] {
- let args = create_args(size, char);
- c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
- b.iter(|| black_box(ltrim.invoke(&args)))
- });
- }
+ let characters = ",!()";
+
+ let string_types = [
+ StringArrayType::Utf8View,
+ StringArrayType::Utf8,
+ StringArrayType::LargeUtf8,
+ ];
+ for size in [1024, 4096, 8192] {
+ // len=12, trimmed_len=4, len_after_ltrim=8
+ let len = 12;
+ let trimmed = characters;
+ let remaining_len = len - trimmed.len();
+ run_one_group(
+ c,
+ "INPUT LEN <= 12",
+ <rim,
+ &string_types,
+ size,
+ len,
+ characters,
+ trimmed,
+ remaining_len,
+ );
+
+ // len=64, trimmed_len=4, len_after_ltrim=60
+ let len = 64;
+ let trimmed = characters;
+ let remaining_len = len - trimmed.len();
+ run_one_group(
+ c,
+ "INPUT LEN > 12, OUTPUT LEN > 12",
+ <rim,
+ &string_types,
+ size,
+ len,
+ characters,
+ trimmed,
+ remaining_len,
+ );
+
+ // len=64, trimmed_len=56, len_after_ltrim=8
+ let len = 64;
+ let trimmed = characters.repeat(15);
+ let remaining_len = len - trimmed.len();
+ run_one_group(
+ c,
+ "INPUT LEN > 12, OUTPUT LEN <= 12",
+ <rim,
+ &string_types,
+ size,
+ len,
+ characters,
+ &trimmed,
+ remaining_len,
+ );
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]