neilconway commented on code in PR #20278:
URL: https://github.com/apache/datafusion/pull/20278#discussion_r2818058622
##########
datafusion/functions/benches/pad.rs:
##########
@@ -30,6 +33,51 @@ use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;
+const UNICODE_STRINGS: &[&str] = &[
+ "Ñandú",
+ "Íslensku",
+ "Þjóðarinnar",
+ "Ελληνική",
+ "Иванович",
+ "データフュージョン",
+ "José García",
+ "Ölçü bïrïmï",
+ "Ÿéšṱëṟḏàÿ",
+ "Ährenstraße",
+];
+
+fn create_unicode_string_array<O: OffsetSizeTrait>(
+ size: usize,
+ null_density: f32,
+) -> arrow::array::GenericStringArray<O> {
+ let mut rng = rand::rng();
+ let mut builder = GenericStringBuilder::<O>::new();
+ for i in 0..size {
+ if rng.random::<f32>() < null_density {
+ builder.append_null();
+ } else {
+ builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
+ }
+ }
+ builder.finish()
+}
+
+fn create_unicode_string_view_array(
Review Comment:
I agree it's quite similar, but the different return types and builder types
would make this a bit complicated to refactor, IMO.
There is a LOT of near-duplicate code for string-handling through the UDF
implementations, unfortunately...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]