neilconway commented on code in PR #21131:
URL: https://github.com/apache/datafusion/pull/21131#discussion_r2997978790
##########
datafusion/functions-nested/src/string.rs:
##########
@@ -261,6 +331,201 @@ impl ScalarUDFImpl for StringToArray {
}
}
+/// Appends `value` to the string builder, or NULL if it matches `null_value`.
+#[inline(always)]
+fn append_part(
+ builder: &mut impl StringArrayBuilderType,
+ value: &str,
+ null_value: Option<&str>,
+) {
+ if null_value == Some(value) {
+ builder.append_null();
+ } else {
+ builder.append_value(value);
+ }
+}
+
+/// Optimized `string_to_array` implementation for the common case where
+/// delimiter and null_value are scalar values.
+fn string_to_array_scalar_args<'a, StringArrType, StringBuilderType>(
+ string_array: &StringArrType,
+ delimiter: Option<&str>,
+ null_value: Option<&str>,
+ string_builder: StringBuilderType,
+) -> Result<ArrayRef>
+where
+ StringArrType: StringArrayType<'a>,
+ StringBuilderType: StringArrayBuilderType,
+{
+ let mut list_builder = ListBuilder::new(string_builder);
+
+ match delimiter {
+ Some("") => {
+ // Empty delimiter: each non-empty string becomes a single-element
list.
+ // Empty strings produce an empty array (PostgreSQL compat).
+ for i in 0..string_array.len() {
+ if string_array.is_null(i) {
+ list_builder.append(false);
+ continue;
+ }
+ let string = string_array.value(i);
+ if !string.is_empty() {
+ append_part(list_builder.values(), string, null_value);
+ }
+ list_builder.append(true);
+ }
+ }
+ Some(delimiter) => {
+ // Rather than using `str::split`, do the split ourselves using
+ // `memmem::Finder`. This allows pre-compiling the delimiter search
+ // pattern once and reusing it for all rows.
+ let finder = memchr::memmem::Finder::new(delimiter.as_bytes());
+ let delim_len = delimiter.len();
+
+ for i in 0..string_array.len() {
+ if string_array.is_null(i) {
+ list_builder.append(false);
+ continue;
+ }
+ let string = string_array.value(i);
+ if !string.is_empty() {
+ let bytes = string.as_bytes();
+ let mut start = 0;
+ for pos in finder.find_iter(bytes) {
+ append_part(
+ list_builder.values(),
+ &string[start..pos],
+ null_value,
+ );
+ start = pos + delim_len;
+ }
+ // Trailing part after last delimiter (or entire string if
no
+ // delimiter was found).
+ append_part(list_builder.values(), &string[start..],
null_value);
+ }
+ list_builder.append(true);
+ }
+ }
+ None => {
+ // NULL delimiter: split into individual characters.
+ for i in 0..string_array.len() {
+ if string_array.is_null(i) {
+ list_builder.append(false);
+ continue;
+ }
+ let string = string_array.value(i);
+ for (pos, c) in string.char_indices() {
+ append_part(
+ list_builder.values(),
+ &string[pos..pos + c.len_utf8()],
+ null_value,
+ );
+ }
+ list_builder.append(true);
+ }
+ }
+ }
+
+ Ok(Arc::new(list_builder.finish()) as ArrayRef)
+}
+
+/// Fallback path for `string_to_array` when delimiter and/or null_value
+/// are array columns rather than scalars.
+fn string_to_array_fallback(args: &[ArrayRef]) -> Result<ArrayRef> {
+ let null_value_array = args.get(2);
+
+ match args[0].data_type() {
+ Utf8 => {
+ let arr = args[0].as_string::<i32>();
+ let builder =
+ StringBuilder::with_capacity(arr.len(),
arr.get_buffer_memory_size());
+ string_to_array_column_args(&arr, &args[1], null_value_array,
builder)
+ }
+ Utf8View => {
+ let arr = args[0].as_string_view();
+ let builder = StringViewBuilder::with_capacity(arr.len());
+ string_to_array_column_args(&arr, &args[1], null_value_array,
builder)
+ }
+ LargeUtf8 => {
+ let arr = args[0].as_string::<i64>();
+ let builder = LargeStringBuilder::with_capacity(
+ arr.len(),
+ arr.get_buffer_memory_size(),
+ );
+ string_to_array_column_args(&arr, &args[1], null_value_array,
builder)
+ }
+ other => exec_err!("unsupported type for string_to_array function as
{other:?}"),
+ }
+}
+
+fn string_to_array_column_args<'a, StringArrType, StringBuilderType>(
+ string_array: &StringArrType,
+ delimiter_array: &ArrayRef,
+ null_value_array: Option<&ArrayRef>,
+ string_builder: StringBuilderType,
+) -> Result<ArrayRef>
+where
+ StringArrType: StringArrayType<'a>,
+ StringBuilderType: StringArrayBuilderType,
+{
+ let mut list_builder = ListBuilder::new(string_builder);
+
+ for i in 0..string_array.len() {
+ if string_array.is_null(i) {
+ list_builder.append(false);
+ continue;
+ }
+
+ let string = string_array.value(i);
+ let delimiter = get_str_value(delimiter_array, i);
+ let null_value = null_value_array.and_then(|arr| get_str_value(arr,
i));
+
+ match delimiter {
+ Some("") => {
+ if !string.is_empty() {
+ append_part(list_builder.values(), string, null_value);
+ }
+ }
+ Some(delimiter) => {
+ if !string.is_empty() {
+ for part in string.split(delimiter) {
+ append_part(list_builder.values(), part, null_value);
+ }
+ }
+ }
+ None => {
+ for (pos, c) in string.char_indices() {
+ append_part(
+ list_builder.values(),
+ &string[pos..pos + c.len_utf8()],
+ null_value,
+ );
+ }
+ }
+ }
+
+ list_builder.append(true);
+ }
+
+ Ok(Arc::new(list_builder.finish()) as ArrayRef)
+}
+
+/// Returns the string value at index `i` from a string array of any type.
+fn get_str_value(array: &ArrayRef, i: usize) -> Option<&str> {
+ if array.is_null(i) {
+ return None;
+ }
+ match array.data_type() {
+ Utf8 => Some(array.as_string::<i32>().value(i)),
+ LargeUtf8 => Some(array.as_string::<i64>().value(i)),
+ Utf8View => Some(array.as_string_view().value(i)),
+ other => {
+ debug_assert!(false, "unexpected type in get_str_value:
{other:?}");
+ None
+ }
+ }
+}
Review Comment:
Thanks for the suggestion! However, `GenericStringArray` doesn't handle
`StringViewArray`; the current approach was overall the cleanest way I could
think of, but let me know if you know of a better way to do this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]