theirix commented on code in PR #19980:
URL: https://github.com/apache/datafusion/pull/19980#discussion_r2725523328
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
+ })?;
+ let byte_view = ByteView::from(view);
+ // Construct a new view
+ let new_view = shrink_string_view_array_view(string, new_length,
byte_view)?;
+ new_views.push(new_view);
+ null_buffer_builder.append_non_null();
+ } else {
+ new_views.push(view);
+ // Emit null
+ null_buffer_builder.append_null();
+ }
+ }
+ // Buffers are unchanged
+ // Nulls are rebuilt from scratch
+ let nulls = null_buffer_builder.finish();
+ let result = StringViewArray::try_new(ScalarBuffer::from(new_views),
buffers, nulls)?;
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+ match n.cmp(&0) {
+ Ordering::Less => {
+ let mut indices = string.char_indices();
+ // Find the first byte of a character past last `-n` characters
+ let mut end_idx: usize = usize::MAX;
+ for _ in n..0 {
+ if let Some((i, _)) = indices.next_back() {
+ end_idx = i;
+ } else {
+ end_idx = 0;
+ break;
+ }
+ }
+ end_idx
+ }
+ Ordering::Equal => 0,
+ Ordering::Greater => {
+ if let Some((end_idx, end_char)) =
+ string.char_indices().take(n as usize).last()
Review Comment:
It wouldn't work, because `take(n).last()` allows working with `n` larger
than a string by providing the last character, while `nth(n)` just returns
none. Same for the negative case.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]