theirix commented on code in PR #19980:
URL: https://github.com/apache/datafusion/pull/19980#discussion_r2725486396
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
Review Comment:
After a second look - no, it cannot, since we get an input string from
`StringViewArray` too. Simplified this part to avoid unnecessary error handling.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]