Jefffrey commented on code in PR #19980:
URL: https://github.com/apache/datafusion/pull/19980#discussion_r2724757548
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
+ })?;
+ let byte_view = ByteView::from(view);
+ // Construct a new view
+ let new_view = shrink_string_view_array_view(string, new_length,
byte_view)?;
+ new_views.push(new_view);
+ null_buffer_builder.append_non_null();
+ } else {
+ new_views.push(view);
+ // Emit null
+ null_buffer_builder.append_null();
+ }
+ }
+ // Buffers are unchanged
+ // Nulls are rebuilt from scratch
+ let nulls = null_buffer_builder.finish();
+ let result = StringViewArray::try_new(ScalarBuffer::from(new_views),
buffers, nulls)?;
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+ match n.cmp(&0) {
+ Ordering::Less => {
+ let mut indices = string.char_indices();
+ // Find the first byte of a character past last `-n` characters
+ let mut end_idx: usize = usize::MAX;
+ for _ in n..0 {
+ if let Some((i, _)) = indices.next_back() {
Review Comment:
Can we use `nth_back` here?
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
+ })?;
+ let byte_view = ByteView::from(view);
+ // Construct a new view
+ let new_view = shrink_string_view_array_view(string, new_length,
byte_view)?;
+ new_views.push(new_view);
+ null_buffer_builder.append_non_null();
+ } else {
+ new_views.push(view);
+ // Emit null
+ null_buffer_builder.append_null();
+ }
+ }
+ // Buffers are unchanged
+ // Nulls are rebuilt from scratch
+ let nulls = null_buffer_builder.finish();
+ let result = StringViewArray::try_new(ScalarBuffer::from(new_views),
buffers, nulls)?;
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+ match n.cmp(&0) {
+ Ordering::Less => {
+ let mut indices = string.char_indices();
+ // Find the first byte of a character past last `-n` characters
+ let mut end_idx: usize = usize::MAX;
+ for _ in n..0 {
+ if let Some((i, _)) = indices.next_back() {
+ end_idx = i;
+ } else {
+ end_idx = 0;
+ break;
+ }
+ }
+ end_idx
+ }
+ Ordering::Equal => 0,
+ Ordering::Greater => {
+ if let Some((end_idx, end_char)) =
+ string.char_indices().take(n as usize).last()
+ {
+ // Include length of the last character
+ end_idx + end_char.len_utf8()
+ } else {
+ // String is empty
+ 0
+ }
+ }
+ }
+}
+
+/// Construct a new StringViewArray view from existing view `byte_view` and
new length `len`.
+/// Prefix is taken from the original string `string`.
+/// Handles both inline and non-inline views, referencing the same buffers.
+fn shrink_string_view_array_view(
+ string: &str,
+ len: u32,
+ byte_view: ByteView,
+) -> Result<u128> {
+ if len > byte_view.length {
+ return exec_err!(
+ "Original length {} must be greater than new length {}",
+ byte_view.length,
+ len
+ );
+ }
+ // Acquire bytes view to string (no allocations)
+ let bytes = string.as_bytes();
+
+ if len <= 12 {
+ // Inline view
+ // Construct manually since ByteView cannot work with inline views
+ let mut view_buffer = [0u8; 16];
+ // 4 bytes: length
+ view_buffer[0..4].copy_from_slice(&len.to_le_bytes());
+ // 12 bytes: the whole zero-padded string
+ view_buffer[4..4 + len as usize].copy_from_slice(&bytes[..len as
usize]);
+ Ok(u128::from_le_bytes(view_buffer))
+ } else {
+ // Non-inline view.
+ // Use ByteView constructor to reference existing buffers
+ // 4 bytes: string prefix
+ let mut prefix = [0u8; 4];
+ prefix.copy_from_slice(&bytes[..4]);
+
+ let new_byte_view = ByteView::new(len, &prefix)
Review Comment:
```suggestion
let new_byte_view = ByteView::new(len, &bytes[..4])
```
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
Review Comment:
Personally I'm of the mind to omit these length checks;
`make_scalar_function` should ensure the arrays are of same size, and checking
`views` len against `string_view_array` len seems redundant considering it came
directly from it
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
+ })?;
+ let byte_view = ByteView::from(view);
+ // Construct a new view
+ let new_view = shrink_string_view_array_view(string, new_length,
byte_view)?;
+ new_views.push(new_view);
+ null_buffer_builder.append_non_null();
+ } else {
+ new_views.push(view);
+ // Emit null
+ null_buffer_builder.append_null();
+ }
+ }
+ // Buffers are unchanged
+ // Nulls are rebuilt from scratch
+ let nulls = null_buffer_builder.finish();
+ let result = StringViewArray::try_new(ScalarBuffer::from(new_views),
buffers, nulls)?;
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+ match n.cmp(&0) {
+ Ordering::Less => {
+ let mut indices = string.char_indices();
+ // Find the first byte of a character past last `-n` characters
+ let mut end_idx: usize = usize::MAX;
+ for _ in n..0 {
+ if let Some((i, _)) = indices.next_back() {
+ end_idx = i;
+ } else {
+ end_idx = 0;
+ break;
+ }
+ }
+ end_idx
+ }
+ Ordering::Equal => 0,
+ Ordering::Greater => {
+ if let Some((end_idx, end_char)) =
+ string.char_indices().take(n as usize).last()
Review Comment:
Should we use `nth` here?
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -94,7 +94,11 @@ impl ScalarUDFImpl for LeftFunc {
}
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
- utf8_to_str_type(&arg_types[0], "left")
+ if arg_types[0] == DataType::Utf8View {
+ Ok(DataType::Utf8View)
+ } else {
+ utf8_to_str_type(&arg_types[0], "left")
+ }
Review Comment:
```suggestion
Ok(arg_types[0].clone())
```
Since we're just maintaining the input string type
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
Review Comment:
I feel we don't need this clone & into_parts; `views` we can retrieve via
[`StringArrayView::views`](https://docs.rs/arrow/latest/arrow/array/type.StringViewArray.html#method.views)
since we don't actually need ownership of it. Only buffers we'd need ownership
and we can just `to_vec` from
[`StringArrayView::data_buffers`](https://docs.rs/arrow/latest/arrow/array/type.StringViewArray.html#method.data_buffers)
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
Review Comment:
We should look into refactoring `invoke_with_args` above if we do this,
since it also checked for utf8view type in order to call `left` with the
correct generic (which is now unused for view implementation)
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
+ let mut new_views = Vec::with_capacity(len);
+
+ for idx in 0..len {
+ let view = views[idx];
+
+ if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+ let string: &str = string_view_array.value(idx);
+ let n = n_array.value(idx);
+
+ let byte_length = left_byte_length(string, n);
+ let new_length: u32 = byte_length.try_into().map_err(|_| {
+ exec_datafusion_err!("String is larger than 32-bit limit at
index {idx}")
Review Comment:
Is this error case ever possible?
##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
/// Returns first n characters in the string, or when n is negative, returns
all but last |n| characters.
/// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
/// The implementation uses UTF-8 code points as characters
fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[1])?;
if args[0].data_type() == &DataType::Utf8View {
- let string_array = as_string_view_array(&args[0])?;
- left_impl::<T, _>(string_array, n_array)
+ let string_view_array = as_string_view_array(&args[0])?;
+ left_impl_view(string_view_array, n_array)
} else {
let string_array = as_generic_string_array::<T>(&args[0])?;
left_impl::<T, _>(string_array, n_array)
}
}
+/// `left` implementation for strings
fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
string_array: V,
n_array: &Int64Array,
) -> Result<ArrayRef> {
let iter = ArrayIter::new(string_array);
- let mut chars_buf = Vec::new();
let result = iter
.zip(n_array.iter())
.map(|(string, n)| match (string, n) {
- (Some(string), Some(n)) => match n.cmp(&0) {
- Ordering::Less => {
- // Collect chars once and reuse for both count and take
- chars_buf.clear();
- chars_buf.extend(string.chars());
- let len = chars_buf.len() as i64;
-
- // For negative n, take (len + n) chars if n > -len
(avoiding abs() which panics on i64::MIN)
- Some(if n > -len {
- chars_buf
- .iter()
- .take((len + n) as usize)
- .collect::<String>()
- } else {
- "".to_string()
- })
- }
- Ordering::Equal => Some("".to_string()),
- Ordering::Greater => {
- Some(string.chars().take(n as usize).collect::<String>())
- }
- },
+ (Some(string), Some(n)) => {
+ let byte_length = left_byte_length(string, n);
+ // Extract first `byte_length` bytes from a byte-indexed slice
+ Some(&string[0..byte_length])
+ }
_ => None,
})
.collect::<GenericStringArray<T>>();
Ok(Arc::new(result) as ArrayRef)
}
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+ string_view_array: &StringViewArray,
+ n_array: &Int64Array,
+) -> Result<ArrayRef> {
+ let len = n_array.len();
+
+ let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+ if string_view_array.len() != n_array.len() {
+ return exec_err!(
+ "Expected same shape of arrays, given {} and {}",
+ string_view_array.len(),
+ n_array.len()
+ );
+ }
+
+ // Every string in StringViewArray has one corresponding view in `views`
+ if views.len() != string_view_array.len() {
+ return exec_err!(
+ "StringViewArray views length {} does not match array length {}",
+ views.len(),
+ string_view_array.len()
+ );
+ }
+ let mut null_buffer_builder = NullBufferBuilder::new(len);
Review Comment:
We can use `NullBuffer::union` to construct the nulls instead of a builder
-
https://docs.rs/arrow/latest/arrow/buffer/struct.NullBuffer.html#method.union
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]