tustvold commented on code in PR #3607:
URL: https://github.com/apache/arrow-rs/pull/3607#discussion_r1089182373
##########
arrow-cast/src/cast.rs:
##########
@@ -3436,6 +3392,77 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
Ok(Arc::new(list) as ArrayRef)
}
+/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`.
This function performs
+/// UTF8 validation during casting. For invalid UTF8 value, it could be Null
or returning `Err` depending
+/// `CastOptions`.
+fn cast_binary_to_generic_string<I, O>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+ I: OffsetSizeTrait + ToPrimitive,
+ O: OffsetSizeTrait + NumCast,
+{
+ let array = array
+ .as_any()
+ .downcast_ref::<GenericByteArray<GenericBinaryType<I>>>()
+ .unwrap();
+
+ if !cast_options.safe {
+ let offsets = array.value_offsets();
+ let values = array.value_data();
+
+ // We only need to validate that all values are valid UTF-8
+ let validated = std::str::from_utf8(values)
+ .map_err(|_| ArrowError::CastError("Invalid UTF-8
sequence".to_string()))?;
+
+ let mut offset_builder = BufferBuilder::<O>::new(offsets.len());
+ offsets
+ .iter()
+ .try_for_each::<_, Result<_, ArrowError>>(|offset| {
+ if !validated.is_char_boundary(offset.as_usize()) {
+ return Err(ArrowError::CastError(
+ "Invalid UTF-8 sequence".to_string(),
+ ));
+ }
+
+ let offset = <O as NumCast>::from(*offset).ok_or_else(|| {
+ ArrowError::ComputeError(format!(
+ "{}Binary array too large to cast to {}String array",
+ I::PREFIX,
+ O::PREFIX
+ ))
+ })?;
+ offset_builder.append(offset);
+ Ok(())
+ })?;
+
+ let offset_buffer = offset_builder.finish();
+
+ let builder = ArrayData::builder(GenericStringArray::<O>::DATA_TYPE)
+ .offset(array.offset())
Review Comment:
I think this is incorrect, we compute a new buffer, and so this offset
should be removed
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]