tustvold commented on code in PR #3607:
URL: https://github.com/apache/arrow-rs/pull/3607#discussion_r1088664717


##########
arrow-cast/src/cast.rs:
##########
@@ -3436,6 +3392,77 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
     Ok(Arc::new(list) as ArrayRef)
 }
 
+/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. 
This function performs
+/// UTF8 validation during casting. For invalid UTF8 value, it could be Null 
or returning `Err` depending
+/// `CastOptions`.
+fn cast_binary_to_generic_string<I, O>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+    I: OffsetSizeTrait + ToPrimitive,
+    O: OffsetSizeTrait + NumCast,
+{
+    let array = array
+        .as_any()
+        .downcast_ref::<GenericByteArray<GenericBinaryType<I>>>()
+        .unwrap();
+
+    if !cast_options.safe {
+        let offsets = array.value_offsets();
+        let values = array.value_data();
+
+        // We only need to validate that all values are valid UTF-8
+        let validated = std::str::from_utf8(values)
+            .map_err(|_| ArrowError::CastError("Invalid UTF-8 
sequence".to_string()))?;
+
+        let mut offset_builder = BufferBuilder::<O>::new(offsets.len());
+        offsets
+            .iter()
+            .try_for_each::<_, Result<_, ArrowError>>(|offset| {
+                if !validated.is_char_boundary(offset.as_usize()) {
+                    return Err(ArrowError::CastError(
+                        "Invalid UTF-8 sequence".to_string(),
+                    ));
+                }
+
+                let offset = <O as NumCast>::from(*offset).ok_or_else(|| {
+                    ArrowError::ComputeError(format!(
+                        "{}Binary array too large to cast to {}String array",
+                        I::PREFIX,
+                        O::PREFIX
+                    ))
+                })?;
+                offset_builder.append(offset);
+                Ok(())
+            })?;
+
+        let offset_buffer = offset_builder.finish();
+
+        let builder = ArrayData::builder(GenericStringArray::<O>::DATA_TYPE)

Review Comment:
   You could copy across the null count as well



##########
arrow-cast/src/cast.rs:
##########
@@ -3436,6 +3392,77 @@ fn cast_list_inner<OffsetSize: OffsetSizeTrait>(
     Ok(Arc::new(list) as ArrayRef)
 }
 
+/// Helper function to cast from `GenericBinaryArray` to `GenericStringArray`. 
This function performs
+/// UTF8 validation during casting. For invalid UTF8 value, it could be Null 
or returning `Err` depending
+/// `CastOptions`.
+fn cast_binary_to_generic_string<I, O>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+    I: OffsetSizeTrait + ToPrimitive,
+    O: OffsetSizeTrait + NumCast,
+{
+    let array = array
+        .as_any()
+        .downcast_ref::<GenericByteArray<GenericBinaryType<I>>>()
+        .unwrap();
+
+    if !cast_options.safe {
+        let offsets = array.value_offsets();
+        let values = array.value_data();
+
+        // We only need to validate that all values are valid UTF-8

Review Comment:
   It seems a shame to duplicate this logic, but I guess it is hard to avoid 
whilst having this type signature



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to