This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 3adccb9ae7 Respect `CastOptions.safe` when casting `BinaryView` → 
`Utf8View` (return `null` for invalid UTF‑8) (#8415)
3adccb9ae7 is described below

commit 3adccb9ae7a1c7e84fb006230e6ad1f6baf22c8c
Author: kosiew <[email protected]>
AuthorDate: Fri Sep 26 00:14:49 2025 +0800

    Respect `CastOptions.safe` when casting `BinaryView` → `Utf8View` (return 
`null` for invalid UTF‑8) (#8415)
    
    # Which issue does this PR close?
    
    Closes #8403.
    
    ---
    
    # Rationale for this change
    
    Casting from `BinaryView` to `Utf8View` currently attempts a direct
    conversion using `to_string_view()` which returns an error if any value
    contains invalid UTF‑8. This behavior is inconsistent with other binary
    array types in Arrow, which honor `CastOptions.safe = true` by replacing
    invalid UTF‑8 sequences with `NULL` values rather than failing the
    entire cast operation.
    
    This PR makes `BinaryView`'s casting behavior consistent with other
    binary types and with user expectations: when `CastOptions.safe` is
    `true`, invalid UTF‑8 bytes are replaced by `NULL` in the resulting
    `StringViewArray`; when `CastOptions.safe` is `false`, the cast retains
    the existing failure behavior.
    
    ---
    
    # What changes are included in this PR?
    
    * Change `cast_with_options` to delegate the `BinaryView -> Utf8View`
    branch to a new helper function `cast_binary_view_to_string_view(array,
    cast_options)` instead of directly calling `to_string_view()` and
    erroring.
    
    * Add `extend_valid_utf8` helper to centralize the logic of mapping
    `Option<&[u8]>` to `Option<&str>` (using
    `std::str::from_utf8(...).ok()`), and reuse it for both
    `GenericStringBuilder` and `StringViewBuilder` flows.
    
    * Implement `cast_binary_view_to_string_view` which:
    
    * Attempts `array.clone().to_string_view()` (fast, zero-copy path) and
    returns it when `Ok`.
      * On `Err`, checks `cast_options.safe`:
    
    * If `true`, builds a `StringViewArray` by filtering invalid UTF‑8 to
    `NULL` using `extend_valid_utf8` and returns that array.
        * If `false`, propagates the original error (existing behavior).
    
    * Add a unit test `test_binary_view_to_string_view_with_invalid_utf8`
    covering both `safe=false` (expect error) and `safe=true` (expect `NULL`
    where invalid UTF‑8 occurred).
    
    Files changed (high level):
    
    * `arrow-cast/src/cast/mod.rs`: route `BinaryView -> Utf8View` case to
    the new helper.
    * `arrow-cast/src/cast/string.rs`: add `extend_valid_utf8` and
    `cast_binary_view_to_string_view`, and use `extend_valid_utf8` from an
    existing cast path.
    
    ---
    
    # Are there any user-facing changes?
    
    Yes — this changes the observable behavior of casting `BinaryView` to
    `Utf8View`:
    
    * With `CastOptions.safe = true` (the safe mode), invalid UTF‑8 in
    `BinaryView` elements will be converted to `NULL` in the resulting
    `Utf8View` array instead of causing the entire cast to fail.
    * With `CastOptions.safe = false`, an invalid UTF‑8 still causes the
    cast to fail as before.
    
    This is a bug fix aligning `BinaryView` with the semantics of other
    binary types and with documented expectations for `CastOptions.safe`.
    
    No public API surface is changed beyond the fixed behavior; the new
    helpers are crate-private.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-cast/src/cast/mod.rs    | 36 +++++++++++++++++++++++++++++++++---
 arrow-cast/src/cast/string.rs | 33 ++++++++++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 71de8f9f18..2034b30cb3 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -1422,9 +1422,7 @@ pub fn cast_with_options(
             let binary_arr = cast_view_to_byte::<BinaryViewType, 
GenericBinaryType<i64>>(array)?;
             cast_binary_to_string::<i64>(&binary_arr, cast_options)
         }
-        (BinaryView, Utf8View) => {
-            Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as 
ArrayRef)
-        }
+        (BinaryView, Utf8View) => cast_binary_view_to_string_view(array, 
cast_options),
         (BinaryView, _) => Err(ArrowError::CastError(format!(
             "Casting from {from_type} to {to_type} not supported",
         ))),
@@ -6388,6 +6386,38 @@ mod tests {
         assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
     }
 
+    #[test]
+    fn test_binary_view_to_string_view_with_invalid_utf8() {
+        let binary_view_array = BinaryViewArray::from_iter(vec![
+            Some("valid".as_bytes()),
+            Some(&[0xff]),
+            Some("utf8".as_bytes()),
+            None,
+        ]);
+
+        let strict_options = CastOptions {
+            safe: false,
+            ..Default::default()
+        };
+
+        assert!(
+            cast_with_options(&binary_view_array, &DataType::Utf8View, 
&strict_options).is_err()
+        );
+
+        let safe_options = CastOptions {
+            safe: true,
+            ..Default::default()
+        };
+
+        let string_view_array =
+            cast_with_options(&binary_view_array, &DataType::Utf8View, 
&safe_options).unwrap();
+        assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+        let values: Vec<_> = 
string_view_array.as_string_view().iter().collect();
+
+        assert_eq!(values, vec![Some("valid"), None, Some("utf8"), None]);
+    }
+
     #[test]
     fn test_string_to_view() {
         _test_string_to_view::<i32>();
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
index 7cc42450f4..77696ae0d8 100644
--- a/arrow-cast/src/cast/string.rs
+++ b/arrow-cast/src/cast/string.rs
@@ -338,6 +338,14 @@ where
 
 /// A specified helper to cast from `GenericBinaryArray` to 
`GenericStringArray` when they have same
 /// offset size so re-encoding offset is unnecessary.
+fn extend_valid_utf8<'a, B, I>(builder: &mut B, iter: I)
+where
+    B: Extend<Option<&'a str>>,
+    I: Iterator<Item = Option<&'a [u8]>>,
+{
+    builder.extend(iter.map(|value| value.and_then(|bytes| 
std::str::from_utf8(bytes).ok())));
+}
+
 pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
     array: &dyn Array,
     cast_options: &CastOptions,
@@ -355,11 +363,7 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
                 let mut builder =
                     GenericStringBuilder::<O>::with_capacity(array.len(), 
array.value_data().len());
 
-                let iter = array
-                    .iter()
-                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
-                builder.extend(iter);
+                extend_valid_utf8(&mut builder, array.iter());
                 Ok(Arc::new(builder.finish()))
             }
             false => Err(e),
@@ -367,6 +371,25 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
     }
 }
 
+pub(crate) fn cast_binary_view_to_string_view(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let array = array.as_binary_view();
+
+    match array.clone().to_string_view() {
+        Ok(result) => Ok(Arc::new(result)),
+        Err(error) => match cast_options.safe {
+            true => {
+                let mut builder = 
StringViewBuilder::with_capacity(array.len());
+                extend_valid_utf8(&mut builder, array.iter());
+                Ok(Arc::new(builder.finish()))
+            }
+            false => Err(error),
+        },
+    }
+}
+
 /// Casts string to boolean
 fn cast_string_to_boolean<'a, StrArray>(
     array: &StrArray,

Reply via email to