This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 3adccb9ae7 Respect `CastOptions.safe` when casting `BinaryView` →
`Utf8View` (return `null` for invalid UTF‑8) (#8415)
3adccb9ae7 is described below
commit 3adccb9ae7a1c7e84fb006230e6ad1f6baf22c8c
Author: kosiew <[email protected]>
AuthorDate: Fri Sep 26 00:14:49 2025 +0800
Respect `CastOptions.safe` when casting `BinaryView` → `Utf8View` (return
`null` for invalid UTF‑8) (#8415)
# Which issue does this PR close?
Closes #8403.
---
# Rationale for this change
Casting from `BinaryView` to `Utf8View` currently attempts a direct
conversion using `to_string_view()` which returns an error if any value
contains invalid UTF‑8. This behavior is inconsistent with other binary
array types in Arrow, which honor `CastOptions.safe = true` by replacing
invalid UTF‑8 sequences with `NULL` values rather than failing the
entire cast operation.
This PR makes `BinaryView`'s casting behavior consistent with other
binary types and with user expectations: when `CastOptions.safe` is
`true`, invalid UTF‑8 bytes are replaced by `NULL` in the resulting
`StringViewArray`; when `CastOptions.safe` is `false`, the cast retains
the existing failure behavior.
---
# What changes are included in this PR?
* Change `cast_with_options` to delegate the `BinaryView -> Utf8View`
branch to a new helper function `cast_binary_view_to_string_view(array,
cast_options)` instead of directly calling `to_string_view()` and
erroring.
* Add `extend_valid_utf8` helper to centralize the logic of mapping
`Option<&[u8]>` to `Option<&str>` (using
`std::str::from_utf8(...).ok()`), and reuse it for both
`GenericStringBuilder` and `StringViewBuilder` flows.
* Implement `cast_binary_view_to_string_view` which:
* Attempts `array.clone().to_string_view()` (fast, zero-copy path) and
returns it when `Ok`.
* On `Err`, checks `cast_options.safe`:
* If `true`, builds a `StringViewArray` by filtering invalid UTF‑8 to
`NULL` using `extend_valid_utf8` and returns that array.
* If `false`, propagates the original error (existing behavior).
* Add a unit test `test_binary_view_to_string_view_with_invalid_utf8`
covering both `safe=false` (expect error) and `safe=true` (expect `NULL`
where invalid UTF‑8 occurred).
Files changed (high level):
* `arrow-cast/src/cast/mod.rs`: route `BinaryView -> Utf8View` case to
the new helper.
* `arrow-cast/src/cast/string.rs`: add `extend_valid_utf8` and
`cast_binary_view_to_string_view`, and use `extend_valid_utf8` from an
existing cast path.
---
# Are there any user-facing changes?
Yes — this changes the observable behavior of casting `BinaryView` to
`Utf8View`:
* With `CastOptions.safe = true` (the safe mode), invalid UTF‑8 in
`BinaryView` elements will be converted to `NULL` in the resulting
`Utf8View` array instead of causing the entire cast to fail.
* With `CastOptions.safe = false`, an invalid UTF‑8 still causes the
cast to fail as before.
This is a bug fix aligning `BinaryView` with the semantics of other
binary types and with documented expectations for `CastOptions.safe`.
No public API surface is changed beyond the fixed behavior; the new
helpers are crate-private.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-cast/src/cast/mod.rs | 36 +++++++++++++++++++++++++++++++++---
arrow-cast/src/cast/string.rs | 33 ++++++++++++++++++++++++++++-----
2 files changed, 61 insertions(+), 8 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 71de8f9f18..2034b30cb3 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -1422,9 +1422,7 @@ pub fn cast_with_options(
let binary_arr = cast_view_to_byte::<BinaryViewType,
GenericBinaryType<i64>>(array)?;
cast_binary_to_string::<i64>(&binary_arr, cast_options)
}
- (BinaryView, Utf8View) => {
- Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as
ArrayRef)
- }
+ (BinaryView, Utf8View) => cast_binary_view_to_string_view(array,
cast_options),
(BinaryView, _) => Err(ArrowError::CastError(format!(
"Casting from {from_type} to {to_type} not supported",
))),
@@ -6388,6 +6386,38 @@ mod tests {
assert_eq!(string_view_array.as_ref(), &expect_string_view_array);
}
+ #[test]
+ fn test_binary_view_to_string_view_with_invalid_utf8() {
+ let binary_view_array = BinaryViewArray::from_iter(vec![
+ Some("valid".as_bytes()),
+ Some(&[0xff]),
+ Some("utf8".as_bytes()),
+ None,
+ ]);
+
+ let strict_options = CastOptions {
+ safe: false,
+ ..Default::default()
+ };
+
+ assert!(
+ cast_with_options(&binary_view_array, &DataType::Utf8View,
&strict_options).is_err()
+ );
+
+ let safe_options = CastOptions {
+ safe: true,
+ ..Default::default()
+ };
+
+ let string_view_array =
+ cast_with_options(&binary_view_array, &DataType::Utf8View,
&safe_options).unwrap();
+ assert_eq!(string_view_array.data_type(), &DataType::Utf8View);
+
+ let values: Vec<_> =
string_view_array.as_string_view().iter().collect();
+
+ assert_eq!(values, vec![Some("valid"), None, Some("utf8"), None]);
+ }
+
#[test]
fn test_string_to_view() {
_test_string_to_view::<i32>();
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
index 7cc42450f4..77696ae0d8 100644
--- a/arrow-cast/src/cast/string.rs
+++ b/arrow-cast/src/cast/string.rs
@@ -338,6 +338,14 @@ where
/// A specified helper to cast from `GenericBinaryArray` to
`GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
+fn extend_valid_utf8<'a, B, I>(builder: &mut B, iter: I)
+where
+ B: Extend<Option<&'a str>>,
+ I: Iterator<Item = Option<&'a [u8]>>,
+{
+ builder.extend(iter.map(|value| value.and_then(|bytes|
std::str::from_utf8(bytes).ok())));
+}
+
pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
cast_options: &CastOptions,
@@ -355,11 +363,7 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
let mut builder =
GenericStringBuilder::<O>::with_capacity(array.len(),
array.value_data().len());
- let iter = array
- .iter()
- .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
- builder.extend(iter);
+ extend_valid_utf8(&mut builder, array.iter());
Ok(Arc::new(builder.finish()))
}
false => Err(e),
@@ -367,6 +371,25 @@ pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
}
}
+pub(crate) fn cast_binary_view_to_string_view(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array.as_binary_view();
+
+ match array.clone().to_string_view() {
+ Ok(result) => Ok(Arc::new(result)),
+ Err(error) => match cast_options.safe {
+ true => {
+ let mut builder =
StringViewBuilder::with_capacity(array.len());
+ extend_valid_utf8(&mut builder, array.iter());
+ Ok(Arc::new(builder.finish()))
+ }
+ false => Err(error),
+ },
+ }
+}
+
/// Casts string to boolean
fn cast_string_to_boolean<'a, StrArray>(
array: &StrArray,