This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new c884f1d093 feat(arrow-array): add
GenericByteViewArray::total_bytes_len (#9641)
c884f1d093 is described below
commit c884f1d093276f91c72d957c70df3535c2467ea4
Author: Harrison Crosse <[email protected]>
AuthorDate: Mon Apr 6 15:34:09 2026 -0400
feat(arrow-array): add GenericByteViewArray::total_bytes_len (#9641)
# Which issue does this PR close?
- Closes #9435
# Rationale for this change
`total_buffer_bytes_used()` only counts non-inlined strings (> 12
bytes), so it returns 0 for arrays of short strings. This makes it
unsuitable as a capacity hint when pre-allocating output buffers (e.g.
in DataFusion's `concat()`/`concat_ws()`).
# What changes are included in this PR?
Adds `total_bytes_len()` to `GenericByteViewArray`, which sums byte
lengths of all non-null values including inlined strings.
# Are there any user-facing changes?
New public method on `GenericByteViewArray` (and by extension
`StringViewArray` / `BinaryViewArray`).
---
arrow-array/src/array/byte_view_array.rs | 68 ++++++++++++++++++++++++++++++++
1 file changed, 68 insertions(+)
diff --git a/arrow-array/src/array/byte_view_array.rs
b/arrow-array/src/array/byte_view_array.rs
index a4a319df64..be6e799221 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -670,6 +670,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
}
}
+ /// Returns the total number of bytes of all non-null values in this array.
+ ///
+ /// Unlike [`Self::total_buffer_bytes_used`], this method includes inlined
strings
+ /// (those with length ≤ [`MAX_INLINE_VIEW_LEN`]), making it suitable as a
+ /// capacity hint when pre-allocating output buffers.
+ ///
+ /// Null values are excluded from the sum.
+ ///
+ /// # Example
+ ///
+ /// ```rust
+ /// # use arrow_array::StringViewArray;
+ /// let array = StringViewArray::from_iter(vec![
+ /// Some("hello"), // 5 bytes, inlined
+ /// None, // excluded
+ /// Some("large payload over 12 bytes"), // 27 bytes, non-inlined
+ /// ]);
+ /// assert_eq!(array.total_bytes_len(), 5 + 27);
+ /// ```
+ pub fn total_bytes_len(&self) -> usize {
+ match self.nulls() {
+ None => self.views().iter().map(|v| (*v as u32) as usize).sum(),
+ Some(nulls) => self
+ .views()
+ .iter()
+ .zip(nulls.iter())
+ .map(|(v, is_valid)| if is_valid { (*v as u32) as usize } else
{ 0 })
+ .sum(),
+ }
+ }
+
/// Returns the total number of bytes used by all non inlined views in all
/// buffers.
///
@@ -1809,4 +1840,41 @@ mod tests {
assert!(from_utf8(array.value(2)).is_ok());
array
}
+
+ #[test]
+ fn test_total_bytes_len() {
+ // inlined: "hello"=5, "world"=5, "lulu"=4 → 14
+ // non-inlined: "large payload over 12 bytes"=27
+ // null: should not count
+ let mut builder = StringViewBuilder::new();
+ builder.append_value("hello");
+ builder.append_value("world");
+ builder.append_value("lulu");
+ builder.append_null();
+ builder.append_value("large payload over 12 bytes");
+ let array = builder.finish();
+ assert_eq!(array.total_bytes_len(), 5 + 5 + 4 + 27);
+ }
+
+ #[test]
+ fn test_total_bytes_len_empty() {
+ let array = StringViewArray::from_iter::<Vec<Option<&str>>>(vec![]);
+ assert_eq!(array.total_bytes_len(), 0);
+ }
+
+ #[test]
+ fn test_total_bytes_len_all_nulls() {
+ let array = StringViewArray::new_null(5);
+ assert_eq!(array.total_bytes_len(), 0);
+ }
+
+ #[test]
+ fn test_total_bytes_len_binary_view() {
+ let array = BinaryViewArray::from_iter(vec![
+ Some(b"hi".as_ref()),
+ None,
+ Some(b"large payload over 12 bytes".as_ref()),
+ ]);
+ assert_eq!(array.total_bytes_len(), 2 + 27);
+ }
}