This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new c884f1d093 feat(arrow-array): add 
GenericByteViewArray::total_bytes_len (#9641)
c884f1d093 is described below

commit c884f1d093276f91c72d957c70df3535c2467ea4
Author: Harrison Crosse <[email protected]>
AuthorDate: Mon Apr 6 15:34:09 2026 -0400

    feat(arrow-array): add GenericByteViewArray::total_bytes_len (#9641)
    
    # Which issue does this PR close?
    
    - Closes #9435
    
    # Rationale for this change
    
    `total_buffer_bytes_used()` only counts non-inlined strings (> 12
    bytes), so it returns 0 for arrays of short strings. This makes it
    unsuitable as a capacity hint when pre-allocating output buffers (e.g.
    in DataFusion's `concat()`/`concat_ws()`).
    
    # What changes are included in this PR?
    
    Adds `total_bytes_len()` to `GenericByteViewArray`, which sums byte
    lengths of all non-null values including inlined strings.
    
    # Are there any user-facing changes?
    
    New public method on `GenericByteViewArray` (and by extension
    `StringViewArray` / `BinaryViewArray`).
---
 arrow-array/src/array/byte_view_array.rs | 68 ++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index a4a319df64..be6e799221 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -670,6 +670,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
         }
     }
 
+    /// Returns the total number of bytes of all non-null values in this array.
+    ///
+    /// Unlike [`Self::total_buffer_bytes_used`], this method includes inlined 
strings
+    /// (those with length ≤ [`MAX_INLINE_VIEW_LEN`]), making it suitable as a
+    /// capacity hint when pre-allocating output buffers.
+    ///
+    /// Null values are excluded from the sum.
+    ///
+    /// # Example
+    ///
+    /// ```rust
+    /// # use arrow_array::StringViewArray;
+    /// let array = StringViewArray::from_iter(vec![
+    ///     Some("hello"),   // 5 bytes, inlined
+    ///     None,            // excluded
+    ///     Some("large payload over 12 bytes"),  // 27 bytes, non-inlined
+    /// ]);
+    /// assert_eq!(array.total_bytes_len(), 5 + 27);
+    /// ```
+    pub fn total_bytes_len(&self) -> usize {
+        match self.nulls() {
+            None => self.views().iter().map(|v| (*v as u32) as usize).sum(),
+            Some(nulls) => self
+                .views()
+                .iter()
+                .zip(nulls.iter())
+                .map(|(v, is_valid)| if is_valid { (*v as u32) as usize } else 
{ 0 })
+                .sum(),
+        }
+    }
+
     /// Returns the total number of bytes used by all non inlined views in all
     /// buffers.
     ///
@@ -1809,4 +1840,41 @@ mod tests {
         assert!(from_utf8(array.value(2)).is_ok());
         array
     }
+
+    #[test]
+    fn test_total_bytes_len() {
+        // inlined: "hello"=5, "world"=5, "lulu"=4 → 14
+        // non-inlined: "large payload over 12 bytes"=27
+        // null: should not count
+        let mut builder = StringViewBuilder::new();
+        builder.append_value("hello");
+        builder.append_value("world");
+        builder.append_value("lulu");
+        builder.append_null();
+        builder.append_value("large payload over 12 bytes");
+        let array = builder.finish();
+        assert_eq!(array.total_bytes_len(), 5 + 5 + 4 + 27);
+    }
+
+    #[test]
+    fn test_total_bytes_len_empty() {
+        let array = StringViewArray::from_iter::<Vec<Option<&str>>>(vec![]);
+        assert_eq!(array.total_bytes_len(), 0);
+    }
+
+    #[test]
+    fn test_total_bytes_len_all_nulls() {
+        let array = StringViewArray::new_null(5);
+        assert_eq!(array.total_bytes_len(), 0);
+    }
+
+    #[test]
+    fn test_total_bytes_len_binary_view() {
+        let array = BinaryViewArray::from_iter(vec![
+            Some(b"hi".as_ref()),
+            None,
+            Some(b"large payload over 12 bytes".as_ref()),
+        ]);
+        assert_eq!(array.total_bytes_len(), 2 + 27);
+    }
 }

Reply via email to