This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9acc9fa0b8 Minor API adjustments for StringViewBuilder (#6047)
9acc9fa0b8 is described below

commit 9acc9fa0b84755c5caffb8acf806fa4fd21928bb
Author: Xiangpeng Hao <haoxiangpeng...@gmail.com>
AuthorDate: Mon Jul 15 05:52:37 2024 -0400

    Minor API adjustments for StringViewBuilder (#6047)
    
    * minor update
    
    * add memory accounting
    
    * Update arrow-buffer/src/builder/null.rs
    
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
    
    * Update arrow-array/src/builder/generic_bytes_view_builder.rs
    
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
    
    * update comments
    
    ---------
    
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
---
 arrow-array/src/array/byte_view_array.rs              |  6 ++++--
 arrow-array/src/builder/generic_bytes_view_builder.rs | 16 +++++++++++++++-
 arrow-buffer/src/builder/null.rs                      |  8 ++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/arrow-array/src/array/byte_view_array.rs 
b/arrow-array/src/array/byte_view_array.rs
index a00bf7271b..5ce150a7a3 100644
--- a/arrow-array/src/array/byte_view_array.rs
+++ b/arrow-array/src/array/byte_view_array.rs
@@ -324,9 +324,11 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
     /// Note that it will copy the array regardless of whether the original 
array is compact.
     /// Use with caution as this can be an expensive operation, only use it 
when you are sure that the view
     /// array is significantly smaller than when it is originally created, 
e.g., after filtering or slicing.
+    ///
+    /// Note: this function does not attempt to canonicalize / deduplicate 
values. For this
+    /// feature see  [`GenericByteViewBuilder::with_deduplicate_strings`].
     pub fn gc(&self) -> Self {
-        let mut builder =
-            
GenericByteViewBuilder::<T>::with_capacity(self.len()).with_deduplicate_strings();
+        let mut builder = 
GenericByteViewBuilder::<T>::with_capacity(self.len());
 
         for v in self.iter() {
             builder.append_option(v);
diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs 
b/arrow-array/src/builder/generic_bytes_view_builder.rs
index dda5535456..587255cc6b 100644
--- a/arrow-array/src/builder/generic_bytes_view_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_view_builder.rs
@@ -201,7 +201,8 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
 
     /// Returns the value at the given index
     /// Useful if we want to know what value has been inserted to the builder
-    fn get_value(&self, index: usize) -> &[u8] {
+    /// The index has to be smaller than `self.len()`, otherwise it will panic
+    pub fn get_value(&self, index: usize) -> &[u8] {
         let view = self.views_builder.as_slice().get(index).unwrap();
         let len = *view as u32;
         if len <= 12 {
@@ -337,6 +338,19 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
     pub fn validity_slice(&self) -> Option<&[u8]> {
         self.null_buffer_builder.as_slice()
     }
+
+    /// Return the allocated size of this builder in bytes, useful for memory 
accounting.
+    pub fn allocated_size(&self) -> usize {
+        let views = self.views_builder.capacity() * 
std::mem::size_of::<u128>();
+        let null = self.null_buffer_builder.allocated_size();
+        let buffer_size = self.completed.iter().map(|b| 
b.capacity()).sum::<usize>();
+        let in_progress = self.in_progress.capacity();
+        let tracker = match &self.string_tracker {
+            Some((ht, _)) => ht.capacity() * std::mem::size_of::<usize>(),
+            None => 0,
+        };
+        buffer_size + in_progress + tracker + views + null
+    }
 }
 
 impl<T: ByteViewType + ?Sized> Default for GenericByteViewBuilder<T> {
diff --git a/arrow-buffer/src/builder/null.rs b/arrow-buffer/src/builder/null.rs
index 55b3303c9e..a1cea6ef2c 100644
--- a/arrow-buffer/src/builder/null.rs
+++ b/arrow-buffer/src/builder/null.rs
@@ -161,6 +161,14 @@ impl NullBufferBuilder {
     pub fn as_slice_mut(&mut self) -> Option<&mut [u8]> {
         self.bitmap_builder.as_mut().map(|b| b.as_slice_mut())
     }
+
+    /// Return the allocated size of this builder, in bytes, useful for memory 
accounting.
+    pub fn allocated_size(&self) -> usize {
+        self.bitmap_builder
+            .as_ref()
+            .map(|b| b.capacity())
+            .unwrap_or(0)
+    }
 }
 
 impl NullBufferBuilder {

Reply via email to