RinChanNOWWW commented on code in PR #5707:
URL: https://github.com/apache/arrow-rs/pull/5707#discussion_r1589867704


##########
arrow-array/src/array/byte_view_array.rs:
##########
@@ -265,6 +266,115 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
             phantom: Default::default(),
         }
     }
+
+    /// Returns whether the buffers are compact
+    pub(self) fn compact_check(&self) -> Vec<bool> {
+        let mut checkers: Vec<_> = self
+            .buffers
+            .iter()
+            .map(|b| CompactChecker::new(b.len()))
+            .collect();
+
+        for (i, view) in self.views.iter().enumerate() {
+            let view = ByteView::from(*view);
+            if self.is_null(i) || view.length <= 12 {
+                continue;
+            }
+            checkers[view.buffer_index as usize]
+                .accumulate(view.offset as usize, view.length as usize);
+        }
+        checkers.into_iter().map(|c| c.finish()).collect()
+    }
+
+    /// Returns a buffer compact version of this array
+    ///
+    /// The original array will *not* be modified
+    ///
+    /// # Garbage Collection
+    ///
+    /// Before GC:
+    /// ```text
+    ///                                        ┌──────┐                 
+    ///                                        │......│                 
+    ///                                        │......│                 
+    /// ┌────────────────────┐       ┌ ─ ─ ─ ▶ │Data1 │   Large buffer  
+    /// │       View 1       │─ ─ ─ ─          │......│  with data that
+    /// ├────────────────────┤                 │......│ is not referred
+    /// │       View 2       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data2 │ to by View 1 or
+    /// └────────────────────┘                 │......│      View 2     
+    ///                                        │......│                 
+    ///    2 views, refer to                   │......│                 
+    ///   small portions of a                  └──────┘                 
+    ///      large buffer                                               
+    /// ```
+    ///                                                                
+    /// After GC:
+    ///
+    /// ```text
+    /// ┌────────────────────┐                 ┌─────┐    After gc, only
+    /// │       View 1       │─ ─ ─ ─ ─ ─ ─ ─▶ │Data1│     data that is  
+    /// ├────────────────────┤       ┌ ─ ─ ─ ▶ │Data2│    pointed to by  
+    /// │       View 2       │─ ─ ─ ─          └─────┘     the views is  
+    /// └────────────────────┘                                 left      
+    ///                                                                  
+    ///                                                                  
+    ///         2 views                                                  
+    /// ```
+    /// This method will compact the data buffers to only include the data
+    /// that is pointed to by the views,
+    /// and return a new array with the compacted data buffers.
+    /// The original array will be left as is.
+    pub fn gc(&self) -> Self {
+        let check_result = self.compact_check();
+
+        if check_result.iter().all(|x| *x) {
+            return self.clone();
+        }
+
+        let mut new_views = Vec::with_capacity(self.views.len());
+        let mut new_bufs: Vec<Vec<u8>> = vec![vec![]; self.buffers.len()];

Review Comment:
   The number of buffers may shrink after `gc`. Every buffer should be filled 
up to `block_size`.
   
   See `GenericByteViewBuilder::append_value`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to