alamb commented on code in PR #7650:
URL: https://github.com/apache/arrow-rs/pull/7650#discussion_r2143592404


##########
arrow-select/src/coalesce.rs:
##########
@@ -222,122 +250,333 @@ impl BatchCoalescer {
     }
 }
 
-/// Heuristically compact `StringViewArray`s to reduce memory usage, if needed
-///
-/// Decides when to consolidate the StringView into a new buffer to reduce
-/// memory usage and improve string locality for better performance.
-///
-/// This differs from `StringViewArray::gc` because:
-/// 1. It may not compact the array depending on a heuristic.
-/// 2. It uses a precise block size to reduce the number of buffers to track.
-///
-/// # Heuristic
+/// Return a new `InProgressArray` for the given data type
+fn create_in_progress_array(data_type: &DataType, batch_size: usize) -> 
Box<dyn InProgressArray> {
+    match data_type {
+        DataType::Utf8View => 
Box::new(InProgressStringViewArray::new(batch_size)),
+        _ => Box::new(GenericInProgressArray::new()),
+    }
+}
+
+/// Incrementally builds in progress arrays
 ///
-/// If the average size of each view is larger than 32 bytes, we compact the 
array.
+/// There are different specialized implementations of this trait for different
+/// array types (e.g., [`StringViewArray`], [`UInt32Array`], etc.).
 ///
-/// `StringViewArray` include pointers to buffer that hold the underlying data.
-/// One of the great benefits of `StringViewArray` is that many operations
-/// (e.g., `filter`) can be done without copying the underlying data.
+/// This is a subset of the ArrayBuilder APIs, but specialized for
+/// the incremental usecase
+trait InProgressArray: std::fmt::Debug {
+    /// Push a new array to the in-progress array
+    fn push_array(&mut self, array: ArrayRef);
+
+    /// Finish the currently in-progress array and clear state for the next
+    fn finish(&mut self) -> Result<ArrayRef, ArrowError>;
+}
+
+/// Fallback implementation for [`InProgressArray`]
 ///
-/// However, after a while (e.g., after `FilterExec` or `HashJoinExec`) the
-/// `StringViewArray` may only refer to a small portion of the buffer,
-/// significantly increasing memory usage.
-fn gc_string_view_batch(batch: RecordBatch) -> RecordBatch {

Review Comment:
   The logic of `gc_string_view_batch` has been incorporated into 
`InProgressStringViewArray`, and thus saves a (third!!!)  copy of the strings 
for some StringViewArrays



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to