alamb commented on code in PR #11587:
URL: https://github.com/apache/datafusion/pull/11587#discussion_r1691747642


##########
datafusion/physical-plan/src/coalesce_batches.rs:
##########
@@ -369,4 +443,100 @@ mod tests {
         )
         .unwrap()
     }
+
+    #[test]
+    fn test_gc_string_view_batch_small_no_compact() {
+        // view with only short strings (no buffers) --> no need to compact
+        let array = StringViewTest {
+            rows: 1000,
+            strings: vec![Some("a"), Some("b"), Some("c")],
+        }
+        .build();
+
+        let gc_array = do_gc(array.clone());
+        compare_string_array_values(&array, &gc_array);
+        assert_eq!(array.data_buffers().len(), 0);
+        assert_eq!(array.data_buffers().len(), gc_array.data_buffers().len()); 
// no compaction
+    }
+
+    #[test]
+    fn test_gc_string_view_batch_large_no_compact() {
+        // view with large strings (has buffers) but full --> no need to 
compact
+        let array = StringViewTest {
+            rows: 1000,
+            strings: vec![Some("This string is longer than 12 bytes")],
+        }
+        .build();
+
+        let gc_array = do_gc(array.clone());
+        compare_string_array_values(&array, &gc_array);
+        assert_eq!(array.data_buffers().len(), 5);
+        // TODO this is failing now (it always compacts)

Review Comment:
   ```suggestion
   ```



##########
datafusion/physical-plan/src/coalesce_batches.rs:
##########
@@ -290,13 +294,83 @@ pub fn concat_batches(
     arrow::compute::concat_batches(schema, batches)
 }
 
+/// Heuristically compact [`StringViewArray`]s to reduce memory usage, if 
needed
+///
+/// This function decides when to consolidate the StringView into a new buffer
+/// to reduce memory usage and improve string locality for better performance.
+///
+/// This differs from [`StringViewArray::gc`] because:
+/// 1. It may not compact the array depending on a heuristic.
+/// 2. It uses a larger default block size (2MB) to reduce the number of 
buffers to track.
+///
+/// # Heuristic
+///
+/// If the average size of each view is larger than 32 bytes, we compact the 
array.
+///
+/// `StringViewArray` include pointers to buffer that hold the underlying data.
+/// One of the great benefits of `StringViewArray` is that many operations
+/// (e.g., `filter`) can be done without copying the underlying data.
+///
+/// However, after a while (e.g., after `FilterExec` or `HashJoinExec`) the
+/// `StringViewArray` may only refer to a small portion of the buffer,
+/// significantly increasing memory usage.
+fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch {
+    let new_columns: Vec<ArrayRef> = batch
+        .columns()
+        .iter()
+        .map(|c| {
+            // Try to re-create the `StringViewArray` to prevent holding the 
underlying buffer too long.
+            let Some(s) = c.as_string_view_opt() else {
+                return Arc::clone(c);
+            };
+            let ideal_buffer_size: usize = s
+                .views()
+                .iter()
+                .map(|v| {
+                    let len = (*v as u32) as usize;
+                    if len > 12 {
+                        len
+                    } else {
+                        0
+                    }
+                })
+                .sum();
+            let actual_buffer_size = s.get_buffer_memory_size();
+
+            // Re-creating the array copies data and can be time consuming.
+            // We only do it if the array is sparse
+            if actual_buffer_size > (ideal_buffer_size * 2) {
+                // We set the block size to `ideal_buffer_size` so that the 
new StringViewArray only has one buffer, which accelerate later concat_batches.
+                // See https://github.com/apache/arrow-rs/issues/6094 for more 
details.
+                let mut builder = StringViewBuilder::with_capacity(s.len())
+                    .with_block_size(ideal_buffer_size as u32);
+
+                for v in s.iter() {
+                    builder.append_option(v);
+                }
+
+                let gc_string = builder.finish();
+
+                debug_assert!(gc_string.data_buffers().len() <= 1); // buffer 
count can be 0 if the `ideal_buffer_size` is 0

Review Comment:
   testing for the win!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to