alamb commented on code in PR #10020:
URL: https://github.com/apache/arrow-rs/pull/10020#discussion_r3357786218
##########
parquet/tests/arrow_writer.rs:
##########
@@ -48,3 +57,329 @@ fn test_delta_bit_pack_type() {
let mut writer = ArrowWriter::try_new(&mut buffer, record_batch.schema(),
Some(props)).unwrap();
let _ = writer.write(&record_batch);
}
+
+// ---------------------------------------------------------------------------
+// Heap-memory regression test for the writer's page buffering.
+//
+// This proves the headline invariant of the pluggable [`PageStore`]: while a
+// row group is being written, the heap used to buffer completed pages grows
+// with the row group size for the default in-memory store, but stays bounded
+// (≈ a few pages per leaf column) once a spilling backend is plugged in.
+//
+// Peak heap is measured with a thread-local tracking allocator (the same
+// pattern used by `parquet/benches/arrow_reader_peak_memory.rs`), so the test
+// needs no external profiling dependency. Tracking is thread-local, so the
+// measured peak reflects only allocations made on the measuring thread; the
+// default `ArrowWriter` is single-threaded, so the writer's buffering all
lands
+// there. Each measurement resets the peak to the current live baseline and
+// reports the delta, so the threads of unrelated tests in this binary do not
+// perturb it.
+//
+// [`PageStore`]: parquet::arrow::arrow_writer::PageStore
+// ---------------------------------------------------------------------------
+
+thread_local! {
+ static LIVE_BYTES: Cell<usize> = const { Cell::new(0) };
+ static PEAK_BYTES: Cell<usize> = const { Cell::new(0) };
+}
+
+struct TrackingAllocator {
+ inner: System,
+}
+
+#[global_allocator]
+static GLOBAL: TrackingAllocator = TrackingAllocator { inner: System };
Review Comment:
FWIW this affects all tests in this test binary, but that seems fine to me
##########
parquet/src/column/writer/mod.rs:
##########
@@ -1271,7 +1282,14 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a,
E> {
};
// Check if we need to buffer data page or flush it to the sink
directly.
- if self.encoder.has_dictionary() {
+ //
+ // For dictionary-encoded columns the dictionary page must be written
+ // first, but it is not final until all values are seen, so completed
+ // data pages are normally buffered here until `close`. A page writer
+ // that defers final layout (the Arrow path) instead orders pages
itself
+ // at flush, so we stream the data pages straight through and never let
+ // them accumulate in memory.
+ if self.encoder.has_dictionary() &&
!self.page_writer.defers_dictionary_ordering() {
self.data_pages.push_back(compressed_page);
Review Comment:
- tracking in https://github.com/apache/arrow-rs/issues/10062
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]