This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 73ceb1d7f2 Expose ColumnCloseResult on ArrowColumnChunk (#9773)
73ceb1d7f2 is described below
commit 73ceb1d7f2af40e5dd33fdf114f86a35393b74f5
Author: Leonardo Yvens <[email protected]>
AuthorDate: Wed Apr 22 11:33:02 2026 -0300
Expose ColumnCloseResult on ArrowColumnChunk (#9773)
Adds `close()` and `close_mut()` accessors on `ArrowColumnChunk` so
callers can inspect and mutate the `ColumnCloseResult` produced by
`ArrowColumnWriter::close()` before appending the chunk to a row group.
My motivation is dynamic deciding whether to omit the page index.
Should not be controversial given `ColumnCloseResult` is already public.
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/9774.
# What changes are included in this PR?
Adds accessor to `ArrowColumnChunk`
# Are these changes tested?
An unit test is included, also illustrating a potential use case.
# Are there any user-facing changes?
Yes the accessors are public.
---
parquet/src/arrow/arrow_writer/mod.rs | 70 +++++++++++++++++++++++++++++++++++
1 file changed, 70 insertions(+)
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 8422263b1f..641b81257a 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -765,6 +765,26 @@ impl std::fmt::Debug for ArrowColumnChunk {
}
impl ArrowColumnChunk {
+ /// Returns the [`ColumnCloseResult`] produced when the chunk was closed.
+ ///
+ /// Exposes encoding information, collected statistics, and the optional
+ ///
[`ColumnIndexMetaData`](crate::file::page_index::column_index::ColumnIndexMetaData)
+ /// /
[`OffsetIndexMetaData`](crate::file::page_index::offset_index::OffsetIndexMetaData)
+ /// gathered for the column chunk.
+ pub fn close(&self) -> &ColumnCloseResult {
+ &self.close
+ }
+
+ /// Returns a mutable reference to the [`ColumnCloseResult`].
+ ///
+ /// This allows callers to mutate the close result before the chunk is
+ /// appended to a row group — for example, clearing `column_index` or
+ /// `bloom_filter` based on a dynamic rule that inspects the encodings and
+ /// collected page statistics.
+ pub fn close_mut(&mut self) -> &mut ColumnCloseResult {
+ &mut self.close
+ }
+
/// Calls [`SerializedRowGroupWriter::append_column`] with this column's
data
pub fn append_to_row_group<W: Write + Send>(
self,
@@ -5066,4 +5086,54 @@ mod tests {
let total_rows: i64 = sizes.iter().sum();
assert_eq!(total_rows, 100, "Total rows should be preserved");
}
+
+ #[test]
+ fn arrow_column_chunk_close_mut_drops_column_index() {
+ use crate::arrow::ArrowSchemaConverter;
+ use crate::file::writer::SerializedFileWriter;
+
+ let schema = Arc::new(Schema::new(vec![Field::new("i",
DataType::Int32, false)]));
+ let props = Arc::new(
+ WriterProperties::builder()
+ .set_statistics_enabled(EnabledStatistics::Page)
+ .build(),
+ );
+ let parquet_schema = ArrowSchemaConverter::new()
+ .with_coerce_types(props.coerce_types())
+ .convert(&schema)
+ .unwrap();
+
+ let mut buf = Vec::with_capacity(1024);
+ let mut writer =
+ SerializedFileWriter::new(&mut buf,
parquet_schema.root_schema_ptr(), props.clone())
+ .unwrap();
+
+ let factory = ArrowRowGroupWriterFactory::new(&writer,
Arc::clone(&schema));
+ let mut col_writers = factory.create_column_writers(0).unwrap();
+ let arr: ArrayRef = Arc::new(Int32Array::from_iter_values(0..64));
+ for leaves in compute_leaves(schema.field(0), &arr).unwrap() {
+ col_writers[0].write(&leaves).unwrap();
+ }
+ let mut chunk = col_writers.pop().unwrap().close().unwrap();
+
+ // Immutable accessor exposes the close result produced at close time.
+ assert!(
+ chunk.close().column_index.is_some(),
+ "EnabledStatistics::Page should produce a column_index"
+ );
+
+ // Mutable accessor lets callers drop the page-level index before
append.
+ chunk.close_mut().column_index = None;
+ assert!(chunk.close().column_index.is_none());
+
+ let mut rg = writer.next_row_group().unwrap();
+ chunk.append_to_row_group(&mut rg).unwrap();
+ rg.close().unwrap();
+ let file_meta = writer.close().unwrap();
+
+ // After dropping column_index, the resulting file records no column
+ // index offset/length for this chunk.
+ let cc = file_meta.row_group(0).column(0);
+ assert!(cc.column_index_range().is_none());
+ }
}