etseidl commented on code in PR #6466:
URL: https://github.com/apache/arrow-rs/pull/6466#discussion_r1778957747
##########
parquet/src/file/metadata/mod.rs:
##########
@@ -290,6 +298,117 @@ impl ParquetMetaData {
}
}
+/// A builder for creating / manipulating [`ParquetMetaData`]
+///
+/// # Example creating a new [`ParquetMetaData`]
+///
+///```no_run
+/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData,
ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
+/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
+/// // Create a new builder given the file metadata
+/// let file_metadata = get_file_metadata();
+/// // Create a row group
+/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
+/// .set_num_rows(100)
+/// // ... (A real row group needs more than just the number of rows)
+/// .build()
+/// .unwrap();
+/// // Create the final metadata
+/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
+/// .add_row_group(row_group)
+/// .build();
+/// ```
+///
+/// # Example modifying an existing [`ParquetMetaData`]
+/// ```no_run
+/// # use parquet::file::metadata::ParquetMetaData;
+/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
+/// // Modify the metadata so only the last RowGroup remains
+/// let metadata: ParquetMetaData = load_metadata();
+/// let mut builder = metadata.into_builder();
+///
+/// // Take existing row groups to modify
+/// let mut row_groups = builder.take_row_groups();
+/// let last_row_group = row_groups.pop().unwrap();
+///
+/// let metadata = builder
+/// .add_row_group(last_row_group)
+/// .build();
+/// ```
+pub struct ParquetMetaDataBuilder(ParquetMetaData);
+
+impl ParquetMetaDataBuilder {
+ /// Create a new builder from a file metadata, with no row groups
+ pub fn new(file_meta_data: FileMetaData) -> Self {
+ Self(ParquetMetaData::new(file_meta_data, vec![]))
+ }
+
+ /// Create a new builder from an exising ParquetMetaData
+ pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
+ Self(metadata)
+ }
+
+ /// Adds a row group to the metadata
+ pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
+ self.0.row_groups.push(row_group);
+ self
+ }
+
+ /// Sets all the row groups to the specified list
+ pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self
{
+ self.0.row_groups = row_groups;
+ self
+ }
+
+ /// Takes ownership of the row groups in this builder, and clears the list
+ /// of row groups.
+ ///
+ /// This can be used for more efficient creation of a new ParquetMetaData
+ /// from an existing one.
+ pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
+ std::mem::take(&mut self.0.row_groups)
+ }
+
+ /// Return a reference to the current row groups
+ pub fn row_groups(&self) -> &[RowGroupMetaData] {
Review Comment:
Do we want an analogue of this for the page index structures?
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -204,41 +206,30 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
}
}
if keep {
- filtered_row_groups.push(rg_meta);
+ metadata_builder = metadata_builder.add_row_group(rg_meta);
}
}
if options.enable_page_index {
let mut columns_indexes = vec![];
let mut offset_indexes = vec![];
- for rg in &mut filtered_row_groups {
+ for rg in metadata_builder.row_groups().iter() {
Review Comment:
I think we can build the metadata here (with the filtered row groups), pass
it into `ParquetMetaDataReader` and then load the page indexes into the
metadata. Let me give that a try.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]