alamb commented on code in PR #6466:
URL: https://github.com/apache/arrow-rs/pull/6466#discussion_r1779462861


##########
parquet/src/file/metadata/mod.rs:
##########
@@ -290,6 +298,117 @@ impl ParquetMetaData {
     }
 }
 
+/// A builder for creating / manipulating [`ParquetMetaData`]
+///
+/// # Example creating a new [`ParquetMetaData`]
+///
+///```no_run
+/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, 
ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
+/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
+/// // Create a new builder given the file metadata
+/// let file_metadata = get_file_metadata();
+/// // Create a row group
+/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
+///    .set_num_rows(100)
+///    // ... (A real row group needs more than just the number of rows)
+///    .build()
+///    .unwrap();
+/// // Create the final metadata
+/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
+///   .add_row_group(row_group)
+///   .build();
+/// ```
+///
+/// # Example modifying an existing [`ParquetMetaData`]
+/// ```no_run
+/// # use parquet::file::metadata::ParquetMetaData;
+/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
+/// // Modify the metadata so only the last RowGroup remains
+/// let metadata: ParquetMetaData = load_metadata();
+/// let mut builder = metadata.into_builder();
+///
+/// // Take existing row groups to modify
+/// let mut row_groups = builder.take_row_groups();
+/// let last_row_group = row_groups.pop().unwrap();
+///
+/// let metadata = builder
+///   .add_row_group(last_row_group)
+///   .build();
+/// ```
+pub struct ParquetMetaDataBuilder(ParquetMetaData);
+
+impl ParquetMetaDataBuilder {
+    /// Create a new builder from a file metadata, with no row groups
+    pub fn new(file_meta_data: FileMetaData) -> Self {
+        Self(ParquetMetaData::new(file_meta_data, vec![]))
+    }
+
+    /// Create a new builder from an exising ParquetMetaData
+    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
+        Self(metadata)
+    }
+
+    /// Adds a row group to the metadata
+    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
+        self.0.row_groups.push(row_group);
+        self
+    }
+
+    /// Sets all the row groups to the specified list
+    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self 
{
+        self.0.row_groups = row_groups;
+        self
+    }
+
+    /// Takes ownership of the row groups in this builder, and clears the list
+    /// of row groups.
+    ///
+    /// This can be used for more efficient creation of a new ParquetMetaData
+    /// from an existing one.
+    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
+        std::mem::take(&mut self.0.row_groups)
+    }
+
+    /// Return a reference to the current row groups
+    pub fn row_groups(&self) -> &[RowGroupMetaData] {

Review Comment:
   Yes, this is a good idea. Added in 5e324f6914



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -204,41 +206,30 @@ impl<R: 'static + ChunkReader> SerializedFileReader<R> {
                 }
             }
             if keep {
-                filtered_row_groups.push(rg_meta);
+                metadata_builder = metadata_builder.add_row_group(rg_meta);
             }
         }
 
         if options.enable_page_index {
             let mut columns_indexes = vec![];
             let mut offset_indexes = vec![];
 
-            for rg in &mut filtered_row_groups {
+            for rg in metadata_builder.row_groups().iter() {

Review Comment:
   I don't quite follow why this is needed. What scenario does it help (I can 
write a test to cover it)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to