[arrow-rs] branch master updated: Convert parquet metadata back to builders (#4265)

tustvold Tue, 23 May 2023 10:15:14 -0700

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 3adca539a Convert parquet metadata back to builders (#4265)
3adca539a is described below

commit 3adca539ad9e1b27892a5ef38ac2780aff4c0bff
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Tue May 23 18:15:00 2023 +0100

    Convert parquet metadata back to builders (#4265)
---
 parquet/src/file/metadata.rs | 139 ++++++++++++++-----------------------------
 1 file changed, 46 insertions(+), 93 deletions(-)

diff --git a/parquet/src/file/metadata.rs b/parquet/src/file/metadata.rs
index c2961aa76..40f6cf312 100644
--- a/parquet/src/file/metadata.rs
+++ b/parquet/src/file/metadata.rs
@@ -365,78 +365,69 @@ impl RowGroupMetaData {
             ordinal: None,
         }
     }
+
+    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
+    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
+        RowGroupMetaDataBuilder(self)
+    }
 }
 
 /// Builder for row group metadata.
-pub struct RowGroupMetaDataBuilder {
-    columns: Vec<ColumnChunkMetaData>,
-    schema_descr: SchemaDescPtr,
-    num_rows: i64,
-    sorting_columns: Option<Vec<SortingColumn>>,
-    total_byte_size: i64,
-}
+pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
 
 impl RowGroupMetaDataBuilder {
     /// Creates new builder from schema descriptor.
     fn new(schema_descr: SchemaDescPtr) -> Self {
-        Self {
+        Self(RowGroupMetaData {
             columns: Vec::with_capacity(schema_descr.num_columns()),
             schema_descr,
             num_rows: 0,
             sorting_columns: None,
             total_byte_size: 0,
-        }
+        })
     }
 
     /// Sets number of rows in this row group.
     pub fn set_num_rows(mut self, value: i64) -> Self {
-        self.num_rows = value;
+        self.0.num_rows = value;
         self
     }
 
     /// Sets the sorting order for columns
     pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> 
Self {
-        self.sorting_columns = value;
+        self.0.sorting_columns = value;
         self
     }
 
     /// Sets total size in bytes for this row group.
     pub fn set_total_byte_size(mut self, value: i64) -> Self {
-        self.total_byte_size = value;
+        self.0.total_byte_size = value;
         self
     }
 
     /// Sets column metadata for this row group.
     pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> 
Self {
-        self.columns = value;
+        self.0.columns = value;
         self
     }
 
     /// Builds row group metadata.
     pub fn build(self) -> Result<RowGroupMetaData> {
-        if self.schema_descr.num_columns() != self.columns.len() {
+        if self.0.schema_descr.num_columns() != self.0.columns.len() {
             return Err(general_err!(
                 "Column length mismatch: {} != {}",
-                self.schema_descr.num_columns(),
-                self.columns.len()
+                self.0.schema_descr.num_columns(),
+                self.0.columns.len()
             ));
         }
 
-        Ok(RowGroupMetaData {
-            columns: self.columns,
-            num_rows: self.num_rows,
-            sorting_columns: self.sorting_columns,
-            total_byte_size: self.total_byte_size,
-            schema_descr: self.schema_descr,
-        })
+        Ok(self.0)
     }
 }
 
 /// Metadata for a column chunk.
 #[derive(Debug, Clone, PartialEq)]
 pub struct ColumnChunkMetaData {
-    column_type: Type,
-    column_path: ColumnPath,
     column_descr: ColumnDescPtr,
     encodings: Vec<Encoding>,
     file_path: Option<String>,
@@ -479,12 +470,12 @@ impl ColumnChunkMetaData {
 
     /// Type of this column. Must be primitive.
     pub fn column_type(&self) -> Type {
-        self.column_type
+        self.column_descr.physical_type()
     }
 
     /// Path (or identifier) of this column.
     pub fn column_path(&self) -> &ColumnPath {
-        &self.column_path
+        self.column_descr.path()
     }
 
     /// Descriptor for this column.
@@ -609,7 +600,6 @@ impl ColumnChunkMetaData {
         }
         let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
         let column_type = Type::try_from(col_metadata.type_)?;
-        let column_path = ColumnPath::new(col_metadata.path_in_schema);
         let encodings = col_metadata
             .encodings
             .drain(0..)
@@ -641,8 +631,6 @@ impl ColumnChunkMetaData {
         let column_index_length = cc.column_index_length;
 
         let result = ColumnChunkMetaData {
-            column_type,
-            column_path,
             column_descr,
             encodings,
             file_path,
@@ -685,9 +673,9 @@ impl ColumnChunkMetaData {
     /// Method to convert to Thrift `ColumnMetaData`
     pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
         ColumnMetaData {
-            type_: self.column_type.into(),
+            type_: self.column_type().into(),
             encodings: self.encodings().iter().map(|&v| v.into()).collect(),
-            path_in_schema: Vec::from(self.column_path.as_ref()),
+            path_in_schema: self.column_path().as_ref().to_vec(),
             codec: self.compression.into(),
             num_values: self.num_values,
             total_uncompressed_size: self.total_uncompressed_size,
@@ -704,34 +692,20 @@ impl ColumnChunkMetaData {
             bloom_filter_offset: self.bloom_filter_offset,
         }
     }
+
+    /// Converts this [`ColumnChunkMetaData`] into a 
[`ColumnChunkMetaDataBuilder`]
+    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
+        ColumnChunkMetaDataBuilder(self)
+    }
 }
 
 /// Builder for column chunk metadata.
-pub struct ColumnChunkMetaDataBuilder {
-    column_descr: ColumnDescPtr,
-    encodings: Vec<Encoding>,
-    file_path: Option<String>,
-    file_offset: i64,
-    num_values: i64,
-    compression: Compression,
-    total_compressed_size: i64,
-    total_uncompressed_size: i64,
-    data_page_offset: i64,
-    index_page_offset: Option<i64>,
-    dictionary_page_offset: Option<i64>,
-    statistics: Option<Statistics>,
-    encoding_stats: Option<Vec<PageEncodingStats>>,
-    bloom_filter_offset: Option<i64>,
-    offset_index_offset: Option<i64>,
-    offset_index_length: Option<i32>,
-    column_index_offset: Option<i64>,
-    column_index_length: Option<i32>,
-}
+pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
 
 impl ColumnChunkMetaDataBuilder {
     /// Creates new column chunk metadata builder.
     fn new(column_descr: ColumnDescPtr) -> Self {
-        Self {
+        Self(ColumnChunkMetaData {
             column_descr,
             encodings: Vec::new(),
             file_path: None,
@@ -750,135 +724,114 @@ impl ColumnChunkMetaDataBuilder {
             offset_index_length: None,
             column_index_offset: None,
             column_index_length: None,
-        }
+        })
     }
 
     /// Sets list of encodings for this column chunk.
     pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
-        self.encodings = encodings;
+        self.0.encodings = encodings;
         self
     }
 
     /// Sets optional file path for this column chunk.
     pub fn set_file_path(mut self, value: String) -> Self {
-        self.file_path = Some(value);
+        self.0.file_path = Some(value);
         self
     }
 
     /// Sets file offset in bytes.
     pub fn set_file_offset(mut self, value: i64) -> Self {
-        self.file_offset = value;
+        self.0.file_offset = value;
         self
     }
 
     /// Sets number of values.
     pub fn set_num_values(mut self, value: i64) -> Self {
-        self.num_values = value;
+        self.0.num_values = value;
         self
     }
 
     /// Sets compression.
     pub fn set_compression(mut self, value: Compression) -> Self {
-        self.compression = value;
+        self.0.compression = value;
         self
     }
 
     /// Sets total compressed size in bytes.
     pub fn set_total_compressed_size(mut self, value: i64) -> Self {
-        self.total_compressed_size = value;
+        self.0.total_compressed_size = value;
         self
     }
 
     /// Sets total uncompressed size in bytes.
     pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
-        self.total_uncompressed_size = value;
+        self.0.total_uncompressed_size = value;
         self
     }
 
     /// Sets data page offset in bytes.
     pub fn set_data_page_offset(mut self, value: i64) -> Self {
-        self.data_page_offset = value;
+        self.0.data_page_offset = value;
         self
     }
 
     /// Sets optional dictionary page ofset in bytes.
     pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
-        self.dictionary_page_offset = value;
+        self.0.dictionary_page_offset = value;
         self
     }
 
     /// Sets optional index page offset in bytes.
     pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
-        self.index_page_offset = value;
+        self.0.index_page_offset = value;
         self
     }
 
     /// Sets statistics for this column chunk.
     pub fn set_statistics(mut self, value: Statistics) -> Self {
-        self.statistics = Some(value);
+        self.0.statistics = Some(value);
         self
     }
 
     /// Sets page encoding stats for this column chunk.
     pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> 
Self {
-        self.encoding_stats = Some(value);
+        self.0.encoding_stats = Some(value);
         self
     }
 
     /// Sets optional bloom filter offset in bytes.
     pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
-        self.bloom_filter_offset = value;
+        self.0.bloom_filter_offset = value;
         self
     }
 
     /// Sets optional offset index offset in bytes.
     pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
-        self.offset_index_offset = value;
+        self.0.offset_index_offset = value;
         self
     }
 
     /// Sets optional offset index length in bytes.
     pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
-        self.offset_index_length = value;
+        self.0.offset_index_length = value;
         self
     }
 
     /// Sets optional column index offset in bytes.
     pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
-        self.column_index_offset = value;
+        self.0.column_index_offset = value;
         self
     }
 
     /// Sets optional column index length in bytes.
     pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
-        self.column_index_length = value;
+        self.0.column_index_length = value;
         self
     }
 
     /// Builds column chunk metadata.
     pub fn build(self) -> Result<ColumnChunkMetaData> {
-        Ok(ColumnChunkMetaData {
-            column_type: self.column_descr.physical_type(),
-            column_path: self.column_descr.path().clone(),
-            column_descr: self.column_descr,
-            encodings: self.encodings,
-            file_path: self.file_path,
-            file_offset: self.file_offset,
-            num_values: self.num_values,
-            compression: self.compression,
-            total_compressed_size: self.total_compressed_size,
-            total_uncompressed_size: self.total_uncompressed_size,
-            data_page_offset: self.data_page_offset,
-            index_page_offset: self.index_page_offset,
-            dictionary_page_offset: self.dictionary_page_offset,
-            statistics: self.statistics,
-            encoding_stats: self.encoding_stats,
-            bloom_filter_offset: self.bloom_filter_offset,
-            offset_index_offset: self.offset_index_offset,
-            offset_index_length: self.offset_index_length,
-            column_index_offset: self.column_index_offset,
-            column_index_length: self.column_index_length,
-        })
+        Ok(self.0)
     }
 }

[arrow-rs] branch master updated: Convert parquet metadata back to builders (#4265)

Reply via email to