alamb commented on code in PR #6000:
URL: https://github.com/apache/arrow-rs/pull/6000#discussion_r1665470090


##########
parquet/src/file/footer.rs:
##########
@@ -256,4 +313,68 @@ mod tests {
 
         parse_column_orders(t_column_orders, &schema_descr);
     }
+
+    #[test]
+    fn test_metadata_encode_roundtrip() {
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), 
None]));
+        let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+
+        let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", 
b)]).expect("valid conversion");
+
+        let mut buf = Vec::new();
+        let writer_options = WriterProperties::builder()
+            
.set_statistics_enabled(crate::file::properties::EnabledStatistics::Page)
+            .build();
+        {
+            let mut writer =
+                ArrowWriter::try_new(&mut buf, batch.schema(), 
Some(writer_options)).unwrap();
+            writer.write(&batch).unwrap();
+            writer.finish().unwrap();
+        }
+        let data = Bytes::from(buf);
+        let options = ArrowReaderOptions::new()
+            // Load the page index when reading metadata to cache
+            // so it is available to interpret row selections
+            .with_page_index(true);
+        let reader = 
ParquetRecordBatchReaderBuilder::try_new_with_options(data, options).unwrap();
+        let metadata = reader.metadata().clone();
+
+        let encoded = encode_metadata(&metadata).unwrap();
+        let decoded = decode_metadata(&encoded).unwrap();
+        assert_eq!(
+            metadata.file_metadata().version(),
+            decoded.file_metadata().version()
+        );
+        assert_eq!(
+            metadata.file_metadata().num_rows(),
+            decoded.file_metadata().num_rows()
+        );
+        assert_eq!(
+            metadata.file_metadata().created_by(),
+            decoded.file_metadata().created_by()
+        );
+        assert_eq!(
+            metadata.file_metadata().key_value_metadata(),
+            decoded.file_metadata().key_value_metadata()
+        );
+        assert_eq!(
+            metadata.file_metadata().schema(),
+            decoded.file_metadata().schema()
+        );
+        assert_eq!(
+            metadata.file_metadata().column_orders(),
+            decoded.file_metadata().column_orders()
+        );
+        assert_eq!(metadata.row_groups().len(), decoded.row_groups().len());
+        for (a, b) in metadata
+            .row_groups()
+            .iter()
+            .zip(decoded.row_groups().iter())
+        {
+            assert_eq!(a, b);
+        }
+        // TODO: add encoding and decoding of column and offset indexes (aka 
page indexes)

Review Comment:
   I agree that encoding/decoding of these structures doesn't have to be 
present in the initial PR, however given they are stored out of line / slightly 
differently than the other structures I think it would be good to ensure we 
could encode them using this same API



##########
parquet/src/file/footer.rs:
##########
@@ -112,6 +109,38 @@ pub fn decode_metadata(buf: &[u8]) -> 
Result<ParquetMetaData> {
     Ok(ParquetMetaData::new(file_metadata, row_groups))
 }
 
+/// Encodes the [`ParquetMetaData`] to bytes.
+/// The format of the returned bytes is the Thift compact binary protocol, as
+/// specified by the [Parquet Spec].
+///
+/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
+pub fn encode_metadata(metadata: &ParquetMetaData) -> Result<Vec<u8>> {

Review Comment:
   Is it possible to switch the existing writers to use this API as well? Not 
only would that avoid code duplication, it would ensure the API is general 
enough
   
   For example, I wonder if it would make sense for this function signature to 
be more like
   
   ```rust
   /// write the metadata to the target `std::io:Write`, returning the number 
of bytes written
   pub fn encode_metadata<W: Write>(metadata: &ParquetMetaData) -> 
Result<usize> {
   ...
   }
   ```
   
   That would allow writing into a `Vec` but also allow writing into various 
other targets and perhaps avoid buffering



##########
parquet/src/file/footer.rs:
##########
@@ -256,4 +313,68 @@ mod tests {
 
         parse_column_orders(t_column_orders, &schema_descr);
     }
+
+    #[test]
+    fn test_metadata_encode_roundtrip() {
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2), 
None]));
+        let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+
+        let batch = RecordBatch::try_from_iter(vec![("a", a), ("b", 
b)]).expect("valid conversion");
+
+        let mut buf = Vec::new();
+        let writer_options = WriterProperties::builder()
+            
.set_statistics_enabled(crate::file::properties::EnabledStatistics::Page)
+            .build();
+        {
+            let mut writer =
+                ArrowWriter::try_new(&mut buf, batch.schema(), 
Some(writer_options)).unwrap();
+            writer.write(&batch).unwrap();
+            writer.finish().unwrap();
+        }
+        let data = Bytes::from(buf);
+        let options = ArrowReaderOptions::new()
+            // Load the page index when reading metadata to cache
+            // so it is available to interpret row selections
+            .with_page_index(true);
+        let reader = 
ParquetRecordBatchReaderBuilder::try_new_with_options(data, options).unwrap();
+        let metadata = reader.metadata().clone();
+
+        let encoded = encode_metadata(&metadata).unwrap();
+        let decoded = decode_metadata(&encoded).unwrap();
+        assert_eq!(

Review Comment:
   Can you simply just assert that `encoded` == `decoded`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to