alamb commented on code in PR #6000:
URL: https://github.com/apache/arrow-rs/pull/6000#discussion_r1665470090
##########
parquet/src/file/footer.rs:
##########
@@ -256,4 +313,68 @@ mod tests {
parse_column_orders(t_column_orders, &schema_descr);
}
+
+ #[test]
+ fn test_metadata_encode_roundtrip() {
+ let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2),
None]));
+ let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+
+ let batch = RecordBatch::try_from_iter(vec![("a", a), ("b",
b)]).expect("valid conversion");
+
+ let mut buf = Vec::new();
+ let writer_options = WriterProperties::builder()
+
.set_statistics_enabled(crate::file::properties::EnabledStatistics::Page)
+ .build();
+ {
+ let mut writer =
+ ArrowWriter::try_new(&mut buf, batch.schema(),
Some(writer_options)).unwrap();
+ writer.write(&batch).unwrap();
+ writer.finish().unwrap();
+ }
+ let data = Bytes::from(buf);
+ let options = ArrowReaderOptions::new()
+ // Load the page index when reading metadata to cache
+ // so it is available to interpret row selections
+ .with_page_index(true);
+ let reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(data, options).unwrap();
+ let metadata = reader.metadata().clone();
+
+ let encoded = encode_metadata(&metadata).unwrap();
+ let decoded = decode_metadata(&encoded).unwrap();
+ assert_eq!(
+ metadata.file_metadata().version(),
+ decoded.file_metadata().version()
+ );
+ assert_eq!(
+ metadata.file_metadata().num_rows(),
+ decoded.file_metadata().num_rows()
+ );
+ assert_eq!(
+ metadata.file_metadata().created_by(),
+ decoded.file_metadata().created_by()
+ );
+ assert_eq!(
+ metadata.file_metadata().key_value_metadata(),
+ decoded.file_metadata().key_value_metadata()
+ );
+ assert_eq!(
+ metadata.file_metadata().schema(),
+ decoded.file_metadata().schema()
+ );
+ assert_eq!(
+ metadata.file_metadata().column_orders(),
+ decoded.file_metadata().column_orders()
+ );
+ assert_eq!(metadata.row_groups().len(), decoded.row_groups().len());
+ for (a, b) in metadata
+ .row_groups()
+ .iter()
+ .zip(decoded.row_groups().iter())
+ {
+ assert_eq!(a, b);
+ }
+ // TODO: add encoding and decoding of column and offset indexes (aka
page indexes)
Review Comment:
I agree that encoding/decoding of these structures doesn't have to be
present in the initial PR, however given they are stored out of line / slightly
differently than the other structures I think it would be good to ensure we
could encode them using this same API
##########
parquet/src/file/footer.rs:
##########
@@ -112,6 +109,38 @@ pub fn decode_metadata(buf: &[u8]) ->
Result<ParquetMetaData> {
Ok(ParquetMetaData::new(file_metadata, row_groups))
}
+/// Encodes the [`ParquetMetaData`] to bytes.
+/// The format of the returned bytes is the Thift compact binary protocol, as
+/// specified by the [Parquet Spec].
+///
+/// [Parquet Spec]: https://github.com/apache/parquet-format#metadata
+pub fn encode_metadata(metadata: &ParquetMetaData) -> Result<Vec<u8>> {
Review Comment:
Is it possible to switch the existing writers to use this API as well? Not
only would that avoid code duplication, it would ensure the API is general
enough
For example, I wonder if it would make sense for this function signature to
be more like
```rust
/// write the metadata to the target `std::io:Write`, returning the number
of bytes written
pub fn encode_metadata<W: Write>(metadata: &ParquetMetaData) ->
Result<usize> {
...
}
```
That would allow writing into a `Vec` but also allow writing into various
other targets and perhaps avoid buffering
##########
parquet/src/file/footer.rs:
##########
@@ -256,4 +313,68 @@ mod tests {
parse_column_orders(t_column_orders, &schema_descr);
}
+
+ #[test]
+ fn test_metadata_encode_roundtrip() {
+ let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), Some(2),
None]));
+ let b: ArrayRef = Arc::new(StringArray::from(vec!["a", "b", "c"]));
+
+ let batch = RecordBatch::try_from_iter(vec![("a", a), ("b",
b)]).expect("valid conversion");
+
+ let mut buf = Vec::new();
+ let writer_options = WriterProperties::builder()
+
.set_statistics_enabled(crate::file::properties::EnabledStatistics::Page)
+ .build();
+ {
+ let mut writer =
+ ArrowWriter::try_new(&mut buf, batch.schema(),
Some(writer_options)).unwrap();
+ writer.write(&batch).unwrap();
+ writer.finish().unwrap();
+ }
+ let data = Bytes::from(buf);
+ let options = ArrowReaderOptions::new()
+ // Load the page index when reading metadata to cache
+ // so it is available to interpret row selections
+ .with_page_index(true);
+ let reader =
ParquetRecordBatchReaderBuilder::try_new_with_options(data, options).unwrap();
+ let metadata = reader.metadata().clone();
+
+ let encoded = encode_metadata(&metadata).unwrap();
+ let decoded = decode_metadata(&encoded).unwrap();
+ assert_eq!(
Review Comment:
Can you simply just assert that `encoded` == `decoded`?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]