This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7fa78b79b5 Support multiple GZip members in parquet page (#4951)
7fa78b79b5 is described below
commit 7fa78b79b5e3ba028b32b20096dbe4a6f17c82bd
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Nov 15 13:55:21 2023 +0000
Support multiple GZip members in parquet page (#4951)
---
parquet-testing | 2 +-
parquet/src/compression.rs | 2 +-
parquet/src/file/serialized_reader.rs | 25 +++++++++++++++++++++++++
3 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/parquet-testing b/parquet-testing
index 506afff9b6..89b685a64c 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 506afff9b6957ffe10d08470d467867d43e1bb91
+Subproject commit 89b685a64c3117b3023d8684af1f41400841db71
diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs
index 9e0eee0e3e..a9a1afbbf2 100644
--- a/parquet/src/compression.rs
+++ b/parquet/src/compression.rs
@@ -255,7 +255,7 @@ mod gzip_codec {
output_buf: &mut Vec<u8>,
_uncompress_size: Option<usize>,
) -> Result<usize> {
- let mut decoder = read::GzDecoder::new(input_buf);
+ let mut decoder = read::MultiGzDecoder::new(input_buf);
decoder.read_to_end(output_buf).map_err(|e| e.into())
}
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index 43e169cd08..fbb172d3b3 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -775,6 +775,7 @@ mod tests {
use crate::format::BoundaryOrder;
use crate::basic::{self, ColumnOrder};
+ use crate::column::reader::ColumnReader;
use crate::data_type::private::ParquetValueType;
use crate::data_type::{AsBytes, FixedLenByteArrayType};
use crate::file::page_index::index::{Index, NativeIndex};
@@ -1730,4 +1731,28 @@ mod tests {
_ => unreachable!(),
}
}
+
+ #[test]
+ fn test_multi_gz() {
+ let file = get_test_file("concatenated_gzip_members.parquet");
+ let reader = SerializedFileReader::new(file).unwrap();
+ let row_group_reader = reader.get_row_group(0).unwrap();
+ match row_group_reader.get_column_reader(0).unwrap() {
+ ColumnReader::Int64ColumnReader(mut reader) => {
+ let mut buffer = [0; 1024];
+ let mut def_levels = [0; 1024];
+ let (num_records, num_values, num_levels) = reader
+ .read_records(1024, Some(&mut def_levels), None, &mut
buffer)
+ .unwrap();
+
+ assert_eq!(num_records, 513);
+ assert_eq!(num_values, 513);
+ assert_eq!(num_levels, 513);
+
+ let expected: Vec<i64> = (1..514).collect();
+ assert_eq!(&buffer[..513], &expected);
+ }
+ _ => unreachable!(),
+ }
+ }
}