This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 7fa78b79b5 Support multiple GZip members in parquet page (#4951)
7fa78b79b5 is described below

commit 7fa78b79b5e3ba028b32b20096dbe4a6f17c82bd
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Wed Nov 15 13:55:21 2023 +0000

    Support multiple GZip members in parquet page (#4951)
---
 parquet-testing                       |  2 +-
 parquet/src/compression.rs            |  2 +-
 parquet/src/file/serialized_reader.rs | 25 +++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/parquet-testing b/parquet-testing
index 506afff9b6..89b685a64c 160000
--- a/parquet-testing
+++ b/parquet-testing
@@ -1 +1 @@
-Subproject commit 506afff9b6957ffe10d08470d467867d43e1bb91
+Subproject commit 89b685a64c3117b3023d8684af1f41400841db71
diff --git a/parquet/src/compression.rs b/parquet/src/compression.rs
index 9e0eee0e3e..a9a1afbbf2 100644
--- a/parquet/src/compression.rs
+++ b/parquet/src/compression.rs
@@ -255,7 +255,7 @@ mod gzip_codec {
             output_buf: &mut Vec<u8>,
             _uncompress_size: Option<usize>,
         ) -> Result<usize> {
-            let mut decoder = read::GzDecoder::new(input_buf);
+            let mut decoder = read::MultiGzDecoder::new(input_buf);
             decoder.read_to_end(output_buf).map_err(|e| e.into())
         }
 
diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index 43e169cd08..fbb172d3b3 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -775,6 +775,7 @@ mod tests {
     use crate::format::BoundaryOrder;
 
     use crate::basic::{self, ColumnOrder};
+    use crate::column::reader::ColumnReader;
     use crate::data_type::private::ParquetValueType;
     use crate::data_type::{AsBytes, FixedLenByteArrayType};
     use crate::file::page_index::index::{Index, NativeIndex};
@@ -1730,4 +1731,28 @@ mod tests {
             _ => unreachable!(),
         }
     }
+
+    #[test]
+    fn test_multi_gz() {
+        let file = get_test_file("concatenated_gzip_members.parquet");
+        let reader = SerializedFileReader::new(file).unwrap();
+        let row_group_reader = reader.get_row_group(0).unwrap();
+        match row_group_reader.get_column_reader(0).unwrap() {
+            ColumnReader::Int64ColumnReader(mut reader) => {
+                let mut buffer = [0; 1024];
+                let mut def_levels = [0; 1024];
+                let (num_records, num_values, num_levels) = reader
+                    .read_records(1024, Some(&mut def_levels), None, &mut 
buffer)
+                    .unwrap();
+
+                assert_eq!(num_records, 513);
+                assert_eq!(num_values, 513);
+                assert_eq!(num_levels, 513);
+
+                let expected: Vec<i64> = (1..514).collect();
+                assert_eq!(&buffer[..513], &expected);
+            }
+            _ => unreachable!(),
+        }
+    }
 }

Reply via email to