alamb commented on code in PR #7389:
URL: https://github.com/apache/arrow-rs/pull/7389#discussion_r2033402532


##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
     // maximum page header size and abort if that is exceeded.
     let buffer = match decompressor {
         Some(decompressor) if can_decompress => {
-            let uncompressed_size = page_header.uncompressed_page_size as 
usize;
-            let mut decompressed = Vec::with_capacity(uncompressed_size);
-            let compressed = &buffer.as_ref()[offset..];
+            let uncompressed_page_size = page_header.uncompressed_page_size as 
usize;

Review Comment:
   If we are going to be messing with this code anyways, maybe we can do a 
checked conversion from signed to unsigned as well:
   
   ```suggestion
               let uncompressed_page_size = 
usize::try_from(page_header.uncompressed_page_size)?;
   ```
   



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -1321,6 +1320,107 @@ mod tests {
         assert_eq!(page_count, 2);
     }
 
+    #[test]
+    fn test_file_reader_empty_datapage_v2() {
+        let test_file = 
get_test_file("datapage_v2_empty_datapage.snappy.parquet");

Review Comment:
   While we are adding tests, can you also add a test that 
`page_v2_empty_compressed.parquet` (added in 
https://github.com/apache/parquet-testing/pull/71)  works too ? It doesn't seem 
to be used yet 
https://github.com/search?q=repo%3Aapache%2Farrow-rs%20page_v2_empty_compressed.parquet&type=code
   
   I know you say the current reader works ok with that file, but it would be 
nice to add additional automated coverage



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -1321,6 +1320,107 @@ mod tests {
         assert_eq!(page_count, 2);
     }
 
+    #[test]
+    fn test_file_reader_empty_datapage_v2() {
+        let test_file = 
get_test_file("datapage_v2_empty_datapage.snappy.parquet");
+        let reader_result = SerializedFileReader::new(test_file);
+        assert!(reader_result.is_ok());
+        let reader = reader_result.unwrap();
+
+        // Test contents in Parquet metadata
+        let metadata = reader.metadata();
+        assert_eq!(metadata.num_row_groups(), 1);
+
+        // Test contents in file metadata
+        let file_metadata = metadata.file_metadata();
+        assert!(file_metadata.created_by().is_some());
+        assert_eq!(
+            file_metadata.created_by().unwrap(),
+            "parquet-mr version 1.13.1 (build 
db4183109d5b734ec5930d870cdae161e408ddba)"
+        );
+        assert!(file_metadata.key_value_metadata().is_some());
+        assert_eq!(
+            file_metadata.key_value_metadata().to_owned().unwrap().len(),
+            2
+        );
+
+        assert_eq!(file_metadata.num_rows(), 1);
+        assert_eq!(file_metadata.version(), 1);
+        let expected_order = 
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED);
+        assert_eq!(
+            file_metadata.column_orders(),
+            Some(vec![expected_order].as_ref())
+        );
+
+        let row_group_metadata = metadata.row_group(0);
+
+        // Check each column order
+        for i in 0..row_group_metadata.num_columns() {
+            assert_eq!(file_metadata.column_order(i), expected_order);
+        }
+
+        // Test row group reader
+        let row_group_reader_result = reader.get_row_group(0);
+        assert!(row_group_reader_result.is_ok());
+        let row_group_reader: Box<dyn RowGroupReader> = 
row_group_reader_result.unwrap();
+        assert_eq!(
+            row_group_reader.num_columns(),
+            row_group_metadata.num_columns()
+        );
+        assert_eq!(
+            row_group_reader.metadata().total_byte_size(),
+            row_group_metadata.total_byte_size()
+        );
+
+        // Test page readers
+        let page_reader_0_result = row_group_reader.get_column_page_reader(0);
+        assert!(page_reader_0_result.is_ok());
+        let mut page_reader_0: Box<dyn PageReader> = 
page_reader_0_result.unwrap();
+        let mut page_count = 0;
+        while let Ok(Some(page)) = page_reader_0.get_next_page() {
+            let is_expected_page = match page {
+                Page::DictionaryPage {
+                    buf,
+                    num_values,
+                    encoding,
+                    is_sorted,
+                } => {
+                    assert_eq!(buf.len(), 7);
+                    assert_eq!(num_values, 1);
+                    assert_eq!(encoding, Encoding::PLAIN);
+                    assert!(!is_sorted);
+                    true
+                }
+                Page::DataPageV2 {
+                    buf,
+                    num_values,
+                    encoding,
+                    num_nulls,
+                    num_rows,
+                    def_levels_byte_len,
+                    rep_levels_byte_len,
+                    is_compressed,
+                    statistics,
+                } => {
+                    assert_eq!(buf.len(), 2);
+                    assert_eq!(num_values, 1);
+                    assert_eq!(encoding, Encoding::PLAIN);
+                    assert_eq!(num_nulls, 1);
+                    assert_eq!(num_rows, 1);
+                    assert_eq!(def_levels_byte_len, 2);
+                    assert_eq!(rep_levels_byte_len, 0);
+                    assert!(is_compressed);
+                    assert!(statistics.is_none());
+                    true
+                }
+                _ => false,
+            };
+            assert!(is_expected_page);
+            page_count += 1;
+        }
+        assert_eq!(page_count, 1);

Review Comment:
   I double checked that the added test fails as follows without the code 
changes in this PR
   ```
   assertion `left == right` failed
     left: 0
    right: 1
   
   Left:  0
   Right: 1
   <Click to see difference>
   
   thread 'file::serialized_reader::tests::test_file_reader_empty_datapage_v2' 
panicked at parquet/src/file/serialized_reader.rs:1422:9:
   assertion `left == right` failed
     left: 0
    right: 1
   stack backtrace:
   ```



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
     // maximum page header size and abort if that is exceeded.
     let buffer = match decompressor {
         Some(decompressor) if can_decompress => {
-            let uncompressed_size = page_header.uncompressed_page_size as 
usize;
-            let mut decompressed = Vec::with_capacity(uncompressed_size);
-            let compressed = &buffer.as_ref()[offset..];
+            let uncompressed_page_size = page_header.uncompressed_page_size as 
usize;
+            let decompressed_size = uncompressed_page_size - offset;
+            let mut decompressed = Vec::with_capacity(uncompressed_page_size);
             decompressed.extend_from_slice(&buffer.as_ref()[..offset]);
-            decompressor.decompress(
-                compressed,
-                &mut decompressed,
-                Some(uncompressed_size - offset),
-            )?;
+            if decompressed_size != 0 {

Review Comment:
   > I'd prefer
   
   FWIW since `decompressed_size` is an unsigned integer I think they are 
equivalent you should do whatever you prefer / makes the most sense to you



##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
     // maximum page header size and abort if that is exceeded.
     let buffer = match decompressor {
         Some(decompressor) if can_decompress => {
-            let uncompressed_size = page_header.uncompressed_page_size as 
usize;
-            let mut decompressed = Vec::with_capacity(uncompressed_size);
-            let compressed = &buffer.as_ref()[offset..];
+            let uncompressed_page_size = page_header.uncompressed_page_size as 
usize;
+            let decompressed_size = uncompressed_page_size - offset;
+            let mut decompressed = Vec::with_capacity(uncompressed_page_size);
             decompressed.extend_from_slice(&buffer.as_ref()[..offset]);
-            decompressor.decompress(
-                compressed,
-                &mut decompressed,
-                Some(uncompressed_size - offset),
-            )?;
+            if decompressed_size != 0 {

Review Comment:
   Yes please! Otherwise we may be susceptible to some sort of DOS / parquet 
bomb with malformed input
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to