alamb commented on code in PR #7389:
URL: https://github.com/apache/arrow-rs/pull/7389#discussion_r2033402532
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
// maximum page header size and abort if that is exceeded.
let buffer = match decompressor {
Some(decompressor) if can_decompress => {
- let uncompressed_size = page_header.uncompressed_page_size as
usize;
- let mut decompressed = Vec::with_capacity(uncompressed_size);
- let compressed = &buffer.as_ref()[offset..];
+ let uncompressed_page_size = page_header.uncompressed_page_size as
usize;
Review Comment:
If we are going to be messing with this code anyways, maybe we can do a
checked conversion from signed to unsigned as well:
```suggestion
let uncompressed_page_size =
usize::try_from(page_header.uncompressed_page_size)?;
```
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -1321,6 +1320,107 @@ mod tests {
assert_eq!(page_count, 2);
}
+ #[test]
+ fn test_file_reader_empty_datapage_v2() {
+ let test_file =
get_test_file("datapage_v2_empty_datapage.snappy.parquet");
Review Comment:
While we are adding tests, can you also add a test that
`page_v2_empty_compressed.parquet` (added in
https://github.com/apache/parquet-testing/pull/71) works too ? It doesn't seem
to be used yet
https://github.com/search?q=repo%3Aapache%2Farrow-rs%20page_v2_empty_compressed.parquet&type=code
I know you say the current reader works ok with that file, but it would be
nice to add additional automated coverage
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -1321,6 +1320,107 @@ mod tests {
assert_eq!(page_count, 2);
}
+ #[test]
+ fn test_file_reader_empty_datapage_v2() {
+ let test_file =
get_test_file("datapage_v2_empty_datapage.snappy.parquet");
+ let reader_result = SerializedFileReader::new(test_file);
+ assert!(reader_result.is_ok());
+ let reader = reader_result.unwrap();
+
+ // Test contents in Parquet metadata
+ let metadata = reader.metadata();
+ assert_eq!(metadata.num_row_groups(), 1);
+
+ // Test contents in file metadata
+ let file_metadata = metadata.file_metadata();
+ assert!(file_metadata.created_by().is_some());
+ assert_eq!(
+ file_metadata.created_by().unwrap(),
+ "parquet-mr version 1.13.1 (build
db4183109d5b734ec5930d870cdae161e408ddba)"
+ );
+ assert!(file_metadata.key_value_metadata().is_some());
+ assert_eq!(
+ file_metadata.key_value_metadata().to_owned().unwrap().len(),
+ 2
+ );
+
+ assert_eq!(file_metadata.num_rows(), 1);
+ assert_eq!(file_metadata.version(), 1);
+ let expected_order =
ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::SIGNED);
+ assert_eq!(
+ file_metadata.column_orders(),
+ Some(vec![expected_order].as_ref())
+ );
+
+ let row_group_metadata = metadata.row_group(0);
+
+ // Check each column order
+ for i in 0..row_group_metadata.num_columns() {
+ assert_eq!(file_metadata.column_order(i), expected_order);
+ }
+
+ // Test row group reader
+ let row_group_reader_result = reader.get_row_group(0);
+ assert!(row_group_reader_result.is_ok());
+ let row_group_reader: Box<dyn RowGroupReader> =
row_group_reader_result.unwrap();
+ assert_eq!(
+ row_group_reader.num_columns(),
+ row_group_metadata.num_columns()
+ );
+ assert_eq!(
+ row_group_reader.metadata().total_byte_size(),
+ row_group_metadata.total_byte_size()
+ );
+
+ // Test page readers
+ let page_reader_0_result = row_group_reader.get_column_page_reader(0);
+ assert!(page_reader_0_result.is_ok());
+ let mut page_reader_0: Box<dyn PageReader> =
page_reader_0_result.unwrap();
+ let mut page_count = 0;
+ while let Ok(Some(page)) = page_reader_0.get_next_page() {
+ let is_expected_page = match page {
+ Page::DictionaryPage {
+ buf,
+ num_values,
+ encoding,
+ is_sorted,
+ } => {
+ assert_eq!(buf.len(), 7);
+ assert_eq!(num_values, 1);
+ assert_eq!(encoding, Encoding::PLAIN);
+ assert!(!is_sorted);
+ true
+ }
+ Page::DataPageV2 {
+ buf,
+ num_values,
+ encoding,
+ num_nulls,
+ num_rows,
+ def_levels_byte_len,
+ rep_levels_byte_len,
+ is_compressed,
+ statistics,
+ } => {
+ assert_eq!(buf.len(), 2);
+ assert_eq!(num_values, 1);
+ assert_eq!(encoding, Encoding::PLAIN);
+ assert_eq!(num_nulls, 1);
+ assert_eq!(num_rows, 1);
+ assert_eq!(def_levels_byte_len, 2);
+ assert_eq!(rep_levels_byte_len, 0);
+ assert!(is_compressed);
+ assert!(statistics.is_none());
+ true
+ }
+ _ => false,
+ };
+ assert!(is_expected_page);
+ page_count += 1;
+ }
+ assert_eq!(page_count, 1);
Review Comment:
I double checked that the added test fails as follows without the code
changes in this PR
```
assertion `left == right` failed
left: 0
right: 1
Left: 0
Right: 1
<Click to see difference>
thread 'file::serialized_reader::tests::test_file_reader_empty_datapage_v2'
panicked at parquet/src/file/serialized_reader.rs:1422:9:
assertion `left == right` failed
left: 0
right: 1
stack backtrace:
```
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
// maximum page header size and abort if that is exceeded.
let buffer = match decompressor {
Some(decompressor) if can_decompress => {
- let uncompressed_size = page_header.uncompressed_page_size as
usize;
- let mut decompressed = Vec::with_capacity(uncompressed_size);
- let compressed = &buffer.as_ref()[offset..];
+ let uncompressed_page_size = page_header.uncompressed_page_size as
usize;
+ let decompressed_size = uncompressed_page_size - offset;
+ let mut decompressed = Vec::with_capacity(uncompressed_page_size);
decompressed.extend_from_slice(&buffer.as_ref()[..offset]);
- decompressor.decompress(
- compressed,
- &mut decompressed,
- Some(uncompressed_size - offset),
- )?;
+ if decompressed_size != 0 {
Review Comment:
> I'd prefer
FWIW since `decompressed_size` is an unsigned integer I think they are
equivalent you should do whatever you prefer / makes the most sense to you
##########
parquet/src/file/serialized_reader.rs:
##########
@@ -452,21 +452,20 @@ pub(crate) fn decode_page(
// maximum page header size and abort if that is exceeded.
let buffer = match decompressor {
Some(decompressor) if can_decompress => {
- let uncompressed_size = page_header.uncompressed_page_size as
usize;
- let mut decompressed = Vec::with_capacity(uncompressed_size);
- let compressed = &buffer.as_ref()[offset..];
+ let uncompressed_page_size = page_header.uncompressed_page_size as
usize;
+ let decompressed_size = uncompressed_page_size - offset;
+ let mut decompressed = Vec::with_capacity(uncompressed_page_size);
decompressed.extend_from_slice(&buffer.as_ref()[..offset]);
- decompressor.decompress(
- compressed,
- &mut decompressed,
- Some(uncompressed_size - offset),
- )?;
+ if decompressed_size != 0 {
Review Comment:
Yes please! Otherwise we may be susceptible to some sort of DOS / parquet
bomb with malformed input
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]