This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 4712daba56 GH-38326: [C++][Parquet] check the decompressed page size 
same as size in page header (#38327)
4712daba56 is described below

commit 4712daba56a282137bc72e05f10a3db152ee2534
Author: mwish <[email protected]>
AuthorDate: Fri Oct 20 00:29:52 2023 +0800

    GH-38326: [C++][Parquet] check the decompressed page size same as size in 
page header (#38327)
    
    
    
    ### Rationale for this change
    
    As mentioned in issue, currently we only decompress the page without 
checking the decompress size.
    This patch add a checkings.
    
    ### What changes are included in this PR?
    
    Throw exception when size not matches in 
`SerializedPageReader::DecompressIfNeeded`
    
    ### Are these changes tested?
    
    Yes.
    
    ### Are there any user-facing changes?
    
    Non-conforming files may throw an exception while they would silently 
return invalid results before.
    
    * Closes: #38326
    
    Authored-by: mwish <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/column_reader.cc         | 15 +++++++++---
 cpp/src/parquet/column_reader_test.cc    |  4 +--
 cpp/src/parquet/file_deserialize_test.cc | 42 ++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index e09369effb..ecc48811e4 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -594,10 +594,17 @@ std::shared_ptr<Buffer> 
SerializedPageReader::DecompressIfNeeded(
   }
 
   // Decompress the values
-  PARQUET_THROW_NOT_OK(decompressor_->Decompress(
-      compressed_len - levels_byte_len, page_buffer->data() + levels_byte_len,
-      uncompressed_len - levels_byte_len,
-      decompression_buffer_->mutable_data() + levels_byte_len));
+  PARQUET_ASSIGN_OR_THROW(
+      auto decompressed_len,
+      decompressor_->Decompress(compressed_len - levels_byte_len,
+                                page_buffer->data() + levels_byte_len,
+                                uncompressed_len - levels_byte_len,
+                                decompression_buffer_->mutable_data() + 
levels_byte_len));
+  if (decompressed_len != uncompressed_len - levels_byte_len) {
+    throw ParquetException("Page didn't decompress to expected size, expected: 
" +
+                           std::to_string(uncompressed_len - levels_byte_len) +
+                           ", but got:" + std::to_string(decompressed_len));
+  }
 
   return decompression_buffer_;
 }
diff --git a/cpp/src/parquet/column_reader_test.cc 
b/cpp/src/parquet/column_reader_test.cc
index b3b75a7bb5..bed7e06786 100644
--- a/cpp/src/parquet/column_reader_test.cc
+++ b/cpp/src/parquet/column_reader_test.cc
@@ -1587,8 +1587,8 @@ class ByteArrayRecordReaderTest : public 
::testing::TestWithParam<bool> {
 };
 
 // Tests reading and skipping a ByteArray field.
-// The binary readers only differ in DeocdeDense and DecodeSpaced functions, so
-// testing optional is sufficient in excercising those code paths.
+// The binary readers only differ in DecodeDense and DecodeSpaced functions, so
+// testing optional is sufficient in exercising those code paths.
 TEST_P(ByteArrayRecordReaderTest, ReadAndSkipOptional) {
   MakeRecordReader(/*levels_per_page=*/90, /*num_pages=*/1);
 
diff --git a/cpp/src/parquet/file_deserialize_test.cc 
b/cpp/src/parquet/file_deserialize_test.cc
index b2a918915e..4377e714a2 100644
--- a/cpp/src/parquet/file_deserialize_test.cc
+++ b/cpp/src/parquet/file_deserialize_test.cc
@@ -879,6 +879,48 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) {
                          /* write_data_page_v2 */ true);
 }
 
+TEST_F(TestPageSerde, BadCompressedPageSize) {
+  // GH-38326: an exception should be raised if a compressed data page
+  // decompresses to a smaller size than declared in the data page header.
+  auto codec_types = GetSupportedCodecTypes();
+  const int data_page_bytes = 8192;
+  const int32_t num_rows = 32;  // dummy value
+  data_page_header_.num_values = num_rows;
+  std::vector<uint8_t> faux_data;
+  // A well-compressible piece of data
+  faux_data.resize(data_page_bytes, 1);
+  for (auto codec_type : codec_types) {
+    auto codec = GetCodec(codec_type);
+    std::vector<uint8_t> buffer;
+    const uint8_t* data = faux_data.data();
+    int data_size = static_cast<int>(faux_data.size());
+
+    int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
+    buffer.resize(max_compressed_size);
+
+    int64_t actual_size;
+    ASSERT_OK_AND_ASSIGN(
+        actual_size, codec->Compress(data_size, data, max_compressed_size, 
&buffer[0]));
+    // Write a data page header declaring a larger decompressed size than 
actual
+    ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1,
+                                                
static_cast<int32_t>(actual_size)));
+    ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
+    InitSerializedPageReader(num_rows, codec_type);
+
+    EXPECT_THROW_THAT(
+        [&]() { page_reader_->NextPage(); }, ParquetException,
+        ::testing::AnyOf(
+            ::testing::Property(
+                &ParquetException::what,
+                ::testing::HasSubstr("Page didn't decompress to expected 
size")),
+            // Some decompressor, like zstd, might be able to detect the error
+            // before checking the page size.
+            ::testing::Property(&ParquetException::what,
+                                ::testing::HasSubstr("IOError"))));
+    ResetStream();
+  }
+}
+
 // ----------------------------------------------------------------------
 // File structure tests
 

Reply via email to