mapleFU commented on code in PR #38327:
URL: https://github.com/apache/arrow/pull/38327#discussion_r1364270024
##########
cpp/src/parquet/file_deserialize_test.cc:
##########
@@ -879,6 +879,45 @@ TEST_F(TestPageSerde, DataPageV2CrcCheckNonExistent) {
/* write_data_page_v2 */ true);
}
+TEST_F(TestPageSerde, BadCompressedPageSize) {
+ auto codec_types = GetSupportedCodecTypes();
+ const int data_page_bytes = 8192;
+ const int32_t num_rows = 32; // dummy value
+ data_page_header_.num_values = num_rows;
+ std::vector<uint8_t> faux_data;
+ // Resize and make all to 1 to make it well-compressed.
+ faux_data.resize(data_page_bytes, 1);
+ for (auto codec_type : codec_types) {
+ auto codec = GetCodec(codec_type);
+ std::vector<uint8_t> buffer;
+ const uint8_t* data = faux_data.data();
+ int data_size = static_cast<int>(faux_data.size());
+
+ int64_t max_compressed_size = codec->MaxCompressedLen(data_size, data);
+ buffer.resize(max_compressed_size);
+
+ int64_t actual_size;
+ ASSERT_OK_AND_ASSIGN(
+ actual_size, codec->Compress(data_size, data, max_compressed_size,
&buffer[0]));
+ ASSERT_NO_FATAL_FAILURE(WriteDataPageHeader(data_page_bytes, data_size + 1,
+
static_cast<int32_t>(actual_size)));
+ ASSERT_OK(out_stream_->Write(buffer.data(), actual_size));
+ InitSerializedPageReader(num_rows, codec_type);
+
+ EXPECT_THROW_THAT(
+ [&]() { page_reader_->NextPage(); }, ParquetException,
+ ::testing::AnyOf(
+ ::testing::Property(
+ &ParquetException::what,
+ ::testing::HasSubstr("Page didn't decompress to expected
size")),
+ // Some decompressor, like zstd, might be able to detect the error
+ // before checking the page size.
Review Comment:
ZSTD would throw an `IOError: Corrupt ...` here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]