This is an automated email from the ASF dual-hosted git repository. maplefu pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 85779a44e9 GH-45185: [C++][Parquet] Raise an error for invalid repetition levels when delimiting records (#45186) 85779a44e9 is described below commit 85779a44e9d1804a0167c46971682beb4d1766e5 Author: Adam Reeve <adre...@gmail.com> AuthorDate: Fri Mar 28 21:21:51 2025 +1300 GH-45185: [C++][Parquet] Raise an error for invalid repetition levels when delimiting records (#45186) ### Rationale for this change See #45185. Invalid repetition levels would previously only cause a fatal error in debug builds. ### What changes are included in this PR? Converts an existing `ARROW_DCHECK_EQ` of the repetition level with a check that will raise an exception in release builds too. ### Are these changes tested? Yes, using a new example file (https://github.com/apache/parquet-testing/pull/67) ### Are there any user-facing changes? Yes, reading columns with invalid repetition levels as Arrow arrays will now raise an exception. * GitHub Issue: #45185 Lead-authored-by: Adam Reeve <adre...@gmail.com> Co-authored-by: mwish <maplewish...@gmail.com> Signed-off-by: mwish <maplewish...@gmail.com> --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 16 +++++++++++++++- cpp/src/parquet/column_reader.cc | 7 ++++++- cpp/submodules/parquet-testing | 2 +- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 0cc5f948c7..3b071bbbea 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3978,7 +3978,8 @@ TEST(TestImpalaConversion, ArrowTimestampToImpalaTimestamp) { } void TryReadDataFile(const std::string& path, - ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK) { + ::arrow::StatusCode expected_code = ::arrow::StatusCode::OK, + const std::string& expected_message = "") { auto pool = ::arrow::default_memory_pool(); std::unique_ptr<FileReader> arrow_reader; @@ -3992,6 +3993,12 @@ void TryReadDataFile(const std::string& path, ASSERT_EQ(s.code(), expected_code) << "Expected reading file to return " << arrow::Status::CodeAsString(expected_code) << ", but got " << s.ToString(); + + if (!expected_message.empty()) { + ASSERT_EQ(s.message().find(expected_message), 0) + << "Expected an error message beginning with '" << expected_message + << "', but got '" << s.message() << "'"; + } } TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) { @@ -4005,6 +4012,13 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) { TryReadDataFile(path, ::arrow::StatusCode::IOError); } +TEST(TestArrowReaderAdHoc, InvalidRepetitionLevels) { + // GH-45185 - Repetition levels start with 1 instead of 0 + auto path = test::get_data_file("ARROW-GH-45185.parquet", /*is_good=*/false); + TryReadDataFile(path, ::arrow::StatusCode::IOError, + "The repetition level at the start of a record must be 0 but got 1"); +} + TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) { // ARROW-3762 ::arrow::StringBuilder builder; diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 8d4169b034..3268701bed 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1611,7 +1611,12 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>, // another record start or exhausting the ColumnChunk int64_t level = levels_position_; if (at_record_start_) { - ARROW_DCHECK_EQ(0, rep_levels[levels_position_]); + if (ARROW_PREDICT_FALSE(rep_levels[levels_position_] != 0)) { + std::stringstream ss; + ss << "The repetition level at the start of a record must be 0 but got " + << rep_levels[levels_position_]; + throw ParquetException(ss.str()); + } ++levels_position_; // We have decided to consume the level at this position; therefore we // must advance until we find another record boundary diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index c7cf1374cf..f4d7ed772a 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit c7cf1374cf284c0c73024cd1437becea75558bf8 +Subproject commit f4d7ed772a62a95111db50fbcad2460833e8c882