This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 85779a44e9 GH-45185: [C++][Parquet] Raise an error for invalid 
repetition levels when delimiting records (#45186)
85779a44e9 is described below

commit 85779a44e9d1804a0167c46971682beb4d1766e5
Author: Adam Reeve <adre...@gmail.com>
AuthorDate: Fri Mar 28 21:21:51 2025 +1300

    GH-45185: [C++][Parquet] Raise an error for invalid repetition levels when 
delimiting records (#45186)
    
    ### Rationale for this change
    
    See #45185. Invalid repetition levels would previously only cause a fatal 
error in debug builds.
    
    ### What changes are included in this PR?
    
    Converts an existing `ARROW_DCHECK_EQ` of the repetition level with a check 
that will raise an exception in release builds too.
    
    ### Are these changes tested?
    
    Yes, using a new example file 
(https://github.com/apache/parquet-testing/pull/67)
    
    ### Are there any user-facing changes?
    
    Yes, reading columns with invalid repetition levels as Arrow arrays will 
now raise an exception.
    * GitHub Issue: #45185
    
    Lead-authored-by: Adam Reeve <adre...@gmail.com>
    Co-authored-by: mwish <maplewish...@gmail.com>
    Signed-off-by: mwish <maplewish...@gmail.com>
---
 cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 16 +++++++++++++++-
 cpp/src/parquet/column_reader.cc                  |  7 ++++++-
 cpp/submodules/parquet-testing                    |  2 +-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc 
b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
index 0cc5f948c7..3b071bbbea 100644
--- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
+++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc
@@ -3978,7 +3978,8 @@ TEST(TestImpalaConversion, 
ArrowTimestampToImpalaTimestamp) {
 }
 
 void TryReadDataFile(const std::string& path,
-                     ::arrow::StatusCode expected_code = 
::arrow::StatusCode::OK) {
+                     ::arrow::StatusCode expected_code = 
::arrow::StatusCode::OK,
+                     const std::string& expected_message = "") {
   auto pool = ::arrow::default_memory_pool();
 
   std::unique_ptr<FileReader> arrow_reader;
@@ -3992,6 +3993,12 @@ void TryReadDataFile(const std::string& path,
   ASSERT_EQ(s.code(), expected_code)
       << "Expected reading file to return " << 
arrow::Status::CodeAsString(expected_code)
       << ", but got " << s.ToString();
+
+  if (!expected_message.empty()) {
+    ASSERT_EQ(s.message().find(expected_message), 0)
+        << "Expected an error message beginning with '" << expected_message
+        << "', but got '" << s.message() << "'";
+  }
 }
 
 TEST(TestArrowReaderAdHoc, Int96BadMemoryAccess) {
@@ -4005,6 +4012,13 @@ TEST(TestArrowReaderAdHoc, CorruptedSchema) {
   TryReadDataFile(path, ::arrow::StatusCode::IOError);
 }
 
+TEST(TestArrowReaderAdHoc, InvalidRepetitionLevels) {
+  // GH-45185 - Repetition levels start with 1 instead of 0
+  auto path = test::get_data_file("ARROW-GH-45185.parquet", /*is_good=*/false);
+  TryReadDataFile(path, ::arrow::StatusCode::IOError,
+                  "The repetition level at the start of a record must be 0 but 
got 1");
+}
+
 TEST(TestArrowReaderAdHoc, LARGE_MEMORY_TEST(LargeStringColumn)) {
   // ARROW-3762
   ::arrow::StringBuilder builder;
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 8d4169b034..3268701bed 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1611,7 +1611,12 @@ class TypedRecordReader : public 
TypedColumnReaderImpl<DType>,
     // another record start or exhausting the ColumnChunk
     int64_t level = levels_position_;
     if (at_record_start_) {
-      ARROW_DCHECK_EQ(0, rep_levels[levels_position_]);
+      if (ARROW_PREDICT_FALSE(rep_levels[levels_position_] != 0)) {
+        std::stringstream ss;
+        ss << "The repetition level at the start of a record must be 0 but got 
"
+           << rep_levels[levels_position_];
+        throw ParquetException(ss.str());
+      }
       ++levels_position_;
       // We have decided to consume the level at this position; therefore we
       // must advance until we find another record boundary
diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing
index c7cf1374cf..f4d7ed772a 160000
--- a/cpp/submodules/parquet-testing
+++ b/cpp/submodules/parquet-testing
@@ -1 +1 @@
-Subproject commit c7cf1374cf284c0c73024cd1437becea75558bf8
+Subproject commit f4d7ed772a62a95111db50fbcad2460833e8c882

Reply via email to