This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 3faa8e172f GH-49626: [C++][Parquet] Fix encoding fuzzing failure 
(#49627)
3faa8e172f is described below

commit 3faa8e172f6277ea21cd667500f3d5b1ac36e0e2
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Apr 1 11:35:34 2026 +0200

    GH-49626: [C++][Parquet] Fix encoding fuzzing failure (#49627)
    
    ### Rationale for this change
    
    The DecodeArrow decoder functions are expected to always return the number 
of requested values or error out, but the DELTA_BINARY_PACKED would not always 
satisfy this guarantee.
    
    Issue found by OSS-Fuzz: https://issues.oss-fuzz.com/issues/492457225
    
    ### What changes are included in this PR?
    
    1. Make sure `DecodeArrow` functions for DELTA_BINARY_PACKED error out if 
there are not enough values
    2. Improve `DecodeArrow` docstrings to clarify the meaning of the 
parameters and return value
    
    ### Are these changes tested?
    
    Only by fuzz regression file.
    
    ### Are there any user-facing changes?
    
    No.
    * GitHub Issue: #49626
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 ci/scripts/cpp_test.sh                          | 15 ++++++++-------
 cpp/src/parquet/arrow/fuzz_encoding_internal.cc |  9 +++++++--
 cpp/src/parquet/decoder.cc                      |  6 ++++++
 cpp/src/parquet/encoding.h                      | 16 ++++++++++++++--
 testing                                         |  2 +-
 5 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index ac4cb06100..241addbfeb 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -214,16 +214,17 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
     fi
 
     # 3. Run fuzz targets on regression files from arrow-testing
+    fuzz_target_options="-rss_limit_mb=2560"  # same as on OSS-Fuzz
     pushd "${ARROW_TEST_DATA}"
-    "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/crash-*
-    "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/*-testcase-*
-    "${binary_output_dir}/arrow-ipc-file-fuzz" arrow-ipc-file/*-testcase-*
-    "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" 
arrow-ipc-tensor-stream/*-testcase-*
+    "${binary_output_dir}/arrow-ipc-stream-fuzz" ${fuzz_target_options} 
arrow-ipc-stream/crash-*
+    "${binary_output_dir}/arrow-ipc-stream-fuzz" ${fuzz_target_options} 
arrow-ipc-stream/*-testcase-*
+    "${binary_output_dir}/arrow-ipc-file-fuzz" ${fuzz_target_options} 
arrow-ipc-file/*-testcase-*
+    "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" ${fuzz_target_options} 
arrow-ipc-tensor-stream/*-testcase-*
     if [ "${ARROW_PARQUET}" == "ON" ]; then
-      "${binary_output_dir}/parquet-arrow-fuzz" parquet/fuzzing/*-testcase-*
-      "${binary_output_dir}/parquet-encoding-fuzz" 
parquet/encoding-fuzzing/*-testcase-*
+      "${binary_output_dir}/parquet-arrow-fuzz" ${fuzz_target_options} 
parquet/fuzzing/*-testcase-*
+      "${binary_output_dir}/parquet-encoding-fuzz" ${fuzz_target_options} 
parquet/encoding-fuzzing/*-testcase-*
     fi
-    "${binary_output_dir}/arrow-csv-fuzz" csv/fuzzing/*-testcase-*
+    "${binary_output_dir}/arrow-csv-fuzz" ${fuzz_target_options} 
csv/fuzzing/*-testcase-*
     popd
 fi
 
diff --git a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc 
b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
index c2dfb39eef..0694f550b1 100644
--- a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
+++ b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
@@ -236,6 +236,7 @@ struct TypedFuzzEncoding {
     auto decoder = MakeDecoder(encoding);
     decoder->SetData(num_values_, encoded_data.data(),
                      static_cast<int>(encoded_data.size()));
+    std::shared_ptr<Array> output;
 
     if constexpr (kType == Type::BYTE_ARRAY) {
       Accumulator acc;
@@ -244,14 +245,18 @@ struct TypedFuzzEncoding {
       decoder->DecodeArrowNonNull(num_values_, &acc);
       END_PARQUET_CATCH_EXCEPTIONS
       ARROW_CHECK_EQ(acc.chunks.size(), 0);
-      return acc.builder->Finish();
+      ARROW_ASSIGN_OR_RAISE(output, acc.builder->Finish());
     } else {
       Accumulator builder(arrow_type, pool());
       BEGIN_PARQUET_CATCH_EXCEPTIONS
       decoder->DecodeArrowNonNull(num_values_, &builder);
       END_PARQUET_CATCH_EXCEPTIONS
-      return builder.Finish();
+      ARROW_ASSIGN_OR_RAISE(output, builder.Finish());
     }
+    // DecodeArrow* methods should emit as many values as requested, or error 
out
+    // if there are not enough remaining values to decode.
+    ARROW_DCHECK_EQ(output->length(), num_values_) << "Read less values than 
expected";
+    return output;
   }
 
   Status Fuzz() {
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index df7a9becad..629b9ba0c7 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -1484,6 +1484,9 @@ class DeltaBitPackDecoder : public 
TypedDecoderImpl<DType> {
     }
     std::vector<T> values(num_values);
     int decoded_count = GetInternal(values.data(), num_values);
+    if (decoded_count < num_values) {
+      ParquetException::EofException("Not enough values in data page");
+    }
     PARQUET_THROW_NOT_OK(out->AppendValues(values.data(), decoded_count));
     return decoded_count;
   }
@@ -1497,6 +1500,9 @@ class DeltaBitPackDecoder : public 
TypedDecoderImpl<DType> {
     }
     std::vector<T> values(num_values);
     int decoded_count = GetInternal(values.data(), num_values);
+    if (decoded_count < num_values) {
+      ParquetException::EofException("Not enough values in data page");
+    }
     PARQUET_THROW_NOT_OK(out->Reserve(decoded_count));
     for (int i = 0; i < decoded_count; ++i) {
       PARQUET_THROW_NOT_OK(out->Append(values[i]));
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 12ee889679..e3de4f2aa6 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -292,20 +292,32 @@ class TypedDecoder : virtual public Decoder {
 
   /// \brief Decode into an ArrayBuilder or other accumulator
   ///
+  /// \param[in] num_values number of values to decode, including null slots
+  /// \param[in] null_count number of null slots
+  /// \param[in] valid_bits validity bitmap
+  /// \param[in] valid_bits_offset bit offset to start into the validity bitmap
+  /// \param[in] out accumulator to decode into
+  ///
   /// This function assumes the definition levels were already decoded
   /// as a validity bitmap in the given `valid_bits`.  `null_count`
   /// is the number of 0s in `valid_bits`.
+  /// `valid_bits` must at least `valid_bits_offset + num_values` bits.
   /// As a space optimization, it is allowed for `valid_bits` to be null
   /// if `null_count` is zero.
+  /// This function throws a ParquetException if there are less than
+  /// `num_values - null_count` values left to decode.
   ///
-  /// \return number of values decoded
+  /// \return The number of non-null values decoded
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* 
valid_bits,
                           int64_t valid_bits_offset,
                           typename EncodingTraits<DType>::Accumulator* out) = 
0;
 
   /// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
   ///
-  /// \return number of values decoded
+  /// \param[in] num_values number of values to decode
+  /// \param[in] out accumulator to decode into
+  ///
+  /// \return The number of values decoded
   int DecodeArrowNonNull(int num_values,
                          typename EncodingTraits<DType>::Accumulator* out) {
     return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
diff --git a/testing b/testing
index 015d80bd20..a871ddc17a 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 015d80bd2012a0629b8b269bb97dec4e96f28b6b
+Subproject commit a871ddc17a4dd936b7aa43898d59f86a11c3a2b5

Reply via email to