This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 07a0aa3ebc GH-49805: [C++][Parquet] Avoid unbounded temporary 
allocation in DeltaBitPackDecoder::DecodeArrow (#49806)
07a0aa3ebc is described below

commit 07a0aa3ebc6cb83ecf80accd7fcce180c2579053
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Apr 20 18:15:42 2026 +0200

    GH-49805: [C++][Parquet] Avoid unbounded temporary allocation in 
DeltaBitPackDecoder::DecodeArrow (#49806)
    
    ### Rationale for this change
    
    DeltaBitPackDecoder::DecodeArrow allocates a temporary std::vector for the 
entire range of decoded values. This is sub-optimal, and generates 
out-of-memory failures in the Parquet encoding fuzzer.
    
    This issue was found by OSS-Fuzz: 
https://issues.oss-fuzz.com/issues/489941774
    
    ### What changes are included in this PR?
    
    We should instead use a fixed-size temporary buffer.
    
    ### Are these changes tested?
    
    Yes, by existing CI tests.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #49805
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/decoder.cc | 64 +++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 629b9ba0c7..18fae84dc5 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -18,6 +18,7 @@
 #include "parquet/encoding.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
@@ -1478,41 +1479,58 @@ class DeltaBitPackDecoder : public 
TypedDecoderImpl<DType> {
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
                   typename EncodingTraits<DType>::Accumulator* out) override {
-    if (null_count != 0) {
-      // TODO(ARROW-34660): implement DecodeArrow with null slots.
-      ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
-    }
-    std::vector<T> values(num_values);
-    int decoded_count = GetInternal(values.data(), num_values);
-    if (decoded_count < num_values) {
-      ParquetException::EofException("Not enough values in data page");
-    }
-    PARQUET_THROW_NOT_OK(out->AppendValues(values.data(), decoded_count));
-    return decoded_count;
+    auto reserve = [out](int64_t num_values) {
+      PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+    };
+    auto append_values = [out](T* values, int64_t num_values) {
+      PARQUET_THROW_NOT_OK(out->AppendValues(values, num_values));
+    };
+    return DecodeArrowInternal(num_values, null_count, valid_bits, 
valid_bits_offset,
+                               reserve, append_values);
   }
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
                   typename EncodingTraits<DType>::DictAccumulator* out) 
override {
+    auto reserve = [out](int64_t num_values) {
+      PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+    };
+    auto append_values = [out](T* values, int64_t num_values) {
+      for (int i = 0; i < num_values; ++i) {
+        PARQUET_THROW_NOT_OK(out->Append(values[i]));
+      }
+    };
+    return DecodeArrowInternal(num_values, null_count, valid_bits, 
valid_bits_offset,
+                               reserve, append_values);
+  }
+
+ private:
+  static constexpr int kMaxDeltaBitWidth = static_cast<int>(sizeof(T) * 8);
+
+  template <typename ReserveFunc, typename AppendValuesFunc>
+  int DecodeArrowInternal(int num_values, int null_count, const uint8_t* 
valid_bits,
+                          int64_t valid_bits_offset, ReserveFunc&& 
reserve_func,
+                          AppendValuesFunc&& append_values_func) {
     if (null_count != 0) {
       // TODO(ARROW-34660): implement DecodeArrow with null slots.
       ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
     }
-    std::vector<T> values(num_values);
-    int decoded_count = GetInternal(values.data(), num_values);
-    if (decoded_count < num_values) {
-      ParquetException::EofException("Not enough values in data page");
-    }
-    PARQUET_THROW_NOT_OK(out->Reserve(decoded_count));
-    for (int i = 0; i < decoded_count; ++i) {
-      PARQUET_THROW_NOT_OK(out->Append(values[i]));
+    reserve_func(num_values);
+    constexpr int kBatchSize = 1024;
+    std::array<T, kBatchSize> values;
+    int offset = 0;
+    while (offset < num_values) {
+      const int batch_size = std::min(num_values - offset, kBatchSize);
+      int decoded_count = GetInternal(values.data(), batch_size);
+      if (decoded_count != batch_size) {
+        ParquetException::EofException("Not enough values in data page");
+      }
+      append_values_func(values.data(), batch_size);
+      offset += batch_size;
     }
-    return decoded_count;
+    return num_values;
   }
 
- private:
-  static constexpr int kMaxDeltaBitWidth = static_cast<int>(sizeof(T) * 8);
-
   void InitHeader() {
     if (!decoder_->GetVlqInt(&values_per_block_) ||
         !decoder_->GetVlqInt(&mini_blocks_per_block_) ||

Reply via email to