This is an automated email from the ASF dual-hosted git repository.
pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 07a0aa3ebc GH-49805: [C++][Parquet] Avoid unbounded temporary
allocation in DeltaBitPackDecoder::DecodeArrow (#49806)
07a0aa3ebc is described below
commit 07a0aa3ebc6cb83ecf80accd7fcce180c2579053
Author: Antoine Pitrou <[email protected]>
AuthorDate: Mon Apr 20 18:15:42 2026 +0200
GH-49805: [C++][Parquet] Avoid unbounded temporary allocation in
DeltaBitPackDecoder::DecodeArrow (#49806)
### Rationale for this change
DeltaBitPackDecoder::DecodeArrow allocates a temporary std::vector for the
entire range of decoded values. This is sub-optimal, and generates
out-of-memory failures in the Parquet encoding fuzzer.
This issue was found by OSS-Fuzz:
https://issues.oss-fuzz.com/issues/489941774
### What changes are included in this PR?
We should instead use a fixed-size temporary buffer.
### Are these changes tested?
Yes, by existing CI tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #49805
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/decoder.cc | 64 +++++++++++++++++++++++++++++-----------------
1 file changed, 41 insertions(+), 23 deletions(-)
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 629b9ba0c7..18fae84dc5 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -18,6 +18,7 @@
#include "parquet/encoding.h"
#include <algorithm>
+#include <array>
#include <cstdint>
#include <cstdlib>
#include <cstring>
@@ -1478,41 +1479,58 @@ class DeltaBitPackDecoder : public
TypedDecoderImpl<DType> {
int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<DType>::Accumulator* out) override {
- if (null_count != 0) {
- // TODO(ARROW-34660): implement DecodeArrow with null slots.
- ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
- }
- std::vector<T> values(num_values);
- int decoded_count = GetInternal(values.data(), num_values);
- if (decoded_count < num_values) {
- ParquetException::EofException("Not enough values in data page");
- }
- PARQUET_THROW_NOT_OK(out->AppendValues(values.data(), decoded_count));
- return decoded_count;
+ auto reserve = [out](int64_t num_values) {
+ PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+ };
+ auto append_values = [out](T* values, int64_t num_values) {
+ PARQUET_THROW_NOT_OK(out->AppendValues(values, num_values));
+ };
+ return DecodeArrowInternal(num_values, null_count, valid_bits,
valid_bits_offset,
+ reserve, append_values);
}
int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<DType>::DictAccumulator* out)
override {
+ auto reserve = [out](int64_t num_values) {
+ PARQUET_THROW_NOT_OK(out->Reserve(num_values));
+ };
+ auto append_values = [out](T* values, int64_t num_values) {
+ for (int i = 0; i < num_values; ++i) {
+ PARQUET_THROW_NOT_OK(out->Append(values[i]));
+ }
+ };
+ return DecodeArrowInternal(num_values, null_count, valid_bits,
valid_bits_offset,
+ reserve, append_values);
+ }
+
+ private:
+ static constexpr int kMaxDeltaBitWidth = static_cast<int>(sizeof(T) * 8);
+
+ template <typename ReserveFunc, typename AppendValuesFunc>
+ int DecodeArrowInternal(int num_values, int null_count, const uint8_t*
valid_bits,
+ int64_t valid_bits_offset, ReserveFunc&&
reserve_func,
+ AppendValuesFunc&& append_values_func) {
if (null_count != 0) {
// TODO(ARROW-34660): implement DecodeArrow with null slots.
ParquetException::NYI("Delta bit pack DecodeArrow with null slots");
}
- std::vector<T> values(num_values);
- int decoded_count = GetInternal(values.data(), num_values);
- if (decoded_count < num_values) {
- ParquetException::EofException("Not enough values in data page");
- }
- PARQUET_THROW_NOT_OK(out->Reserve(decoded_count));
- for (int i = 0; i < decoded_count; ++i) {
- PARQUET_THROW_NOT_OK(out->Append(values[i]));
+ reserve_func(num_values);
+ constexpr int kBatchSize = 1024;
+ std::array<T, kBatchSize> values;
+ int offset = 0;
+ while (offset < num_values) {
+ const int batch_size = std::min(num_values - offset, kBatchSize);
+ int decoded_count = GetInternal(values.data(), batch_size);
+ if (decoded_count != batch_size) {
+ ParquetException::EofException("Not enough values in data page");
+ }
+ append_values_func(values.data(), batch_size);
+ offset += batch_size;
}
- return decoded_count;
+ return num_values;
}
- private:
- static constexpr int kMaxDeltaBitWidth = static_cast<int>(sizeof(T) * 8);
-
void InitHeader() {
if (!decoder_->GetVlqInt(&values_per_block_) ||
!decoder_->GetVlqInt(&mini_blocks_per_block_) ||