This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3faa8e172f GH-49626: [C++][Parquet] Fix encoding fuzzing failure
(#49627)
3faa8e172f is described below
commit 3faa8e172f6277ea21cd667500f3d5b1ac36e0e2
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Apr 1 11:35:34 2026 +0200
GH-49626: [C++][Parquet] Fix encoding fuzzing failure (#49627)
### Rationale for this change
The DecodeArrow decoder functions are expected to always return the number
of requested values or error out, but the DELTA_BINARY_PACKED would not always
satisfy this guarantee.
Issue found by OSS-Fuzz: https://issues.oss-fuzz.com/issues/492457225
### What changes are included in this PR?
1. Make sure `DecodeArrow` functions for DELTA_BINARY_PACKED error out if
there are not enough values
2. Improve `DecodeArrow` docstrings to clarify the meaning of the
parameters and return value
### Are these changes tested?
Only by fuzz regression file.
### Are there any user-facing changes?
No.
* GitHub Issue: #49626
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
ci/scripts/cpp_test.sh | 15 ++++++++-------
cpp/src/parquet/arrow/fuzz_encoding_internal.cc | 9 +++++++--
cpp/src/parquet/decoder.cc | 6 ++++++
cpp/src/parquet/encoding.h | 16 ++++++++++++++--
testing | 2 +-
5 files changed, 36 insertions(+), 12 deletions(-)
diff --git a/ci/scripts/cpp_test.sh b/ci/scripts/cpp_test.sh
index ac4cb06100..241addbfeb 100755
--- a/ci/scripts/cpp_test.sh
+++ b/ci/scripts/cpp_test.sh
@@ -214,16 +214,17 @@ if [ "${ARROW_FUZZING}" == "ON" ]; then
fi
# 3. Run fuzz targets on regression files from arrow-testing
+ fuzz_target_options="-rss_limit_mb=2560" # same as on OSS-Fuzz
pushd "${ARROW_TEST_DATA}"
- "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/crash-*
- "${binary_output_dir}/arrow-ipc-stream-fuzz" arrow-ipc-stream/*-testcase-*
- "${binary_output_dir}/arrow-ipc-file-fuzz" arrow-ipc-file/*-testcase-*
- "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz"
arrow-ipc-tensor-stream/*-testcase-*
+ "${binary_output_dir}/arrow-ipc-stream-fuzz" ${fuzz_target_options}
arrow-ipc-stream/crash-*
+ "${binary_output_dir}/arrow-ipc-stream-fuzz" ${fuzz_target_options}
arrow-ipc-stream/*-testcase-*
+ "${binary_output_dir}/arrow-ipc-file-fuzz" ${fuzz_target_options}
arrow-ipc-file/*-testcase-*
+ "${binary_output_dir}/arrow-ipc-tensor-stream-fuzz" ${fuzz_target_options}
arrow-ipc-tensor-stream/*-testcase-*
if [ "${ARROW_PARQUET}" == "ON" ]; then
- "${binary_output_dir}/parquet-arrow-fuzz" parquet/fuzzing/*-testcase-*
- "${binary_output_dir}/parquet-encoding-fuzz"
parquet/encoding-fuzzing/*-testcase-*
+ "${binary_output_dir}/parquet-arrow-fuzz" ${fuzz_target_options}
parquet/fuzzing/*-testcase-*
+ "${binary_output_dir}/parquet-encoding-fuzz" ${fuzz_target_options}
parquet/encoding-fuzzing/*-testcase-*
fi
- "${binary_output_dir}/arrow-csv-fuzz" csv/fuzzing/*-testcase-*
+ "${binary_output_dir}/arrow-csv-fuzz" ${fuzz_target_options}
csv/fuzzing/*-testcase-*
popd
fi
diff --git a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
index c2dfb39eef..0694f550b1 100644
--- a/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
+++ b/cpp/src/parquet/arrow/fuzz_encoding_internal.cc
@@ -236,6 +236,7 @@ struct TypedFuzzEncoding {
auto decoder = MakeDecoder(encoding);
decoder->SetData(num_values_, encoded_data.data(),
static_cast<int>(encoded_data.size()));
+ std::shared_ptr<Array> output;
if constexpr (kType == Type::BYTE_ARRAY) {
Accumulator acc;
@@ -244,14 +245,18 @@ struct TypedFuzzEncoding {
decoder->DecodeArrowNonNull(num_values_, &acc);
END_PARQUET_CATCH_EXCEPTIONS
ARROW_CHECK_EQ(acc.chunks.size(), 0);
- return acc.builder->Finish();
+ ARROW_ASSIGN_OR_RAISE(output, acc.builder->Finish());
} else {
Accumulator builder(arrow_type, pool());
BEGIN_PARQUET_CATCH_EXCEPTIONS
decoder->DecodeArrowNonNull(num_values_, &builder);
END_PARQUET_CATCH_EXCEPTIONS
- return builder.Finish();
+ ARROW_ASSIGN_OR_RAISE(output, builder.Finish());
}
+ // DecodeArrow* methods should emit as many values as requested, or error
out
+ // if there are not enough remaining values to decode.
+ ARROW_DCHECK_EQ(output->length(), num_values_) << "Read less values than
expected";
+ return output;
}
Status Fuzz() {
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index df7a9becad..629b9ba0c7 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -1484,6 +1484,9 @@ class DeltaBitPackDecoder : public
TypedDecoderImpl<DType> {
}
std::vector<T> values(num_values);
int decoded_count = GetInternal(values.data(), num_values);
+ if (decoded_count < num_values) {
+ ParquetException::EofException("Not enough values in data page");
+ }
PARQUET_THROW_NOT_OK(out->AppendValues(values.data(), decoded_count));
return decoded_count;
}
@@ -1497,6 +1500,9 @@ class DeltaBitPackDecoder : public
TypedDecoderImpl<DType> {
}
std::vector<T> values(num_values);
int decoded_count = GetInternal(values.data(), num_values);
+ if (decoded_count < num_values) {
+ ParquetException::EofException("Not enough values in data page");
+ }
PARQUET_THROW_NOT_OK(out->Reserve(decoded_count));
for (int i = 0; i < decoded_count; ++i) {
PARQUET_THROW_NOT_OK(out->Append(values[i]));
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 12ee889679..e3de4f2aa6 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -292,20 +292,32 @@ class TypedDecoder : virtual public Decoder {
/// \brief Decode into an ArrayBuilder or other accumulator
///
+ /// \param[in] num_values number of values to decode, including null slots
+ /// \param[in] null_count number of null slots
+ /// \param[in] valid_bits validity bitmap
+ /// \param[in] valid_bits_offset bit offset to start into the validity bitmap
+ /// \param[in] out accumulator to decode into
+ ///
/// This function assumes the definition levels were already decoded
/// as a validity bitmap in the given `valid_bits`. `null_count`
/// is the number of 0s in `valid_bits`.
+ /// `valid_bits` must at least `valid_bits_offset + num_values` bits.
/// As a space optimization, it is allowed for `valid_bits` to be null
/// if `null_count` is zero.
+ /// This function throws a ParquetException if there are less than
+ /// `num_values - null_count` values left to decode.
///
- /// \return number of values decoded
+ /// \return The number of non-null values decoded
virtual int DecodeArrow(int num_values, int null_count, const uint8_t*
valid_bits,
int64_t valid_bits_offset,
typename EncodingTraits<DType>::Accumulator* out) =
0;
/// \brief Decode into an ArrayBuilder or other accumulator ignoring nulls
///
- /// \return number of values decoded
+ /// \param[in] num_values number of values to decode
+ /// \param[in] out accumulator to decode into
+ ///
+ /// \return The number of values decoded
int DecodeArrowNonNull(int num_values,
typename EncodingTraits<DType>::Accumulator* out) {
return DecodeArrow(num_values, 0, /*valid_bits=*/NULLPTR, 0, out);
diff --git a/testing b/testing
index 015d80bd20..a871ddc17a 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 015d80bd2012a0629b8b269bb97dec4e96f28b6b
+Subproject commit a871ddc17a4dd936b7aa43898d59f86a11c3a2b5