This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 59e0ba6f66 GH-49299: [C++][Parquet] Integer overflow in Parquet dict
decoding (#49300)
59e0ba6f66 is described below
commit 59e0ba6f6644d53480e35f8e7b7f365c51c9ef59
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Feb 18 08:58:30 2026 +0100
GH-49299: [C++][Parquet] Integer overflow in Parquet dict decoding (#49300)
### Rationale for this change
Computing the byte size of a buffer of decoded dictionary values in Parquet
could lead to integer overflow on a 32-bit multiplication. This does not seem
easily exploitable due to another size check in the PLAIN decoder (we only
support PLAIN-encoded dictionary values).
### What changes are included in this PR?
Do byte size computations in the 64-bit signed integer domain to avoid any
overflow issues.
### Are these changes tested?
No.
### Are there any user-facing changes?
No.
* GitHub Issue: #49299
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/parquet/decoder.cc | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 3ce2323d29..5d32d39e5f 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -1000,8 +1000,9 @@ class DictDecoderImpl : public TypedDecoderImpl<Type>,
public DictDecoder<Type>
inline void DecodeDict(TypedDecoder<Type>* dictionary) {
dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
- PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
- /*shrink_to_fit=*/false));
+ PARQUET_THROW_NOT_OK(
+ dictionary_->Resize(static_cast<int64_t>(dictionary_length_) *
sizeof(T),
+ /*shrink_to_fit=*/false));
dictionary->Decode(dictionary_->mutable_data_as<T>(), dictionary_length_);
}
@@ -1044,15 +1045,15 @@ void
DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
auto* dict_values = dictionary_->mutable_data_as<ByteArray>();
- int total_size = 0;
+ int64_t total_size = 0;
for (int i = 0; i < dictionary_length_; ++i) {
total_size += dict_values[i].len;
}
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
/*shrink_to_fit=*/false));
- PARQUET_THROW_NOT_OK(
- byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
- /*shrink_to_fit=*/false));
+ PARQUET_THROW_NOT_OK(byte_array_offsets_->Resize(
+ (static_cast<int64_t>(dictionary_length_) + 1) * sizeof(int32_t),
+ /*shrink_to_fit=*/false));
int32_t offset = 0;
uint8_t* bytes_data = byte_array_data_->mutable_data();
@@ -1073,7 +1074,7 @@ inline void
DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionar
auto* dict_values = dictionary_->mutable_data_as<FLBA>();
int fixed_len = this->type_length_;
- int total_size = dictionary_length_ * fixed_len;
+ int64_t total_size = static_cast<int64_t>(dictionary_length_) * fixed_len;
PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
/*shrink_to_fit=*/false));