(arrow) branch main updated: GH-49299: [C++][Parquet] Integer overflow in Parquet dict decoding (#49300)

apitrou Tue, 17 Feb 2026 23:58:49 -0800

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/main by this push:
     new 59e0ba6f66 GH-49299: [C++][Parquet] Integer overflow in Parquet dict 
decoding (#49300)
59e0ba6f66 is described below

commit 59e0ba6f6644d53480e35f8e7b7f365c51c9ef59
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Feb 18 08:58:30 2026 +0100

    GH-49299: [C++][Parquet] Integer overflow in Parquet dict decoding (#49300)
    
    ### Rationale for this change
    
    Computing the byte size of a buffer of decoded dictionary values in Parquet 
could lead to integer overflow on a 32-bit multiplication. This does not seem 
easily exploitable due to another size check in the PLAIN decoder (we only 
support PLAIN-encoded dictionary values).
    
    ### What changes are included in this PR?
    
    Do byte size computations in the 64-bit signed integer domain to avoid any 
overflow issues.
    
    ### Are these changes tested?
    
    No.
    
    ### Are there any user-facing changes?
    
    No.
    
    * GitHub Issue: #49299
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/parquet/decoder.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc
index 3ce2323d29..5d32d39e5f 100644
--- a/cpp/src/parquet/decoder.cc
+++ b/cpp/src/parquet/decoder.cc
@@ -1000,8 +1000,9 @@ class DictDecoderImpl : public TypedDecoderImpl<Type>, 
public DictDecoder<Type>
 
   inline void DecodeDict(TypedDecoder<Type>* dictionary) {
     dictionary_length_ = static_cast<int32_t>(dictionary->values_left());
-    PARQUET_THROW_NOT_OK(dictionary_->Resize(dictionary_length_ * sizeof(T),
-                                             /*shrink_to_fit=*/false));
+    PARQUET_THROW_NOT_OK(
+        dictionary_->Resize(static_cast<int64_t>(dictionary_length_) * 
sizeof(T),
+                            /*shrink_to_fit=*/false));
     dictionary->Decode(dictionary_->mutable_data_as<T>(), dictionary_length_);
   }
 
@@ -1044,15 +1045,15 @@ void 
DictDecoderImpl<ByteArrayType>::SetDict(TypedDecoder<ByteArrayType>* dictio
 
   auto* dict_values = dictionary_->mutable_data_as<ByteArray>();
 
-  int total_size = 0;
+  int64_t total_size = 0;
   for (int i = 0; i < dictionary_length_; ++i) {
     total_size += dict_values[i].len;
   }
   PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                 /*shrink_to_fit=*/false));
-  PARQUET_THROW_NOT_OK(
-      byte_array_offsets_->Resize((dictionary_length_ + 1) * sizeof(int32_t),
-                                  /*shrink_to_fit=*/false));
+  PARQUET_THROW_NOT_OK(byte_array_offsets_->Resize(
+      (static_cast<int64_t>(dictionary_length_) + 1) * sizeof(int32_t),
+      /*shrink_to_fit=*/false));
 
   int32_t offset = 0;
   uint8_t* bytes_data = byte_array_data_->mutable_data();
@@ -1073,7 +1074,7 @@ inline void 
DictDecoderImpl<FLBAType>::SetDict(TypedDecoder<FLBAType>* dictionar
   auto* dict_values = dictionary_->mutable_data_as<FLBA>();
 
   int fixed_len = this->type_length_;
-  int total_size = dictionary_length_ * fixed_len;
+  int64_t total_size = static_cast<int64_t>(dictionary_length_) * fixed_len;
 
   PARQUET_THROW_NOT_OK(byte_array_data_->Resize(total_size,
                                                 /*shrink_to_fit=*/false));

(arrow) branch main updated: GH-49299: [C++][Parquet] Integer overflow in Parquet dict decoding (#49300)

Reply via email to