This is an automated email from the ASF dual-hosted git repository.

pitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a2594dff60 GH-50162: [C++][Parquet] Avoid int32 overflow in 
BitPackedRunDecoder::GetBatch offset (#50089)
a2594dff60 is described below

commit a2594dff60d25d3c1b91f28a086fbfc326220e19
Author: metsw24-max <[email protected]>
AuthorDate: Tue Jun 16 14:03:18 2026 +0530

    GH-50162: [C++][Parquet] Avoid int32 overflow in 
BitPackedRunDecoder::GetBatch offset (#50089)
    
    **int32 overflow in the bit-packed run decoder offset**
    GetBatch works out the byte position with `values_read_ * value_bit_width` 
in 32-bit int. For a large bit-packed run (this decodes untrusted parquet 
RLE/bit-packed dictionary indices and levels, with value width up to 64) the 
product passes INT32_MAX and wraps negative, so bytes_fully_read goes negative 
and unread_data ends up before the buffer, giving an out of bounds read in 
unpack. raw_data_size just above already widens to int64 before the same 
multiply, so I matched that here.
    * GitHub Issue: #50162
    
    Authored-by: metsw24-max <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/util/rle_encoding_internal.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/util/rle_encoding_internal.h 
b/cpp/src/arrow/util/rle_encoding_internal.h
index 161a926f5d..7b4f211480 100644
--- a/cpp/src/arrow/util/rle_encoding_internal.h
+++ b/cpp/src/arrow/util/rle_encoding_internal.h
@@ -371,15 +371,19 @@ class BitPackedRunDecoder {
   /// left.
   [[nodiscard]] rle_size_t GetBatch(value_type* out, rle_size_t batch_size,
                                     rle_size_t value_bit_width) {
-    const int bits_read = values_read_ * value_bit_width;
-    const int bytes_fully_read = bits_read / 8;
+    const int64_t bits_read = static_cast<int64_t>(values_read_) * 
value_bit_width;
+    const int64_t bytes_fully_read = bits_read / 8;
+    // The parser only creates runs whose full payload fits in max_read_bytes_ 
(see
+    // BitPackedRun), so the max_read_bytes difference below is in [0, 
max_read_bytes_]
+    // and fits an int. A negative (unbounded) max_read_bytes_ stays negative.
+    ARROW_DCHECK(max_read_bytes_ < 0 || bytes_fully_read <= max_read_bytes_);
     const uint8_t* unread_data = data_ + bytes_fully_read;
 
     const ::arrow::internal::UnpackOptions opts{
         /* .batch_size= */ std::min(batch_size, remaining()),
         /* .bit_width= */ value_bit_width,
-        /* .bit_offset= */ bits_read % 8,
-        /* .max_read_bytes= */ max_read_bytes_ - bytes_fully_read,
+        /* .bit_offset= */ static_cast<int>(bits_read % 8),
+        /* .max_read_bytes= */ static_cast<int>(max_read_bytes_ - 
bytes_fully_read),
     };
 
     if constexpr (std::is_same_v<T, bool>) {

Reply via email to