This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 587ce1d  ARROW-14792: [C++] Fix crash when reading DELTA_BYTE_ARRAY 
Parquet file
587ce1d is described below

commit 587ce1dc5fcf4172591bdfdcb8882d47507d7daa
Author: Antoine Pitrou <anto...@python.org>
AuthorDate: Wed Nov 24 10:26:20 2021 +0100

    ARROW-14792: [C++] Fix crash when reading DELTA_BYTE_ARRAY Parquet file
    
    Should fix the following issues (found by OSS-Fuzz):
    
    * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=41221
    * https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=41222
    
    Closes #11756 from pitrou/ARROW-14792-delta-byte-array-fuzz
    
    Authored-by: Antoine Pitrou <anto...@python.org>
    Signed-off-by: Antoine Pitrou <anto...@python.org>
---
 cpp/src/parquet/encoding.cc | 43 +++++++++++++++++++++++--------------------
 testing                     |  2 +-
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index 783e868..57a7186 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -2424,8 +2424,11 @@ class DeltaByteArrayDecoder : public DecoderImpl,
         reinterpret_cast<const int32_t*>(buffered_prefix_length_->data()) +
         prefix_len_offset_;
     for (int i = 0; i < max_values; ++i) {
-      if (AddWithOverflow(data_size, prefix_len_ptr[i], &data_size) ||
-          AddWithOverflow(data_size, buffer[i].len, &data_size)) {
+      if (ARROW_PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
+        throw ParquetException("negative prefix length in DELTA_BYTE_ARRAY");
+      }
+      if (ARROW_PREDICT_FALSE(AddWithOverflow(data_size, prefix_len_ptr[i], 
&data_size) ||
+                              AddWithOverflow(data_size, buffer[i].len, 
&data_size))) {
         throw ParquetException("excess expansion in DELTA_BYTE_ARRAY");
       }
     }
@@ -2435,7 +2438,7 @@ class DeltaByteArrayDecoder : public DecoderImpl,
     uint8_t* data_ptr = buffered_data_->mutable_data();
     for (int i = 0; i < max_values; ++i) {
       if (ARROW_PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > 
prefix.length())) {
-        throw ParquetException("prefix length too large");
+        throw ParquetException("prefix length too large in DELTA_BYTE_ARRAY");
       }
       memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
       // buffer[i] currently points to the string suffix
@@ -2461,31 +2464,31 @@ class DeltaByteArrayDecoder : public DecoderImpl,
                           typename EncodingTraits<ByteArrayType>::Accumulator* 
out,
                           int* out_num_values) {
     ArrowBinaryHelper helper(out);
-    ::arrow::internal::BitmapReader bit_reader(valid_bits, valid_bits_offset, 
num_values);
 
     std::vector<ByteArray> values(num_values);
-    int num_valid_values = GetInternal(values.data(), num_values - null_count);
+    const int num_valid_values = GetInternal(values.data(), num_values - 
null_count);
     DCHECK_EQ(num_values - null_count, num_valid_values);
 
     auto values_ptr = reinterpret_cast<const ByteArray*>(values.data());
     int value_idx = 0;
 
-    for (int i = 0; i < num_values; ++i) {
-      bool is_valid = bit_reader.IsSet();
-      bit_reader.Next();
+    RETURN_NOT_OK(VisitNullBitmapInline(
+        valid_bits, valid_bits_offset, num_values, null_count,
+        [&]() {
+          const auto& val = values_ptr[value_idx];
+          if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
+            RETURN_NOT_OK(helper.PushChunk());
+          }
+          RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
+          ++value_idx;
+          return Status::OK();
+        },
+        [&]() {
+          RETURN_NOT_OK(helper.AppendNull());
+          --null_count;
+          return Status::OK();
+        }));
 
-      if (is_valid) {
-        const auto& val = values_ptr[value_idx];
-        if (ARROW_PREDICT_FALSE(!helper.CanFit(val.len))) {
-          RETURN_NOT_OK(helper.PushChunk());
-        }
-        RETURN_NOT_OK(helper.Append(val.ptr, static_cast<int32_t>(val.len)));
-        ++value_idx;
-      } else {
-        RETURN_NOT_OK(helper.AppendNull());
-        --null_count;
-      }
-    }
     DCHECK_EQ(null_count, 0);
     *out_num_values = num_valid_values;
     return Status::OK();
diff --git a/testing b/testing
index 93ef4a7..d6c7b9d 160000
--- a/testing
+++ b/testing
@@ -1 +1 @@
-Subproject commit 93ef4a7bbf8cc629fa1f82bf38bb6e89cda91d40
+Subproject commit d6c7b9d670f3cc3af4a27e043749300b9d27addf

Reply via email to