Re: [PR] GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data [arrow]

via GitHub Tue, 02 Dec 2025 17:53:33 -0800


mapleFU commented on code in PR #48309:
URL: https://github.com/apache/arrow/pull/48309#discussion_r2583356838



##########
cpp/src/parquet/decoder.cc:
##########
@@ -147,6 +139,20 @@ struct ArrowBinaryHelper<ByteArrayType, 
::arrow::BinaryType> {
     return Status::OK();
   }
 
+  Status ReserveInitialChunkData(std::optional<int64_t> 
estimated_remaining_data_length) {
+    RETURN_NOT_OK(builder_->Reserve(entries_remaining_));
+    if (estimated_remaining_data_length.has_value()) {
+      int64_t required_capacity =
+          std::min(*estimated_remaining_data_length, chunk_space_remaining_);
+      // Ensure we'll be allocating a non-zero-sized data buffer, to avoid 
encountering
+      // a null pointer
+      constexpr int64_t kMinReserveCapacity = 64;

Review Comment:
   64 here is just a not too small or not too large number, to avoid `builder_` 
return an NULLPTR?



##########
cpp/src/parquet/decoder.cc:
##########
@@ -754,43 +760,47 @@ class PlainByteArrayDecoder : public 
PlainDecoder<ByteArrayType> {
                           int64_t valid_bits_offset,
                           typename EncodingTraits<ByteArrayType>::Accumulator* 
out,
                           int* out_values_decoded) {
-    // We're going to decode up to `num_values - null_count` PLAIN values,
+    // We're going to decode `num_values - null_count` PLAIN values,
     // and each value has a 4-byte length header that doesn't count for the
     // Arrow binary data length.
-    int64_t estimated_data_length =
-        std::max<int64_t>(0, len_ - 4 * (num_values - null_count));
+    int64_t estimated_data_length = len_ - 4 * (num_values - null_count);
+    if (ARROW_PREDICT_FALSE(estimated_data_length < 0)) {
+      return Status::Invalid("Invalid or truncated PLAIN-encoded BYTE_ARRAY 
data");
+    }
 
     auto visit_binary_helper = [&](auto* helper) {
       int values_decoded = 0;
 
-      RETURN_NOT_OK(VisitBitRuns(
-          valid_bits, valid_bits_offset, num_values,
-          [&](int64_t position, int64_t run_length, bool is_valid) {
-            if (is_valid) {
-              for (int64_t i = 0; i < run_length; ++i) {
-                if (ARROW_PREDICT_FALSE(len_ < 4)) {
-                  return Status::Invalid(
-                      "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
-                }
-                auto value_len = SafeLoadAs<int32_t>(data_);
-                if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > len_ - 
4)) {
-                  return Status::Invalid(
-                      "Invalid or truncated PLAIN-encoded BYTE_ARRAY data");
-                }
-                RETURN_NOT_OK(
-                    helper->AppendValue(data_ + 4, value_len, 
estimated_data_length));
-                auto increment = value_len + 4;
-                data_ += increment;
-                len_ -= increment;
-                estimated_data_length -= value_len;
-                DCHECK_GE(estimated_data_length, 0);
-              }
-              values_decoded += static_cast<int>(run_length);
-              return Status::OK();
-            } else {
-              return helper->AppendNulls(run_length);
+      auto visit_run = [&](int64_t position, int64_t run_length, bool 
is_valid) {
+        if (is_valid) {
+          for (int64_t i = 0; i < run_length; ++i) {
+            // We ensure `len_` is sufficient thanks to:
+            //   1. the initial `estimated_data_length` check above,
+            //   2. the running `value_len > estimated_data_length` check 
below.
+            // This precondition follows from those two checks.
+            DCHECK_GE(len_, 4);
+            auto value_len = SafeLoadAs<int32_t>(data_);
+            if (ARROW_PREDICT_FALSE(value_len < 0 || value_len > 
estimated_data_length)) {

Review Comment:
   Would you mind add this to comment?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] GH-48308: [C++][Parquet] Fix potential crash when reading invalid Parquet data [arrow]

Reply via email to