This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c5a2f799cf GH-49759: [C++][Integration] Harden BinaryView JSON parsing 
with runtime validation (#49758)
c5a2f799cf is described below

commit c5a2f799cfe907731c46273cf678812848f8ee67
Author: metsw24-max <[email protected]>
AuthorDate: Sat Apr 18 18:15:54 2026 +0530

    GH-49759: [C++][Integration] Harden BinaryView JSON parsing with runtime 
validation (#49758)
    
    ### Rationale for this change
    Binary view JSON parsing relied on debug-only checks for key invariants. In 
release builds, malformed JSON inputs could bypass these checks and potentially 
reach unsafe memory handling paths.
    
    ### What changes are included in this PR?
    This PR hardens binary view parsing in `json_internal.cc` by introducing 
explicit runtime validation for:
    1. Non-negative SIZE
    2. Exact INLINED length checks
    3. Non-negative BUFFER_INDEX and OFFSET
    4. PREFIX_HEX length validation
    5. BUFFER_INDEX bounds checks
    6. OFFSET + SIZE range validation against the referenced data buffer
    
    Additionally, this PR adds regression tests in `json_integration_test.cc`:
    1. BinaryViewRejectsNegativeSize
    2. BinaryViewRejectsInlineLengthMismatch
    3. BinaryViewRejectsOutOfBoundsReference
    
    ### Are these changes tested?
    Yes.
    - Added 3 targeted regression tests (all passing)
    - Verified existing `TestJsonFileReadWrite.*` suite (all passing)
    
    ### Are there any user-facing changes?
    No API changes.
    
    Malformed binary-view JSON inputs now return clear `Invalid` errors instead 
of relying on debug-only assertions.
    
    * GitHub Issue: #49759
    
    Authored-by: metsw24-max <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/integration/json_integration_test.cc | 45 ++++++++++++++++
 cpp/src/arrow/integration/json_internal.cc         | 62 +++++++++++++++++-----
 2 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/cpp/src/arrow/integration/json_integration_test.cc 
b/cpp/src/arrow/integration/json_integration_test.cc
index 0e84ea6124..23105e8ca1 100644
--- a/cpp/src/arrow/integration/json_integration_test.cc
+++ b/cpp/src/arrow/integration/json_integration_test.cc
@@ -1112,6 +1112,51 @@ TEST(TestJsonFileReadWrite, JsonExample6) {
   AssertArraysEqual(*batch->column(0), *expected_array);
 }
 
+static void AssertInvalidBinaryViewJson(const std::string& json_array) {
+  rj::Document d;
+  // Pass explicit size to avoid ASAN issues with SIMD loads in RapidJson.
+  d.Parse(json_array.data(), json_array.size());
+  ASSERT_FALSE(d.HasParseError());
+
+  ASSERT_RAISES(Invalid,
+                json::ReadArray(default_memory_pool(), d, field("f", 
binary_view())));
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsNegativeSize) {
+  AssertInvalidBinaryViewJson(R"({
+    "name": "f",
+    "count": 1,
+    "VALIDITY": [1],
+    "VIEWS": [{"SIZE": -1, "INLINED": ""}],
+    "VARIADIC_DATA_BUFFERS": []
+  })");
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsInlineLengthMismatch) {
+  AssertInvalidBinaryViewJson(R"({
+    "name": "f",
+    "count": 1,
+    "VALIDITY": [1],
+    "VIEWS": [{"SIZE": 4, "INLINED": "x"}],
+    "VARIADIC_DATA_BUFFERS": []
+  })");
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsOutOfBoundsReference) {
+  AssertInvalidBinaryViewJson(R"({
+    "name": "f",
+    "count": 1,
+    "VALIDITY": [1],
+    "VIEWS": [{
+      "SIZE": 20,
+      "PREFIX_HEX": "00010203",
+      "BUFFER_INDEX": 0,
+      "OFFSET": 0
+    }],
+    "VARIADIC_DATA_BUFFERS": ["0001020304050607"]
+  })");
+}
+
 class TestJsonRoundTrip : public ::testing::TestWithParam<MakeRecordBatch*> {
  public:
   void SetUp() {}
diff --git a/cpp/src/arrow/integration/json_internal.cc 
b/cpp/src/arrow/integration/json_internal.cc
index 2dc6b36118..1e57be51b0 100644
--- a/cpp/src/arrow/integration/json_internal.cc
+++ b/cpp/src/arrow/integration/json_internal.cc
@@ -19,6 +19,7 @@
 
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <iomanip>
 #include <iostream>
 #include <memory>
@@ -1497,8 +1498,11 @@ class ArrayReader {
 
       auto json_size = json_view_obj.FindMember("SIZE");
       RETURN_NOT_INT("SIZE", json_size, json_view_obj);
-      DCHECK_GE(json_size->value.GetInt64(), 0);
-      auto size = static_cast<int32_t>(json_size->value.GetInt64());
+      auto size = json_size->value.GetInt();
+      if (size < 0) {
+        return Status::Invalid("Invalid binary view SIZE: ", size,
+                               ". Expected a non-negative value");
+      }
 
       if (size <= BinaryViewType::kInlineSize) {
         auto json_inlined = json_view_obj.FindMember("INLINED");
@@ -1506,11 +1510,19 @@ class ArrayReader {
         out_view.inlined = {size, {}};
 
         if constexpr (ViewType::is_utf8) {
-          DCHECK_LE(json_inlined->value.GetStringLength(), 
BinaryViewType::kInlineSize);
+          if (json_inlined->value.GetStringLength() != 
static_cast<rj::SizeType>(size)) {
+            return Status::Invalid("Invalid binary view INLINED length: ",
+                                   json_inlined->value.GetStringLength(),
+                                   ". Expected exactly ", size, " bytes");
+          }
           memcpy(&out_view.inlined.data, json_inlined->value.GetString(), 
size);
         } else {
-          DCHECK_LE(json_inlined->value.GetStringLength(),
-                    BinaryViewType::kInlineSize * 2);
+          if (json_inlined->value.GetStringLength() !=
+              static_cast<rj::SizeType>(size * 2)) {
+            return Status::Invalid("Invalid binary view INLINED hex length: ",
+                                   json_inlined->value.GetStringLength(),
+                                   ". Expected exactly ", size * 2, " 
characters");
+          }
           ARROW_ASSIGN_OR_RAISE(auto inlined, 
GetStringView(json_inlined->value));
           RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data()));
         }
@@ -1524,20 +1536,46 @@ class ArrayReader {
       RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj);
       RETURN_NOT_INT("OFFSET", json_offset, json_view_obj);
 
+      const auto buffer_index = json_buffer_index->value.GetInt();
+      const auto offset = json_offset->value.GetInt();
+      if (buffer_index < 0) {
+        return Status::Invalid("Invalid binary view BUFFER_INDEX: ", 
buffer_index,
+                               ". Expected a non-negative value");
+      }
+      if (offset < 0) {
+        return Status::Invalid("Invalid binary view OFFSET: ", offset,
+                               ". Expected a non-negative value");
+      }
+      if (json_prefix->value.GetStringLength() != BinaryViewType::kPrefixSize 
* 2) {
+        return Status::Invalid("Invalid binary view PREFIX_HEX length: ",
+                               json_prefix->value.GetStringLength(),
+                               ". Expected exactly ", 
BinaryViewType::kPrefixSize * 2,
+                               " characters");
+      }
+
+      const int64_t num_variadic_buffers = 
static_cast<int64_t>(buffers.size()) - 2;
+      if (buffer_index >= num_variadic_buffers) {
+        return Status::Invalid("Invalid binary view BUFFER_INDEX: ", 
buffer_index,
+                               ". Expected < ", num_variadic_buffers);
+      }
+
+      const int64_t data_buffer_size = buffers[buffer_index + 2]->size();
+      if (static_cast<int64_t>(offset) > data_buffer_size ||
+          static_cast<int64_t>(size) > data_buffer_size - 
static_cast<int64_t>(offset)) {
+        return Status::Invalid("Invalid binary view range [offset=", offset,
+                               ", size=", size, "] for data buffer of size ",
+                               data_buffer_size);
+      }
+
       out_view.ref = {
           size,
           {},
-          static_cast<int32_t>(json_buffer_index->value.GetInt64()),
-          static_cast<int32_t>(json_offset->value.GetInt64()),
+          buffer_index,
+          offset,
       };
 
-      DCHECK_EQ(json_prefix->value.GetStringLength(), 
BinaryViewType::kPrefixSize * 2);
       ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value));
       RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data()));
-
-      DCHECK_LE(static_cast<size_t>(out_view.ref.buffer_index), buffers.size() 
- 2);
-      DCHECK_LE(static_cast<int64_t>(out_view.ref.offset) + out_view.size(),
-                buffers[out_view.ref.buffer_index + 2]->size());
     }
 
     data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count);

Reply via email to