This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c5a2f799cf GH-49759: [C++][Integration] Harden BinaryView JSON parsing
with runtime validation (#49758)
c5a2f799cf is described below
commit c5a2f799cfe907731c46273cf678812848f8ee67
Author: metsw24-max <[email protected]>
AuthorDate: Sat Apr 18 18:15:54 2026 +0530
GH-49759: [C++][Integration] Harden BinaryView JSON parsing with runtime
validation (#49758)
### Rationale for this change
Binary view JSON parsing relied on debug-only checks for key invariants. In
release builds, malformed JSON inputs could bypass these checks and potentially
reach unsafe memory handling paths.
### What changes are included in this PR?
This PR hardens binary view parsing in `json_internal.cc` by introducing
explicit runtime validation for:
1. Non-negative SIZE
2. Exact INLINED length checks
3. Non-negative BUFFER_INDEX and OFFSET
4. PREFIX_HEX length validation
5. BUFFER_INDEX bounds checks
6. OFFSET + SIZE range validation against the referenced data buffer
Additionally, this PR adds regression tests in `json_integration_test.cc`:
1. BinaryViewRejectsNegativeSize
2. BinaryViewRejectsInlineLengthMismatch
3. BinaryViewRejectsOutOfBoundsReference
### Are these changes tested?
Yes.
- Added 3 targeted regression tests (all passing)
- Verified existing `TestJsonFileReadWrite.*` suite (all passing)
### Are there any user-facing changes?
No API changes.
Malformed binary-view JSON inputs now return clear `Invalid` errors instead
of relying on debug-only assertions.
* GitHub Issue: #49759
Authored-by: metsw24-max <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/integration/json_integration_test.cc | 45 ++++++++++++++++
cpp/src/arrow/integration/json_internal.cc | 62 +++++++++++++++++-----
2 files changed, 95 insertions(+), 12 deletions(-)
diff --git a/cpp/src/arrow/integration/json_integration_test.cc
b/cpp/src/arrow/integration/json_integration_test.cc
index 0e84ea6124..23105e8ca1 100644
--- a/cpp/src/arrow/integration/json_integration_test.cc
+++ b/cpp/src/arrow/integration/json_integration_test.cc
@@ -1112,6 +1112,51 @@ TEST(TestJsonFileReadWrite, JsonExample6) {
AssertArraysEqual(*batch->column(0), *expected_array);
}
+static void AssertInvalidBinaryViewJson(const std::string& json_array) {
+ rj::Document d;
+ // Pass explicit size to avoid ASAN issues with SIMD loads in RapidJson.
+ d.Parse(json_array.data(), json_array.size());
+ ASSERT_FALSE(d.HasParseError());
+
+ ASSERT_RAISES(Invalid,
+ json::ReadArray(default_memory_pool(), d, field("f",
binary_view())));
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsNegativeSize) {
+ AssertInvalidBinaryViewJson(R"({
+ "name": "f",
+ "count": 1,
+ "VALIDITY": [1],
+ "VIEWS": [{"SIZE": -1, "INLINED": ""}],
+ "VARIADIC_DATA_BUFFERS": []
+ })");
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsInlineLengthMismatch) {
+ AssertInvalidBinaryViewJson(R"({
+ "name": "f",
+ "count": 1,
+ "VALIDITY": [1],
+ "VIEWS": [{"SIZE": 4, "INLINED": "x"}],
+ "VARIADIC_DATA_BUFFERS": []
+ })");
+}
+
+TEST(TestJsonFileReadWrite, BinaryViewRejectsOutOfBoundsReference) {
+ AssertInvalidBinaryViewJson(R"({
+ "name": "f",
+ "count": 1,
+ "VALIDITY": [1],
+ "VIEWS": [{
+ "SIZE": 20,
+ "PREFIX_HEX": "00010203",
+ "BUFFER_INDEX": 0,
+ "OFFSET": 0
+ }],
+ "VARIADIC_DATA_BUFFERS": ["0001020304050607"]
+ })");
+}
+
class TestJsonRoundTrip : public ::testing::TestWithParam<MakeRecordBatch*> {
public:
void SetUp() {}
diff --git a/cpp/src/arrow/integration/json_internal.cc
b/cpp/src/arrow/integration/json_internal.cc
index 2dc6b36118..1e57be51b0 100644
--- a/cpp/src/arrow/integration/json_internal.cc
+++ b/cpp/src/arrow/integration/json_internal.cc
@@ -19,6 +19,7 @@
#include <cstdint>
#include <cstdlib>
+#include <cstring>
#include <iomanip>
#include <iostream>
#include <memory>
@@ -1497,8 +1498,11 @@ class ArrayReader {
auto json_size = json_view_obj.FindMember("SIZE");
RETURN_NOT_INT("SIZE", json_size, json_view_obj);
- DCHECK_GE(json_size->value.GetInt64(), 0);
- auto size = static_cast<int32_t>(json_size->value.GetInt64());
+ auto size = json_size->value.GetInt();
+ if (size < 0) {
+ return Status::Invalid("Invalid binary view SIZE: ", size,
+ ". Expected a non-negative value");
+ }
if (size <= BinaryViewType::kInlineSize) {
auto json_inlined = json_view_obj.FindMember("INLINED");
@@ -1506,11 +1510,19 @@ class ArrayReader {
out_view.inlined = {size, {}};
if constexpr (ViewType::is_utf8) {
- DCHECK_LE(json_inlined->value.GetStringLength(),
BinaryViewType::kInlineSize);
+ if (json_inlined->value.GetStringLength() !=
static_cast<rj::SizeType>(size)) {
+ return Status::Invalid("Invalid binary view INLINED length: ",
+ json_inlined->value.GetStringLength(),
+ ". Expected exactly ", size, " bytes");
+ }
memcpy(&out_view.inlined.data, json_inlined->value.GetString(),
size);
} else {
- DCHECK_LE(json_inlined->value.GetStringLength(),
- BinaryViewType::kInlineSize * 2);
+ if (json_inlined->value.GetStringLength() !=
+ static_cast<rj::SizeType>(size * 2)) {
+ return Status::Invalid("Invalid binary view INLINED hex length: ",
+ json_inlined->value.GetStringLength(),
+ ". Expected exactly ", size * 2, "
characters");
+ }
ARROW_ASSIGN_OR_RAISE(auto inlined,
GetStringView(json_inlined->value));
RETURN_NOT_OK(ParseHexValues(inlined, out_view.inlined.data.data()));
}
@@ -1524,20 +1536,46 @@ class ArrayReader {
RETURN_NOT_INT("BUFFER_INDEX", json_buffer_index, json_view_obj);
RETURN_NOT_INT("OFFSET", json_offset, json_view_obj);
+ const auto buffer_index = json_buffer_index->value.GetInt();
+ const auto offset = json_offset->value.GetInt();
+ if (buffer_index < 0) {
+ return Status::Invalid("Invalid binary view BUFFER_INDEX: ",
buffer_index,
+ ". Expected a non-negative value");
+ }
+ if (offset < 0) {
+ return Status::Invalid("Invalid binary view OFFSET: ", offset,
+ ". Expected a non-negative value");
+ }
+ if (json_prefix->value.GetStringLength() != BinaryViewType::kPrefixSize
* 2) {
+ return Status::Invalid("Invalid binary view PREFIX_HEX length: ",
+ json_prefix->value.GetStringLength(),
+ ". Expected exactly ",
BinaryViewType::kPrefixSize * 2,
+ " characters");
+ }
+
+ const int64_t num_variadic_buffers =
static_cast<int64_t>(buffers.size()) - 2;
+ if (buffer_index >= num_variadic_buffers) {
+ return Status::Invalid("Invalid binary view BUFFER_INDEX: ",
buffer_index,
+ ". Expected < ", num_variadic_buffers);
+ }
+
+ const int64_t data_buffer_size = buffers[buffer_index + 2]->size();
+ if (static_cast<int64_t>(offset) > data_buffer_size ||
+ static_cast<int64_t>(size) > data_buffer_size -
static_cast<int64_t>(offset)) {
+ return Status::Invalid("Invalid binary view range [offset=", offset,
+ ", size=", size, "] for data buffer of size ",
+ data_buffer_size);
+ }
+
out_view.ref = {
size,
{},
- static_cast<int32_t>(json_buffer_index->value.GetInt64()),
- static_cast<int32_t>(json_offset->value.GetInt64()),
+ buffer_index,
+ offset,
};
- DCHECK_EQ(json_prefix->value.GetStringLength(),
BinaryViewType::kPrefixSize * 2);
ARROW_ASSIGN_OR_RAISE(auto prefix, GetStringView(json_prefix->value));
RETURN_NOT_OK(ParseHexValues(prefix, out_view.ref.prefix.data()));
-
- DCHECK_LE(static_cast<size_t>(out_view.ref.buffer_index), buffers.size()
- 2);
- DCHECK_LE(static_cast<int64_t>(out_view.ref.offset) + out_view.size(),
- buffers[out_view.ref.buffer_index + 2]->size());
}
data_ = ArrayData::Make(type_, length_, std::move(buffers), null_count);