This is an automated email from the ASF dual-hosted git repository.
mrhhsg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e8c06f265a2 [fix](be) Escape JSONB path member control characters
(#63517)
e8c06f265a2 is described below
commit e8c06f265a24ed405bddf3646854491f221ce71d
Author: Jerry Hu <[email protected]>
AuthorDate: Wed Jun 10 15:32:33 2026 +0800
[fix](be) Escape JSONB path member control characters (#63517)
### What problem does this PR solve?
Issue Number: None
Problem Summary: JSONB path serialization emitted raw control characters
in member names. This could make paths returned by JSON functions or
shown in diagnostics contain raw newlines/tabs and fail path round-trip
parsing. Escape JSONB path member names with JSON-compatible sequences,
decode those sequences while parsing path members, and reset per-leg
escape parser state so escaped nested members followed by additional
legs round-trip correctly.
### Release note
None
### Check List (For Author)
- Test:
- Unit Test: `./run-be-ut.sh --run
--filter=JsonbDocumentTest.jsonb_path_member_to_string_escapes_control_characters
-j 16`
- Unit Test: standalone JsonbPath nested escaped-member repro compiled
and ran successfully
- Build check: `DORIS_HOME=$PWD ninja -C be/ut_build_ASAN
test/CMakeFiles/doris_be_test.dir/core/jsonb/jsonb_document_test.cpp.o`
- Format check: `build-support/clang-format.sh`;
`build-support/check-format.sh`; `git diff --check`
- Static Analysis: attempted `build-support/run-clang-tidy.sh --base
d7d516ff7b60e2ff6971e7d18cd805478595b5c2 --build-dir be/ut_build_ASAN`,
blocked by existing clang-tidy/toolchain diagnostics outside the change
- Full `doris_be_test` link: attempted locally, but stopped because this
worktree triggered a large 1955-target rebuild
- Behavior changed: Yes. JSONB path member serialization now escapes
control characters instead of emitting raw control bytes, and escaped
nested path members followed by additional legs now parse and round-trip
correctly.
- Does this need documentation: No
---
be/src/util/jsonb_document.h | 105 +++++++++++++++++--
be/test/core/jsonb/jsonb_document_test.cpp | 113 +++++++++++++++++++++
.../sql_functions/json_functions/json_search.out | 3 +
.../json_functions/json_search.groovy | 10 ++
4 files changed, 223 insertions(+), 8 deletions(-)
diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h
index 0300c5935f6..40c1e9792a4 100644
--- a/be/src/util/jsonb_document.h
+++ b/be/src/util/jsonb_document.h
@@ -319,10 +319,65 @@ public:
unsigned int get_leg_len() const { return leg_len; }
void remove_escapes() {
- int new_len = 0;
- for (int i = 0; i < leg_len; i++) {
- if (leg_ptr[i] != '\\') {
+ unsigned int new_len = 0;
+ for (unsigned int i = 0; i < leg_len; ++i) {
+ if (leg_ptr[i] != ESCAPE) {
leg_ptr[new_len++] = leg_ptr[i];
+ continue;
+ }
+
+ ++i;
+ if (i >= leg_len) {
+ break;
+ }
+
+ switch (leg_ptr[i]) {
+ case 'b':
+ leg_ptr[new_len++] = '\b';
+ break;
+ case 'f':
+ leg_ptr[new_len++] = '\f';
+ break;
+ case 'n':
+ leg_ptr[new_len++] = '\n';
+ break;
+ case 'r':
+ leg_ptr[new_len++] = '\r';
+ break;
+ case 't':
+ leg_ptr[new_len++] = '\t';
+ break;
+ case 'u': {
+ if (i + 4 >= leg_len || leg_ptr[i + 1] != '0' || leg_ptr[i +
2] != '0') {
+ leg_ptr[new_len++] = leg_ptr[i];
+ break;
+ }
+
+ auto hex_to_int = [](char c) -> int {
+ if (c >= '0' && c <= '9') {
+ return c - '0';
+ }
+ if (c >= 'a' && c <= 'f') {
+ return c - 'a' + 10;
+ }
+ if (c >= 'A' && c <= 'F') {
+ return c - 'A' + 10;
+ }
+ return -1;
+ };
+ int high = hex_to_int(leg_ptr[i + 3]);
+ int low = hex_to_int(leg_ptr[i + 4]);
+ if (high < 0 || low < 0) {
+ leg_ptr[new_len++] = leg_ptr[i];
+ break;
+ }
+ leg_ptr[new_len++] = static_cast<char>((high << 4) | low);
+ i += 4;
+ break;
+ }
+ default:
+ leg_ptr[new_len++] = leg_ptr[i];
+ break;
}
}
leg_ptr[new_len] = '\0';
@@ -363,19 +418,51 @@ struct leg_info {
///type: 0 is member 1 is array
unsigned int type;
+ // NOLINTNEXTLINE(readability-non-const-parameter): str is an output
parameter.
bool to_string(std::string* str) const {
if (type == MEMBER_CODE) {
str->push_back(BEGIN_MEMBER);
bool contains_space = false;
std::string tmp;
for (auto* it = leg_ptr; it != (leg_ptr + leg_len); ++it) {
- if (std::isspace(*it)) {
+ auto c = static_cast<unsigned char>(*it);
+ if (std::isspace(c)) {
contains_space = true;
- } else if (*it == '"' || *it == ESCAPE || *it == '\r' || *it
== '\n' ||
- *it == '\b' || *it == '\t') {
- tmp.push_back(ESCAPE);
}
- tmp.push_back(*it);
+
+ switch (*it) {
+ case '"':
+ tmp.append("\\\"");
+ break;
+ case ESCAPE:
+ tmp.append("\\\\");
+ break;
+ case '\b':
+ tmp.append("\\b");
+ break;
+ case '\f':
+ tmp.append("\\f");
+ break;
+ case '\n':
+ tmp.append("\\n");
+ break;
+ case '\r':
+ tmp.append("\\r");
+ break;
+ case '\t':
+ tmp.append("\\t");
+ break;
+ default:
+ if (c < 0x20) {
+ constexpr char hex[] = "0123456789abcdef";
+ tmp.append("\\u00");
+ tmp.push_back(hex[c >> 4]);
+ tmp.push_back(hex[c & 0x0F]);
+ } else {
+ tmp.push_back(*it);
+ }
+ break;
+ }
}
if (contains_space) {
str->push_back(DOUBLE_QUOTE);
@@ -414,6 +501,7 @@ public:
void pop_leg_from_leg_vector() { leg_vector.pop_back(); }
+ // NOLINTNEXTLINE(readability-non-const-parameter): res is an output
parameter.
bool to_string(std::string* res) const {
res->push_back(SCOPE);
for (const auto& leg : leg_vector) {
@@ -1426,6 +1514,7 @@ inline bool JsonbPath::seek(const char* key_path, size_t
kp_len) {
stream.skip_whitespace();
stream.clear_leg_ptr();
stream.clear_leg_len();
+ stream.set_has_escapes(false);
if (!JsonbPath::parsePath(&stream, this)) {
//path invalid
diff --git a/be/test/core/jsonb/jsonb_document_test.cpp
b/be/test/core/jsonb/jsonb_document_test.cpp
index a26f1f87e72..4801e1cbfc8 100644
--- a/be/test/core/jsonb/jsonb_document_test.cpp
+++ b/be/test/core/jsonb/jsonb_document_test.cpp
@@ -42,6 +42,17 @@ static const JsonbValue* jsonb_value(JsonbWriter& writer) {
return writer.getDocument()->getValue();
}
+static std::string remove_path_member_escapes(std::string escaped) {
+ Stream stream(escaped.data(), escaped.size());
+ stream.set_leg_ptr(escaped.data());
+ stream.clear_leg_len();
+ for (size_t i = 0; i < escaped.size(); ++i) {
+ stream.add_leg_len();
+ }
+ stream.remove_escapes();
+ return {escaped.data(), stream.get_leg_len()};
+}
+
TEST_F(JsonbDocumentTest, writer) {
JsonbWriter writer;
writer.writeStartObject();
@@ -285,6 +296,108 @@ TEST_F(JsonbDocumentTest, forobject) {
}
}
+TEST_F(JsonbDocumentTest,
jsonb_path_member_to_string_escapes_control_characters) {
+ std::string key = "a\nb\tc\rd\"e\\f";
+ key.push_back('\x01');
+ key.append("g");
+
+ leg_info leg(key.data(), static_cast<unsigned int>(key.size()), 0,
MEMBER_CODE);
+
+ std::string out;
+ ASSERT_TRUE(leg.to_string(&out));
+ EXPECT_EQ(out, ".\"a\\nb\\tc\\rd\\\"e\\\\f\\u0001g\"");
+ EXPECT_EQ(out.find('\n'), std::string::npos);
+ EXPECT_EQ(out.find('\t'), std::string::npos);
+ EXPECT_EQ(out.find('\r'), std::string::npos);
+
+ const std::string expected_path = "$" + out;
+ std::string parsed_path = expected_path;
+ JsonbPath path;
+ ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+
+ std::string round_trip;
+ ASSERT_TRUE(path.to_string(&round_trip));
+ EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest,
jsonb_path_member_to_string_escapes_all_control_forms) {
+ std::string key = "a";
+ key.push_back('\b');
+ key.append("b");
+ key.push_back('\f');
+ key.append("c");
+ key.push_back('\0');
+ key.push_back('\x1F');
+ key.append("d");
+
+ leg_info leg(key.data(), static_cast<unsigned int>(key.size()), 0,
MEMBER_CODE);
+
+ std::string out;
+ ASSERT_TRUE(leg.to_string(&out));
+ EXPECT_EQ(out, ".\"a\\bb\\fc\\u0000\\u001fd\"");
+ EXPECT_EQ(out.find('\b'), std::string::npos);
+ EXPECT_EQ(out.find('\f'), std::string::npos);
+ EXPECT_EQ(out.find('\0'), std::string::npos);
+ EXPECT_EQ(out.find('\x1F'), std::string::npos);
+
+ const std::string expected_path = "$" + out;
+ std::string parsed_path = expected_path;
+ JsonbPath path;
+ ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+
+ std::string round_trip;
+ ASSERT_TRUE(path.to_string(&round_trip));
+ EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest,
jsonb_path_escaped_member_round_trips_with_following_legs) {
+ const std::string expected_path = R"($."a\nb".c.d)";
+ std::string parsed_path = expected_path;
+
+ JsonbPath path;
+ ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+ EXPECT_EQ(path.get_leg_vector_size(), 3);
+
+ std::string round_trip;
+ ASSERT_TRUE(path.to_string(&round_trip));
+ EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest, jsonb_path_remove_escapes_decodes_json_sequences) {
+ std::string expected = "a";
+ expected.push_back('\b');
+ expected.append("b");
+ expected.push_back('\f');
+ expected.append("c");
+ expected.push_back('\n');
+ expected.append("d");
+ expected.push_back('\r');
+ expected.append("e");
+ expected.push_back('\t');
+ expected.append("f\"g\\h/i");
+ expected.push_back('\0');
+ expected.append("j");
+ expected.push_back('\x1F');
+ expected.append("k");
+ expected.push_back('\x1F');
+ expected.append("l");
+
+ const auto decoded =
+
remove_path_member_escapes(R"(a\bb\fc\nd\re\tf\"g\\h\/i\u0000j\u001fk\u001Fl)");
+ EXPECT_EQ(decoded.size(), expected.size());
+ EXPECT_EQ(decoded, expected);
+}
+
+TEST_F(JsonbDocumentTest,
jsonb_path_remove_escapes_keeps_invalid_unicode_escape_body) {
+ EXPECT_EQ(remove_path_member_escapes(R"(\u0101)"), "u0101");
+ EXPECT_EQ(remove_path_member_escapes(R"(\u00g1)"), "u00g1");
+ EXPECT_EQ(remove_path_member_escapes(R"(\u00G1)"), "u00G1");
+ EXPECT_EQ(remove_path_member_escapes(R"(\u001g)"), "u001g");
+ EXPECT_EQ(remove_path_member_escapes(R"(\u00)"), "u00");
+ EXPECT_EQ(remove_path_member_escapes(R"(\q)"), "q");
+ EXPECT_EQ(remove_path_member_escapes("\\"), "");
+}
+
TEST_F(JsonbDocumentTest, invaild_jsonb_document) {
const JsonbDocument* doc = nullptr;
auto st = JsonbDocument::checkAndCreateDocument(nullptr, 0, &doc);
diff --git
a/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
b/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
index 99976574ff2..c5eda451e47 100644
--- a/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
+++ b/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
@@ -167,3 +167,6 @@
-- !search3 --
"fool"
+-- !search_control_escape --
+"$.\\"a\\\\nb\\"" "$.\\"a\\\\tb\\"" "$.\\"a\\\\rb\\""
"$.a\\\\u0001b" "needle"
+
diff --git
a/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
b/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
index 6c0ee70b856..1a4c515a340 100644
---
a/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
+++
b/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
@@ -143,4 +143,14 @@ suite("test_json_search") {
JSON_UNQUOTE(JSON_SEARCH('{ "onepotato": "foot", "one potato":
"food" , "one \\\\"potato": "fool" }','all', 'fool'))
);
"""
+
+ qt_search_control_escape """
+ select JSON_SEARCH('{ "a\\\\nb": "needle" }', 'one', 'needle'),
+ JSON_SEARCH('{ "a\\\\tb": "tab" }', 'one', 'tab'),
+ JSON_SEARCH('{ "a\\\\rb": "cr" }', 'one', 'cr'),
+ JSON_SEARCH('{ "a\\\\u0001b": "ctrl" }', 'one', 'ctrl'),
+ JSON_EXTRACT('{ "a\\\\nb": "needle" }',
+ JSON_UNQUOTE(JSON_SEARCH('{ "a\\\\nb": "needle" }', 'one',
'needle'))
+ );
+ """
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]