This is an automated email from the ASF dual-hosted git repository.

mrhhsg pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e8c06f265a2 [fix](be) Escape JSONB path member control characters 
(#63517)
e8c06f265a2 is described below

commit e8c06f265a24ed405bddf3646854491f221ce71d
Author: Jerry Hu <[email protected]>
AuthorDate: Wed Jun 10 15:32:33 2026 +0800

    [fix](be) Escape JSONB path member control characters (#63517)
    
    ### What problem does this PR solve?
    
    Issue Number: None
    
    Problem Summary: JSONB path serialization emitted raw control characters
    in member names. This could make paths returned by JSON functions or
    shown in diagnostics contain raw newlines/tabs and fail path round-trip
    parsing. Escape JSONB path member names with JSON-compatible sequences,
    decode those sequences while parsing path members, and reset per-leg
    escape parser state so escaped nested members followed by additional
    legs round-trip correctly.
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test:
    - Unit Test: `./run-be-ut.sh --run
    
--filter=JsonbDocumentTest.jsonb_path_member_to_string_escapes_control_characters
    -j 16`
    - Unit Test: standalone JsonbPath nested escaped-member repro compiled
    and ran successfully
    - Build check: `DORIS_HOME=$PWD ninja -C be/ut_build_ASAN
    test/CMakeFiles/doris_be_test.dir/core/jsonb/jsonb_document_test.cpp.o`
    - Format check: `build-support/clang-format.sh`;
    `build-support/check-format.sh`; `git diff --check`
    - Static Analysis: attempted `build-support/run-clang-tidy.sh --base
    d7d516ff7b60e2ff6971e7d18cd805478595b5c2 --build-dir be/ut_build_ASAN`,
    blocked by existing clang-tidy/toolchain diagnostics outside the change
    - Full `doris_be_test` link: attempted locally, but stopped because this
    worktree triggered a large 1955-target rebuild
    - Behavior changed: Yes. JSONB path member serialization now escapes
    control characters instead of emitting raw control bytes, and escaped
    nested path members followed by additional legs now parse and round-trip
    correctly.
    - Does this need documentation: No
---
 be/src/util/jsonb_document.h                       | 105 +++++++++++++++++--
 be/test/core/jsonb/jsonb_document_test.cpp         | 113 +++++++++++++++++++++
 .../sql_functions/json_functions/json_search.out   |   3 +
 .../json_functions/json_search.groovy              |  10 ++
 4 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h
index 0300c5935f6..40c1e9792a4 100644
--- a/be/src/util/jsonb_document.h
+++ b/be/src/util/jsonb_document.h
@@ -319,10 +319,65 @@ public:
     unsigned int get_leg_len() const { return leg_len; }
 
     void remove_escapes() {
-        int new_len = 0;
-        for (int i = 0; i < leg_len; i++) {
-            if (leg_ptr[i] != '\\') {
+        unsigned int new_len = 0;
+        for (unsigned int i = 0; i < leg_len; ++i) {
+            if (leg_ptr[i] != ESCAPE) {
                 leg_ptr[new_len++] = leg_ptr[i];
+                continue;
+            }
+
+            ++i;
+            if (i >= leg_len) {
+                break;
+            }
+
+            switch (leg_ptr[i]) {
+            case 'b':
+                leg_ptr[new_len++] = '\b';
+                break;
+            case 'f':
+                leg_ptr[new_len++] = '\f';
+                break;
+            case 'n':
+                leg_ptr[new_len++] = '\n';
+                break;
+            case 'r':
+                leg_ptr[new_len++] = '\r';
+                break;
+            case 't':
+                leg_ptr[new_len++] = '\t';
+                break;
+            case 'u': {
+                if (i + 4 >= leg_len || leg_ptr[i + 1] != '0' || leg_ptr[i + 
2] != '0') {
+                    leg_ptr[new_len++] = leg_ptr[i];
+                    break;
+                }
+
+                auto hex_to_int = [](char c) -> int {
+                    if (c >= '0' && c <= '9') {
+                        return c - '0';
+                    }
+                    if (c >= 'a' && c <= 'f') {
+                        return c - 'a' + 10;
+                    }
+                    if (c >= 'A' && c <= 'F') {
+                        return c - 'A' + 10;
+                    }
+                    return -1;
+                };
+                int high = hex_to_int(leg_ptr[i + 3]);
+                int low = hex_to_int(leg_ptr[i + 4]);
+                if (high < 0 || low < 0) {
+                    leg_ptr[new_len++] = leg_ptr[i];
+                    break;
+                }
+                leg_ptr[new_len++] = static_cast<char>((high << 4) | low);
+                i += 4;
+                break;
+            }
+            default:
+                leg_ptr[new_len++] = leg_ptr[i];
+                break;
             }
         }
         leg_ptr[new_len] = '\0';
@@ -363,19 +418,51 @@ struct leg_info {
     ///type: 0 is member 1 is array
     unsigned int type;
 
+    // NOLINTNEXTLINE(readability-non-const-parameter): str is an output 
parameter.
     bool to_string(std::string* str) const {
         if (type == MEMBER_CODE) {
             str->push_back(BEGIN_MEMBER);
             bool contains_space = false;
             std::string tmp;
             for (auto* it = leg_ptr; it != (leg_ptr + leg_len); ++it) {
-                if (std::isspace(*it)) {
+                auto c = static_cast<unsigned char>(*it);
+                if (std::isspace(c)) {
                     contains_space = true;
-                } else if (*it == '"' || *it == ESCAPE || *it == '\r' || *it 
== '\n' ||
-                           *it == '\b' || *it == '\t') {
-                    tmp.push_back(ESCAPE);
                 }
-                tmp.push_back(*it);
+
+                switch (*it) {
+                case '"':
+                    tmp.append("\\\"");
+                    break;
+                case ESCAPE:
+                    tmp.append("\\\\");
+                    break;
+                case '\b':
+                    tmp.append("\\b");
+                    break;
+                case '\f':
+                    tmp.append("\\f");
+                    break;
+                case '\n':
+                    tmp.append("\\n");
+                    break;
+                case '\r':
+                    tmp.append("\\r");
+                    break;
+                case '\t':
+                    tmp.append("\\t");
+                    break;
+                default:
+                    if (c < 0x20) {
+                        constexpr char hex[] = "0123456789abcdef";
+                        tmp.append("\\u00");
+                        tmp.push_back(hex[c >> 4]);
+                        tmp.push_back(hex[c & 0x0F]);
+                    } else {
+                        tmp.push_back(*it);
+                    }
+                    break;
+                }
             }
             if (contains_space) {
                 str->push_back(DOUBLE_QUOTE);
@@ -414,6 +501,7 @@ public:
 
     void pop_leg_from_leg_vector() { leg_vector.pop_back(); }
 
+    // NOLINTNEXTLINE(readability-non-const-parameter): res is an output 
parameter.
     bool to_string(std::string* res) const {
         res->push_back(SCOPE);
         for (const auto& leg : leg_vector) {
@@ -1426,6 +1514,7 @@ inline bool JsonbPath::seek(const char* key_path, size_t 
kp_len) {
         stream.skip_whitespace();
         stream.clear_leg_ptr();
         stream.clear_leg_len();
+        stream.set_has_escapes(false);
 
         if (!JsonbPath::parsePath(&stream, this)) {
             //path invalid
diff --git a/be/test/core/jsonb/jsonb_document_test.cpp 
b/be/test/core/jsonb/jsonb_document_test.cpp
index a26f1f87e72..4801e1cbfc8 100644
--- a/be/test/core/jsonb/jsonb_document_test.cpp
+++ b/be/test/core/jsonb/jsonb_document_test.cpp
@@ -42,6 +42,17 @@ static const JsonbValue* jsonb_value(JsonbWriter& writer) {
     return writer.getDocument()->getValue();
 }
 
+static std::string remove_path_member_escapes(std::string escaped) {
+    Stream stream(escaped.data(), escaped.size());
+    stream.set_leg_ptr(escaped.data());
+    stream.clear_leg_len();
+    for (size_t i = 0; i < escaped.size(); ++i) {
+        stream.add_leg_len();
+    }
+    stream.remove_escapes();
+    return {escaped.data(), stream.get_leg_len()};
+}
+
 TEST_F(JsonbDocumentTest, writer) {
     JsonbWriter writer;
     writer.writeStartObject();
@@ -285,6 +296,108 @@ TEST_F(JsonbDocumentTest, forobject) {
     }
 }
 
+TEST_F(JsonbDocumentTest, 
jsonb_path_member_to_string_escapes_control_characters) {
+    std::string key = "a\nb\tc\rd\"e\\f";
+    key.push_back('\x01');
+    key.append("g");
+
+    leg_info leg(key.data(), static_cast<unsigned int>(key.size()), 0, 
MEMBER_CODE);
+
+    std::string out;
+    ASSERT_TRUE(leg.to_string(&out));
+    EXPECT_EQ(out, ".\"a\\nb\\tc\\rd\\\"e\\\\f\\u0001g\"");
+    EXPECT_EQ(out.find('\n'), std::string::npos);
+    EXPECT_EQ(out.find('\t'), std::string::npos);
+    EXPECT_EQ(out.find('\r'), std::string::npos);
+
+    const std::string expected_path = "$" + out;
+    std::string parsed_path = expected_path;
+    JsonbPath path;
+    ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+
+    std::string round_trip;
+    ASSERT_TRUE(path.to_string(&round_trip));
+    EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest, 
jsonb_path_member_to_string_escapes_all_control_forms) {
+    std::string key = "a";
+    key.push_back('\b');
+    key.append("b");
+    key.push_back('\f');
+    key.append("c");
+    key.push_back('\0');
+    key.push_back('\x1F');
+    key.append("d");
+
+    leg_info leg(key.data(), static_cast<unsigned int>(key.size()), 0, 
MEMBER_CODE);
+
+    std::string out;
+    ASSERT_TRUE(leg.to_string(&out));
+    EXPECT_EQ(out, ".\"a\\bb\\fc\\u0000\\u001fd\"");
+    EXPECT_EQ(out.find('\b'), std::string::npos);
+    EXPECT_EQ(out.find('\f'), std::string::npos);
+    EXPECT_EQ(out.find('\0'), std::string::npos);
+    EXPECT_EQ(out.find('\x1F'), std::string::npos);
+
+    const std::string expected_path = "$" + out;
+    std::string parsed_path = expected_path;
+    JsonbPath path;
+    ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+
+    std::string round_trip;
+    ASSERT_TRUE(path.to_string(&round_trip));
+    EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest, 
jsonb_path_escaped_member_round_trips_with_following_legs) {
+    const std::string expected_path = R"($."a\nb".c.d)";
+    std::string parsed_path = expected_path;
+
+    JsonbPath path;
+    ASSERT_TRUE(path.seek(parsed_path.data(), parsed_path.size()));
+    EXPECT_EQ(path.get_leg_vector_size(), 3);
+
+    std::string round_trip;
+    ASSERT_TRUE(path.to_string(&round_trip));
+    EXPECT_EQ(round_trip, expected_path);
+}
+
+TEST_F(JsonbDocumentTest, jsonb_path_remove_escapes_decodes_json_sequences) {
+    std::string expected = "a";
+    expected.push_back('\b');
+    expected.append("b");
+    expected.push_back('\f');
+    expected.append("c");
+    expected.push_back('\n');
+    expected.append("d");
+    expected.push_back('\r');
+    expected.append("e");
+    expected.push_back('\t');
+    expected.append("f\"g\\h/i");
+    expected.push_back('\0');
+    expected.append("j");
+    expected.push_back('\x1F');
+    expected.append("k");
+    expected.push_back('\x1F');
+    expected.append("l");
+
+    const auto decoded =
+            
remove_path_member_escapes(R"(a\bb\fc\nd\re\tf\"g\\h\/i\u0000j\u001fk\u001Fl)");
+    EXPECT_EQ(decoded.size(), expected.size());
+    EXPECT_EQ(decoded, expected);
+}
+
+TEST_F(JsonbDocumentTest, 
jsonb_path_remove_escapes_keeps_invalid_unicode_escape_body) {
+    EXPECT_EQ(remove_path_member_escapes(R"(\u0101)"), "u0101");
+    EXPECT_EQ(remove_path_member_escapes(R"(\u00g1)"), "u00g1");
+    EXPECT_EQ(remove_path_member_escapes(R"(\u00G1)"), "u00G1");
+    EXPECT_EQ(remove_path_member_escapes(R"(\u001g)"), "u001g");
+    EXPECT_EQ(remove_path_member_escapes(R"(\u00)"), "u00");
+    EXPECT_EQ(remove_path_member_escapes(R"(\q)"), "q");
+    EXPECT_EQ(remove_path_member_escapes("\\"), "");
+}
+
 TEST_F(JsonbDocumentTest, invaild_jsonb_document) {
     const JsonbDocument* doc = nullptr;
     auto st = JsonbDocument::checkAndCreateDocument(nullptr, 0, &doc);
diff --git 
a/regression-test/data/query_p0/sql_functions/json_functions/json_search.out 
b/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
index 99976574ff2..c5eda451e47 100644
--- a/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
+++ b/regression-test/data/query_p0/sql_functions/json_functions/json_search.out
@@ -167,3 +167,6 @@
 -- !search3 --
 "fool"
 
+-- !search_control_escape --
+"$.\\"a\\\\nb\\""      "$.\\"a\\\\tb\\""       "$.\\"a\\\\rb\\""       
"$.a\\\\u0001b" "needle"
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
 
b/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
index 6c0ee70b856..1a4c515a340 100644
--- 
a/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/json_functions/json_search.groovy
@@ -143,4 +143,14 @@ suite("test_json_search") {
             JSON_UNQUOTE(JSON_SEARCH('{ "onepotato": "foot", "one potato": 
"food" , "one \\\\"potato": "fool" }','all', 'fool'))
         );
     """
+
+    qt_search_control_escape """
+        select JSON_SEARCH('{ "a\\\\nb": "needle" }', 'one', 'needle'),
+               JSON_SEARCH('{ "a\\\\tb": "tab" }', 'one', 'tab'),
+               JSON_SEARCH('{ "a\\\\rb": "cr" }', 'one', 'cr'),
+               JSON_SEARCH('{ "a\\\\u0001b": "ctrl" }', 'one', 'ctrl'),
+               JSON_EXTRACT('{ "a\\\\nb": "needle" }',
+                   JSON_UNQUOTE(JSON_SEARCH('{ "a\\\\nb": "needle" }', 'one', 
'needle'))
+               );
+    """
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to