This is an automated email from the ASF dual-hosted git repository.

morrySnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 22c9ed691d5 [fix](fe) Reject lone UTF-16 surrogates in JSONB literals 
(RFC 8259 §8.2) (#63255)
22c9ed691d5 is described below

commit 22c9ed691d5c0708d20d6daf357bbd1c1ea97168
Author: morrySnow <[email protected]>
AuthorDate: Mon May 18 15:00:26 2026 +0800

    [fix](fe) Reject lone UTF-16 surrogates in JSONB literals (RFC 8259 §8.2) 
(#63255)
    
    ## Summary
    
    **Problem fixed:** `JsonLiteral` (Nereids/Jackson path) and
    `analysis.JsonLiteral` (legacy/Gson path) silently accepted lone UTF-16
    surrogates (e.g. `'"\uD800"'::JSONB`) as valid JSONB literals. RFC 8259
    §8.2 explicitly forbids unpaired surrogates in JSON strings because they
    cannot be represented as valid UTF-8.
    
    **How it was fixed:** Added a recursive `validateNoLoneSurrogate`
    post-parse check in both `JsonLiteral` constructors. After Jackson/Gson
    parses the JSON tree, the method walks all string nodes and immediately
    throws `AnalysisException` on any lone high or low surrogate.
    
    ## What problem does this PR solve?
    
    **Before this fix:** Passing a lone surrogate like `'"\uD800"'::JSONB`
    was silently accepted at the FE layer. The invalid value would be stored
    in the BE JSONB column. The error would only surface later — during
    `EXPORT`, `SELECT INTO OUTFILE`, or cross-system transfer — making it
    hard to diagnose. This is a data-correctness (SEV-2) issue.
    
    **After this fix:** Constructing a `JsonLiteral` with a lone surrogate
    immediately throws `AnalysisException: Invalid jsonb literal: JSON
    string contains lone high surrogate` (or `lone low surrogate`), giving
    the user a clear error at write time.
    
    ## Behavior change
    
    | Scenario | Before | After |
    |---|---|---|
    | `'"\uD800"'::JSONB` | Accepted silently | AnalysisException at parse
    time |
    | `INSERT INTO t VALUES (1, '"\uD800"')` | Stored in BE, may fail on
    export | AnalysisException at FE |
    | `'"\uD83D\uDE00"'::JSONB` (valid pair 😀) | Accepted | Still accepted
    (no change) |
    | `'"hello"'::JSONB` (plain ASCII) | Accepted | Still accepted (no
    change) |
    
    ## Why both paths?
    
    Doris has two `JsonLiteral` implementations:
    - **Nereids** (`fe-core`): uses Jackson `ObjectMapper.readTree` —
    Jackson accepts lone surrogates by default
    - **Legacy** (`fe-catalog`, `analysis`): uses Gson `JsonParser.parse` —
    Gson also accepts lone surrogates by default
    
    Both needed the same fix to ensure consistent behavior regardless of
    which query path is used.
    
    ## Release note
    
    JSONB literal expressions now reject strings containing lone UTF-16
    surrogates (e.g. `'"\uD800"'::JSONB`) with an `AnalysisException` at
    parse time, conforming to RFC 8259 §8.2. Previously such literals were
    silently accepted, which could cause errors during export or
    cross-system data transfer.
    
    ---------
    
    Co-authored-by: Copilot <[email protected]>
---
 .../org/apache/doris/analysis/JsonLiteral.java     |  39 ++++++-
 .../trees/expressions/literal/JsonLiteral.java     |  39 ++++++-
 .../trees/expressions/literal/JsonLiteralTest.java | 124 +++++++++++++++++++++
 3 files changed, 200 insertions(+), 2 deletions(-)

diff --git 
a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java 
b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
index 26af45d4bcf..f4ad4ab419b 100644
--- a/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
+++ b/fe/fe-catalog/src/main/java/org/apache/doris/analysis/JsonLiteral.java
@@ -20,10 +20,12 @@ package org.apache.doris.analysis;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.common.AnalysisException;
 
+import com.google.gson.JsonElement;
 import com.google.gson.JsonParser;
 import com.google.gson.JsonSyntaxException;
 import com.google.gson.annotations.SerializedName;
 
+import java.util.Map;
 import java.util.Objects;
 
 public class JsonLiteral extends LiteralExpr {
@@ -41,7 +43,8 @@ public class JsonLiteral extends LiteralExpr {
 
     public JsonLiteral(String value) throws AnalysisException {
         try {
-            parser.parse(value);
+            JsonElement element = parser.parse(value);
+            validateNoLoneSurrogate(element);
         } catch (JsonSyntaxException e) {
             throw new AnalysisException("Invalid jsonb literal: " + 
e.getMessage());
         }
@@ -50,6 +53,40 @@ public class JsonLiteral extends LiteralExpr {
         this.nullable = false;
     }
 
+    // RFC 8259 §8.2: JSON strings must not contain lone UTF-16 surrogates.
+    // Gson accepts them by default, so we validate after parsing.
+    // Both string values AND object field names are checked.
+    private static void validateNoLoneSurrogate(JsonElement element) throws 
AnalysisException {
+        if (element.isJsonPrimitive() && 
element.getAsJsonPrimitive().isString()) {
+            validateNoLoneSurrogateInString(element.getAsString());
+        } else if (element.isJsonObject()) {
+            for (Map.Entry<String, JsonElement> entry : 
element.getAsJsonObject().entrySet()) {
+                validateNoLoneSurrogateInString(entry.getKey());
+                validateNoLoneSurrogate(entry.getValue());
+            }
+        } else if (element.isJsonArray()) {
+            for (JsonElement child : element.getAsJsonArray()) {
+                validateNoLoneSurrogate(child);
+            }
+        }
+    }
+
+    private static void validateNoLoneSurrogateInString(String s) throws 
AnalysisException {
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            if (Character.isHighSurrogate(c)) {
+                if (i + 1 >= s.length() || 
!Character.isLowSurrogate(s.charAt(i + 1))) {
+                    throw new AnalysisException(
+                            "Invalid jsonb literal: JSON string contains lone 
high surrogate");
+                }
+                i++; // skip the paired low surrogate
+            } else if (Character.isLowSurrogate(c)) {
+                throw new AnalysisException(
+                        "Invalid jsonb literal: JSON string contains lone low 
surrogate");
+            }
+        }
+    }
+
     protected JsonLiteral(JsonLiteral other) {
         super(other);
         value = other.value;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
index 4c4c7dced4c..b563b430893 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteral.java
@@ -27,6 +27,9 @@ import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
+import java.util.Iterator;
+import java.util.Map;
+
 /**
  * literal for json type.
  */
@@ -49,8 +52,42 @@ public class JsonLiteral extends Literal {
         }
         if (jsonNode == null || jsonNode.isMissingNode()) {
             throw new AnalysisException("Invalid jsonb literal: ''");
+        }
+        validateNoLoneSurrogate(jsonNode);
+        this.value = jsonNode.toString();
+    }
+
+    // RFC 8259 §8.2: JSON strings must not contain lone UTF-16 surrogates.
+    // Jackson accepts them by default, so we validate after parsing.
+    // Both string values AND object field names are checked.
+    private static void validateNoLoneSurrogate(JsonNode node) {
+        if (node.isTextual()) {
+            validateNoLoneSurrogateInString(node.textValue());
+        } else if (node.isObject()) {
+            Iterator<Map.Entry<String, JsonNode>> fields = node.fields();
+            while (fields.hasNext()) {
+                Map.Entry<String, JsonNode> entry = fields.next();
+                validateNoLoneSurrogateInString(entry.getKey());
+                validateNoLoneSurrogate(entry.getValue());
+            }
         } else {
-            this.value = jsonNode.toString();
+            node.forEach(JsonLiteral::validateNoLoneSurrogate);
+        }
+    }
+
+    private static void validateNoLoneSurrogateInString(String s) {
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            if (Character.isHighSurrogate(c)) {
+                if (i + 1 >= s.length() || 
!Character.isLowSurrogate(s.charAt(i + 1))) {
+                    throw new AnalysisException(
+                            "Invalid jsonb literal: JSON string contains lone 
high surrogate");
+                }
+                i++; // skip the paired low surrogate
+            } else if (Character.isLowSurrogate(c)) {
+                throw new AnalysisException(
+                        "Invalid jsonb literal: JSON string contains lone low 
surrogate");
+            }
         }
     }
 
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
new file mode 100644
index 00000000000..6e16ea9805e
--- /dev/null
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/expressions/literal/JsonLiteralTest.java
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.literal;
+
+import org.apache.doris.nereids.exceptions.AnalysisException;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Tests for JsonLiteral surrogate validation (RFC 8259 §8.2).
+ */
+public class JsonLiteralTest {
+
+    // --- valid inputs ---
+
+    @Test
+    public void testValidAsciiString() {
+        // plain ASCII string in JSON is always valid
+        Assertions.assertDoesNotThrow(() -> new JsonLiteral("\"hello\""));
+    }
+
+    @Test
+    public void testValidObject() {
+        Assertions.assertDoesNotThrow(() -> new 
JsonLiteral("{\"key\":\"value\"}"));
+    }
+
+    @Test
+    public void testValidArray() {
+        Assertions.assertDoesNotThrow(() -> new JsonLiteral("[1, \"abc\", 
true]"));
+    }
+
+    @Test
+    public void testValidSurrogatePair() {
+        // \uD83D\uDE00 is a valid surrogate pair (U+1F600, 😀)
+        // JSON escape: "\uD83D\uDE00"
+        Assertions.assertDoesNotThrow(() -> new 
JsonLiteral("\"\\uD83D\\uDE00\""));
+    }
+
+    @Test
+    public void testValidSurrogatePairInObject() {
+        Assertions.assertDoesNotThrow(() -> new 
JsonLiteral("{\"emoji\":\"\\uD83D\\uDE00\"}"));
+    }
+
+    // --- lone high surrogate ---
+
+    @Test
+    public void testLoneHighSurrogateTopLevel() {
+        // "\uD800" — lone high surrogate, no paired low surrogate
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("\"\\uD800\""));
+        Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"),
+                "Expected 'lone high surrogate' in: " + ex.getMessage());
+    }
+
+    @Test
+    public void testLoneHighSurrogateInObject() {
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("{\"k\":\"\\uD800\"}"));
+        Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+    }
+
+    @Test
+    public void testLoneHighSurrogateInArray() {
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("[\"\\uD800\"]"));
+        Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+    }
+
+    @Test
+    public void testHighSurrogateFollowedByNonLow() {
+        // \uD800\u0041 — high surrogate followed by 'A', not a low surrogate
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("\"\\uD800A\""));
+        Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+    }
+
+    // --- lone low surrogate ---
+
+    @Test
+    public void testLoneLowSurrogateTopLevel() {
+        // "\uDC00" — lone low surrogate
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("\"\\uDC00\""));
+        Assertions.assertTrue(ex.getMessage().contains("lone low surrogate"),
+                "Expected 'lone low surrogate' in: " + ex.getMessage());
+    }
+
+    @Test
+    public void testLoneHighSurrogateInObjectKey() {
+        // lone surrogate in object field name must also be rejected
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("{\"\\uD800\":\"value\"}"));
+        Assertions.assertTrue(ex.getMessage().contains("lone high surrogate"));
+    }
+
+    @Test
+    public void testLoneLowSurrogateInObjectKey() {
+        AnalysisException ex = Assertions.assertThrows(AnalysisException.class,
+                () -> new JsonLiteral("{\"\\uDC00\":\"value\"}"));
+        Assertions.assertTrue(ex.getMessage().contains("lone low surrogate"));
+    }
+
+    @Test
+    public void testValidSurrogatePairInObjectKey() {
+        // valid surrogate pair in key must be accepted
+        Assertions.assertDoesNotThrow(() -> new 
JsonLiteral("{\"\\uD83D\\uDE00\":\"ok\"}"));
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to