В Пн, 13/03/2023 в 13:58 -0400, Tom Lane пишет:
> Nikolay Shaplov <dh...@nataraj.su> writes:
> > I found a bug in jsonb_in function (it converts json from sting 
> > representation
> >  into jsonb internal representation).
> 
> Yeah.  Looks like json_lex_string is failing to honor the invariant
> that it needs to set token_terminator ... although the documentation
> of the function certainly isn't helping.  I think we need the attached.
> 
> A nice side benefit is that the error context reports get a lot more
> useful --- somebody should have inquired before as to why they were
> so bogus.
> 
>                         regards, tom lane
> 

Good day, Tom and all.

Merged patch looks like start of refactoring.

Colleague (Nikita Glukhov) propose further refactoring of jsonapi.c:
- use of inline functions instead of macroses,
- more uniform their usage in token success or error reporting,
- simplify json_lex_number and its usage a bit.
Also he added tests for fixed bug.


-----

Regards,
Yura Sokolov.
From 757a314d5fa9c6ba8334762b4a080763f02244c5 Mon Sep 17 00:00:00 2001
From: Nikita Glukhov <n.glu...@postgrespro.ru>
Date: Tue, 14 Mar 2023 12:15:48 +0300
Subject: [PATCH] Refactor JSON lexer error reporting

Also add special tests for already fixed bug and for multibyte symbols after surrogates.
---
 src/common/jsonapi.c                          | 246 +++++++-----------
 src/test/regress/expected/json_encoding.out   |  25 ++
 src/test/regress/expected/json_encoding_1.out |  23 ++
 src/test/regress/sql/json_encoding.sql        |   7 +
 4 files changed, 149 insertions(+), 152 deletions(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 2e86589cfd8..d4a9d8f0378 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -44,8 +44,7 @@ typedef enum					/* contexts of JSON parser */
 } JsonParseContext;
 
 static inline JsonParseErrorType json_lex_string(JsonLexContext *lex);
-static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s,
-												 bool *num_err, int *total_len);
+static inline JsonParseErrorType json_lex_number(JsonLexContext *lex, char *s);
 static inline JsonParseErrorType parse_scalar(JsonLexContext *lex, JsonSemAction *sem);
 static JsonParseErrorType parse_object_field(JsonLexContext *lex, JsonSemAction *sem);
 static JsonParseErrorType parse_object(JsonLexContext *lex, JsonSemAction *sem);
@@ -104,8 +103,6 @@ lex_expect(JsonParseContext ctx, JsonLexContext *lex, JsonTokenType token)
 bool
 IsValidJsonNumber(const char *str, int len)
 {
-	bool		numeric_error;
-	int			total_len;
 	JsonLexContext dummy_lex;
 
 	if (len <= 0)
@@ -128,9 +125,8 @@ IsValidJsonNumber(const char *str, int len)
 		dummy_lex.input_length = len;
 	}
 
-	json_lex_number(&dummy_lex, dummy_lex.input, &numeric_error, &total_len);
-
-	return (!numeric_error) && (total_len == dummy_lex.input_length);
+	return json_lex_number(&dummy_lex, dummy_lex.input) == JSON_SUCCESS &&
+		dummy_lex.token_terminator == dummy_lex.input + dummy_lex.input_length;
 }
 
 /*
@@ -545,6 +541,37 @@ parse_array(JsonLexContext *lex, JsonSemAction *sem)
 	return JSON_SUCCESS;
 }
 
+/* Convenience inline functions for success/error exits from lexer */
+static inline JsonParseErrorType
+json_lex_success(JsonLexContext *lex, char *token_terminator,
+				 JsonTokenType token_type)
+{
+	lex->prev_token_terminator = lex->token_terminator;
+	lex->token_terminator = token_terminator;
+	lex->token_type = token_type;
+
+	return JSON_SUCCESS;
+}
+
+static inline JsonParseErrorType
+json_lex_fail_at_char_start(JsonLexContext *lex, char *s,
+							JsonParseErrorType code)
+{
+	lex->token_terminator = s;
+
+	return code;
+}
+
+static inline JsonParseErrorType
+json_lex_fail_at_char_end(JsonLexContext *lex, char *s,
+						  JsonParseErrorType code)
+{
+	lex->token_terminator =
+		s + pg_encoding_mblen_bounded(lex->input_encoding, s);
+
+	return code;
+}
+
 /*
  * Lex one token from the input stream.
  */
@@ -553,7 +580,6 @@ json_lex(JsonLexContext *lex)
 {
 	char	   *s;
 	char	   *const end = lex->input + lex->input_length;
-	JsonParseErrorType result;
 
 	/* Skip leading whitespace. */
 	s = lex->token_terminator;
@@ -571,9 +597,7 @@ json_lex(JsonLexContext *lex)
 	if (s >= end)
 	{
 		lex->token_start = NULL;
-		lex->prev_token_terminator = lex->token_terminator;
-		lex->token_terminator = s;
-		lex->token_type = JSON_TOKEN_END;
+		return json_lex_success(lex, s, JSON_TOKEN_END);
 	}
 	else
 	{
@@ -581,49 +605,31 @@ json_lex(JsonLexContext *lex)
 		{
 				/* Single-character token, some kind of punctuation mark. */
 			case '{':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_OBJECT_START;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_OBJECT_START);
+
 			case '}':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_OBJECT_END;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_OBJECT_END);
+
 			case '[':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_ARRAY_START;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_ARRAY_START);
+
 			case ']':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_ARRAY_END;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_ARRAY_END);
+
 			case ',':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_COMMA;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_COMMA);
+
 			case ':':
-				lex->prev_token_terminator = lex->token_terminator;
-				lex->token_terminator = s + 1;
-				lex->token_type = JSON_TOKEN_COLON;
-				break;
+				return json_lex_success(lex, s + 1, JSON_TOKEN_COLON);
+
 			case '"':
 				/* string */
-				result = json_lex_string(lex);
-				if (result != JSON_SUCCESS)
-					return result;
-				lex->token_type = JSON_TOKEN_STRING;
-				break;
+				return json_lex_string(lex);
+
 			case '-':
 				/* Negative number. */
-				result = json_lex_number(lex, s + 1, NULL, NULL);
-				if (result != JSON_SUCCESS)
-					return result;
-				lex->token_type = JSON_TOKEN_NUMBER;
-				break;
+				return json_lex_number(lex, s + 1);
+
 			case '0':
 			case '1':
 			case '2':
@@ -635,11 +641,8 @@ json_lex(JsonLexContext *lex)
 			case '8':
 			case '9':
 				/* Positive number. */
-				result = json_lex_number(lex, s, NULL, NULL);
-				if (result != JSON_SUCCESS)
-					return result;
-				lex->token_type = JSON_TOKEN_NUMBER;
-				break;
+				return json_lex_number(lex, s);
+
 			default:
 				{
 					char	   *p;
@@ -662,37 +665,27 @@ json_lex(JsonLexContext *lex)
 					 * that one character.
 					 */
 					if (p == s)
-					{
-						lex->prev_token_terminator = lex->token_terminator;
-						lex->token_terminator = s + 1;
-						return JSON_INVALID_TOKEN;
-					}
+						return json_lex_fail_at_char_start(lex, s + 1, JSON_INVALID_TOKEN);
 
 					/*
 					 * We've got a real alphanumeric token here.  If it
 					 * happens to be true, false, or null, all is well.  If
 					 * not, error out.
 					 */
-					lex->prev_token_terminator = lex->token_terminator;
-					lex->token_terminator = p;
 					if (p - s == 4)
 					{
 						if (memcmp(s, "true", 4) == 0)
-							lex->token_type = JSON_TOKEN_TRUE;
+							return json_lex_success(lex, p, JSON_TOKEN_TRUE);
 						else if (memcmp(s, "null", 4) == 0)
-							lex->token_type = JSON_TOKEN_NULL;
-						else
-							return JSON_INVALID_TOKEN;
+							return json_lex_success(lex, p, JSON_TOKEN_NULL);
 					}
 					else if (p - s == 5 && memcmp(s, "false", 5) == 0)
-						lex->token_type = JSON_TOKEN_FALSE;
-					else
-						return JSON_INVALID_TOKEN;
+						return json_lex_success(lex, p, JSON_TOKEN_FALSE);
+
+					return json_lex_fail_at_char_start(lex, p, JSON_INVALID_TOKEN);
 				}
 		}						/* end of switch */
 	}
-
-	return JSON_SUCCESS;
 }
 
 /*
@@ -713,19 +706,6 @@ json_lex_string(JsonLexContext *lex)
 	char	   *const end = lex->input + lex->input_length;
 	int			hi_surrogate = -1;
 
-	/* Convenience macros for error exits */
-#define FAIL_AT_CHAR_START(code) \
-	do { \
-		lex->token_terminator = s; \
-		return code; \
-	} while (0)
-#define FAIL_AT_CHAR_END(code) \
-	do { \
-		lex->token_terminator = \
-			s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
-		return code; \
-	} while (0)
-
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
 
@@ -736,7 +716,7 @@ json_lex_string(JsonLexContext *lex)
 		s++;
 		/* Premature end of the string. */
 		if (s >= end)
-			FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+			return json_lex_fail_at_char_start(lex, s, JSON_INVALID_TOKEN);
 		else if (*s == '"')
 			break;
 		else if (*s == '\\')
@@ -744,7 +724,7 @@ json_lex_string(JsonLexContext *lex)
 			/* OK, we have an escape character. */
 			s++;
 			if (s >= end)
-				FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+				return json_lex_fail_at_char_start(lex, s, JSON_INVALID_TOKEN);
 			else if (*s == 'u')
 			{
 				int			i;
@@ -754,7 +734,7 @@ json_lex_string(JsonLexContext *lex)
 				{
 					s++;
 					if (s >= end)
-						FAIL_AT_CHAR_START(JSON_INVALID_TOKEN);
+						return json_lex_fail_at_char_start(lex, s, JSON_INVALID_TOKEN);
 					else if (*s >= '0' && *s <= '9')
 						ch = (ch * 16) + (*s - '0');
 					else if (*s >= 'a' && *s <= 'f')
@@ -762,7 +742,7 @@ json_lex_string(JsonLexContext *lex)
 					else if (*s >= 'A' && *s <= 'F')
 						ch = (ch * 16) + (*s - 'A') + 10;
 					else
-						FAIL_AT_CHAR_END(JSON_UNICODE_ESCAPE_FORMAT);
+						return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_ESCAPE_FORMAT);
 				}
 				if (lex->strval != NULL)
 				{
@@ -772,20 +752,20 @@ json_lex_string(JsonLexContext *lex)
 					if (is_utf16_surrogate_first(ch))
 					{
 						if (hi_surrogate != -1)
-							FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_SURROGATE);
+							return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_HIGH_SURROGATE);
 						hi_surrogate = ch;
 						continue;
 					}
 					else if (is_utf16_surrogate_second(ch))
 					{
 						if (hi_surrogate == -1)
-							FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
+							return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_LOW_SURROGATE);
 						ch = surrogate_pair_to_codepoint(hi_surrogate, ch);
 						hi_surrogate = -1;
 					}
 
 					if (hi_surrogate != -1)
-						FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
+						return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_LOW_SURROGATE);
 
 					/*
 					 * Reject invalid cases.  We can't have a value above
@@ -795,7 +775,7 @@ json_lex_string(JsonLexContext *lex)
 					if (ch == 0)
 					{
 						/* We can't allow this, since our TEXT type doesn't */
-						FAIL_AT_CHAR_END(JSON_UNICODE_CODE_POINT_ZERO);
+						return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_CODE_POINT_ZERO);
 					}
 
 					/*
@@ -809,7 +789,7 @@ json_lex_string(JsonLexContext *lex)
 						char		cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
 
 						if (!pg_unicode_to_server_noerror(ch, (unsigned char *) cbuf))
-							FAIL_AT_CHAR_END(JSON_UNICODE_UNTRANSLATABLE);
+							return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_UNTRANSLATABLE);
 						appendStringInfoString(lex->strval, cbuf);
 					}
 #else
@@ -829,14 +809,14 @@ json_lex_string(JsonLexContext *lex)
 						appendStringInfoChar(lex->strval, (char) ch);
 					}
 					else
-						FAIL_AT_CHAR_END(JSON_UNICODE_HIGH_ESCAPE);
+						return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_HIGH_ESCAPE);
 #endif							/* FRONTEND */
 				}
 			}
 			else if (lex->strval != NULL)
 			{
 				if (hi_surrogate != -1)
-					FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
+					return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_LOW_SURROGATE);
 
 				switch (*s)
 				{
@@ -868,7 +848,7 @@ json_lex_string(JsonLexContext *lex)
 						 * is reported, not the whole string.
 						 */
 						lex->token_start = s;
-						FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
+						return json_lex_fail_at_char_end(lex, s, JSON_ESCAPING_INVALID);
 				}
 			}
 			else if (strchr("\"\\/bfnrt", *s) == NULL)
@@ -881,7 +861,7 @@ json_lex_string(JsonLexContext *lex)
 				 * shown it's not a performance win.
 				 */
 				lex->token_start = s;
-				FAIL_AT_CHAR_END(JSON_ESCAPING_INVALID);
+				return json_lex_fail_at_char_end(lex, s, JSON_ESCAPING_INVALID);
 			}
 		}
 		else
@@ -889,7 +869,7 @@ json_lex_string(JsonLexContext *lex)
 			char	   *p = s;
 
 			if (hi_surrogate != -1)
-				FAIL_AT_CHAR_END(JSON_UNICODE_LOW_SURROGATE);
+				return json_lex_fail_at_char_end(lex, s, JSON_UNICODE_LOW_SURROGATE);
 
 			/*
 			 * Skip to the first byte that requires special handling, so we
@@ -912,8 +892,7 @@ json_lex_string(JsonLexContext *lex)
 					 * Since *p isn't printable, exclude it from the context
 					 * string
 					 */
-					lex->token_terminator = p;
-					return JSON_ESCAPING_REQUIRED;
+					return json_lex_fail_at_char_start(lex, p, JSON_ESCAPING_REQUIRED);
 				}
 			}
 
@@ -929,18 +908,10 @@ json_lex_string(JsonLexContext *lex)
 	}
 
 	if (hi_surrogate != -1)
-	{
-		lex->token_terminator = s + 1;
-		return JSON_UNICODE_LOW_SURROGATE;
-	}
+		return json_lex_fail_at_char_start(lex, s + 1, JSON_UNICODE_LOW_SURROGATE);
 
 	/* Hooray, we found the end of the string! */
-	lex->prev_token_terminator = lex->token_terminator;
-	lex->token_terminator = s + 1;
-	return JSON_SUCCESS;
-
-#undef FAIL_AT_CHAR_START
-#undef FAIL_AT_CHAR_END
+	return json_lex_success(lex, s + 1, JSON_TOKEN_STRING);
 }
 
 /*
@@ -966,74 +937,58 @@ json_lex_string(JsonLexContext *lex)
  * The 's' argument to this function points to the ostensible beginning
  * of part 2 - i.e. the character after any optional minus sign, or the
  * first character of the string if there is none.
- *
- * If num_err is not NULL, we return an error flag to *num_err rather than
- * raising an error for a badly-formed number.  Also, if total_len is not NULL
- * the distance from lex->input to the token end+1 is returned to *total_len.
  */
 static inline JsonParseErrorType
-json_lex_number(JsonLexContext *lex, char *s,
-				bool *num_err, int *total_len)
+json_lex_number(JsonLexContext *lex, char *s)
 {
 	bool		error = false;
-	int			len = s - lex->input;
+	char	   *input_end = lex->input + lex->input_length;
 
 	/* Part (1): leading sign indicator. */
 	/* Caller already did this for us; so do nothing. */
 
 	/* Part (2): parse main digit string. */
-	if (len < lex->input_length && *s == '0')
-	{
+	if (s < input_end && *s == '0')
 		s++;
-		len++;
-	}
-	else if (len < lex->input_length && *s >= '1' && *s <= '9')
+	else if (s < input_end && *s >= '1' && *s <= '9')
 	{
 		do
 		{
 			s++;
-			len++;
-		} while (len < lex->input_length && *s >= '0' && *s <= '9');
+		} while (s < input_end && *s >= '0' && *s <= '9');
 	}
 	else
 		error = true;
 
 	/* Part (3): parse optional decimal portion. */
-	if (len < lex->input_length && *s == '.')
+	if (s < input_end && *s == '.')
 	{
 		s++;
-		len++;
-		if (len == lex->input_length || *s < '0' || *s > '9')
+		if (s == input_end || *s < '0' || *s > '9')
 			error = true;
 		else
 		{
 			do
 			{
 				s++;
-				len++;
-			} while (len < lex->input_length && *s >= '0' && *s <= '9');
+			} while (s < input_end && *s >= '0' && *s <= '9');
 		}
 	}
 
 	/* Part (4): parse optional exponent. */
-	if (len < lex->input_length && (*s == 'e' || *s == 'E'))
+	if (s < input_end && (*s == 'e' || *s == 'E'))
 	{
 		s++;
-		len++;
-		if (len < lex->input_length && (*s == '+' || *s == '-'))
-		{
+		if (s < input_end && (*s == '+' || *s == '-'))
 			s++;
-			len++;
-		}
-		if (len == lex->input_length || *s < '0' || *s > '9')
+		if (s == input_end || *s < '0' || *s > '9')
 			error = true;
 		else
 		{
 			do
 			{
 				s++;
-				len++;
-			} while (len < lex->input_length && *s >= '0' && *s <= '9');
+			} while (s < input_end && *s >= '0' && *s <= '9');
 		}
 	}
 
@@ -1042,28 +997,15 @@ json_lex_number(JsonLexContext *lex, char *s,
 	 * here should be considered part of the token for error-reporting
 	 * purposes.
 	 */
-	for (; len < lex->input_length && JSON_ALPHANUMERIC_CHAR(*s); s++, len++)
+	for (; s < input_end && JSON_ALPHANUMERIC_CHAR(*s); s++)
 		error = true;
 
-	if (total_len != NULL)
-		*total_len = len;
-
-	if (num_err != NULL)
-	{
-		/* let the caller handle any error */
-		*num_err = error;
-	}
-	else
-	{
-		/* return token endpoint */
-		lex->prev_token_terminator = lex->token_terminator;
-		lex->token_terminator = s;
-		/* handle error if any */
-		if (error)
-			return JSON_INVALID_TOKEN;
-	}
+	/* handle error if any */
+	if (error)
+		return json_lex_fail_at_char_start(lex, s, JSON_INVALID_TOKEN);
 
-	return JSON_SUCCESS;
+	/* return token endpoint 's' */
+	return json_lex_success(lex, s, JSON_TOKEN_NUMBER);
 }
 
 /*
diff --git a/src/test/regress/expected/json_encoding.out b/src/test/regress/expected/json_encoding.out
index fe729db8c9d..455c6441f46 100644
--- a/src/test/regress/expected/json_encoding.out
+++ b/src/test/regress/expected/json_encoding.out
@@ -267,3 +267,28 @@ select * from pg_input_error_info('{ "a":  "\ud83d\ude04\ud83d\udc36" }', 'jsonb
          |        |      | 
 (1 row)
 
+-- additional tests for bug in null/surrogates error reporting
+SELECT jsonb E'\n"\\u0000"' as null_is_not_supported;
+ERROR:  unsupported Unicode escape sequence
+LINE 1: SELECT jsonb E'\n"\\u0000"' as null_is_not_supported;
+                     ^
+DETAIL:  \u0000 cannot be converted to text.
+CONTEXT:  JSON data, line 2: "\u0000...
+SELECT jsonb E'\n"\\ud83dX"' as orphan_high_surrogate;
+ERROR:  invalid input syntax for type json
+LINE 1: SELECT jsonb E'\n"\\ud83dX"' as orphan_high_surrogate;
+                     ^
+DETAIL:  Unicode low surrogate must follow a high surrogate.
+CONTEXT:  JSON data, line 2: "\ud83dX...
+SELECT jsonb E'\n"\\ud83d\u00a9"' as orphan_high_surrogate;
+ERROR:  invalid input syntax for type json
+LINE 1: SELECT jsonb E'\n"\\ud83d\u00a9"' as orphan_high_surrogate;
+                     ^
+DETAIL:  Unicode low surrogate must follow a high surrogate.
+CONTEXT:  JSON data, line 2: "\ud83d©...
+SELECT jsonb E'\n"\\ud83d\\ud83d"' as two_high_surrogates_in_a_row;
+ERROR:  invalid input syntax for type json
+LINE 1: SELECT jsonb E'\n"\\ud83d\\ud83d"' as two_high_surrogates_in...
+                     ^
+DETAIL:  Unicode high surrogate must not follow a high surrogate.
+CONTEXT:  JSON data, line 2: "\ud83d\ud83d...
diff --git a/src/test/regress/expected/json_encoding_1.out b/src/test/regress/expected/json_encoding_1.out
index 5c8d91ad0b6..5400c20b625 100644
--- a/src/test/regress/expected/json_encoding_1.out
+++ b/src/test/regress/expected/json_encoding_1.out
@@ -263,3 +263,26 @@ select * from pg_input_error_info('{ "a":  "\ud83d\ude04\ud83d\udc36" }', 'jsonb
  unsupported Unicode escape sequence | Unicode escape value could not be translated to the server's encoding SQL_ASCII. |      | 22P05
 (1 row)
 
+-- additional tests for bug in null/surrogates error reporting
+SELECT jsonb E'\n"\\u0000"' as null_is_not_supported;
+ERROR:  unsupported Unicode escape sequence
+LINE 1: SELECT jsonb E'\n"\\u0000"' as null_is_not_supported;
+                     ^
+DETAIL:  \u0000 cannot be converted to text.
+CONTEXT:  JSON data, line 2: "\u0000...
+SELECT jsonb E'\n"\\ud83dX"' as orphan_high_surrogate;
+ERROR:  invalid input syntax for type json
+LINE 1: SELECT jsonb E'\n"\\ud83dX"' as orphan_high_surrogate;
+                     ^
+DETAIL:  Unicode low surrogate must follow a high surrogate.
+CONTEXT:  JSON data, line 2: "\ud83dX...
+SELECT jsonb E'\n"\\ud83d\u00a9"' as orphan_high_surrogate;
+ERROR:  conversion between UTF8 and SQL_ASCII is not supported
+LINE 1: SELECT jsonb E'\n"\\ud83d\u00a9"' as orphan_high_surrogate;
+                                 ^
+SELECT jsonb E'\n"\\ud83d\\ud83d"' as two_high_surrogates_in_a_row;
+ERROR:  invalid input syntax for type json
+LINE 1: SELECT jsonb E'\n"\\ud83d\\ud83d"' as two_high_surrogates_in...
+                     ^
+DETAIL:  Unicode high surrogate must not follow a high surrogate.
+CONTEXT:  JSON data, line 2: "\ud83d\ud83d...
diff --git a/src/test/regress/sql/json_encoding.sql b/src/test/regress/sql/json_encoding.sql
index aceb8dbc3c6..f0bb6d66a1d 100644
--- a/src/test/regress/sql/json_encoding.sql
+++ b/src/test/regress/sql/json_encoding.sql
@@ -80,3 +80,10 @@ SELECT jsonb '{ "a":  "null \\u0000 escape" }' ->> 'a' as not_an_escape;
 -- soft error for input-time failure
 
 select * from pg_input_error_info('{ "a":  "\ud83d\ude04\ud83d\udc36" }', 'jsonb');
+
+
+-- additional tests for bug in null/surrogates error reporting
+SELECT jsonb E'\n"\\u0000"' as null_is_not_supported;
+SELECT jsonb E'\n"\\ud83dX"' as orphan_high_surrogate;
+SELECT jsonb E'\n"\\ud83d\u00a9"' as orphan_high_surrogate;
+SELECT jsonb E'\n"\\ud83d\\ud83d"' as two_high_surrogates_in_a_row;
-- 
2.25.1

Reply via email to