From 7573859b6f66b4ed370725f33077361c1cb81cb7 Mon Sep 17 00:00:00 2001
From: Jacob Champion <jacob.champion@enterprisedb.com>
Date: Mon, 8 Apr 2024 15:31:17 -0700
Subject: [PATCH v2] json_lex_string: don't overread on bad UTF8

Inputs to pg_parse_json[_incremental] are not guaranteed to be
null-terminated, so pg_encoding_mblen_bounded (which uses strnlen) can
walk off the end of the buffer. Check against the end pointer instead.

pg_encoding_mblen_bounded() no longer has any callers and has been
removed.

TODO:
- Do we really want to print incomplete UTF-8 sequences as-is once we
  know they're bad?
---
 src/common/jsonapi.c                              |  4 ++--
 src/common/wchar.c                                | 13 +------------
 src/include/mb/pg_wchar.h                         |  1 -
 src/test/modules/test_json_parser/t/002_inline.pl |  8 ++++++++
 4 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index fc0cb36974..26e1f43ed3 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -1689,8 +1689,8 @@ json_lex_string(JsonLexContext *lex)
 	} while (0)
 #define FAIL_AT_CHAR_END(code) \
 	do { \
-		lex->token_terminator = \
-			s + pg_encoding_mblen_bounded(lex->input_encoding, s); \
+		char	   *term = s + pg_encoding_mblen(lex->input_encoding, s); \
+		lex->token_terminator = (term <= end) ? term : end; \
 		return code; \
 	} while (0)
 
diff --git a/src/common/wchar.c b/src/common/wchar.c
index 76b7dfdfcb..97e9b61dba 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -2062,8 +2062,7 @@ const pg_wchar_tbl pg_wchar_table[] = {
  *
  * Caution: when dealing with text that is not certainly valid in the
  * specified encoding, the result may exceed the actual remaining
- * string length.  Callers that are not prepared to deal with that
- * should use pg_encoding_mblen_bounded() instead.
+ * string length.
  */
 int
 pg_encoding_mblen(int encoding, const char *mbstr)
@@ -2073,16 +2072,6 @@ pg_encoding_mblen(int encoding, const char *mbstr)
 			pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr));
 }
 
-/*
- * Returns the byte length of a multibyte character; but not more than
- * the distance to end of string.
- */
-int
-pg_encoding_mblen_bounded(int encoding, const char *mbstr)
-{
-	return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr));
-}
-
 /*
  * Returns the display length of a multibyte character.
  */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 249cd18a35..ac65bfcbef 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -663,7 +663,6 @@ extern int	pg_valid_server_encoding_id(int encoding);
  * earlier in this file are also available from libpgcommon.
  */
 extern int	pg_encoding_mblen(int encoding, const char *mbstr);
-extern int	pg_encoding_mblen_bounded(int encoding, const char *mbstr);
 extern int	pg_encoding_dsplen(int encoding, const char *mbstr);
 extern int	pg_encoding_verifymbchar(int encoding, const char *mbstr, int len);
 extern int	pg_encoding_verifymbstr(int encoding, const char *mbstr, int len);
diff --git a/src/test/modules/test_json_parser/t/002_inline.pl b/src/test/modules/test_json_parser/t/002_inline.pl
index f83cec03f8..60bb930e92 100644
--- a/src/test/modules/test_json_parser/t/002_inline.pl
+++ b/src/test/modules/test_json_parser/t/002_inline.pl
@@ -128,5 +128,13 @@ test(
 	"incorrect escape count",
 	'"\\\\\\\\\\\\\\"',
 	error => qr/Token ""\\\\\\\\\\\\\\"" is invalid/);
+test(
+	"incomplete UTF-8 sequence",
+	# Three bytes: double-quote, backslash, <f5>
+	"\"\\\x{F5}",
+	# Both invalid-token and invalid-escape are possible, because for smaller
+	# chunk sizes the incremental parser will skip the string parsing when it
+	# can't find an ending quote.
+	error => qr/(Token|Escape sequence) ""?\\\x{F5}" is invalid/);
 
 done_testing();
-- 
2.34.1

