Re: Small patch to improve safety of utf8_to_unicode().

Jeff Davis Mon, 15 Dec 2025 12:24:01 -0800

On Sun, 2025-12-14 at 07:22 +0800, Chao Li wrote:
> This patch adds a length check to utf8_to_unicode(), I think which is
> where “safety” comes from. Can you please add an a little bit more to
> the commit message instead of only saying “improve safety”.


Right: it does not read past pg_mblen(), nor past the supplied length.

We could go further and check for valid continuation bytes, but the
other routines don't do that.

> It also deleted two redundant function declarations from pg_wchar.h,
> which may also worth a quick note in the commit message.

I committed that as an independent change and removed it from this
patch.

> +     /* Assume byte sequence has not been broken. */
> +     c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
> ```
> 
> Here we need an empty line above the new comment.

Done, and I expanded the comment to explain why it's safe.

>  pg_utf_dsplen(const unsigned char *s)
>  {
> -     return ucs_wcwidth(utf8_to_unicode(s));
> +     /* trust that input is not a truncated byte sequence */
> +     return ucs_wcwidth(utf8_to_unicode(s,
> MAX_MULTIBYTE_CHAR_LEN));
>  }
> ```
> 
> For the new comment, as a code reader, I wonder why we “trust” that?

We could use strlen(), but I was concerned that it might be used for
string fragments that aren't NUL-terminated, because it's intended for
a single character. A caller might reasonably assume that it wouldn't
read past pg_mblen().

So I changed the comment slightly to just say that it requires the
input is a valid UTF-8 sequence. Let me know if you have another
suggestion.

Regards,
        Jeff Davis

From 94a80385eec23dbb85cc601063cac04e8d845a52 Mon Sep 17 00:00:00 2001
From: Jeff Davis <[email protected]>
Date: Thu, 4 Dec 2025 19:37:32 -0800
Subject: [PATCH v2] Make utf8_to_unicode() safer.

Require the input string length so that it doesn't read past the end
of a string if given an invalid (or truncated) byte sequence.

Discussion: https://postgr.es/m/[email protected]
---
 contrib/fuzzystrmatch/daitch_mokotoff.c   |  8 ++++++-
 src/backend/utils/adt/pg_locale_builtin.c |  5 +++--
 src/backend/utils/adt/varlena.c           | 26 +++++++++++------------
 src/common/saslprep.c                     |  4 +++-
 src/common/unicode/case_test.c            |  5 +++--
 src/common/unicode_case.c                 | 12 +++++++----
 src/common/wchar.c                        |  3 ++-
 src/include/mb/pg_wchar.h                 | 12 +++++------
 8 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..103a29d89d4 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,13 @@ read_char(const unsigned char *str, int *ix)
 
 	/* Decode UTF-8 character to ISO 10646 code point. */
 	str += *ix;
-	c = utf8_to_unicode(str);
+
+	/*
+	 * We assume the string does not contain truncated byte sequences because
+	 * it came from pg_server_to_any(..., PG_UTF8), so we can safely use
+	 * MAX_MULTIBYTE_CHAR_LEN.
+	 */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
 
 	/* Advance *ix, but (for safety) not if we've reached end of string. */
 	if (c)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c2920112bb..28fc55278b0 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -59,12 +59,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..d684370900d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5423,19 +5423,17 @@ Datum
 unicode_assigned(PG_FUNCTION_ARGS)
 {
 	text	   *input = PG_GETARG_TEXT_PP(0);
-	unsigned char *p;
-	int			size;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 
 	if (GetDatabaseEncoding() != PG_UTF8)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
 	/* convert to char32_t */
-	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	p = (unsigned char *) VARDATA_ANY(input);
-	for (int i = 0; i < size; i++)
+	while (p < p_end)
 	{
-		char32_t	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p, p_end - p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	text	   *result;
 	int			i;
 
@@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* action */
 	output_chars = unicode_normalize(form, input_chars);
@@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	int			i;
 	UnicodeNormalizationQC quickcheck;
 	int			output_size;
@@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* quick check (see UAX #15) */
 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 101e8d65a4d..083702654b0 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
+	unsigned char *p_end;
 	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
@@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output)
 		goto oom;
 
 	p = (unsigned char *) input;
+	p_end = p + strlen(input);
 	for (i = 0; i < input_size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 00d4f85e5a5..e63ce2fbeb9 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -51,12 +51,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index e5e494db43c..a760094104c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 		   (str_casekind != CaseTitle && !wbnext && !wbstate));
 
+	if (srclen < 0)
+		srclen = strlen(src);
+
 	if (str_casekind == CaseTitle)
 	{
 		boundary = wbnext(wbstate);
 		Assert(boundary == 0);	/* start of text is always a boundary */
 	}
 
-	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
+	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff,
+										 srclen - srcoff);
 		int			u1len = unicode_utf8len(u1);
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
@@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..a104234ae92 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
 static int
 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* input must be a valid UTF-8 sequence */
+	return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
 }
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 5df67ceea87..6dc0fff332f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
 /*
  * Convert a UTF-8 character to a Unicode code point.
  * This is a one-character version of pg_utf2wchar_with_len.
- *
- * No error checks here, c must point to a long-enough string.
  */
 static inline char32_t
-utf8_to_unicode(const unsigned char *c)
+utf8_to_unicode(const unsigned char *c, size_t len)
 {
-	if ((*c & 0x80) == 0)
+	if ((*c & 0x80) == 0 && len >= 1)
 		return (char32_t) c[0];
-	else if ((*c & 0xe0) == 0xc0)
+	else if ((*c & 0xe0) == 0xc0 && len >= 2)
 		return (char32_t) (((c[0] & 0x1f) << 6) |
 						   (c[1] & 0x3f));
-	else if ((*c & 0xf0) == 0xe0)
+	else if ((*c & 0xf0) == 0xe0 && len >= 3)
 		return (char32_t) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
-	else if ((*c & 0xf8) == 0xf0)
+	else if ((*c & 0xf8) == 0xf0 && len >= 4)
 		return (char32_t) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |
-- 
2.43.0

Re: Small patch to improve safety of utf8_to_unicode().

Reply via email to