[HACKERS] JSON and unicode surrogate pairs

Andrew Dunstan Wed, 05 Jun 2013 07:53:44 -0700

In 9.2, the JSON parser didn't check the validity of the use of unicodeescapes other than that it required 4 hex digits to follow '\u'. In 9.3,that is still the case. However, the JSON accessor functions andoperators also try to turn JSON strings into text in the serverencoding, and this includes de-escaping \u sequences. This works fineexcept when there is a pair of sequences representing a UTF-16 typesurrogate pair, something that is explicitly permitted in the JSON spec.

The attached patch is an attempt to remedy that, and a surrogate pair isturned into the correct code point before converting it to whatever theserver encoding is.

Note that this would mean we can still put JSON with incorrect use ofsurrogates into the database, as now (9.2 and later), and they willcause almost all the accessor functions to raise an error, as now (9.3).All this does is allow JSON that uses surrogates correctly not to failwhen applying the accessor functions and operators. That's a possibleviolation of POLA, and at least worth of a note in the docs, but I'm notsure what else we can do now - adding this check to the input lexerwould possibly cause restores to fail, which users might not thank us for.


cheers

andrew

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index aaf99bd..28868fb 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -646,6 +646,7 @@ json_lex_string(JsonLexContext *lex)
 {
 	char	   *s;
 	int			len;
+	int         hi_surrogate = -1;
 
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
@@ -718,6 +719,36 @@ json_lex_string(JsonLexContext *lex)
 					int			utf8len;
 					char	   *converted;
 
+					if (ch >= 0xd800 && ch <= 0xdbff)
+					{
+						if (hi_surrogate != -1)
+							ereport(ERROR,
+									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+									 errmsg("invalid input syntax for type json"),
+									 errdetail("high order surrogate must not follow a high order surrogate."),
+									 report_json_context(lex)));
+						hi_surrogate = (ch & 0x3ff) << 10;
+						continue;
+					}
+					else if (ch >= 0xdc00 && ch <= 0xdfff)
+					{
+						if (hi_surrogate == -1)
+							ereport(ERROR,
+									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+									 errmsg("invalid input syntax for type json"),
+									 errdetail("low order surrogate must follow a high order surrogate."),
+									 report_json_context(lex)));
+						ch = 0x10000 + hi_surrogate + (ch & 0x3ff);
+						hi_surrogate = -1;
+					}
+
+					if (hi_surrogate != -1)
+						ereport(ERROR,
+								(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+								 errmsg("invalid input syntax for type json"),
+								 errdetail("low order surrogate must follow a high order surrogate."),
+								 report_json_context(lex)));
+
 					unicode_to_utf8(ch, (unsigned char *) utf8str);
 					utf8len = pg_utf_mblen((unsigned char *) utf8str);
 					utf8str[utf8len] = '\0';
@@ -730,6 +761,13 @@ json_lex_string(JsonLexContext *lex)
 			}
 			else if (lex->strval != NULL)
 			{
+				if (hi_surrogate != -1)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+							 errmsg("invalid input syntax for type json"),
+							 errdetail("low order surrogate must follow a high order surrogate."),
+							 report_json_context(lex)));
+				
 				switch (*s)
 				{
 					case '"':
@@ -784,11 +822,25 @@ json_lex_string(JsonLexContext *lex)
 		}
 		else if (lex->strval != NULL)
 		{
+			if (hi_surrogate != -1)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+						 errmsg("invalid input syntax for type json"),
+						 errdetail("low order surrogate must follow a high order surrogate."),
+						 report_json_context(lex)));
+			
 			appendStringInfoChar(lex->strval, *s);
 		}
 
 	}
 
+	if (hi_surrogate != -1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+				 errmsg("invalid input syntax for type json"),
+				 errdetail("low order surrogate must follow a high order surrogate."),
+				 report_json_context(lex)));
+
 	/* Hooray, we found the end of the string! */
 	lex->prev_token_terminator = lex->token_terminator;
 	lex->token_terminator = s + 1;

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] JSON and unicode surrogate pairs

Reply via email to