From 8f8aa638ee2b6dfa85fa8bc0ec5788f44768e92f Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Fri, 24 Jun 2022 09:19:13 +0200
Subject: [PATCH] Optimize json_lex_string by batching character copies

When parsing JSON strings need to be converted from the JSON string
format to a c-style string. A simple copy of the buffer does not suffice
because of the various escape sequences that that JSON supports. Because
of this our JSON parser wrote characters into the c-style string buffer
one at a time.

However, this is only necessary for these escaped sequences that map to
another character. This patch changes the behaviour for non-escaped
characters. These are now copied in batches instead of one character at
a time.

To test performance of this change I used COPY BINARY from a JSONB table
into another, containing fairly JSONB values of ~15kB. The JSONB values
are a JSON object with a single level. They contain a few small keys and
values, but one very big value that's a stringified JSON blob. So this
JSON blob contains a relatively high number of escape characters, to
escape all the " characters. This change improves performance for
workload this workload on my machine by ~18% (going from 1m24s to 1m09s).
---
 src/common/jsonapi.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c
index 98e4ef0942..219ecb9df9 100644
--- a/src/common/jsonapi.c
+++ b/src/common/jsonapi.c
@@ -674,6 +674,7 @@ json_lex_string(JsonLexContext *lex)
 	char	   *s;
 	int			len;
 	int			hi_surrogate = -1;
+	int			copyable_characters_length = 0;
 
 	if (lex->strval != NULL)
 		resetStringInfo(lex->strval);
@@ -692,7 +693,18 @@ json_lex_string(JsonLexContext *lex)
 			return JSON_INVALID_TOKEN;
 		}
 		else if (*s == '"')
+		{
+			if (copyable_characters_length)
+			{
+				/* flush copyable characters */
+				appendBinaryStringInfo(
+									   lex->strval,
+									   s - copyable_characters_length,
+									   copyable_characters_length);
+
+			}
 			break;
+		}
 		else if ((unsigned char) *s < 32)
 		{
 			/* Per RFC4627, these characters MUST be escaped. */
@@ -702,6 +714,16 @@ json_lex_string(JsonLexContext *lex)
 		}
 		else if (*s == '\\')
 		{
+			if (copyable_characters_length)
+			{
+				/* flush copyable characters */
+				appendBinaryStringInfo(
+									   lex->strval,
+									   s - copyable_characters_length,
+									   copyable_characters_length);
+				copyable_characters_length = 0;
+
+			}
 			/* OK, we have an escape character. */
 			s++;
 			len++;
@@ -818,7 +840,7 @@ json_lex_string(JsonLexContext *lex)
 					case '"':
 					case '\\':
 					case '/':
-						appendStringInfoChar(lex->strval, *s);
+						copyable_characters_length++;
 						break;
 					case 'b':
 						appendStringInfoChar(lex->strval, '\b');
@@ -861,7 +883,7 @@ json_lex_string(JsonLexContext *lex)
 			if (hi_surrogate != -1)
 				return JSON_UNICODE_LOW_SURROGATE;
 
-			appendStringInfoChar(lex->strval, *s);
+			copyable_characters_length++;
 		}
 	}
 
-- 
2.34.1

