From 58a913589b0b89a8c5ece50b5f8de6c9321a8366 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Thu, 23 May 2024 10:53:23 +1200
Subject: [PATCH v5] Optimize escaping of JSON strings using SIMD

Here we adjust escape_json_with_len() to make use of SIMD to allow
processing of up to 16-bytes at a time rather than processing a single
byte at a time.  This has been shown to speed up escaping of JSON
strings significantly, especially when no escaping is required.

Reviewed-by: Melih Mutlu
Discussion: https://postgr.es/m/CAApHDvpLXwMZvbCKcdGfU9XQjGCDm7tFpRdTXuB9PVgpNUYfEQ@mail.gmail.com
---
 src/backend/utils/adt/json.c       | 82 +++++++++++++++++++++++++++++-
 src/test/regress/expected/json.out | 48 +++++++++++++++++
 src/test/regress/sql/json.sql      |  7 +++
 3 files changed, 135 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index be7bc46038..4e86d734e4 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -19,6 +19,7 @@
 #include "funcapi.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
+#include "port/simd.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/date.h"
@@ -1603,11 +1604,88 @@ escape_json(StringInfo buf, const char *str)
 void
 escape_json_with_len(StringInfo buf, const char *str, int len)
 {
+	int			i = 0;
+	int			copypos = 0;
+	int			vlen;
+
+	Assert(len >= 0);
+
+	/*
+	 * Figure out how many bytes to process using SIMD.  Round 'len' down to
+	 * the previous multiple of sizeof(Vector8), assuming that's a power-of-2.
+	 */
+	vlen = len & (int) (~(sizeof(Vector8) - 1));
+
 	appendStringInfoCharMacro(buf, '"');
 
-	for (int i = 0; i < len; i++)
-		escape_json_char(buf, str[i]);
+	for (;;)
+	{
+		/*
+		 * To speed this up try searching sizeof(Vector8) bytes at once for
+		 * special characters that we need to escape.  When we find one, we
+		 * fall out of the Vector8 loop and copy the portion we've vector
+		 * searched and then we process sizeof(Vector8) bytes one byte at a
+		 * time.  Once done, come back and try doing vector searching again.
+		 * We'll also process any remaining bytes at the tail end of the
+		 * string byte-by-byte.  This optimization assumes special characters
+		 * are not that common.
+		 */
+		for (; i < vlen; i += sizeof(Vector8))
+		{
+			Vector8		chunk;
+
+			vector8_load(&chunk, (const uint8 *) &str[i]);
+
+			/*
+			 * Break on anything less than ' ' or if we find a '"' or '\\'.
+			 * Those need special handling.  That's done in the per-byte loop.
+			 */
+			if (vector8_has_le(chunk, (unsigned char) 0x1F) ||
+				vector8_has(chunk, (unsigned char) '"') ||
+				vector8_has(chunk, (unsigned char) '\\'))
+				break;
+
+/* #define ESCAPE_JSON_MAX_LOOKHEAD 512 */
+#ifdef ESCAPE_JSON_MAX_LOOKHEAD
+			if (i - copypos >= ESCAPE_JSON_MAX_LOOKHEAD)
+			{
+				appendBinaryStringInfo(buf, &str[copypos], i - copypos);
+				copypos = i;
+			}
+#endif
+		}
+
+		/*
+		 * Write to the destination up to the point of that we've vector
+		 * searched so far.  Do this only when switching into per-byte mode
+		 * rather than once every sizeof(Vector8) bytes.
+		 */
+		if (copypos < i)
+		{
+			appendBinaryStringInfo(buf, &str[copypos], i - copypos);
+			copypos = i;
+		}
+
+		/*
+		 * Per-byte loop for Vector8s containing special chars and for
+		 * processing the tail of the string.
+		 */
+		for (int b = 0; b < sizeof(Vector8); b++)
+		{
+			/* check if we've finished */
+			if (i == len)
+				goto done;
+
+			Assert(i < len);
+
+			escape_json_char(buf, str[i++]);
+		}
+
+		copypos = i;
+		/* We're not done yet.  Try the vector search again */
+	}
 
+done:
 	appendStringInfoCharMacro(buf, '"');
 }
 
diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out
index aa29bc597b..c8e9b97f0a 100644
--- a/src/test/regress/expected/json.out
+++ b/src/test/regress/expected/json.out
@@ -55,6 +55,54 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
  "............abc\n"
 (1 row)
 
+-- Test various lengths of strings to validate SIMD processing to escape
+-- special chars in the JSON.
+SELECT row_to_json(j)::jsonb FROM (
+  SELECT left(E'abcdefghijklmnopqrstuvwxyz0123456"\t78', a) AS very_long_column_name_to_test_json_escape
+  FROM generate_series(0,37) a
+) j;
+                                       row_to_json                                        
+------------------------------------------------------------------------------------------
+ {"very_long_column_name_to_test_json_escape": ""}
+ {"very_long_column_name_to_test_json_escape": "a"}
+ {"very_long_column_name_to_test_json_escape": "ab"}
+ {"very_long_column_name_to_test_json_escape": "abc"}
+ {"very_long_column_name_to_test_json_escape": "abcd"}
+ {"very_long_column_name_to_test_json_escape": "abcde"}
+ {"very_long_column_name_to_test_json_escape": "abcdef"}
+ {"very_long_column_name_to_test_json_escape": "abcdefg"}
+ {"very_long_column_name_to_test_json_escape": "abcdefgh"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghi"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghij"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijk"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijkl"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklm"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmn"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmno"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnop"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopq"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqr"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrs"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrst"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstu"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuv"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvw"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwx"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxy"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz01"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz012"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz01234"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz012345"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\""}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t7"}
+ {"very_long_column_name_to_test_json_escape": "abcdefghijklmnopqrstuvwxyz0123456\"\t78"}
+(38 rows)
+
 -- see json_encoding test for input with unicode escapes
 -- Numbers.
 SELECT '1'::json;				-- OK
diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql
index ec57dfe707..9bf33115d4 100644
--- a/src/test/regress/sql/json.sql
+++ b/src/test/regress/sql/json.sql
@@ -12,6 +12,13 @@ SELECT '"\v"'::json;			-- ERROR, not a valid JSON escape
 SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK
 SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
 
+-- Test various lengths of strings to validate SIMD processing to escape
+-- special chars in the JSON.
+SELECT row_to_json(j)::jsonb FROM (
+  SELECT left(E'abcdefghijklmnopqrstuvwxyz0123456"\t78', a) AS very_long_column_name_to_test_json_escape
+  FROM generate_series(0,37) a
+) j;
+
 -- see json_encoding test for input with unicode escapes
 
 -- Numbers.
-- 
2.34.1

