From 36e226c368d2eb37c41124f52ef819bc626fd5a8 Mon Sep 17 00:00:00 2001
From: David Rowley <dgrowley@gmail.com>
Date: Thu, 23 May 2024 10:53:23 +1200
Subject: [PATCH v2 2/3] Use SIMD processing for escape_json()

---
 src/backend/utils/adt/json.c       | 72 +++++++++++++++++++++++++++++-
 src/test/regress/expected/json.out | 44 ++++++++++++++++++
 src/test/regress/sql/json.sql      |  8 ++++
 3 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 7934cf62fb..a266f60ff3 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -19,6 +19,7 @@
 #include "funcapi.h"
 #include "libpq/pqformat.h"
 #include "miscadmin.h"
+#include "port/simd.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/date.h"
@@ -1597,11 +1598,78 @@ escape_json_cstring(StringInfo buf, const char *str)
 void
 escape_json(StringInfo buf, const char *str, int len)
 {
+	int i = 0;
+	int copypos = 0;
+
+	Assert(len >= 0);
+
 	appendStringInfoCharMacro(buf, '"');
 
-	for (int i = 0; i < len; i++)
-		escape_json_char(buf, str[i]);
+	for (;;)
+	{
+		Vector8 chunk;
+		int		vlen;
+
+		/*
+		 * Figure out how many bytes to process using SIMD.  Round 'len' down
+		 * to the previous multiple of sizeof(Vector8), assuming that's a
+		 * power-of-2.
+		 */
+		vlen = len & (int) (~(sizeof(Vector8) - 1));
+
+		/*
+		 * To speed this up try searching sizeof(Vector8) bytes at once for
+		 * special characters that we need to escape.  When we find one, we
+		 * fall out of this first loop and copy the parts we've vector
+		 * searched before processing the special-char vector byte-by-byte.
+		 * Once we're done with that, come back and try doing vector searching
+		 * again.  We'll also process the tail end of the string byte-by-byte.
+		 */
+		for (; i < vlen; i += sizeof(Vector8))
+		{
+			vector8_load(&chunk, (const uint8 *) &str[i]);
+
+			/*
+			 * Break on anything less than ' ' or if we find a '"' or '\\'.
+			 * Those need special handling.  That's done in the per-byte loop.
+			 */
+			if (vector8_has_le(chunk, (unsigned char) 0x1F) ||
+				vector8_has(chunk, (unsigned char) '"') ||
+				vector8_has(chunk, (unsigned char) '\\'))
+				break;
+		}
+
+		/*
+		 * Write to the destination up to the point of that we've vector
+		 * searched so far.  Do this only when switching into per-byte mode
+		 * rather than once every sizeof(Vector8) bytes.
+		 */
+		if (copypos < i)
+		{
+			appendBinaryStringInfo(buf, &str[copypos], i - copypos);
+			copypos = i;
+		}
+
+		/*
+		 * Per-byte loop for Vector8s containing special chars and for
+		 * processing the tail of the string.
+		 */
+		for (int b = 0; b < sizeof(Vector8); b++)
+		{
+			/* check if we've finished */
+			if (i == len)
+				goto done;
+
+			Assert(i < len);
+
+			escape_json_char(buf, str[i++]);
+		}
+
+		copypos = i;
+		/* We're not done yet.  Try the SIMD search again */
+	}
 
+done:
 	appendStringInfoCharMacro(buf, '"');
 }
 
diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out
index aa29bc597b..bfcc26c531 100644
--- a/src/test/regress/expected/json.out
+++ b/src/test/regress/expected/json.out
@@ -55,6 +55,50 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
  "............abc\n"
 (1 row)
 
+-- Stress testing of JSON escape code
+CREATE TABLE json_escape (very_long_column_name_to_test_json_escape text);
+INSERT INTO json_escape SELECT repeat('a', a) FROM generate_series(0,33) a;
+-- Test various lengths of strings to validate SIMD processing to escape
+-- special chars in the JSON.
+SELECT row_to_json(j)::jsonb FROM json_escape j;
+                                    row_to_json                                     
+------------------------------------------------------------------------------------
+ {"very_long_column_name_to_test_json_escape": ""}
+ {"very_long_column_name_to_test_json_escape": "a"}
+ {"very_long_column_name_to_test_json_escape": "aa"}
+ {"very_long_column_name_to_test_json_escape": "aaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+ {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}
+(34 rows)
+
 -- see json_encoding test for input with unicode escapes
 -- Numbers.
 SELECT '1'::json;				-- OK
diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql
index ec57dfe707..0e7ca2f5af 100644
--- a/src/test/regress/sql/json.sql
+++ b/src/test/regress/sql/json.sql
@@ -12,6 +12,14 @@ SELECT '"\v"'::json;			-- ERROR, not a valid JSON escape
 SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK
 SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes
 
+-- Stress testing of JSON escape code
+CREATE TABLE json_escape (very_long_column_name_to_test_json_escape text);
+INSERT INTO json_escape SELECT repeat('a', a) FROM generate_series(0,33) a;
+
+-- Test various lengths of strings to validate SIMD processing to escape
+-- special chars in the JSON.
+SELECT row_to_json(j)::jsonb FROM json_escape j;
+
 -- see json_encoding test for input with unicode escapes
 
 -- Numbers.
-- 
2.34.1

