From a00c3354b126e39ce057d910a3a040f96e1491d3 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil@determined.ai>
Date: Sun, 2 Jun 2024 14:00:58 -0400
Subject: [PATCH v2 3/4] Optimize COPY TO in CSV format using SIMD

CopyAttributeOutCSV() does one or two byte-by-byte loops over the text of each
attribute, depending on whether quotation is required. Implementing this loops
using SIMD yields a significant speedup for long attribute values. For short
attribute values, performance is roughly unchanged.

We don't attempt to apply this optimization encoding_embeds_ascii is true,
because the required bookkeeping would be complicated.
---
 src/backend/commands/copyto.c | 152 +++++++++++++++++++++++++++++++++-
 1 file changed, 149 insertions(+), 3 deletions(-)

diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c
index cd2d7bb217..9114bb1c48 100644
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -29,6 +29,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/simd.h"
 #include "storage/fd.h"
 #include "tcop/tcopprot.h"
 #include "utils/lsyscache.h"
@@ -1127,6 +1128,144 @@ CopyAttributeOutText(CopyToState cstate, const char *string)
 	DUMPSOFAR();
 }
 
+/*
+ * Send text representation of one attribute, with conversion and CSV-style
+ * escaping.  This is significantly faster for wide attributes, assuming that
+ * control characters are rare.
+ *
+ * This variant assumes that encoding_embeds_ascii is false.  This simplifies
+ * the implementation because we can look at arbitrary-sized chunks of bytes,
+ * without needing to go through the pg_encoding_mblen() machinery to ensure
+ * that multibyte characters don't cross chunk boundaries.  In principle we
+ * could combine vectorization with such encodings, but the bookkeeping
+ * required would be complicated.
+ */
+static void
+CopyAttributeOutCSVVector(CopyToState cstate, const char *ptr,
+						  bool use_quote)
+{
+	int			len;
+	int			vlen;
+	char		delimc = cstate->opts.delim[0];
+	char		quotec = cstate->opts.quote[0];
+	char		escapec = cstate->opts.escape[0];
+
+	len = strlen(ptr);
+	vlen = len & (int) (~(sizeof(Vector8) - 1));
+
+	/*
+	 * Make a preliminary pass to discover if it needs quoting
+	 */
+	if (!use_quote)
+	{
+		bool	single_attr = (list_length(cstate->attnumlist) == 1);
+
+		/*
+		 * Because '\.' can be a data value, quote it if it appears alone on a
+		 * line so it is not interpreted as the end-of-data marker.
+		 */
+		if (single_attr && strcmp(ptr, "\\.") == 0)
+			use_quote = true;
+		else
+		{
+			int		i;
+			Vector8 chunk;
+
+			for (i = 0; i < vlen; i += sizeof(Vector8))
+			{
+				vector8_load(&chunk, (const uint8 *) &ptr[i]);
+
+				if (vector8_has(chunk, (unsigned char) delimc) ||
+					vector8_has(chunk, (unsigned char) quotec) ||
+					vector8_has(chunk, (unsigned char) '\n') ||
+					vector8_has(chunk, (unsigned char) '\r'))
+				{
+					use_quote = true;
+					break;
+				}
+			}
+
+			/* Check the tail of the string */
+			if (!use_quote)
+			{
+				for (; i < len; i++)
+				{
+					char c = ptr[i];
+
+					if (c == delimc || c == quotec || c == '\n' || c == '\r')
+					{
+						use_quote = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	if (use_quote)
+	{
+		int		i;
+		int		start_idx = 0;
+		Vector8 chunk;
+
+		CopySendChar(cstate, quotec);
+
+		for (i = 0; i < vlen; i += sizeof(Vector8))
+		{
+			vector8_load(&chunk, (const uint8 *) &ptr[i]);
+
+			if (vector8_has(chunk, (unsigned char) delimc) ||
+				vector8_has(chunk, (unsigned char) quotec))
+			{
+				/*
+				 * This chunk has one or more characters that require
+				 * escaping, so switch to byte-at-a-time processing
+				 */
+				for (int j = i; j < (i + sizeof(Vector8)); j++)
+				{
+					char c = ptr[j];
+
+					if (c == quotec || c == escapec)
+					{
+						if (j > start_idx)
+							CopySendData(cstate, ptr + start_idx, j - start_idx);
+
+						CopySendChar(cstate, escapec);
+						start_idx = j;
+					}
+				}
+			}
+		}
+
+		/* Process the tail of the string */
+		for (; i < len; i++)
+		{
+			char c = ptr[i];
+
+			if (c == quotec || c == escapec)
+			{
+				if (i > start_idx)
+					CopySendData(cstate, ptr + start_idx, i - start_idx);
+
+				CopySendChar(cstate, escapec);
+				start_idx = i;
+			}
+		}
+
+		/* Send any remaining text */
+		if (start_idx < len)
+			CopySendData(cstate, ptr + start_idx, len - start_idx);
+
+		CopySendChar(cstate, quotec);
+	}
+	else
+	{
+		/* If it doesn't need quoting, we can just dump it as-is */
+		CopySendData(cstate, ptr, len);
+	}
+}
+
+
 /*
  * Send text representation of one attribute, with conversion and
  * CSV-style escaping
@@ -1141,7 +1280,6 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 	char		delimc = cstate->opts.delim[0];
 	char		quotec = cstate->opts.quote[0];
 	char		escapec = cstate->opts.escape[0];
-	bool		single_attr = (list_length(cstate->attnumlist) == 1);
 
 	/* force quoting if it matches null_print (before conversion!) */
 	if (!use_quote && strcmp(string, cstate->opts.null_print) == 0)
@@ -1152,11 +1290,19 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 	else
 		ptr = string;
 
+	if (!cstate->encoding_embeds_ascii)
+	{
+		CopyAttributeOutCSVVector(cstate, ptr, use_quote);
+		return;
+	}
+
 	/*
 	 * Make a preliminary pass to discover if it needs quoting
 	 */
 	if (!use_quote)
 	{
+		bool	single_attr = (list_length(cstate->attnumlist) == 1);
+
 		/*
 		 * Because '\.' can be a data value, quote it if it appears alone on a
 		 * line so it is not interpreted as the end-of-data marker.
@@ -1174,7 +1320,7 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 					use_quote = true;
 					break;
 				}
-				if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
+				if (IS_HIGHBIT_SET(c))
 					tptr += pg_encoding_mblen(cstate->file_encoding, tptr);
 				else
 					tptr++;
@@ -1198,7 +1344,7 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 				CopySendChar(cstate, escapec);
 				start = ptr;	/* we include char in next run */
 			}
-			if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
+			if (IS_HIGHBIT_SET(c))
 				ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
 			else
 				ptr++;
-- 
2.39.3 (Apple Git-146)

