Hi,

On Wed, 10 Dec 2025 at 01:13, Manni Wood <[email protected]> wrote:
>
> Bilal Yavuz (Nazir Bilal Yavuz?),

It is Nazir Bilal Yavuz, I changed some settings on my phone and it
seems that it affected my mail account, hopefully it should be fixed
now.

> I did not get a chance to do any work on this today, but wanted to thank you 
> for finding my logic errors in counting special chars for CSV, and hacking on 
> my naive solution to make it faster. By attempting Andrew Dunstan's 
> suggestion, I got a better feel for the reality that the "housekeeping" code 
> produces a significant amount of overhead.

You are welcome! v4.1 has some problems with in_quote case in SIMD
handling code and counting cstate->chars_processed variable. I fixed
them in v4.2.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft
From e4546b0612bd2fde6190a9ade6e60a1f08299184 Mon Sep 17 00:00:00 2001
From: Manni Wood <[email protected]>
Date: Fri, 5 Dec 2025 18:30:00 -0600
Subject: [PATCH v4.2 1/3] Speed up COPY FROM text/CSV parsing using SIMD

Authors: Shinya Kato <[email protected]>,
Nazir Bilal Yavuz <[email protected]>,
Ayoub Kazar <[email protected]>
Reviewers: Andrew Dunstan <[email protected]>
Descussion:
https://www.postgresql.org/message-id/flat/caozeursw8cnr6tpksjrstnpfhf4qyqqb4tnpxgge8n4e_v7...@mail.gmail.com
---
 src/backend/commands/copyfromparse.c | 73 ++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 62afcd8fad1..673d6683a72 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -71,7 +71,9 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_bitutils.h"
 #include "port/pg_bswap.h"
+#include "port/simd.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
 
@@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 	char		quotec = '\0';
 	char		escapec = '\0';
 
+#ifndef USE_NO_SIMD
+	Vector8		nl = vector8_broadcast('\n');
+	Vector8		cr = vector8_broadcast('\r');
+	Vector8		bs = vector8_broadcast('\\');
+	Vector8		quote = vector8_broadcast(0);
+	Vector8		escape = vector8_broadcast(0);
+#endif
+
 	if (is_csv)
 	{
 		quotec = cstate->opts.quote[0];
@@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* ignore special escape processing if it's the same as quotec */
 		if (quotec == escapec)
 			escapec = '\0';
+
+#ifndef USE_NO_SIMD
+		quote = vector8_broadcast(quotec);
+		if (quotec != escapec)
+			escape = vector8_broadcast(escapec);
+#endif
 	}
 
 	/*
@@ -1328,6 +1344,63 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			need_data = false;
 		}
 
+#ifndef USE_NO_SIMD
+
+		/*
+		 * Use SIMD instructions to efficiently scan the input buffer for
+		 * special characters (e.g., newline, carriage return, quote, and
+		 * escape). This is faster than byte-by-byte iteration, especially on
+		 * large buffers.
+		 *
+		 * We do not apply the SIMD fast path in either of the following
+		 * cases: - When the previously processed character was an escape
+		 * character (last_was_esc), since the next byte must be examined
+		 * sequentially. - The remaining buffer is smaller than one vector
+		 * width (sizeof(Vector8)); SIMD operates on fixed-size chunks.
+		 */
+		if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		{
+			Vector8		chunk;
+			Vector8		match = vector8_broadcast(0);
+			uint32		mask;
+
+			/* Load a chunk of data into a vector register */
+			vector8_load(&chunk, (const uint8 *) &copy_input_buf[input_buf_ptr]);
+
+			/* \n and \r are not special inside quotes */
+			if (!in_quote)
+				match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+
+			if (is_csv)
+			{
+				match = vector8_or(match, vector8_eq(chunk, quote));
+				if (escapec != '\0')
+					match = vector8_or(match, vector8_eq(chunk, escape));
+			}
+			else
+				match = vector8_or(match, vector8_eq(chunk, bs));
+
+			/* Check if we found any special characters */
+			mask = vector8_highbit_mask(match);
+			if (mask != 0)
+			{
+				/*
+				 * Found a special character. Advance up to that point and let
+				 * the scalar code handle it.
+				 */
+				int			advance = pg_rightmost_one_pos32(mask);
+
+				input_buf_ptr += advance;
+			}
+			else
+			{
+				/* No special characters found, so skip the entire chunk */
+				input_buf_ptr += sizeof(Vector8);
+				continue;
+			}
+		}
+#endif
+
 		/* OK to fetch a character */
 		prev_raw_ptr = input_buf_ptr;
 		c = copy_input_buf[input_buf_ptr++];
-- 
2.51.0

From 92ac4ada1e4833f81ce30164b48868dc1ade102f Mon Sep 17 00:00:00 2001
From: Manni Wood <[email protected]>
Date: Fri, 5 Dec 2025 18:33:46 -0600
Subject: [PATCH v4.2 2/3] Speed up COPY FROM text/CSV parsing using SIMD

Authors: Shinya Kato <[email protected]>,
Nazir Bilal Yavuz <[email protected]>,
Ayoub Kazar <[email protected]>
Reviewers: Andrew Dunstan <[email protected]>
Descussion:
https://www.postgresql.org/message-id/flat/caozeursw8cnr6tpksjrstnpfhf4qyqqb4tnpxgge8n4e_v7...@mail.gmail.com
---
 src/include/commands/copyfrom_internal.h | 11 +++++++++
 src/backend/commands/copyfrom.c          |  3 +++
 src/backend/commands/copyfromparse.c     | 29 +++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index c8b22af22d8..215215f909f 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -181,6 +181,17 @@ typedef struct CopyFromStateData
 #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
 
 	uint64		bytes_processed;	/* number of bytes processed so far */
+
+	/* the amount of bytes to read until checking if we should try simd */
+#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000
+	/* the number of special chars read below which we use simd */
+#define SPECIAL_CHAR_SIMD_THRESHOLD 20000
+	uint64		special_chars_encountered;	/* number of special chars
+											 * encountered so far */
+	bool		checked_simd;	/* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK
+								 * and checked if we should use SIMD on the
+								 * rest of the file */
+	bool		use_simd;		/* use simd to speed up copying */
 } CopyFromStateData;
 
 extern void ReceiveCopyBegin(CopyFromState cstate);
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 2ae3d2ba86e..6711c0cfcdd 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -1720,6 +1720,9 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 	cstate->relname_only = false;
+	cstate->special_chars_encountered = 0;
+	cstate->checked_simd = false;
+	cstate->use_simd = false;
 
 	/*
 	 * Allocate buffers for the input pipeline.
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 673d6683a72..d548674c8ff 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -1346,6 +1346,28 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 
 #ifndef USE_NO_SIMD
 
+		/*
+		 * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK.
+		 * cstate->bytes_processed will grow an unpredictable amount with each
+		 * call to this function, so just wait until we have crossed the
+		 * threshold.
+		 */
+		if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK)
+		{
+			cstate->checked_simd = true;
+
+			/*
+			 * If we have not read too many special characters
+			 * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up
+			 * processing. This heuristic assumes that input does not vary too
+			 * much from line to line and that number of special characters
+			 * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are
+			 * indicitive of the whole file.
+			 */
+			if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD)
+				cstate->use_simd = true;
+		}
+
 		/*
 		 * Use SIMD instructions to efficiently scan the input buffer for
 		 * special characters (e.g., newline, carriage return, quote, and
@@ -1358,7 +1380,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		 * sequentially. - The remaining buffer is smaller than one vector
 		 * width (sizeof(Vector8)); SIMD operates on fixed-size chunks.
 		 */
-		if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
 		{
 			Vector8		chunk;
 			Vector8		match = vector8_broadcast(0);
@@ -1415,6 +1437,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			 */
 			if (c == '\r')
 			{
+				cstate->special_chars_encountered++;
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			}
 
@@ -1446,6 +1469,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \r */
 		if (c == '\r' && (!is_csv || !in_quote))
 		{
+			cstate->special_chars_encountered++;
 			/* Check for \r\n on first line, _and_ handle \r\n. */
 			if (cstate->eol_type == EOL_UNKNOWN ||
 				cstate->eol_type == EOL_CRNL)
@@ -1502,6 +1526,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \n */
 		if (c == '\n' && (!is_csv || !in_quote))
 		{
+			cstate->special_chars_encountered++;
 			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
@@ -1524,6 +1549,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		{
 			char		c2;
 
+			cstate->special_chars_encountered++;
+
 			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
-- 
2.51.0

From 128574f80963c5b532c8aa7e7fad84a7e6e20874 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Tue, 9 Dec 2025 15:32:10 +0300
Subject: [PATCH v4.2 3/3] Feedback / Changes

---
 src/include/commands/copyfrom_internal.h |  9 +--
 src/backend/commands/copyfrom.c          |  1 +
 src/backend/commands/copyfromparse.c     | 92 +++++++++++++++---------
 3 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 215215f909f..397720bf875 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -183,12 +183,13 @@ typedef struct CopyFromStateData
 	uint64		bytes_processed;	/* number of bytes processed so far */
 
 	/* the amount of bytes to read until checking if we should try simd */
-#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000
-	/* the number of special chars read below which we use simd */
-#define SPECIAL_CHAR_SIMD_THRESHOLD 20000
+#define CHARS_PROCESSED_UNTIL_SIMD_CHECK 100000
+	/* the ratio of special chars read below which we use simd */
+#define SPECIAL_CHAR_SIMD_RATIO 4
+	uint64		chars_processed;
 	uint64		special_chars_encountered;	/* number of special chars
 											 * encountered so far */
-	bool		checked_simd;	/* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK
+	bool		checked_simd;	/* we read CHARS_PROCESSED_UNTIL_SIMD_CHECK
 								 * and checked if we should use SIMD on the
 								 * rest of the file */
 	bool		use_simd;		/* use simd to speed up copying */
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 6711c0cfcdd..2b77ba2556c 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -1720,6 +1720,7 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 	cstate->relname_only = false;
+	cstate->chars_processed = 0;
 	cstate->special_chars_encountered = 0;
 	cstate->checked_simd = false;
 	cstate->use_simd = false;
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index d548674c8ff..720222152da 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -143,7 +143,7 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
 
 /* non-export function prototypes */
 static bool CopyReadLine(CopyFromState cstate, bool is_csv);
-static bool CopyReadLineText(CopyFromState cstate, bool is_csv);
+static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd);
 static int	CopyReadAttributesText(CopyFromState cstate);
 static int	CopyReadAttributesCSV(CopyFromState cstate);
 static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
@@ -1173,8 +1173,40 @@ CopyReadLine(CopyFromState cstate, bool is_csv)
 	resetStringInfo(&cstate->line_buf);
 	cstate->line_buf_valid = false;
 
-	/* Parse data and transfer into line_buf */
-	result = CopyReadLineText(cstate, is_csv);
+#ifndef USE_NO_SIMD
+
+	/*
+	 * Wait until we have read more than CHARS_PROCESSED_UNTIL_SIMD_CHECK.
+	 * cstate->bytes_processed will grow an unpredictable amount with each
+	 * call to this function, so just wait until we have crossed the
+	 * threshold.
+	 */
+	if (!cstate->checked_simd && cstate->chars_processed > CHARS_PROCESSED_UNTIL_SIMD_CHECK)
+	{
+		cstate->checked_simd = true;
+
+		/*
+		 * If we have not read too many special characters then start using
+		 * SIMD to speed up processing. This heuristic assumes that input does
+		 * not vary too much from line to line and that number of special
+		 * characters encountered in the first
+		 * CHARS_PROCESSED_UNTIL_SIMD_CHECK are indicitive of the whole file.
+		 */
+		if (cstate->chars_processed / SPECIAL_CHAR_SIMD_RATIO >= cstate->special_chars_encountered)
+		{
+			cstate->use_simd = true;
+		}
+	}
+#endif
+
+	/*
+	 * Parse data and transfer into line_buf. To get benefit from inlining,
+	 * call CopyReadLineText() with the constant boolean variables.
+	 */
+	if (cstate->use_simd)
+		result = CopyReadLineText(cstate, is_csv, true);
+	else
+		result = CopyReadLineText(cstate, is_csv, false);
 
 	if (result)
 	{
@@ -1241,11 +1273,12 @@ CopyReadLine(CopyFromState cstate, bool is_csv)
 /*
  * CopyReadLineText - inner loop of CopyReadLine for text mode
  */
-static bool
-CopyReadLineText(CopyFromState cstate, bool is_csv)
+static pg_attribute_always_inline bool
+CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd)
 {
 	char	   *copy_input_buf;
 	int			input_buf_ptr;
+	int			start_input_buf_ptr;
 	int			copy_buf_len;
 	bool		need_data = false;
 	bool		hit_eof = false;
@@ -1309,6 +1342,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 	input_buf_ptr = cstate->input_buf_index;
 	copy_buf_len = cstate->input_buf_len;
 
+	start_input_buf_ptr = input_buf_ptr;
 	for (;;)
 	{
 		int			prev_raw_ptr;
@@ -1327,9 +1361,11 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			REFILL_LINEBUF;
 
 			CopyLoadInputBuf(cstate);
+			cstate->chars_processed += (input_buf_ptr - start_input_buf_ptr);
 			/* update our local variables */
 			hit_eof = cstate->input_reached_eof;
 			input_buf_ptr = cstate->input_buf_index;
+			start_input_buf_ptr = input_buf_ptr;
 			copy_buf_len = cstate->input_buf_len;
 
 			/*
@@ -1346,28 +1382,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 
 #ifndef USE_NO_SIMD
 
-		/*
-		 * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK.
-		 * cstate->bytes_processed will grow an unpredictable amount with each
-		 * call to this function, so just wait until we have crossed the
-		 * threshold.
-		 */
-		if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK)
-		{
-			cstate->checked_simd = true;
-
-			/*
-			 * If we have not read too many special characters
-			 * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up
-			 * processing. This heuristic assumes that input does not vary too
-			 * much from line to line and that number of special characters
-			 * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are
-			 * indicitive of the whole file.
-			 */
-			if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD)
-				cstate->use_simd = true;
-		}
-
 		/*
 		 * Use SIMD instructions to efficiently scan the input buffer for
 		 * special characters (e.g., newline, carriage return, quote, and
@@ -1380,7 +1394,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		 * sequentially. - The remaining buffer is smaller than one vector
 		 * width (sizeof(Vector8)); SIMD operates on fixed-size chunks.
 		 */
-		if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		if (use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
 		{
 			Vector8		chunk;
 			Vector8		match = vector8_broadcast(0);
@@ -1427,6 +1441,21 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		prev_raw_ptr = input_buf_ptr;
 		c = copy_input_buf[input_buf_ptr++];
 
+		/* Use this calculation decide whether to use SIMD later */
+		if (!use_simd && unlikely(!cstate->checked_simd))
+		{
+			if (is_csv)
+			{
+				if (c == '\r' || c == '\n' || c == quotec || c == escapec)
+					cstate->special_chars_encountered++;
+			}
+			else
+			{
+				if (c == '\r' || c == '\n' || c == '\\')
+					cstate->special_chars_encountered++;
+			}
+		}
+
 		if (is_csv)
 		{
 			/*
@@ -1437,7 +1466,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			 */
 			if (c == '\r')
 			{
-				cstate->special_chars_encountered++;
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			}
 
@@ -1469,7 +1497,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \r */
 		if (c == '\r' && (!is_csv || !in_quote))
 		{
-			cstate->special_chars_encountered++;
 			/* Check for \r\n on first line, _and_ handle \r\n. */
 			if (cstate->eol_type == EOL_UNKNOWN ||
 				cstate->eol_type == EOL_CRNL)
@@ -1526,7 +1553,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \n */
 		if (c == '\n' && (!is_csv || !in_quote))
 		{
-			cstate->special_chars_encountered++;
 			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
@@ -1549,8 +1575,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		{
 			char		c2;
 
-			cstate->special_chars_encountered++;
-
 			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
@@ -1635,6 +1659,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 	 */
 	REFILL_LINEBUF;
 
+	cstate->chars_processed += (input_buf_ptr - start_input_buf_ptr);
+
 	return result;
 }
 
-- 
2.51.0

Reply via email to