Hi, On Wed, 10 Dec 2025 at 01:13, Manni Wood <[email protected]> wrote: > > Bilal Yavuz (Nazir Bilal Yavuz?),
It is Nazir Bilal Yavuz, I changed some settings on my phone and it seems that it affected my mail account, hopefully it should be fixed now. > I did not get a chance to do any work on this today, but wanted to thank you > for finding my logic errors in counting special chars for CSV, and hacking on > my naive solution to make it faster. By attempting Andrew Dunstan's > suggestion, I got a better feel for the reality that the "housekeeping" code > produces a significant amount of overhead. You are welcome! v4.1 has some problems with in_quote case in SIMD handling code and counting cstate->chars_processed variable. I fixed them in v4.2. -- Regards, Nazir Bilal Yavuz Microsoft
From e4546b0612bd2fde6190a9ade6e60a1f08299184 Mon Sep 17 00:00:00 2001 From: Manni Wood <[email protected]> Date: Fri, 5 Dec 2025 18:30:00 -0600 Subject: [PATCH v4.2 1/3] Speed up COPY FROM text/CSV parsing using SIMD Authors: Shinya Kato <[email protected]>, Nazir Bilal Yavuz <[email protected]>, Ayoub Kazar <[email protected]> Reviewers: Andrew Dunstan <[email protected]> Descussion: https://www.postgresql.org/message-id/flat/caozeursw8cnr6tpksjrstnpfhf4qyqqb4tnpxgge8n4e_v7...@mail.gmail.com --- src/backend/commands/copyfromparse.c | 73 ++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 62afcd8fad1..673d6683a72 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -71,7 +71,9 @@ #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" +#include "port/pg_bitutils.h" #include "port/pg_bswap.h" +#include "port/simd.h" #include "utils/builtins.h" #include "utils/rel.h" @@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) char quotec = '\0'; char escapec = '\0'; +#ifndef USE_NO_SIMD + Vector8 nl = vector8_broadcast('\n'); + Vector8 cr = vector8_broadcast('\r'); + Vector8 bs = vector8_broadcast('\\'); + Vector8 quote = vector8_broadcast(0); + Vector8 escape = vector8_broadcast(0); +#endif + if (is_csv) { quotec = cstate->opts.quote[0]; @@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* ignore special escape processing if it's the same as quotec */ if (quotec == escapec) escapec = '\0'; + +#ifndef USE_NO_SIMD + quote = vector8_broadcast(quotec); + if (quotec != escapec) + escape = vector8_broadcast(escapec); +#endif } /* @@ -1328,6 +1344,63 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) need_data = false; } +#ifndef USE_NO_SIMD + + /* + * Use SIMD instructions to efficiently scan the input buffer for + * special characters (e.g., newline, carriage return, quote, and + * escape). This is faster than byte-by-byte iteration, especially on + * large buffers. + * + * We do not apply the SIMD fast path in either of the following + * cases: - When the previously processed character was an escape + * character (last_was_esc), since the next byte must be examined + * sequentially. - The remaining buffer is smaller than one vector + * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. + */ + if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + { + Vector8 chunk; + Vector8 match = vector8_broadcast(0); + uint32 mask; + + /* Load a chunk of data into a vector register */ + vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]); + + /* \n and \r are not special inside quotes */ + if (!in_quote) + match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr)); + + if (is_csv) + { + match = vector8_or(match, vector8_eq(chunk, quote)); + if (escapec != '\0') + match = vector8_or(match, vector8_eq(chunk, escape)); + } + else + match = vector8_or(match, vector8_eq(chunk, bs)); + + /* Check if we found any special characters */ + mask = vector8_highbit_mask(match); + if (mask != 0) + { + /* + * Found a special character. Advance up to that point and let + * the scalar code handle it. + */ + int advance = pg_rightmost_one_pos32(mask); + + input_buf_ptr += advance; + } + else + { + /* No special characters found, so skip the entire chunk */ + input_buf_ptr += sizeof(Vector8); + continue; + } + } +#endif + /* OK to fetch a character */ prev_raw_ptr = input_buf_ptr; c = copy_input_buf[input_buf_ptr++]; -- 2.51.0
From 92ac4ada1e4833f81ce30164b48868dc1ade102f Mon Sep 17 00:00:00 2001 From: Manni Wood <[email protected]> Date: Fri, 5 Dec 2025 18:33:46 -0600 Subject: [PATCH v4.2 2/3] Speed up COPY FROM text/CSV parsing using SIMD Authors: Shinya Kato <[email protected]>, Nazir Bilal Yavuz <[email protected]>, Ayoub Kazar <[email protected]> Reviewers: Andrew Dunstan <[email protected]> Descussion: https://www.postgresql.org/message-id/flat/caozeursw8cnr6tpksjrstnpfhf4qyqqb4tnpxgge8n4e_v7...@mail.gmail.com --- src/include/commands/copyfrom_internal.h | 11 +++++++++ src/backend/commands/copyfrom.c | 3 +++ src/backend/commands/copyfromparse.c | 29 +++++++++++++++++++++++- 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h index c8b22af22d8..215215f909f 100644 --- a/src/include/commands/copyfrom_internal.h +++ b/src/include/commands/copyfrom_internal.h @@ -181,6 +181,17 @@ typedef struct CopyFromStateData #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index) uint64 bytes_processed; /* number of bytes processed so far */ + + /* the amount of bytes to read until checking if we should try simd */ +#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000 + /* the number of special chars read below which we use simd */ +#define SPECIAL_CHAR_SIMD_THRESHOLD 20000 + uint64 special_chars_encountered; /* number of special chars + * encountered so far */ + bool checked_simd; /* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK + * and checked if we should use SIMD on the + * rest of the file */ + bool use_simd; /* use simd to speed up copying */ } CopyFromStateData; extern void ReceiveCopyBegin(CopyFromState cstate); diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index 2ae3d2ba86e..6711c0cfcdd 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -1720,6 +1720,9 @@ BeginCopyFrom(ParseState *pstate, cstate->cur_attname = NULL; cstate->cur_attval = NULL; cstate->relname_only = false; + cstate->special_chars_encountered = 0; + cstate->checked_simd = false; + cstate->use_simd = false; /* * Allocate buffers for the input pipeline. diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 673d6683a72..d548674c8ff 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -1346,6 +1346,28 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) #ifndef USE_NO_SIMD + /* + * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK. + * cstate->bytes_processed will grow an unpredictable amount with each + * call to this function, so just wait until we have crossed the + * threshold. + */ + if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK) + { + cstate->checked_simd = true; + + /* + * If we have not read too many special characters + * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up + * processing. This heuristic assumes that input does not vary too + * much from line to line and that number of special characters + * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are + * indicitive of the whole file. + */ + if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD) + cstate->use_simd = true; + } + /* * Use SIMD instructions to efficiently scan the input buffer for * special characters (e.g., newline, carriage return, quote, and @@ -1358,7 +1380,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * sequentially. - The remaining buffer is smaller than one vector * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. */ - if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) { Vector8 chunk; Vector8 match = vector8_broadcast(0); @@ -1415,6 +1437,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) */ if (c == '\r') { + cstate->special_chars_encountered++; IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); } @@ -1446,6 +1469,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \r */ if (c == '\r' && (!is_csv || !in_quote)) { + cstate->special_chars_encountered++; /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || cstate->eol_type == EOL_CRNL) @@ -1502,6 +1526,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \n */ if (c == '\n' && (!is_csv || !in_quote)) { + cstate->special_chars_encountered++; if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), @@ -1524,6 +1549,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) { char c2; + cstate->special_chars_encountered++; + IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); IF_NEED_REFILL_AND_EOF_BREAK(0); -- 2.51.0
From 128574f80963c5b532c8aa7e7fad84a7e6e20874 Mon Sep 17 00:00:00 2001 From: Nazir Bilal Yavuz <[email protected]> Date: Tue, 9 Dec 2025 15:32:10 +0300 Subject: [PATCH v4.2 3/3] Feedback / Changes --- src/include/commands/copyfrom_internal.h | 9 +-- src/backend/commands/copyfrom.c | 1 + src/backend/commands/copyfromparse.c | 92 +++++++++++++++--------- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h index 215215f909f..397720bf875 100644 --- a/src/include/commands/copyfrom_internal.h +++ b/src/include/commands/copyfrom_internal.h @@ -183,12 +183,13 @@ typedef struct CopyFromStateData uint64 bytes_processed; /* number of bytes processed so far */ /* the amount of bytes to read until checking if we should try simd */ -#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000 - /* the number of special chars read below which we use simd */ -#define SPECIAL_CHAR_SIMD_THRESHOLD 20000 +#define CHARS_PROCESSED_UNTIL_SIMD_CHECK 100000 + /* the ratio of special chars read below which we use simd */ +#define SPECIAL_CHAR_SIMD_RATIO 4 + uint64 chars_processed; uint64 special_chars_encountered; /* number of special chars * encountered so far */ - bool checked_simd; /* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK + bool checked_simd; /* we read CHARS_PROCESSED_UNTIL_SIMD_CHECK * and checked if we should use SIMD on the * rest of the file */ bool use_simd; /* use simd to speed up copying */ diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c index 6711c0cfcdd..2b77ba2556c 100644 --- a/src/backend/commands/copyfrom.c +++ b/src/backend/commands/copyfrom.c @@ -1720,6 +1720,7 @@ BeginCopyFrom(ParseState *pstate, cstate->cur_attname = NULL; cstate->cur_attval = NULL; cstate->relname_only = false; + cstate->chars_processed = 0; cstate->special_chars_encountered = 0; cstate->checked_simd = false; cstate->use_simd = false; diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index d548674c8ff..720222152da 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -143,7 +143,7 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; /* non-export function prototypes */ static bool CopyReadLine(CopyFromState cstate, bool is_csv); -static bool CopyReadLineText(CopyFromState cstate, bool is_csv); +static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd); static int CopyReadAttributesText(CopyFromState cstate); static int CopyReadAttributesCSV(CopyFromState cstate); static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, @@ -1173,8 +1173,40 @@ CopyReadLine(CopyFromState cstate, bool is_csv) resetStringInfo(&cstate->line_buf); cstate->line_buf_valid = false; - /* Parse data and transfer into line_buf */ - result = CopyReadLineText(cstate, is_csv); +#ifndef USE_NO_SIMD + + /* + * Wait until we have read more than CHARS_PROCESSED_UNTIL_SIMD_CHECK. + * cstate->bytes_processed will grow an unpredictable amount with each + * call to this function, so just wait until we have crossed the + * threshold. + */ + if (!cstate->checked_simd && cstate->chars_processed > CHARS_PROCESSED_UNTIL_SIMD_CHECK) + { + cstate->checked_simd = true; + + /* + * If we have not read too many special characters then start using + * SIMD to speed up processing. This heuristic assumes that input does + * not vary too much from line to line and that number of special + * characters encountered in the first + * CHARS_PROCESSED_UNTIL_SIMD_CHECK are indicitive of the whole file. + */ + if (cstate->chars_processed / SPECIAL_CHAR_SIMD_RATIO >= cstate->special_chars_encountered) + { + cstate->use_simd = true; + } + } +#endif + + /* + * Parse data and transfer into line_buf. To get benefit from inlining, + * call CopyReadLineText() with the constant boolean variables. + */ + if (cstate->use_simd) + result = CopyReadLineText(cstate, is_csv, true); + else + result = CopyReadLineText(cstate, is_csv, false); if (result) { @@ -1241,11 +1273,12 @@ CopyReadLine(CopyFromState cstate, bool is_csv) /* * CopyReadLineText - inner loop of CopyReadLine for text mode */ -static bool -CopyReadLineText(CopyFromState cstate, bool is_csv) +static pg_attribute_always_inline bool +CopyReadLineText(CopyFromState cstate, bool is_csv, bool use_simd) { char *copy_input_buf; int input_buf_ptr; + int start_input_buf_ptr; int copy_buf_len; bool need_data = false; bool hit_eof = false; @@ -1309,6 +1342,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) input_buf_ptr = cstate->input_buf_index; copy_buf_len = cstate->input_buf_len; + start_input_buf_ptr = input_buf_ptr; for (;;) { int prev_raw_ptr; @@ -1327,9 +1361,11 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) REFILL_LINEBUF; CopyLoadInputBuf(cstate); + cstate->chars_processed += (input_buf_ptr - start_input_buf_ptr); /* update our local variables */ hit_eof = cstate->input_reached_eof; input_buf_ptr = cstate->input_buf_index; + start_input_buf_ptr = input_buf_ptr; copy_buf_len = cstate->input_buf_len; /* @@ -1346,28 +1382,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) #ifndef USE_NO_SIMD - /* - * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK. - * cstate->bytes_processed will grow an unpredictable amount with each - * call to this function, so just wait until we have crossed the - * threshold. - */ - if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK) - { - cstate->checked_simd = true; - - /* - * If we have not read too many special characters - * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up - * processing. This heuristic assumes that input does not vary too - * much from line to line and that number of special characters - * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are - * indicitive of the whole file. - */ - if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD) - cstate->use_simd = true; - } - /* * Use SIMD instructions to efficiently scan the input buffer for * special characters (e.g., newline, carriage return, quote, and @@ -1380,7 +1394,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) * sequentially. - The remaining buffer is smaller than one vector * width (sizeof(Vector8)); SIMD operates on fixed-size chunks. */ - if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) + if (use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8)) { Vector8 chunk; Vector8 match = vector8_broadcast(0); @@ -1427,6 +1441,21 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) prev_raw_ptr = input_buf_ptr; c = copy_input_buf[input_buf_ptr++]; + /* Use this calculation decide whether to use SIMD later */ + if (!use_simd && unlikely(!cstate->checked_simd)) + { + if (is_csv) + { + if (c == '\r' || c == '\n' || c == quotec || c == escapec) + cstate->special_chars_encountered++; + } + else + { + if (c == '\r' || c == '\n' || c == '\\') + cstate->special_chars_encountered++; + } + } + if (is_csv) { /* @@ -1437,7 +1466,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) */ if (c == '\r') { - cstate->special_chars_encountered++; IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); } @@ -1469,7 +1497,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \r */ if (c == '\r' && (!is_csv || !in_quote)) { - cstate->special_chars_encountered++; /* Check for \r\n on first line, _and_ handle \r\n. */ if (cstate->eol_type == EOL_UNKNOWN || cstate->eol_type == EOL_CRNL) @@ -1526,7 +1553,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) /* Process \n */ if (c == '\n' && (!is_csv || !in_quote)) { - cstate->special_chars_encountered++; if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL) ereport(ERROR, (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), @@ -1549,8 +1575,6 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) { char c2; - cstate->special_chars_encountered++; - IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0); IF_NEED_REFILL_AND_EOF_BREAK(0); @@ -1635,6 +1659,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv) */ REFILL_LINEBUF; + cstate->chars_processed += (input_buf_ptr - start_input_buf_ptr); + return result; } -- 2.51.0
