From ea97c1e30605473cd73e5f3ffe2ac966b9f0c180 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Wed, 12 Feb 2025 15:27:16 +0700
Subject: [PATCH v10 4/5] Improve CRC32C performance on x86_64

The current SSE4.2 implementation of CRC32C relies on the native
CRC32 instruction, which operates on 8 bytes at a time. We can get a
substantial speedup on longer inputs by using carryless multiplication
on SIMD registers, processing 64 bytes per loop iteration.

The PCLMULQDQ instruction has been widely available since 2011 (almost
as old as SSE 4.2), so this commit now requires that, as well as SSE
4.2, to build pg_crc32c_sse42.c.

The MIT-licensed implementation was generated with the "generate"
program from

https://github.com/corsix/fast-crc32/

Based on: "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
Instruction" V. Gopal, E. Ozturk, et al., 2009

Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Author: John Naylor <johncnaylorls@gmail.com>
Discussion: https://postgr.es/m/PH8PR11MB82869FF741DFA4E9A029FF13FBF72@PH8PR11MB8286.namprd11.prod.outlook.com
Signed-off-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
---
 src/include/port/pg_cpucap.h          |   2 +
 src/port/pg_cpucap.c                  |   1 +
 src/port/pg_cpucap_arm.c              |   6 ++
 src/port/pg_cpucap_x86.c              |  23 +++++
 src/port/pg_crc32c_sse42.c            | 123 ++++++++++++++++++++++++++
 src/test/regress/expected/strings.out |  24 +++++
 src/test/regress/sql/strings.sql      |   4 +
 7 files changed, 183 insertions(+)

diff --git a/src/include/port/pg_cpucap.h b/src/include/port/pg_cpucap.h
index 5e04213b21..af3fabfcff 100644
--- a/src/include/port/pg_cpucap.h
+++ b/src/include/port/pg_cpucap.h
@@ -18,11 +18,13 @@
 #define PGCPUCAP_POPCNT         (1 << 1)
 #define PGCPUCAP_VPOPCNT        (1 << 2)
 #define PGCPUCAP_CRC32C         (1 << 3)
+#define PGCPUCAP_CLMUL          (1 << 4)
 
 extern PGDLLIMPORT uint32 pg_cpucap;
 extern void pg_cpucap_initialize(void);
 
 /* arch-specific functions private to src/port */
 extern void pg_cpucap_crc32c(void);
+extern void pg_cpucap_clmul(void);
 
 #endif							/* PG_CPUCAP_H */
diff --git a/src/port/pg_cpucap.c b/src/port/pg_cpucap.c
index 88d7582702..301bd9fc2c 100644
--- a/src/port/pg_cpucap.c
+++ b/src/port/pg_cpucap.c
@@ -30,4 +30,5 @@ pg_cpucap_initialize(void)
 	pg_cpucap = PGCPUCAP_INIT;
 
 	pg_cpucap_crc32c();
+	pg_cpucap_clmul();
 }
diff --git a/src/port/pg_cpucap_arm.c b/src/port/pg_cpucap_arm.c
index 19e052fecf..e080a5a931 100644
--- a/src/port/pg_cpucap_arm.c
+++ b/src/port/pg_cpucap_arm.c
@@ -111,3 +111,9 @@ pg_cpucap_crc32c(void)
 	if (pg_crc32c_armv8_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
 }
+
+void
+pg_cpucap_clmul(void)
+{
+	// WIP: does this even make sense?
+}
diff --git a/src/port/pg_cpucap_x86.c b/src/port/pg_cpucap_x86.c
index 07462bd1d2..3a62a3a582 100644
--- a/src/port/pg_cpucap_x86.c
+++ b/src/port/pg_cpucap_x86.c
@@ -41,6 +41,22 @@ pg_sse42_available(void)
 	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
 
+static bool
+pg_pclmul_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 1)) != 0;	/* PCLMUL */
+}
+
 /*
  * Check if hardware instructions for CRC computation are available.
  */
@@ -50,3 +66,10 @@ pg_cpucap_crc32c(void)
 	if (pg_sse42_available())
 		pg_cpucap |= PGCPUCAP_CRC32C;
 }
+
+void
+pg_cpucap_clmul(void)
+{
+	if (pg_pclmul_available())
+		pg_cpucap |= PGCPUCAP_CLMUL;
+}
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df3..fc3cf0d088 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -15,9 +15,118 @@
 #include "c.h"
 
 #include <nmmintrin.h>
+#include <wmmintrin.h>
 
 #include "port/pg_crc32c.h"
 
+/* WIP: configure checks */
+#ifdef __x86_64__
+#define HAVE_PCLMUL_RUNTIME
+#endif
+
+ /*
+  * WIP: Testing has shown that on Kaby Lake (2016) this algorithm needs two
+  * iterations of the main loop to be faster than using regular CRC
+  * instrutions, but Tiger Lake (2020) is fine with a single iteration. Could
+  * use more testing between those years (on AMD as well).
+  */
+#define PCLMUL_THRESHOLD 128
+
+#ifdef HAVE_PCLMUL_RUNTIME
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i sse -p crc32c -a v4e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
+pg_attribute_target("sse4.2,pclmul")
+static pg_crc32c
+pg_comp_crc32c_pclmul(pg_crc32c crc, const void *data, size_t length)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	size_t		len = length;
+	const char *buf = data;
+
+	// This prolog is trying to avoid loads straddling
+	// cache lines, but it doesn't seem worth it if
+	// we're trying to be fast on small inputs as well
+#if 0
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = _mm_crc32_u8(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+#endif
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		__m128i		x0 = _mm_loadu_si128((const __m128i *) buf),
+					y0;
+		__m128i		x1 = _mm_loadu_si128((const __m128i *) (buf + 16)),
+					y1;
+		__m128i		x2 = _mm_loadu_si128((const __m128i *) (buf + 32)),
+					y2;
+		__m128i		x3 = _mm_loadu_si128((const __m128i *) (buf + 48)),
+					y3;
+		__m128i		k;
+
+		k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+		x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+		buf += 64;
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+			y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+			y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+			y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+			y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i *) buf)), x0 = _mm_xor_si128(x0, y0);
+			y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i *) (buf + 16))), x1 = _mm_xor_si128(x1, y1);
+			y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i *) (buf + 32))), x2 = _mm_xor_si128(x2, y2);
+			y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i *) (buf + 48))), x3 = _mm_xor_si128(x3, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+		y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+		y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+		k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+		y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+		y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+		crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+		len = end - buf;
+	}
+	for (; len >= 8; buf += 8, len -= 8)
+	{
+		crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+	}
+	for (; len; --len)
+	{
+		crc0 = _mm_crc32_u8(crc0, *buf++);
+	}
+
+	return crc0;
+}
+
+#endif
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2")
 pg_crc32c
@@ -26,6 +135,17 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 	const unsigned char *p = data;
 	const unsigned char *pend = p + len;
 
+	/* XXX not for commit */
+	const pg_crc32c orig_crc PG_USED_FOR_ASSERTS_ONLY = crc;
+	const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len;
+
+#ifdef HAVE_PCLMUL_RUNTIME
+	if (len >= PCLMUL_THRESHOLD && (pg_cpucap & PGCPUCAP_CLMUL))
+	{
+		return pg_comp_crc32c_pclmul(crc, data, len);
+	}
+#endif
+
 	/*
 	 * Process eight bytes of data at a time.
 	 *
@@ -66,5 +186,8 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 		p++;
 	}
 
+	/* XXX not for commit */
+	Assert(crc == pg_comp_crc32c_sb8(orig_crc, data, orig_len));
+
 	return crc;
 }
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index b65bb2d536..662bd37ace 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2282,6 +2282,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
  419469235
 (1 row)
 
+SELECT crc32c(repeat('A', 80)::bytea);
+   crc32c   
+------------
+ 3799127650
+(1 row)
+
+SELECT crc32c(repeat('A', 127)::bytea);
+  crc32c   
+-----------
+ 291820082
+(1 row)
+
+SELECT crc32c(repeat('A', 128)::bytea);
+  crc32c   
+-----------
+ 816091258
+(1 row)
+
+SELECT crc32c(repeat('A', 129)::bytea);
+   crc32c   
+------------
+ 4213642571
+(1 row)
+
 --
 -- encode/decode
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 8e0f3a0e75..26f86dc92e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -727,6 +727,10 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
 
 SELECT crc32c('');
 SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+SELECT crc32c(repeat('A', 80)::bytea);
+SELECT crc32c(repeat('A', 127)::bytea);
+SELECT crc32c(repeat('A', 128)::bytea);
+SELECT crc32c(repeat('A', 129)::bytea);
 
 --
 -- encode/decode
-- 
2.43.0