From f8d60abb5851ea713561da1ccababc8f94206ee3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sun, 9 Feb 2025 12:25:56 +0700
Subject: [PATCH v3 4/4] Shorter version from corsix

---
 src/port/pg_crc32c_sse42.c | 165 ++++++++++++++-----------------------
 1 file changed, 62 insertions(+), 103 deletions(-)

diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 69f8917c7d..dec685d139 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -78,114 +78,73 @@ pg_comp_crc32c_sse42_tail(pg_crc32c crc, const void *data, size_t len)
  * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/third_party/zlib/crc32_simd.c
  */
 
+#define clmul_lo(a, b) (_mm_clmulepi64_si128((a), (b), 0))
+#define clmul_hi(a, b) (_mm_clmulepi64_si128((a), (b), 17))
+
 pg_attribute_no_sanitize_alignment()
 pg_attribute_target("sse4.2,pclmul")
 pg_crc32c
-pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t length)
+pg_comp_crc32c_sse42(pg_crc32c crc0, const void *data, size_t length)
 {
-    ssize_t len = (ssize_t) length;
+    size_t len = length;
     const unsigned char *buf = data;
-    /*
-     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
-     * the CRC32+Barrett polynomials given at the end of the paper.
-     */
-    static const uint64_t pg_attribute_aligned(16) k1k2[] = { 0x740eef02, 0x9e4addf8 };
-    static const uint64_t pg_attribute_aligned(16) k3k4[] = { 0xf20c0dfe, 0x14cd00bd6 };
-    static const uint64_t pg_attribute_aligned(16) k5k0[] = { 0xdd45aab8, 0x000000000 };
-    static const uint64_t pg_attribute_aligned(16) poly[] = { 0x105ec76f1, 0xdea713f1 };
-    if (len >= 64) {
-        __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
-        /*
-         * There's at least one block of 64.
-         */
-        x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
-        x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
-        x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
-        x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
-        x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
-        x0 = _mm_load_si128((__m128i *)k1k2);
-        buf += 64;
-        len -= 64;
-        /*
-         * Parallel fold blocks of 64, if any.
-         */
-        while (len >= 64)
-        {
-            x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-            x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
-            x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
-            x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
-            x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-            x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
-            x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
-            x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
-            y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
-            y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
-            y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
-            y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
-            x1 = _mm_xor_si128(x1, x5);
-            x2 = _mm_xor_si128(x2, x6);
-            x3 = _mm_xor_si128(x3, x7);
-            x4 = _mm_xor_si128(x4, x8);
-            x1 = _mm_xor_si128(x1, y5);
-            x2 = _mm_xor_si128(x2, y6);
-            x3 = _mm_xor_si128(x3, y7);
-            x4 = _mm_xor_si128(x4, y8);
-            buf += 64;
-            len -= 64;
-        }
-        /*
-         * Fold into 128-bits.
-         */
-        x0 = _mm_load_si128((__m128i *)k3k4);
-        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-        x1 = _mm_xor_si128(x1, x2);
-        x1 = _mm_xor_si128(x1, x5);
-        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-        x1 = _mm_xor_si128(x1, x3);
-        x1 = _mm_xor_si128(x1, x5);
-        x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-        x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-        x1 = _mm_xor_si128(x1, x4);
-        x1 = _mm_xor_si128(x1, x5);
-        /*
-         * Single fold blocks of 16, if any.
-         */
-        while (len >= 16)
-        {
-            x2 = _mm_loadu_si128((__m128i *)buf);
-            x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-            x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-            x1 = _mm_xor_si128(x1, x2);
-            x1 = _mm_xor_si128(x1, x5);
-            buf += 16;
-            len -= 16;
-        }
-        /*
-         * Fold 128-bits to 64-bits.
-         */
-        x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
-        x3 = _mm_setr_epi32(~0, 0, ~0, 0);
-        x1 = _mm_srli_si128(x1, 8);
-        x1 = _mm_xor_si128(x1, x2);
-        x0 = _mm_loadl_epi64((__m128i*)k5k0);
-        x2 = _mm_srli_si128(x1, 4);
-        x1 = _mm_and_si128(x1, x3);
-        x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
-        x1 = _mm_xor_si128(x1, x2);
-        /*
-         * Barret reduce to 32-bits.
-         */
-        x0 = _mm_load_si128((__m128i*)poly);
-        x2 = _mm_and_si128(x1, x3);
-        x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
-        x2 = _mm_and_si128(x2, x3);
-        x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
-        x1 = _mm_xor_si128(x1, x2);
-        crc = _mm_extract_epi32(x1, 1);
+
+  if (len >= 64) {
+    /* First vector chunk. */
+    __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
+    __m128i x1 = _mm_loadu_si128((const __m128i*)(buf + 16)), y1;
+    __m128i x2 = _mm_loadu_si128((const __m128i*)(buf + 32)), y2;
+    __m128i x3 = _mm_loadu_si128((const __m128i*)(buf + 48)), y3;
+    __m128i k;
+    k = _mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0);
+    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+    buf += 64;
+    len -= 64;
+    /* Main loop. */
+    while (len >= 64) {
+      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+      y1 = clmul_lo(x1, k), x1 = clmul_hi(x1, k);
+      y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+      y3 = clmul_lo(x3, k), x3 = clmul_hi(x3, k);
+      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
+      y1 = _mm_xor_si128(y1, _mm_loadu_si128((const __m128i*)(buf + 16))), x1 = _mm_xor_si128(x1, y1);
+      y2 = _mm_xor_si128(y2, _mm_loadu_si128((const __m128i*)(buf + 32))), x2 = _mm_xor_si128(x2, y2);
+      y3 = _mm_xor_si128(y3, _mm_loadu_si128((const __m128i*)(buf + 48))), x3 = _mm_xor_si128(x3, y3);
+      buf += 64;
+      len -= 64;
+    }
+    /* Reduce x0 ... x3 to just x0. */
+    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y2 = clmul_lo(x2, k), x2 = clmul_hi(x2, k);
+    y0 = _mm_xor_si128(y0, x1), x0 = _mm_xor_si128(x0, y0);
+    y2 = _mm_xor_si128(y2, x3), x2 = _mm_xor_si128(x2, y2);
+    k = _mm_setr_epi32(0x3da6d0cb, 0, 0xba4fc28e, 0);
+    y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+    y0 = _mm_xor_si128(y0, x2), x0 = _mm_xor_si128(x0, y0);
+    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+  }
+  if (len >= 16) {
+    /* First vector chunk. */
+    __m128i x0 = _mm_loadu_si128((const __m128i*)buf), y0;
+    __m128i k;
+    k = _mm_setr_epi32(0xf20c0dfe, 0, 0x493c7d27, 0);
+    x0 = _mm_xor_si128(_mm_cvtsi32_si128(crc0), x0);
+    buf += 16;
+    len -= 16;
+    /* Main loop. */
+    while (len >= 16) {
+      y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+      y0 = _mm_xor_si128(y0, _mm_loadu_si128((const __m128i*)buf)), x0 = _mm_xor_si128(x0, y0);
+      buf += 16;
+      len -= 16;
     }
+    /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+    crc0 = _mm_crc32_u64(0, _mm_extract_epi64(x0, 0));
+    crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(x0, 1));
+  }
 
-    return pg_comp_crc32c_sse42_tail(crc, buf, len);
+    return pg_comp_crc32c_sse42_tail(crc0, buf, len);
 }
-- 
2.43.0

