From c28dcbcfc3dd105f622dc5962a278e3533d2bd7a Mon Sep 17 00:00:00 2001
From: Sam Russell <sam.h.russell@gmail.com>
Date: Tue, 26 Nov 2024 15:58:09 +0100
Subject: [PATCH] crc: Add PCLMUL implementation

* lib/crc-x86_64-pclmul.c: Implement CRC32 with PCLMUl intrinsics
* lib/crc.c: Use PCLMUL implementation if available
* lib/crc.h: Add new PCLMUL function
* m4/crc.m4: Detect PCLMUL and build accordingly
* modules/crc: Include crc-pclmul if we are building
* modules/crc-x64_64-pclmul: New module for crc-pclmul
---
 ChangeLog                 |  11 +++
 lib/crc-x86_64-pclmul.c   | 175 ++++++++++++++++++++++++++++++++++++++
 lib/crc.c                 |  35 +++++++-
 lib/crc.h                 |   5 ++
 m4/crc.m4                 |  34 ++++++++
 modules/crc               |   2 +
 modules/crc-x86_64-pclmul |  26 ++++++
 7 files changed, 287 insertions(+), 1 deletion(-)
 create mode 100644 lib/crc-x86_64-pclmul.c
 create mode 100644 modules/crc-x86_64-pclmul

diff --git a/ChangeLog b/ChangeLog
index 9d412f9cbc..f74ec652c0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2024-11-26  Sam Russell  <sam.h.russell@gmail.com>
+
+    crc: Add PCLMUL implementation
+
+    * lib/crc-x86_64-pclmul.c: Implement CRC32 with PCLMUl intrinsics
+    * lib/crc.c: Use PCLMUL implementation if available
+    * lib/crc.h: Add new PCLMUL function
+    * m4/crc.m4: Detect PCLMUL and build accordingly
+    * modules/crc: Include crc-x86_64-pclmul if we are building
+    * modules/crc-x86_64-pclmul: New module for crc-x86_64-pclmul
+
 2024-11-20  Paul Eggert  <eggert@cs.ucla.edu>
 
 	openat: don’t close (-1)
diff --git a/lib/crc-x86_64-pclmul.c b/lib/crc-x86_64-pclmul.c
new file mode 100644
index 0000000000..c6e7a6f12b
--- /dev/null
+++ b/lib/crc-x86_64-pclmul.c
@@ -0,0 +1,175 @@
+#include <config.h>
+
+#include "x86intrin.h"
+#include "crc.h"
+
+#include <string.h>
+
+#pragma GCC target("pclmul,avx")
+uint32_t
+crc32_update_no_xor_pclmul (uint32_t crc, const void *buf, size_t len)
+  {
+    const __m128i* data = buf;
+    __m128i* datarw;
+    size_t bytes_remaining = len;
+    __m128i in256[4] = {0};
+    __m128i xor_crc;
+    __m128i in1 = {0};
+    __m128i in2 = {0};
+    __m128i in3 = {0};
+    __m128i in4 = {0};
+    __m128i in5 = {0};
+    __m128i in6 = {0};
+    __m128i in7 = {0};
+    __m128i in8 = {0};
+    __m128i final_buf[12] = {0};
+    __m128i fold_high;
+    __m128i fold_low;
+
+    /* These constants are calculated as T-1 mod P
+       0x8F352D95 = (544 - 1) mod P etc
+       This method is described in Fast CRC Computation for Generic Polynomials
+       Using PCLMULQDQ Instruction (Gopal et al.) */
+    __m128i shift544_shift480 = _mm_set_epi64x(0x1D9513D7, 0x8F352D95);
+    __m128i shift160_shift96 = _mm_set_epi64x(0xCCAA009E, 0xAE689191);
+    __m128i shift96_shift64 = _mm_set_epi64x(0xB8BC6765, 0xCCAA009E);
+    /* Mu is defined as x^64 / P(x)
+       P(x) is the generator polynomial for this CRC32 implementation */
+    __m128i mu_poly = _mm_set_epi64x(0x1DB710641, 0x1F7011641);
+
+    if (len == 0) {
+      return crc;
+    }
+
+    if (bytes_remaining >= 128)
+      {
+        /* Here we fold 4x 128bit words forward by 512 bits */
+        /* First load up our initial state */
+        in1 = _mm_loadu_si128(data);
+        in2 = _mm_loadu_si128(data + 1);
+        in3 = _mm_loadu_si128(data + 2);
+        in4 = _mm_loadu_si128(data + 3);
+
+        /* Initialise with incoming CRC */
+        xor_crc = _mm_set_epi32 (0, 0, 0, crc);
+        in1 = _mm_xor_si128 (in1, xor_crc);
+
+        while (bytes_remaining >= 128)
+          {
+            /* Load up second set of 512 bits*/
+            in5 = _mm_loadu_si128(data + 4);
+            in6 = _mm_loadu_si128(data + 5);
+            in7 = _mm_loadu_si128(data + 6);
+            in8 = _mm_loadu_si128(data + 7);
+
+            /* We shift the high QWORD forward 544 bits and the
+               low QWORD 480 bits */
+            fold_high = _mm_clmulepi64_si128(in1, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in1, shift544_shift480, 0x00);
+            in1 = _mm_xor_si128(in5, fold_high);
+            in1 = _mm_xor_si128(in1, fold_low);
+            fold_high = _mm_clmulepi64_si128(in2, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in2, shift544_shift480, 0x00);
+            in2 = _mm_xor_si128(in6, fold_high);
+            in2 = _mm_xor_si128(in2, fold_low);
+            fold_high = _mm_clmulepi64_si128(in3, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in3, shift544_shift480, 0x00);
+            in3 = _mm_xor_si128(in7, fold_high);
+            in3 = _mm_xor_si128(in3, fold_low);
+            fold_high = _mm_clmulepi64_si128(in4, shift544_shift480, 0x11);
+            fold_low = _mm_clmulepi64_si128(in4, shift544_shift480, 0x00);
+            in4 = _mm_xor_si128(in8, fold_high);
+            in4 = _mm_xor_si128(in4, fold_low);
+
+            bytes_remaining -= 64;
+            data += 4;
+          }
+
+        _mm_storeu_si128(final_buf, in1);
+        _mm_storeu_si128(final_buf + 1, in2);
+        _mm_storeu_si128(final_buf + 2, in3);
+        _mm_storeu_si128(final_buf + 3, in4);
+      }
+
+    /* Move everything to final_buf because it is RW */
+
+    memcpy(final_buf + 4, data + 4, bytes_remaining - 64);
+    datarw = final_buf;
+
+    while (bytes_remaining >= 32) {
+        /* Do 128-bit folds as above */
+        in1 = _mm_loadu_si128(datarw);
+        in2 = _mm_loadu_si128(datarw + 1);
+
+        /* We shift the high QWORD forward 160 bits and the
+           low QWORD 96 bits */
+        fold_high = _mm_clmulepi64_si128(in1, shift160_shift96, 0x11);
+        fold_low = _mm_clmulepi64_si128(in1, shift160_shift96, 0x00);
+        in2 = _mm_xor_si128(in2, fold_high);
+        in2 = _mm_xor_si128(in2, fold_low);
+
+        _mm_storeu_si128(datarw + 1, in2);
+        bytes_remaining -= 16;
+        datarw += 1;
+    }
+
+    /* We have 16-31 bytes here
+       If we have 17-31 then we do another special case 128-bit fold
+       the padding trick works because we're effectively padding 0s on the front
+       which means with little endian we're shifting the number higher
+       we'll also use this step to pick the <16 byte case */
+
+    if (bytes_remaining != 16){
+        /* Pad remainder and fold 128-bits
+           We're reading in up to 32 bytes here = 256 bits
+           This is inefficient so we only actually want to hit this on the actual end of data
+           If we're reading a multiple of 32 bytes in the loop then this will never get hit */
+
+        /* Read in at an offset so we get the shift for free */
+        memcpy(((char*) in256) + (32 - bytes_remaining), datarw, bytes_remaining);
+        in1 = _mm_loadu_si128(in256);
+        in2 = _mm_loadu_si128(in256 + 1);
+
+        /* Now we fold in1 onto in2 */
+        shift160_shift96 = _mm_set_epi64x(0x0ccaa009e, 0x1751997d0);
+        fold_high = _mm_clmulepi64_si128(in1, shift160_shift96, 0x11);
+        fold_low = _mm_clmulepi64_si128(in1, shift160_shift96, 0x00);
+        in2 = _mm_xor_si128(in2, fold_high);
+        in1 = _mm_xor_si128(in2, fold_low);
+    }
+    else {
+        in1 = _mm_loadu_si128(datarw);
+    }
+
+    /* We now have 16 bytes and fold as normal */
+
+    in2 = _mm_and_si128(_mm_srli_si128(in1, 8), _mm_set_epi64x(0, 0xffffffff));
+    in3 = _mm_and_si128(_mm_srli_si128(in1, 12), _mm_set_epi64x(0, 0xffffffff));
+    in1 = _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffffffffffff));
+    /* Multiply first 64 bits against shift96 */
+    in1 = _mm_clmulepi64_si128(shift96_shift64, in1, 0x00);
+    /* First 32 bits go on in2 */
+    in2 = _mm_xor_si128(in2, _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffff)));
+    /* Next 64 bits go on in3 */
+    in3 = _mm_xor_si128(in3, _mm_srli_si128(in1, 4));
+    /* Then shift 64 bits from here */
+    in1 = _mm_clmulepi64_si128(shift96_shift64, in2, 0x01);
+    in1 = _mm_xor_si128(in1, in3);
+
+    /* This is the Barrett reduction */
+    /* Take the bottom 32 bits */
+    in2 = _mm_and_si128(in1, _mm_set_epi64x(0, 0xffffffff));
+    /* Multiply by mu */
+    in2 = _mm_clmulepi64_si128(mu_poly, in2, 0x00);
+    /* Take the bottom 32 bits of the result */
+    in2 = _mm_and_si128(in2, _mm_set_epi64x(0, 0xffffffff));
+    /* Multiply by P(x) */
+    in2 = _mm_clmulepi64_si128(mu_poly, in2, 0x01);
+    /* XOR against input */
+    in1 = _mm_xor_si128(in1, in2);
+    /* Take bits 32-63 as the CRC*/
+    in1 = _mm_srli_si128(in1, 4);
+    crc = _mm_cvtsi128_si32(in1);
+
+    return crc;
+  }
diff --git a/lib/crc.c b/lib/crc.c
index 53f26486a2..dde3169c59 100644
--- a/lib/crc.c
+++ b/lib/crc.c
@@ -23,7 +23,12 @@
 
 #include <string.h>
 
-#ifdef GL_CRC_SLICE_BY_8
+#ifdef GL_CRC_PCLMUL
+static bool pclmul_enabled = false;
+static bool pclmul_checked = false;
+#endif
+
+#if GL_CRC_SLICE_BY_8
 #include "crc-sliceby8.h"
 
 /*
@@ -110,6 +115,20 @@ crc32_update_no_xor (uint32_t crc, const char *buf, size_t len)
 {
   size_t n, slice_alignment;
 
+#ifdef GL_CRC_PCLMUL
+  if (!pclmul_checked)
+    {
+      pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") &&
+                        0 < __builtin_cpu_supports ("avx"));
+      pclmul_checked = true;
+    }
+
+  if (pclmul_enabled || len < 64)
+    {
+      return crc32_update_no_xor_pclmul(crc, buf, len);
+    }
+#endif
+
   slice_alignment = (len & (-8));
 
   for (n = 0; n < slice_alignment; n += 8)
@@ -180,6 +199,20 @@ crc32_update_no_xor (uint32_t crc, const char *buf, size_t len)
 {
   size_t n;
 
+#ifdef GL_CRC_PCLMUL
+  if (!pclmul_checked)
+    {
+      pclmul_enabled = (0 < __builtin_cpu_supports ("pclmul") &&
+                        0 < __builtin_cpu_supports ("avx"));
+      pclmul_checked = true;
+    }
+
+  if (pclmul_enabled || len < 64)
+    {
+      return crc32_update_no_xor_pclmul(crc, buf, len);
+    }
+#endif
+
   for (n = 0; n < len; n++)
     crc = crc32_table[(crc ^ buf[n]) & 0xff] ^ (crc >> 8);
 
diff --git a/lib/crc.h b/lib/crc.h
index 2754126dbe..e9200f7f87 100644
--- a/lib/crc.h
+++ b/lib/crc.h
@@ -52,6 +52,11 @@ extern uint32_t
 crc32_update_no_xor (uint32_t crc, const char *buf, size_t len)
   _GL_ATTRIBUTE_PURE;
 
+#ifdef GL_CRC_PCLMUL
+extern uint32_t
+crc32_update_no_xor_pclmul (uint32_t crc, const void *buf, size_t len)
+  _GL_ATTRIBUTE_PURE;
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/m4/crc.m4 b/m4/crc.m4
index f62987e78e..958981901a 100644
--- a/m4/crc.m4
+++ b/m4/crc.m4
@@ -23,3 +23,37 @@ AC_DEFUN([gl_CRC_SLICE_BY_8],
       AC_MSG_RESULT([no])
   fi
 ])
+
+AC_DEFUN([gl_CRC_PCLMUL],
+[
+  ac_save_CFLAGS=$CFLAGS
+  CFLAGS="-mavx -mpclmul $CFLAGS"
+  AC_MSG_CHECKING([if pclmul intrinsic exists])
+  AC_CACHE_VAL([gl_cv_crc_pclmul],[
+  AC_LINK_IFELSE(
+    [AC_LANG_SOURCE([[
+      #include <x86intrin.h>
+
+      int
+      main (void)
+      {
+        __m128i a, b;
+        a = _mm_clmulepi64_si128 (a, b, 0x00);
+        a = _mm_shuffle_epi8 (a, b);
+        return __builtin_cpu_supports ("pclmul");
+      }
+    ]])
+    ],[
+      gl_cv_crc_pclmul=yes
+    ],[
+      gl_cv_crc_pclmul=no
+    ])])
+  AC_MSG_RESULT([$gl_cv_crc_pclmul])
+  if test $gl_cv_crc_pclmul = yes; then
+    AC_DEFINE([GL_CRC_PCLMUL], [1],
+              [CRC32 calculation by pclmul hardware instruction enabled])
+  fi
+  AM_CONDITIONAL([GL_CRC_PCLMUL],
+                [test $gl_cv_crc_pclmul = yes])
+  CFLAGS=$ac_save_CFLAGS
+])
diff --git a/modules/crc b/modules/crc
index c8dc462a27..66420e9791 100644
--- a/modules/crc
+++ b/modules/crc
@@ -11,9 +11,11 @@ m4/build-cc.m4
 Depends-on:
 stdint
 endian
+crc-x86_64-pclmul [GL_CRC_PCLMUL = 1]
 
 configure.ac:
 AC_REQUIRE([gl_CRC_SLICE_BY_8])
+AC_REQUIRE([gl_CRC_PCLMUL])
 gl_BUILD_CC
 AC_PROG_MKDIR_P
 
diff --git a/modules/crc-x86_64-pclmul b/modules/crc-x86_64-pclmul
new file mode 100644
index 0000000000..d8999b7879
--- /dev/null
+++ b/modules/crc-x86_64-pclmul
@@ -0,0 +1,26 @@
+Description:
+PCLMUL-based implementation of CRC32
+
+Files:
+lib/crc.h
+lib/crc-x86_64-pclmul.c
+m4/crc.m4
+
+Depends-on:
+stdint
+endian
+
+configure.ac:
+AC_REQUIRE([gl_CRC_PCLMUL])
+
+Makefile.am:
+lib_SOURCES += crc-x86_64-pclmul.c
+
+Include:
+"crc.h"
+
+License:
+LGPL
+
+Maintainer:
+Simon Josefsson
-- 
2.43.0