From 5097d32a3c2f3af3b2fa34f370941b6b325fdd59 Mon Sep 17 00:00:00 2001
From: Sam Russell <sam.h.russell@gmail.com>
Date: Tue, 24 Dec 2024 16:55:27 +0100
Subject: [PATCH] cksum: Implement Chorba algorithm in PCLMUL

* src/cksum_pclmul.c: Incorporate Chorba algorithm.
* src/cksum_avx2.c: Incorporate Chorba algorithm.
* src/cksum_avx512.c: Incorporate Chorba algorithm.
---
 src/cksum_avx2.c   | 462 ++++++++++++++++++++++++++++++++++++++++++++-
 src/cksum_avx512.c | 452 +++++++++++++++++++++++++++++++++++++++++++-
 src/cksum_pclmul.c | 395 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 1295 insertions(+), 14 deletions(-)

diff --git a/src/cksum_avx2.c b/src/cksum_avx2.c
index 252e01d6c..233a7ad1a 100644
--- a/src/cksum_avx2.c
+++ b/src/cksum_avx2.c
@@ -28,7 +28,7 @@
 extern uint_fast32_t const crctab[8][256];
 
 extern bool
-cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+cksum_avx2 (FILE * fp, uint_fast32_t * crc_out, uintmax_t * length_out);
 
 bool
 cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
@@ -39,6 +39,7 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
   size_t bytes_read;
   __m256i single_mult_constant;
   __m256i four_mult_constant;
+  __m256i twelve_mult_constant;
   __m256i shuffle_constant;
 
   if (!fp || !crc_out || !length_out)
@@ -46,11 +47,19 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
 
   /* These constants and general algorithms are taken from the Intel whitepaper
      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-  */
+     2^(256) mod P = 0x75BE46B7
+     2^(256+64) mod P = 0x569700E5
+     2^(256*4) mod P = 0x567FDDEB
+     2^(256*4+64) mod P = 0x10BD4D7C
+     2^(256*8) mod P = 0x3CD4B4ED
+     2^(256*8+64) mod P = 0x1D97B060
+   */
   single_mult_constant = _mm256_set_epi64x (0x569700E5, 0x75BE46B7,
                                             0x569700E5, 0x75BE46B7);
   four_mult_constant = _mm256_set_epi64x (0x10BD4D7C, 0x567FDDEB,
-                                            0x10BD4D7C, 0x567FDDEB);
+                                          0x10BD4D7C, 0x567FDDEB);
+  twelve_mult_constant = _mm256_set_epi64x (0x1D97B060, 0x3CD4B4ED,
+                                            0x1D97B060, 0x3CD4B4ED);
 
   /* Constant to byteswap a full AVX2 register */
   shuffle_constant = _mm256_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
@@ -69,6 +78,14 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
       __m256i data8;
       __m256i fold_data;
       __m256i xor_crc;
+      __m256i chorba1;
+      __m256i chorba2;
+      __m256i chorba3;
+      __m256i chorba4;
+      __m256i chorba5;
+      __m256i chorba6;
+      __m256i chorba7;
+      __m256i chorba8;
 
       __m256i *datap;
 
@@ -79,10 +96,10 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
         }
       length += bytes_read;
 
-      datap = (__m256i *)buf;
+      datap = (__m256i *) buf;
 
       /* Fold in parallel 16x 16-byte blocks into 8x 16-byte blocks */
-      if (bytes_read >= 16 * 8 * 2)
+      if (bytes_read >= 32 * 2)
         {
           data = _mm256_loadu_si256 (datap);
           data = _mm256_shuffle_epi8 (data, shuffle_constant);
@@ -98,7 +115,436 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
           data7 = _mm256_loadu_si256 (datap + 3);
           data7 = _mm256_shuffle_epi8 (data7, shuffle_constant);
 
-          while (bytes_read >= 16 * 8 * 2)
+          // use the chorba method to copy 8 vars forward without pclmul
+          while (bytes_read >= 1024 + 128 + 32 * 8)
+            {
+              datap += 4;
+              chorba1 = _mm256_loadu_si256 (datap);
+              chorba1 = _mm256_shuffle_epi8 (chorba1, shuffle_constant);
+              chorba2 = _mm256_loadu_si256 (datap + 1);
+              chorba2 = _mm256_shuffle_epi8 (chorba2, shuffle_constant);
+              chorba3 = _mm256_loadu_si256 (datap + 2);
+              chorba3 = _mm256_shuffle_epi8 (chorba3, shuffle_constant);
+              chorba4 = _mm256_loadu_si256 (datap + 3);
+              chorba4 = _mm256_shuffle_epi8 (chorba4, shuffle_constant);
+              chorba5 = _mm256_loadu_si256 (datap + 4);
+              chorba5 = _mm256_shuffle_epi8 (chorba5, shuffle_constant);
+              chorba6 = _mm256_loadu_si256 (datap + 5);
+              chorba6 = _mm256_shuffle_epi8 (chorba6, shuffle_constant);
+              chorba7 = _mm256_loadu_si256 (datap + 6);
+              chorba7 =
+                _mm256_shuffle_epi8 (chorba7, shuffle_constant) ^ chorba1;
+              chorba8 = _mm256_loadu_si256 (datap + 7);
+              chorba8 =
+                _mm256_shuffle_epi8 (chorba8, shuffle_constant) ^ chorba2;
+              bytes_read -= (32 * 8);
+              datap += 8;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, twelve_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, twelve_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, twelve_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, twelve_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, twelve_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, twelve_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, twelve_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, twelve_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 = _mm256_shuffle_epi8 (data2, shuffle_constant) ^ chorba3;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba4 ^ chorba1;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba5 ^ chorba2 ^
+                chorba1;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba6 ^ chorba3 ^
+                chorba2;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba7 ^ chorba4 ^
+                chorba3;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba5 ^
+                chorba4;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba6 ^ chorba5;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba7 ^ chorba6;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba1;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba2;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 = _mm256_shuffle_epi8 (data6, shuffle_constant) ^ chorba3;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 = _mm256_shuffle_epi8 (data8, shuffle_constant) ^ chorba4;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba5 ^ chorba1;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba6 ^ chorba2 ^
+                chorba1;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba3 ^
+                chorba2 ^ chorba1;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba4 ^
+                chorba3 ^ chorba2;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba5 ^ chorba4 ^
+                chorba3 ^ chorba1;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba6 ^ chorba5 ^
+                chorba4 ^ chorba2 ^ chorba1;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba1;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba2;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba3;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 =
+                _mm256_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba4;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm256_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm256_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm256_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm256_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm256_xor_si256 (data, data2);
+              data2 = _mm256_loadu_si256 (datap);
+              data2 =
+                _mm256_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5;
+              data = _mm256_xor_si256 (data, data2);
+
+              data3 = _mm256_xor_si256 (data3, data4);
+              data4 = _mm256_loadu_si256 (datap + 1);
+              data4 =
+                _mm256_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6;
+              data3 = _mm256_xor_si256 (data3, data4);
+
+              data5 = _mm256_xor_si256 (data5, data6);
+              data6 = _mm256_loadu_si256 (datap + 2);
+              data6 =
+                _mm256_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba8 ^ chorba7;
+              data5 = _mm256_xor_si256 (data5, data6);
+
+              data7 = _mm256_xor_si256 (data7, data8);
+              data8 = _mm256_loadu_si256 (datap + 3);
+              data8 = _mm256_shuffle_epi8 (data8, shuffle_constant) ^ chorba8;
+              data7 = _mm256_xor_si256 (data7, data8);
+
+              bytes_read -= (32 * 4);
+            }
+
+          while (bytes_read >= 32 * 8)
             {
               datap += 4;
 
@@ -145,7 +591,7 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
               data8 = _mm256_shuffle_epi8 (data8, shuffle_constant);
               data7 = _mm256_xor_si256 (data7, data8);
 
-              bytes_read -= (16 * 4 * 2);
+              bytes_read -= (32 * 4);
             }
           /* At end of loop we write out results from variables back into
              the buffer, for use in single fold loop */
@@ -186,7 +632,7 @@ cksum_avx2 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
         }
 
       /* And finish up last 0-63 bytes in a byte by byte fashion */
-      unsigned char *cp = (unsigned char *)datap;
+      unsigned char *cp = (unsigned char *) datap;
       while (bytes_read--)
         crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
       if (feof (fp))
diff --git a/src/cksum_avx512.c b/src/cksum_avx512.c
index 5f8ff2375..f3bd05bb1 100644
--- a/src/cksum_avx512.c
+++ b/src/cksum_avx512.c
@@ -39,6 +39,7 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
   size_t bytes_read;
   __m512i single_mult_constant;
   __m512i four_mult_constant;
+  __m512i twelve_mult_constant;
   __m512i shuffle_constant;
 
   if (!fp || !crc_out || !length_out)
@@ -46,6 +47,12 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
 
   /* These constants and general algorithms are taken from the Intel whitepaper
      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+     2^(512) mod P = 0xE6228B11
+     2^(512+64) mod P = 0x8833794C
+     2^(512*4) mod P = 0x88FE2237
+     2^(512*4+64) mod P = 0xCBCF3BCB
+     2^(512*8) mod P = 0x413686A0
+     2^(512*8+64) mod P = 0x9DEF026A
   */
   single_mult_constant = _mm512_set_epi64 (0x8833794C, 0xE6228B11,
                                            0x8833794C, 0xE6228B11,
@@ -55,6 +62,10 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
                                          0xCBCF3BCB, 0x88FE2237,
                                          0xCBCF3BCB, 0x88FE2237,
                                          0xCBCF3BCB, 0x88FE2237);
+  twelve_mult_constant = _mm512_set_epi64 (0x9DEF026A, 0x413686A0,
+                                         0x9DEF026A, 0x413686A0,
+                                         0x9DEF026A, 0x413686A0,
+                                         0x9DEF026A, 0x413686A0);
 
   /* Constant to byteswap a full AVX512 register */
   shuffle_constant = _mm512_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
@@ -77,6 +88,14 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
       __m512i data8;
       __m512i fold_data;
       __m512i xor_crc;
+      __m512i chorba1;
+      __m512i chorba2;
+      __m512i chorba3;
+      __m512i chorba4;
+      __m512i chorba5;
+      __m512i chorba6;
+      __m512i chorba7;
+      __m512i chorba8;
 
       __m512i *datap;
 
@@ -107,7 +126,436 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
           data7 = _mm512_loadu_si512 (datap + 3);
           data7 = _mm512_shuffle_epi8 (data7, shuffle_constant);
 
-          while (bytes_read >= 16 * 8 * 4)
+          // use the chorba method to copy 8 vars forward without pclmul
+          while (bytes_read >= 2048 + 256 + 64 * 8)
+            {
+              datap += 4;
+              chorba1 = _mm512_loadu_si512 (datap);
+              chorba1 = _mm512_shuffle_epi8 (chorba1, shuffle_constant);
+              chorba2 = _mm512_loadu_si512 (datap + 1);
+              chorba2 = _mm512_shuffle_epi8 (chorba2, shuffle_constant);
+              chorba3 = _mm512_loadu_si512 (datap + 2);
+              chorba3 = _mm512_shuffle_epi8 (chorba3, shuffle_constant);
+              chorba4 = _mm512_loadu_si512 (datap + 3);
+              chorba4 = _mm512_shuffle_epi8 (chorba4, shuffle_constant);
+              chorba5 = _mm512_loadu_si512 (datap + 4);
+              chorba5 = _mm512_shuffle_epi8 (chorba5, shuffle_constant);
+              chorba6 = _mm512_loadu_si512 (datap + 5);
+              chorba6 = _mm512_shuffle_epi8 (chorba6, shuffle_constant);
+              chorba7 = _mm512_loadu_si512 (datap + 6);
+              chorba7 =
+                _mm512_shuffle_epi8 (chorba7, shuffle_constant) ^ chorba1;
+              chorba8 = _mm512_loadu_si512 (datap + 7);
+              chorba8 =
+                _mm512_shuffle_epi8 (chorba8, shuffle_constant) ^ chorba2;
+              bytes_read -= (64 * 8);
+              datap += 8;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, twelve_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, twelve_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, twelve_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, twelve_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, twelve_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, twelve_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, twelve_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, twelve_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 = _mm512_shuffle_epi8 (data2, shuffle_constant) ^ chorba3;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba4 ^ chorba1;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba5 ^ chorba2 ^
+                chorba1;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba6 ^ chorba3 ^
+                chorba2;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba7 ^ chorba4 ^
+                chorba3;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba5 ^
+                chorba4;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba6 ^ chorba5;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba7 ^ chorba6;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba1;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba2;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 = _mm512_shuffle_epi8 (data6, shuffle_constant) ^ chorba3;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 = _mm512_shuffle_epi8 (data8, shuffle_constant) ^ chorba4;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba5 ^ chorba1;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba6 ^ chorba2 ^
+                chorba1;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba3 ^
+                chorba2 ^ chorba1;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba4 ^
+                chorba3 ^ chorba2;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba5 ^ chorba4 ^
+                chorba3 ^ chorba1;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba6 ^ chorba5 ^
+                chorba4 ^ chorba2 ^ chorba1;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba1;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba2;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba3;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 =
+                _mm512_shuffle_epi8 (data8,
+                                     shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba4;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+
+              datap += 4;
+
+              data2 =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x00);
+              data =
+                _mm512_clmulepi64_epi128 (data, four_mult_constant, 0x11);
+              data4 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x00);
+              data3 =
+                _mm512_clmulepi64_epi128 (data3, four_mult_constant, 0x11);
+              data6 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x00);
+              data5 =
+                _mm512_clmulepi64_epi128 (data5, four_mult_constant, 0x11);
+              data8 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x00);
+              data7 =
+                _mm512_clmulepi64_epi128 (data7, four_mult_constant, 0x11);
+
+              data = _mm512_xor_si512 (data, data2);
+              data2 = _mm512_loadu_si512 (datap);
+              data2 =
+                _mm512_shuffle_epi8 (data2,
+                                     shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5;
+              data = _mm512_xor_si512 (data, data2);
+
+              data3 = _mm512_xor_si512 (data3, data4);
+              data4 = _mm512_loadu_si512 (datap + 1);
+              data4 =
+                _mm512_shuffle_epi8 (data4,
+                                     shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6;
+              data3 = _mm512_xor_si512 (data3, data4);
+
+              data5 = _mm512_xor_si512 (data5, data6);
+              data6 = _mm512_loadu_si512 (datap + 2);
+              data6 =
+                _mm512_shuffle_epi8 (data6,
+                                     shuffle_constant) ^ chorba8 ^ chorba7;
+              data5 = _mm512_xor_si512 (data5, data6);
+
+              data7 = _mm512_xor_si512 (data7, data8);
+              data8 = _mm512_loadu_si512 (datap + 3);
+              data8 = _mm512_shuffle_epi8 (data8, shuffle_constant) ^ chorba8;
+              data7 = _mm512_xor_si512 (data7, data8);
+
+              bytes_read -= (64 * 4);
+            }
+
+          while (bytes_read >= 64 * 8)
             {
               datap += 4;
 
@@ -154,7 +602,7 @@ cksum_avx512 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
               data8 = _mm512_shuffle_epi8 (data8, shuffle_constant);
               data7 = _mm512_xor_si512 (data7, data8);
 
-              bytes_read -= (16 * 4 * 4);
+              bytes_read -= (64 * 4);
             }
           /* At end of loop we write out results from variables back into
              the buffer, for use in single fold loop */
diff --git a/src/cksum_pclmul.c b/src/cksum_pclmul.c
index f88f6cc01..51162fecb 100644
--- a/src/cksum_pclmul.c
+++ b/src/cksum_pclmul.c
@@ -28,7 +28,7 @@
 extern uint_fast32_t const crctab[8][256];
 
 extern bool
-cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+cksum_pclmul (FILE * fp, uint_fast32_t * crc_out, uintmax_t * length_out);
 
 /* Calculate CRC32 using PCLMULQDQ CPU instruction found in x86/x64 CPUs */
 
@@ -41,6 +41,7 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
   size_t bytes_read;
   __m128i single_mult_constant;
   __m128i four_mult_constant;
+  __m128i twelve_mult_constant;
   __m128i shuffle_constant;
 
   if (!fp || !crc_out || !length_out)
@@ -48,9 +49,17 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
 
   /* These constants and general algorithms are taken from the Intel whitepaper
      "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-  */
+     2^(128) mod P = 0xE8A45605
+     2^(128+64) mod P = 0xC5B9CD4C
+     2^(128*4) mod P = 0xE6228B11
+     2^(128*4+64) mod P = 0x8833794C
+     2^(128*8) mod P = 0xD2536D46
+     2^(128*8+64) mod P = 0xDC53DFCC
+   */
   single_mult_constant = _mm_set_epi64x (0xC5B9CD4C, 0xE8A45605);
   four_mult_constant = _mm_set_epi64x (0x8833794C, 0xE6228B11);
+  /* Extra fold for algorithm from https://arxiv.org/abs/2412.16398 */
+  twelve_mult_constant = _mm_set_epi64x (0xDC53DFCC, 0xD2536D46);
 
   /* Constant to byteswap a full SSE register */
   shuffle_constant = _mm_set_epi8 (0, 1, 2, 3, 4, 5, 6, 7, 8,
@@ -69,6 +78,14 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
       __m128i data8;
       __m128i fold_data;
       __m128i xor_crc;
+      __m128i chorba1;
+      __m128i chorba2;
+      __m128i chorba3;
+      __m128i chorba4;
+      __m128i chorba5;
+      __m128i chorba6;
+      __m128i chorba7;
+      __m128i chorba8;
 
       if (length + bytes_read < length)
         {
@@ -77,7 +94,7 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
         }
       length += bytes_read;
 
-      datap = (__m128i *)buf;
+      datap = (__m128i *) buf;
 
       /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
       if (bytes_read >= 16 * 8)
@@ -96,6 +113,376 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
           data7 = _mm_loadu_si128 (datap + 3);
           data7 = _mm_shuffle_epi8 (data7, shuffle_constant);
 
+          // use the chorba method to copy 8 vars forward without pclmul
+          while (bytes_read >= 512 + 64 + 16 * 8)
+            {
+              datap += 4;
+              chorba1 = _mm_loadu_si128 (datap);
+              chorba1 = _mm_shuffle_epi8 (chorba1, shuffle_constant);
+              chorba2 = _mm_loadu_si128 (datap + 1);
+              chorba2 = _mm_shuffle_epi8 (chorba2, shuffle_constant);
+              chorba3 = _mm_loadu_si128 (datap + 2);
+              chorba3 = _mm_shuffle_epi8 (chorba3, shuffle_constant);
+              chorba4 = _mm_loadu_si128 (datap + 3);
+              chorba4 = _mm_shuffle_epi8 (chorba4, shuffle_constant);
+              chorba5 = _mm_loadu_si128 (datap + 4);
+              chorba5 = _mm_shuffle_epi8 (chorba5, shuffle_constant);
+              chorba6 = _mm_loadu_si128 (datap + 5);
+              chorba6 = _mm_shuffle_epi8 (chorba6, shuffle_constant);
+              chorba7 = _mm_loadu_si128 (datap + 6);
+              chorba7 =
+                _mm_shuffle_epi8 (chorba7, shuffle_constant) ^ chorba1;
+              chorba8 = _mm_loadu_si128 (datap + 7);
+              chorba8 =
+                _mm_shuffle_epi8 (chorba8, shuffle_constant) ^ chorba2;
+              bytes_read -= (16 * 8);
+              datap += 8;
+
+              data2 = _mm_clmulepi64_si128 (data, twelve_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, twelve_mult_constant, 0x11);
+              data4 =
+                _mm_clmulepi64_si128 (data3, twelve_mult_constant, 0x00);
+              data3 =
+                _mm_clmulepi64_si128 (data3, twelve_mult_constant, 0x11);
+              data6 =
+                _mm_clmulepi64_si128 (data5, twelve_mult_constant, 0x00);
+              data5 =
+                _mm_clmulepi64_si128 (data5, twelve_mult_constant, 0x11);
+              data8 =
+                _mm_clmulepi64_si128 (data7, twelve_mult_constant, 0x00);
+              data7 =
+                _mm_clmulepi64_si128 (data7, twelve_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 = _mm_shuffle_epi8 (data2, shuffle_constant) ^ chorba3;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba4 ^ chorba1;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba5 ^ chorba2 ^
+                chorba1;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba6 ^ chorba3 ^
+                chorba2;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba7 ^ chorba4 ^
+                chorba3;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba8 ^ chorba5 ^
+                chorba4;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba6 ^ chorba5;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba7 ^ chorba6;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba1;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba8 ^ chorba2;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 = _mm_shuffle_epi8 (data6, shuffle_constant) ^ chorba3;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 = _mm_shuffle_epi8 (data8, shuffle_constant) ^ chorba4;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba5 ^ chorba1;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba6 ^ chorba2 ^
+                chorba1;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba7 ^ chorba3 ^
+                chorba2 ^ chorba1;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba8 ^ chorba4 ^
+                chorba3 ^ chorba2;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba5 ^ chorba4 ^
+                chorba3 ^ chorba1;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba6 ^ chorba5 ^
+                chorba4 ^ chorba2 ^ chorba1;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6 ^ chorba4 ^ chorba3 ^ chorba1;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba1;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba2 ^ chorba1;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba3 ^ chorba2 ^ chorba1;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba4 ^ chorba3 ^ chorba2;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba5 ^ chorba4 ^ chorba3;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 =
+                _mm_shuffle_epi8 (data8,
+                                  shuffle_constant) ^ chorba8 ^ chorba6 ^
+                chorba5 ^ chorba4;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+
+              datap += 4;
+
+              data2 = _mm_clmulepi64_si128 (data, four_mult_constant, 0x00);
+              data = _mm_clmulepi64_si128 (data, four_mult_constant, 0x11);
+              data4 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x00);
+              data3 = _mm_clmulepi64_si128 (data3, four_mult_constant, 0x11);
+              data6 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x00);
+              data5 = _mm_clmulepi64_si128 (data5, four_mult_constant, 0x11);
+              data8 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x00);
+              data7 = _mm_clmulepi64_si128 (data7, four_mult_constant, 0x11);
+
+              data = _mm_xor_si128 (data, data2);
+              data2 = _mm_loadu_si128 (datap);
+              data2 =
+                _mm_shuffle_epi8 (data2,
+                                  shuffle_constant) ^ chorba7 ^ chorba6 ^
+                chorba5;
+              data = _mm_xor_si128 (data, data2);
+
+              data3 = _mm_xor_si128 (data3, data4);
+              data4 = _mm_loadu_si128 (datap + 1);
+              data4 =
+                _mm_shuffle_epi8 (data4,
+                                  shuffle_constant) ^ chorba8 ^ chorba7 ^
+                chorba6;
+              data3 = _mm_xor_si128 (data3, data4);
+
+              data5 = _mm_xor_si128 (data5, data6);
+              data6 = _mm_loadu_si128 (datap + 2);
+              data6 =
+                _mm_shuffle_epi8 (data6,
+                                  shuffle_constant) ^ chorba8 ^ chorba7;
+              data5 = _mm_xor_si128 (data5, data6);
+
+              data7 = _mm_xor_si128 (data7, data8);
+              data8 = _mm_loadu_si128 (datap + 3);
+              data8 = _mm_shuffle_epi8 (data8, shuffle_constant) ^ chorba8;
+              data7 = _mm_xor_si128 (data7, data8);
+
+              bytes_read -= (16 * 4);
+            }
 
           while (bytes_read >= 16 * 8)
             {
@@ -175,7 +562,7 @@ cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
         }
 
       /* And finish up last 0-31 bytes in a byte by byte fashion */
-      unsigned char *cp = (unsigned char *)datap;
+      unsigned char *cp = (unsigned char *) datap;
       while (bytes_read--)
         crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
       if (feof (fp))
-- 
2.43.0

