From e57c79a54f8c9767c2c13c243ba8623bd2116e50 Mon Sep 17 00:00:00 2001
From: Sam Russell <sam.h.russell@gmail.com>
Date: Thu, 28 Nov 2024 20:28:21 +0100
Subject: [PATCH] cksum: Use ARMv8 SIMD extensions

* configure.ac: Add check for ARMv8 VMULL support.
* src/cksum.c: Add ARMv8 VMULL detection function.
* src/cksum.h: Add ARMv8 VMULL implementation declaration.
* src/cksum_vmull.c: ARMv8 VMULL implementation
* src/local.mk: Add build flags for ARMv8 VMULL
* NEWS: Mention the ARMv8 SIMD improvement
---
 NEWS              |   3 +
 configure.ac      |  34 +++++++++
 src/cksum.c       |  26 +++++++
 src/cksum.h       |   3 +
 src/cksum_vmull.c | 190 ++++++++++++++++++++++++++++++++++++++++++++++
 src/local.mk      |   7 ++
 6 files changed, 263 insertions(+)
 create mode 100644 src/cksum_vmull.c

diff --git a/NEWS b/NEWS
index 5bd9199e4..409f8ca3e 100644
--- a/NEWS
+++ b/NEWS
@@ -66,6 +66,9 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  cksum -a crc, makes use of ARMv8 SIMD extensions for a time reduction of
+  80%.
+
   cksum -a crc, makes use of AVX2 and AVX512 extensions for time reductions
   of 40% and 60% respectively.
 
diff --git a/configure.ac b/configure.ac
index 17fa23b45..f167f226f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -618,6 +618,40 @@ if test $utils_cv_brain_16_bit_supported = yes; then
   AC_DEFINE([BF16_SUPPORTED], [1], [Brain 16 bit float supported])
 fi
 
+ac_save_CFLAGS=$CFLAGS
+CFLAGS="-march=armv8-a+crypto $CFLAGS"
+AC_MSG_CHECKING([if vmull intrinsic exists])
+AC_CACHE_VAL([utils_cv_vmull_intrinsic_exists],[
+AC_LINK_IFELSE(
+  [AC_LANG_SOURCE([[
+    #include <stdio.h>
+    #include <sys/auxv.h>
+    #include <asm/hwcap.h>
+    #include <arm_neon.h>
+
+    int
+    main (void)
+    {
+      uint64x2_t a;
+      poly64_t shift64 = vget_lane_p64(vcreate_p64(0xB8BC6765), 0);
+      a = vreinterpretq_u64_p128(vmull_p64(shift64, vreinterpretq_p128_u64(a)));
+      return (getauxval(AT_HWCAP) & HWCAP_PMULL) > 0;
+    }
+  ]])
+  ],[
+    utils_cv_vmull_intrinsic_exists=yes
+  ],[
+    utils_cv_vmull_intrinsic_exists=no
+  ])])
+AC_MSG_RESULT([$utils_cv_vmull_intrinsic_exists])
+if test $utils_cv_vmull_intrinsic_exists = yes; then
+  AC_DEFINE([USE_VMULL_CRC32], [1],
+            [CRC32 calculation by vmull hardware instruction enabled])
+fi
+AM_CONDITIONAL([USE_VMULL_CRC32],
+               [test $utils_cv_vmull_intrinsic_exists = yes])
+CFLAGS=$ac_save_CFLAGS
+
 ac_save_CFLAGS=$CFLAGS
 CFLAGS="-mavx -mpclmul $CFLAGS"
 AC_MSG_CHECKING([if pclmul intrinsic exists])
diff --git a/src/cksum.c b/src/cksum.c
index 5900d141e..6e8f869ed 100644
--- a/src/cksum.c
+++ b/src/cksum.c
@@ -40,6 +40,11 @@
 #include <endian.h>
 #include "system.h"
 
+#ifdef USE_VMULL_CRC32
+# include <sys/auxv.h>
+# include <asm/hwcap.h>
+#endif
+
 #ifdef CRCTAB
 
 # define BIT(x)	((uint_fast32_t) 1 << (x))
@@ -201,6 +206,25 @@ avx512_supported (void)
   return avx512_enabled;
 }
 
+static bool
+vmull_supported (void)
+{
+  /* vmull for multiplication  */
+  bool vmull_enabled = false;
+# if USE_VMULL_CRC32
+
+  vmull_enabled = (getauxval(AT_HWCAP) & HWCAP_PMULL) > 0;
+
+  if (cksum_debug)
+    error (0, 0, "%s",
+           (vmull_enabled
+            ? _("using vmull hardware support")
+            : _("vmull support not detected")));
+# endif
+
+  return vmull_enabled;
+}
+
 static bool
 cksum_slice8 (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
 {
@@ -273,6 +297,8 @@ crc_sum_stream (FILE *stream, void *resstream, uintmax_t *length)
         cksum_fp = cksum_avx2;
       else if (pclmul_supported ())
         cksum_fp = cksum_pclmul;
+      else if (vmull_supported ())
+        cksum_fp = cksum_vmull;
       else
         cksum_fp = cksum_slice8;
     }
diff --git a/src/cksum.h b/src/cksum.h
index 6e8a5d008..c42491a95 100644
--- a/src/cksum.h
+++ b/src/cksum.h
@@ -14,6 +14,9 @@ output_crc (char const *file, int binary_file, void const *digest, bool raw,
             bool tagged, unsigned char delim, bool args, uintmax_t length)
   _GL_ATTRIBUTE_NONNULL ((3));
 
+extern bool
+cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
 extern bool
 cksum_pclmul (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
 
diff --git a/src/cksum_vmull.c b/src/cksum_vmull.c
new file mode 100644
index 000000000..5aae8e962
--- /dev/null
+++ b/src/cksum_vmull.c
@@ -0,0 +1,190 @@
+/* cksum -- calculate and print POSIX checksums and sizes of files
+   Copyright (C) 1992-2024 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdint.h>
+#include <arm_neon.h>
+#include "system.h"
+
+/* Number of bytes to read at once.  */
+#define BUFLEN (1 << 16)
+
+extern uint_fast32_t const crctab[8][256];
+
+extern bool
+cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out);
+
+static uint64x2_t bswap_neon(uint64x2_t in) {
+    uint64x2_t a = vreinterpretq_u64_u8(vrev64q_u8(vreinterpretq_u8_u64(in)));
+    a = vcombine_u64(vget_high_u64(a), vget_low_u64(a));
+    return a;
+}
+
+/* Calculate CRC32 using VMULL CPU instruction found in ARMv8 CPUs */
+
+bool
+cksum_vmull (FILE *fp, uint_fast32_t *crc_out, uintmax_t *length_out)
+{
+  uint64x2_t buf[BUFLEN / sizeof (uint64x2_t)];
+  uint_fast32_t crc = 0;
+  uintmax_t length = 0;
+  size_t bytes_read;
+  poly64x2_t single_mult_constant;
+  poly64x2_t four_mult_constant;
+
+  if (!fp || !crc_out || !length_out)
+    return false;
+
+  /* These constants and general algorithms are taken from the Intel whitepaper
+     "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+  */
+  single_mult_constant = vcombine_p64(vcreate_p64(0xE8A45605), vcreate_p64(0xC5B9CD4C));
+  four_mult_constant = vcombine_p64(vcreate_p64(0xE6228B11), vcreate_p64(0x8833794C));
+
+  while ((bytes_read = fread (buf, 1, BUFLEN, fp)) > 0)
+    {
+      uint64x2_t *datap;
+      uint64x2_t data;
+      uint64x2_t data2;
+      uint64x2_t data3;
+      uint64x2_t data4;
+      uint64x2_t data5;
+      uint64x2_t data6;
+      uint64x2_t data7;
+      uint64x2_t data8;
+      uint64x2_t fold_data;
+      uint64x2_t xor_crc;
+
+      if (length + bytes_read < length)
+        {
+          errno = EOVERFLOW;
+          return false;
+        }
+      length += bytes_read;
+
+      datap = (uint64x2_t *)buf;
+
+      /* Fold in parallel eight 16-byte blocks into four 16-byte blocks */
+      if (bytes_read >= 16 * 8)
+        {
+          data = vld1q_u64 ((uint64_t*) (datap));
+          data = bswap_neon (data);
+          /* XOR in initial CRC value (for us 0 so no effect), or CRC value
+             calculated for previous BUFLEN buffer from fread */
+          xor_crc = vcombine_u64(vcreate_u64(0), vcreate_u64(crc << 32));
+          crc = 0;
+          data = veorq_u64 (data, xor_crc);
+          data3 = vld1q_u64 ((uint64_t*) (datap + 1));
+          data3 = bswap_neon (data3);
+          data5 = vld1q_u64 ((uint64_t*) (datap + 2));
+          data5 = bswap_neon (data5);
+          data7 = vld1q_u64 ((uint64_t*) (datap + 3));
+          data7 = bswap_neon (data7);
+
+
+          while (bytes_read >= 16 * 8)
+            {
+              datap += 4;
+
+              /* Do multiplication here for four consecutive 16 byte blocks */
+              data2 = vreinterpretq_u64_p128 (vmull_p64 (vgetq_lane_p64(vreinterpretq_p64_u64(data), 0), vgetq_lane_p64(four_mult_constant, 0)));
+              data = vreinterpretq_u64_p128 (vmull_high_p64 (vreinterpretq_p64_u64(data), four_mult_constant));
+              data4 = vreinterpretq_u64_p128 (vmull_p64 (vgetq_lane_p64(vreinterpretq_p64_u64(data3), 0), vgetq_lane_p64(four_mult_constant, 0)));
+              data3 = vreinterpretq_u64_p128 (vmull_high_p64 (vreinterpretq_p64_u64(data3), four_mult_constant));
+              data6 = vreinterpretq_u64_p128 (vmull_p64 (vgetq_lane_p64(vreinterpretq_p64_u64(data5), 0), vgetq_lane_p64(four_mult_constant, 0)));
+              data5 = vreinterpretq_u64_p128 (vmull_high_p64 (vreinterpretq_p64_u64(data5), four_mult_constant));
+              data8 = vreinterpretq_u64_p128 (vmull_p64 (vgetq_lane_p64(vreinterpretq_p64_u64(data7), 0), vgetq_lane_p64(four_mult_constant, 0)));
+              data7 = vreinterpretq_u64_p128 (vmull_high_p64 (vreinterpretq_p64_u64(data7), four_mult_constant));
+
+              /* Now multiplication results for the four blocks is xor:ed with
+                 next four 16 byte blocks from the buffer. This effectively
+                 "consumes" the first four blocks from the buffer.
+                 Keep xor result in variables for multiplication in next
+                 round of loop. */
+              data = veorq_u64 (data, data2);
+              data2 = vld1q_u64 ((uint64_t*) (datap));
+              data2 = bswap_neon (data2);
+              data = veorq_u64 (data, data2);
+
+              data3 = veorq_u64 (data3, data4);
+              data4 = vld1q_u64 ((uint64_t*) (datap + 1));
+              data4 = bswap_neon (data4);
+              data3 = veorq_u64 (data3, data4);
+
+              data5 = veorq_u64 (data5, data6);
+              data6 = vld1q_u64 ((uint64_t*) (datap + 2));
+              data6 = bswap_neon (data6);
+              data5 = veorq_u64 (data5, data6);
+
+              data7 = veorq_u64 (data7, data8);
+              data8 = vld1q_u64 ((uint64_t*) (datap + 3));
+              data8 = bswap_neon (data8);
+              data7 = veorq_u64 (data7, data8);
+
+              bytes_read -= (16 * 4);
+            }
+          /* At end of loop we write out results from variables back into
+             the buffer, for use in single fold loop */
+          data = bswap_neon (data);
+          vst1q_u64((uint64_t*) (datap), data);
+          data3 = bswap_neon (data3);
+          vst1q_u64((uint64_t*) (datap + 1), data3);
+          data5 = bswap_neon (data5);
+          vst1q_u64((uint64_t*) (datap + 2), data5);
+          data7 = bswap_neon (data7);
+          vst1q_u64((uint64_t*) (datap + 3), data7);
+        }
+
+      /* Fold two 16-byte blocks into one 16-byte block */
+      if (bytes_read >= 32)
+        {
+          data = vld1q_u64 ((uint64_t*) (datap));
+          data = bswap_neon (data);
+          xor_crc = vcombine_u64(vcreate_u64(0), vcreate_u64(crc << 32));
+          crc = 0;
+          data = veorq_u64 (data, xor_crc);
+          while (bytes_read >= 32)
+            {
+              datap++;
+
+              data2 = vreinterpretq_u64_p128 (vmull_p64 (vgetq_lane_p64(vreinterpretq_p64_u64(data), 0), vgetq_lane_p64(single_mult_constant, 0)));
+              data = vreinterpretq_u64_p128 (vmull_high_p64 (vreinterpretq_p64_u64(data), single_mult_constant));
+              fold_data = vld1q_u64 ((uint64_t*) (datap));
+              fold_data = bswap_neon (fold_data);
+              data = veorq_u64 (data, data2);
+              data = veorq_u64 (data, fold_data);
+              bytes_read -= 16;
+            }
+          data = bswap_neon (data);
+          vst1q_u64((uint64_t*) (datap), data);
+        }
+
+      /* And finish up last 0-31 bytes in a byte by byte fashion */
+      unsigned char *cp = (unsigned char *)datap;
+      while (bytes_read--)
+        crc = (crc << 8) ^ crctab[0][((crc >> 24) ^ *cp++) & 0xFF];
+      if (feof (fp))
+        break;
+    }
+
+  *crc_out = crc;
+  *length_out = length;
+
+  return !ferror (fp);
+}
diff --git a/src/local.mk b/src/local.mk
index 90c98cc50..0d505052c 100644
--- a/src/local.mk
+++ b/src/local.mk
@@ -461,6 +461,13 @@ cksum_pclmul_ldadd = src/libcksum_pclmul.a
 src_cksum_LDADD += $(cksum_pclmul_ldadd)
 src_libcksum_pclmul_a_CFLAGS = -mavx -mpclmul $(AM_CFLAGS)
 endif
+if USE_VMULL_CRC32
+noinst_LIBRARIES += src/libcksum_vmull.a
+src_libcksum_vmull_a_SOURCES = src/cksum_vmull.c src/cksum.h
+cksum_vmull_ldadd = src/libcksum_vmull.a
+src_cksum_LDADD += $(cksum_vmull_ldadd)
+src_libcksum_vmull_a_CFLAGS = -march=armv8-a+crypto $(AM_CFLAGS)
+endif
 
 src_base64_SOURCES = src/basenc.c
 src_base64_CPPFLAGS = -DBASE_TYPE=64 $(AM_CPPFLAGS)
-- 
2.43.0

