On Wed, May 14, 2025 I wrote:
>
> We did something similar for x86 for v18, and here is some progress
> towards Arm support.

Coming back to this, since there's been recent interest in Arm support.

v2 is a rebase, with a few changes.

- I simplified it by leaving out the inlining for "assume CRC" builds,
since I wanted to avoid alignment considerations if I can. I think
always indirecting through a pointer will have less risk of
regressions in a realistic setting than for x86 since Arm chips
typically have low latency for carryless multiplication instructions.
With just a bit of code we can still use the direct call for small
constant inputs, so I did that to avoid regressions under WAL insert
lock.

- One coding idiom for a vector literal in the generated code was
giving pgindent indigestion, I so rewrote it using Neon intrinsics and
verified it in Godbolt.

> 0002: Like 3c6e8c12389 and in fact uses the same program to generate
> the code, by specifying Neon instructions with the Arm "crypto"
> extension instead. There are some interesting differences from x86
> here as well:
> - The upstream implementation chose to use inline assembly instead of
> intrinsics for some reason. I initially thought that was a way to get
> broader compiler support, but it turns out you still need to pass the
> relevant flags to get the assembly to link.

To follow-up for curiosity's sake, [1] says that Apple chips can issue
PMULL + EOR as a single uop if they are next to each other in the
instruction stream.

> - I only have Meson support for now, since I used MacOS on CI to test.
> That OS and compiler combination apparently targets the CRC extension,
> but the PMULL instruction runtime check uses Linux-only headers, I
> believe, so previously I hacked the choose function to return true for
> testing. The choose function in 0002 is untested in this form.

This is still true, but now the CI hack lives in a separate
not-for-commit patch for clarity.

autoconf support is a WIP, and I will share that after I do some
testing on an Arm Linux instance.

[1] https://dougallj.github.io/applecpu/firestorm.html

--
John Naylor
Amazon Web Services
From 5057eb8ac0b0156de8b9371415a7dfc852cfd0c7 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Fri, 9 May 2025 19:48:26 +0700
Subject: [PATCH v2 1/2] Compute CRC32C on ARM using the Crypto Extension where
 available

---
 meson.build                       |  33 ++++++++
 src/include/port/pg_crc32c.h      |  22 ++++--
 src/port/meson.build              |   1 +
 src/port/pg_crc32c_armv8.c        | 124 ++++++++++++++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c |  34 ++++++++
 5 files changed, 209 insertions(+), 5 deletions(-)

diff --git a/meson.build b/meson.build
index 2064d1b0a8d..6401895a5da 100644
--- a/meson.build
+++ b/meson.build
@@ -2548,6 +2548,39 @@ int main(void)
     have_optimized_crc = true
   endif
 
+    # Check if the compiler supports ARMv8 CRYPTO carryless multiplication
+    # and exclusive-or inline assembly instructions used for computing CRC.
+    # Check __crc32cd here as well, since the full implementation relies on
+    # 8-byte CRC instructions.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t	a;
+uint64x2_t	b;
+uint64x2_t	c;
+
+int main(void)
+{
+    uint64x2_t	r;
+    uint64x2_t	r2;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    /* return computed value, to prevent the above being optimized away */
+    r = veorq_u64(r, r2);
+    return __crc32cd(0, vgetq_lane_u64(r, 0));
+}
+'''
+
+  if cc.links(prog,
+      name: 'PMULL CRC32C',
+      args: test_c_args + ['-march=armv8-a+crc+simd+crypto'])
+    # Use ARM CRYPTO Extension, with runtime check
+    cflags_crc += '-march=armv8-a+crc+simd+crypto'
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'
 
   prog = '''
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 9ac619aec3e..bbfc68f6dd5 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif
 
 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for small constant inputs,
+ * we can avoid an indirect function call.
+ */
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = __builtin_constant_p(len) && len < 32 ? 									\
+		pg_comp_crc32c_armv8((crc), (data), (len)) : 		\
+		pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
@@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #else
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 28655142ebe..9faafbbe8bf 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -93,6 +93,7 @@ replace_funcs_pos = [
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 039986c7b33..7d70ad055cd 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -20,6 +20,10 @@
 #include <arm_acle.h>
 #endif
 
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
 #include "port/pg_crc32c.h"
 
 pg_crc32c
@@ -77,3 +81,123 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	const char *buf = data;
+
+	/* align to 16 bytes */
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		uint64x2_t	x0 = vld1q_u64((const uint64_t *) buf),
+					y0;
+		uint64x2_t	x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+					y1;
+		uint64x2_t	x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+					y2;
+		uint64x2_t	x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+					y3;
+		uint64x2_t	k;
+
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+			k = vld1q_u64(k_);
+		}
+
+		/*
+		 * pgindent complained of unmatched parens upstream:
+		 *
+		 * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+		 */
+		x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+			y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+			y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+			y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+		y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+		crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index a1f0e540c6b..ac6e2862e8e 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
+static inline bool
+pg_pmull_available(void)
+{
+#ifdef __aarch64__
+
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+	return false;
+#endif
+
+#else
+	return false;
+#endif
+}
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+#if defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 	if (pg_crc32c_armv8_available())
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
 	else
 		pg_comp_crc32c = pg_comp_crc32c_sb8;
+#elif defined(USE_ARMV8_CRC32C)
+	/*
+	 * We still set the function pointer as a fallback for the PMULL
+	 * implementation.
+	 */
+	pg_comp_crc32c = pg_comp_crc32c_armv8;
+#endif
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+	if (pg_pmull_available())
+		pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.52.0

From c871c612297d1c0526520b5bf4b5fef0713072c9 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Mon, 12 Jan 2026 15:35:07 +0700
Subject: [PATCH v2 2/2] Force testing on MacOS CI XXX not for commit

---
 src/port/pg_crc32c_armv8_choose.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index ac6e2862e8e..1ed66b2fb76 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -150,7 +150,7 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 #endif
 
 #ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
-	if (pg_pmull_available())
+	if (true || pg_pmull_available())
 		pg_comp_crc32c = pg_comp_crc32c_pmull;
 #endif
 
-- 
2.52.0

Reply via email to