On Wed, May 14, 2025 I wrote: > > We did something similar for x86 for v18, and here is some progress > towards Arm support.
Coming back to this, since there's been recent interest in Arm support. v2 is a rebase, with a few changes. - I simplified it by leaving out the inlining for "assume CRC" builds, since I wanted to avoid alignment considerations if I can. I think always indirecting through a pointer will have less risk of regressions in a realistic setting than for x86 since Arm chips typically have low latency for carryless multiplication instructions. With just a bit of code we can still use the direct call for small constant inputs, so I did that to avoid regressions under WAL insert lock. - One coding idiom for a vector literal in the generated code was giving pgindent indigestion, I so rewrote it using Neon intrinsics and verified it in Godbolt. > 0002: Like 3c6e8c12389 and in fact uses the same program to generate > the code, by specifying Neon instructions with the Arm "crypto" > extension instead. There are some interesting differences from x86 > here as well: > - The upstream implementation chose to use inline assembly instead of > intrinsics for some reason. I initially thought that was a way to get > broader compiler support, but it turns out you still need to pass the > relevant flags to get the assembly to link. To follow-up for curiosity's sake, [1] says that Apple chips can issue PMULL + EOR as a single uop if they are next to each other in the instruction stream. > - I only have Meson support for now, since I used MacOS on CI to test. > That OS and compiler combination apparently targets the CRC extension, > but the PMULL instruction runtime check uses Linux-only headers, I > believe, so previously I hacked the choose function to return true for > testing. The choose function in 0002 is untested in this form. This is still true, but now the CI hack lives in a separate not-for-commit patch for clarity. autoconf support is a WIP, and I will share that after I do some testing on an Arm Linux instance. [1] https://dougallj.github.io/applecpu/firestorm.html -- John Naylor Amazon Web Services
From 5057eb8ac0b0156de8b9371415a7dfc852cfd0c7 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Fri, 9 May 2025 19:48:26 +0700 Subject: [PATCH v2 1/2] Compute CRC32C on ARM using the Crypto Extension where available --- meson.build | 33 ++++++++ src/include/port/pg_crc32c.h | 22 ++++-- src/port/meson.build | 1 + src/port/pg_crc32c_armv8.c | 124 ++++++++++++++++++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 34 ++++++++ 5 files changed, 209 insertions(+), 5 deletions(-) diff --git a/meson.build b/meson.build index 2064d1b0a8d..6401895a5da 100644 --- a/meson.build +++ b/meson.build @@ -2548,6 +2548,39 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports ARMv8 CRYPTO carryless multiplication + # and exclusive-or inline assembly instructions used for computing CRC. + # Check __crc32cd here as well, since the full implementation relies on + # 8-byte CRC instructions. + prog = ''' +#include <arm_acle.h> +#include <arm_neon.h> +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; + +int main(void) +{ + uint64x2_t r; + uint64x2_t r2; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + /* return computed value, to prevent the above being optimized away */ + r = veorq_u64(r, r2); + return __crc32cd(0, vgetq_lane_u64(r, 0)); +} +''' + + if cc.links(prog, + name: 'PMULL CRC32C', + args: test_c_args + ['-march=armv8-a+crc+simd+crypto']) + # Use ARM CRYPTO Extension, with runtime check + cflags_crc += '-march=armv8-a+crc+simd+crypto' + cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + elif host_cpu == 'loongarch64' prog = ''' diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 9ac619aec3e..bbfc68f6dd5 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l #endif #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ - +/* + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions. + * We don't need a runtime check for CRC, so for small constant inputs, + * we can avoid an indirect function call. + */ #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = __builtin_constant_p(len) && len < 32 ? \ + pg_comp_crc32c_armv8((crc), (data), (len)) : \ + pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use ARMv8 instructions, but perform a runtime check first - * to check that they are available. + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions, + * but perform a runtime check first to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c((crc), (data), (len))) @@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index 28655142ebe..9faafbbe8bf 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -93,6 +93,7 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 039986c7b33..7d70ad055cd 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -20,6 +20,10 @@ #include <arm_acle.h> #endif +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +#include <arm_neon.h> +#endif + #include "port/pg_crc32c.h" pg_crc32c @@ -77,3 +81,123 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration + * - match whitespace to our project style + * - be more friendly for pgindent + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i neon -p crc32c -a v4e */ +/* MIT licensed */ + +static inline +uint64x2_t +clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +static inline +uint64x2_t +clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_crc32c +pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + const char *buf = data; + + /* align to 16 bytes */ + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = __crc32cd(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + uint64x2_t x0 = vld1q_u64((const uint64_t *) buf), + y0; + uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)), + y1; + uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)), + y2; + uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)), + y3; + uint64x2_t k; + + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8}; + + k = vld1q_u64(k_); + } + + /* + * pgindent complained of unmatched parens upstream: + * + * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0); + */ + x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0); + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0); + y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1); + y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2); + y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); + y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_armv8(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index a1f0e540c6b..ac6e2862e8e 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void) #endif } +static inline bool +pg_pmull_available(void) +{ +#ifdef __aarch64__ + +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_PMULL) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +#else + return false; +#endif + +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { +#if defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) if (pg_crc32c_armv8_available()) pg_comp_crc32c = pg_comp_crc32c_armv8; else pg_comp_crc32c = pg_comp_crc32c_sb8; +#elif defined(USE_ARMV8_CRC32C) + /* + * We still set the function pointer as a fallback for the PMULL + * implementation. + */ + pg_comp_crc32c = pg_comp_crc32c_armv8; +#endif + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + if (pg_pmull_available()) + pg_comp_crc32c = pg_comp_crc32c_pmull; +#endif return pg_comp_crc32c(crc, data, len); } -- 2.52.0
From c871c612297d1c0526520b5bf4b5fef0713072c9 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Mon, 12 Jan 2026 15:35:07 +0700 Subject: [PATCH v2 2/2] Force testing on MacOS CI XXX not for commit --- src/port/pg_crc32c_armv8_choose.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index ac6e2862e8e..1ed66b2fb76 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -150,7 +150,7 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) #endif #ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK - if (pg_pmull_available()) + if (true || pg_pmull_available()) pg_comp_crc32c = pg_comp_crc32c_pmull; #endif -- 2.52.0
