I wrote: > autoconf support is a WIP, and I will share that after I do some > testing on an Arm Linux instance.
I've only checked paths with objdump and debugging printouts (no perf testing), but this seems to work in v3. My main concern now is whether it's a maintenance hazard to overwrite CFLAGS_CRC in a separate check. In master, we can have one of: CFLAGS_CRC="" CFLAGS_CRC="-march=armv8-a+crc+simd" CFLAGS_CRC="-march=armv8-a+crc" ...and then based on that we set either USE_ARMV8_CRC32C or USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, and set PG_CRC32C_OBJS. But below that, v3 runs a new test for pmull instructions with the flag "-march=armv8-a+crc+simd+crypto" and if it links, it will reset CFLAGS_CRC to that set of flags. That doesn't seem like the right thing to do, but I don't see a good alternative. I suppose I could sidestep that with function attributes, but that's not as well supported. Another idea would be to turn the relevant line here if test x"$Ac_cachevar" = x"yes"; then CFLAGS_CRC="$1" pgac_arm_pmull_intrinsics=yes fi ...into CFLAGS_CRC="CFLAGS_CRC$1", where in this case $1 is just "+crypto". That seems even more fragile, though. -- John Naylor Amazon Web Services
From f4227534f6090c4c1ddef6f44975fe506c2eb0b3 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Tue, 31 Mar 2026 17:40:38 +0700 Subject: [PATCH v3] Compute CRC32C on ARM using the Crypto Extension where available --- config/c-compiler.m4 | 38 ++++++++- configure | 57 +++++++++++++- configure.ac | 11 ++- meson.build | 33 ++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_crc32c.h | 22 ++++-- src/port/meson.build | 1 + src/port/pg_crc32c_armv8.c | 124 ++++++++++++++++++++++++++++++ src/port/pg_crc32c_armv8_choose.c | 36 ++++++++- 9 files changed, 315 insertions(+), 10 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 629572ee350..0027ef3710c 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -759,6 +759,41 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS +# PGAC_ARM_PLMULL_INTRINSICS +# --------------------------- +# Check if the compiler supports Arm CRYPTO carryless multiplication +# instructions used for vectorized CRC. +# +# If the intrinsics are supported, sets pgac_arm_pmull_intrinsics. +############ WIP: is it really safe to overwrite CFLAGS_CRC? +AC_DEFUN([PGAC_ARM_PLMULL_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_intrinsics_$1])])dnl +AC_CACHE_CHECK([for pmull and pmull2 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h> +#include <arm_neon.h> +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r; +uint64x2_t r2;], + + [__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0));])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1" + pgac_arm_pmull_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_ARM_PLMULL_INTRINSICS + # PGAC_LOONGARCH_CRC32C_INTRINSICS # --------------------------- # Check if the compiler supports the LoongArch CRCC instructions, using @@ -784,7 +819,8 @@ AC_CACHE_CHECK( /* return computed value, to prevent the above being optimized away */ return crc == 0;])], [Ac_cachevar=yes], - [Ac_cachevar=no])]) + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) if test x"$Ac_cachevar" = x"yes"; then pgac_loongarch_crc32c_intrinsics=yes fi diff --git a/configure b/configure index 0d123d7dc8a..fdecd5b524a 100755 --- a/configure +++ b/configure @@ -18395,6 +18395,53 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then pgac_avx512_pclmul_intrinsics=yes fi +else + if test x"$host_cpu" = x"aarch64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2 with CFLAGS=-march=armv8-a+crc+simd+crypto" >&5 +$as_echo_n "checking for pmull and pmull2 with CFLAGS=-march=armv8-a+crc+simd+crypto... " >&6; } +if ${pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=armv8-a+crc+simd+crypto" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <arm_acle.h> +#include <arm_neon.h> +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; +uint64x2_t r; +uint64x2_t r2; +int +main () +{ +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0)); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto=yes +else + pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" >&5 +$as_echo "$pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" >&6; } +if test x"$pgac_cv_arm_pmull_intrinsics__march_armv8_apcrcpsimdpcrypto" = x"yes"; then + CFLAGS_CRC="-march=armv8-a+crc+simd+crypto" + pgac_arm_pmull_intrinsics=yes +fi + + fi fi { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5 @@ -18406,8 +18453,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5 $as_echo "AVX-512 with runtime check" >&6; } else - { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 + if test x"$pgac_arm_pmull_intrinsics" = x"yes"; then + +$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: PMULL CRC with runtime check" >&5 +$as_echo "PMULL CRC with runtime check" >&6; } + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 $as_echo "none" >&6; } + fi fi # Select semaphore implementation type. diff --git a/configure.ac b/configure.ac index 2342780359a..0ea56dc53dd 100644 --- a/configure.ac +++ b/configure.ac @@ -2291,6 +2291,10 @@ AC_SUBST(PG_CRC32C_OBJS) # if test x"$host_cpu" = x"x86_64"; then PGAC_AVX512_PCLMUL_INTRINSICS() +else + if test x"$host_cpu" = x"aarch64"; then + PGAC_ARM_PLMULL_INTRINSICS([-march=armv8-a+crc+simd+crypto]) + fi fi AC_MSG_CHECKING([for vectorized CRC-32C]) @@ -2298,7 +2302,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.]) AC_MSG_RESULT(AVX-512 with runtime check) else - AC_MSG_RESULT(none) + if test x"$pgac_arm_pmull_intrinsics" = x"yes"; then + AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.]) + AC_MSG_RESULT(PMULL CRC with runtime check) + else + AC_MSG_RESULT(none) + fi fi # Select semaphore implementation type. diff --git a/meson.build b/meson.build index 0ee772cd475..a0fac181595 100644 --- a/meson.build +++ b/meson.build @@ -2681,6 +2681,39 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports ARMv8 CRYPTO carryless multiplication + # and exclusive-or inline assembly instructions used for computing CRC. + # Check __crc32cd here as well, since the full implementation relies on + # 8-byte CRC instructions. + prog = ''' +#include <arm_acle.h> +#include <arm_neon.h> +uint64x2_t a; +uint64x2_t b; +uint64x2_t c; + +int main(void) +{ + uint64x2_t r; + uint64x2_t r2; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b)); + + r = veorq_u64(r, r2); + /* return computed value, to prevent the above being optimized away */ + return __crc32cd(0, vgetq_lane_u64(r, 0)); +} +''' + + if cc.links(prog, + name: 'PMULL CRC32C', + args: test_c_args + ['-march=armv8-a+crc+simd+crypto']) + # Use ARM CRYPTO Extension, with runtime check + cflags_crc += '-march=armv8-a+crc+simd+crypto' + cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + elif host_cpu == 'loongarch64' prog = ''' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d8d61918aff..dbc97c565a3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -729,6 +729,9 @@ /* Define to 1 to build with PAM support. (--with-pam) */ #undef USE_PAM +/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */ +#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */ #undef USE_SLICING_BY_8_CRC32C diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 1f8e837d119..1230709197a 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l #endif #elif defined(USE_ARMV8_CRC32C) -/* Use ARMv8 CRC Extension instructions. */ - +/* + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions. + * We don't need a runtime check for CRC, so for small constant inputs, + * we can avoid an indirect function call. + */ #define COMP_CRC32C(crc, data, len) \ - ((crc) = pg_comp_crc32c_armv8((crc), (data), (len))) + ((crc) = __builtin_constant_p(len) && len < 32 ? \ + pg_comp_crc32c_armv8((crc), (data), (len)) : \ + pg_comp_crc32c((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #elif defined(USE_LOONGARCH_CRC32C) /* Use LoongArch CRCC instructions. */ @@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use ARMv8 instructions, but perform a runtime check first - * to check that they are available. + * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions, + * but perform a runtime check first to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ ((crc) = pg_comp_crc32c((crc), (data), (len))) @@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len); +#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index d55cb0424f3..922b3f64676 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -93,6 +93,7 @@ replace_funcs_pos = [ # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'], + ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'], ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c index 9ca0f728d39..64b82f6de58 100644 --- a/src/port/pg_crc32c_armv8.c +++ b/src/port/pg_crc32c_armv8.c @@ -20,6 +20,10 @@ #include <arm_acle.h> #endif +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK +#include <arm_neon.h> +#endif + #include "port/pg_crc32c.h" pg_crc32c @@ -77,3 +81,123 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration + * - match whitespace to our project style + * - be more friendly for pgindent + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i neon -p crc32c -a v4e */ +/* MIT licensed */ + +static inline +uint64x2_t +clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +static inline +uint64x2_t +clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c) +{ + uint64x2_t r; + +__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b)); + return r; +} + +pg_crc32c +pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + const char *buf = data; + + /* align to 16 bytes */ + for (; len && ((uintptr_t) buf & 7); --len) + { + crc0 = __crc32cb(crc0, *buf++); + } + if (((uintptr_t) buf & 8) && len >= 8) + { + crc0 = __crc32cd(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + + /* First vector chunk. */ + uint64x2_t x0 = vld1q_u64((const uint64_t *) buf), + y0; + uint64x2_t x1 = vld1q_u64((const uint64_t *) (buf + 16)), + y1; + uint64x2_t x2 = vld1q_u64((const uint64_t *) (buf + 32)), + y2; + uint64x2_t x3 = vld1q_u64((const uint64_t *) (buf + 48)), + y3; + uint64x2_t k; + + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8}; + + k = vld1q_u64(k_); + } + + /* + * pgindent complained of unmatched parens upstream: + * + * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0); + */ + x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0); + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0); + y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1); + y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2); + y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3); + buf += 64; + } + + /* Reduce x0 ... x3 to just x0. */ + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0); + y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2); + { + static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e}; + + k = vld1q_u64(k_); + } + y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0)); + crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1)); + len = end - buf; + } + + return pg_comp_crc32c_armv8(crc0, buf, len); +} + +#endif diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c index a1f0e540c6b..164af65454b 100644 --- a/src/port/pg_crc32c_armv8_choose.c +++ b/src/port/pg_crc32c_armv8_choose.c @@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void) #endif } +static inline bool +pg_pmull_available(void) +{ +#if defined(__aarch64__) && defined(HWCAP_PMULL) + +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_PMULL) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0; +#else + return false; +#endif + +#else + return false; +#endif +} + /* * This gets called on the first call. It replaces the function pointer * so that subsequent calls are routed directly to the chosen implementation. @@ -115,10 +136,21 @@ pg_crc32c_armv8_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + if (pg_crc32c_armv8_available()) pg_comp_crc32c = pg_comp_crc32c_armv8; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; + +#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK + if (pg_pmull_available()) + pg_comp_crc32c = pg_comp_crc32c_pmull; +#endif return pg_comp_crc32c(crc, data, len); } -- 2.53.0
