We have accrued duplicate bits of hardware detection logic in different places, and the the AVX2 page checksum patch is about to add more. It seems like a good time to try again to centralize things, before that happens. The attached only touches x86, but that's enough to demonstrate, and there's no point in trying to do everything at once. Arm should get the same treatment at some point.
0001 starts by renaming pg_crc32c_sse42_choose.c to something more general and does just enough to fix the build. Without a separate rename step, there's too much change for git to call it a rename. Humans can still see some carryover, so it seems right to keep git history continuous. 0002 adds an array of bool indexed by an enum of feature names, and adjusts the CRC and popcount code to use it. 0003 refactors detection of support for ZMM registers in preparation of doing same for YMM (needed by AVX2). 0004 rebases the latest page checksum patch on top of the above for demonstration (review of that is happening in its own thread [1]). Not counting autoconf/meson and the pointer juggling, the additional feature detection is now only 2 lines of code, which is nice. For PG20, we can build on this to simplify the rat's nest of #ifdefs that a couple of src/include/port headers have. We should also be able to arrange so that packagers that pass relevant flags to common compilers will automatically get some branches/indirection eliminated via the compiler's standard dead code elimination, in a simple way, rather than our having to kluge it together in multiple places. That will make irrelevant the question that occasionally come up about moving hardware requirements. Anyway, 0001-3 is doable for PG19. [1] https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com -- John Naylor Amazon Web Services
From 145bc4f00c8270b046c651e42686a2a18ad16ddb Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Wed, 11 Feb 2026 14:34:18 +0700 Subject: [PATCH v1 1/4] Rename CRC "choose" files for future general purpose WIP: x86 only --- configure | 4 ++-- configure.ac | 4 ++-- src/port/meson.build | 4 ++-- src/port/{pg_crc32c_sse42_choose.c => pg_x86_feature.c} | 0 4 files changed, 6 insertions(+), 6 deletions(-) rename src/port/{pg_crc32c_sse42_choose.c => pg_x86_feature.c} (100%) diff --git a/configure b/configure index a10a2c85c6a..373194daa05 100755 --- a/configure +++ b/configure @@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else @@ -18204,7 +18204,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 814e64a967e..62e47394544 100644 --- a/configure.ac +++ b/configure.ac @@ -2245,12 +2245,12 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then diff --git a/src/port/meson.build b/src/port/meson.build index d7d4e705b89..d96b4eed4c6 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -86,8 +86,8 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], - ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_x86_feature', 'USE_SSE42_CRC32C'], + ['pg_x86_feature', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_x86_feature.c similarity index 100% rename from src/port/pg_crc32c_sse42_choose.c rename to src/port/pg_x86_feature.c -- 2.53.0
From e7f01e25e04bc4b6bbd2eb8ce79760647b298cc2 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Fri, 13 Feb 2026 18:11:39 +0700 Subject: [PATCH v1 3/4] Refactor the detection of ZMM registers - Call _xgetbv within x86_set_runtime_features rather than in a separate function - Use symbols for XCR mask bits rather than a magic constant A future commit will build on this to detect YMM registers without code duplication. --- src/port/pg_x86_feature.c | 41 ++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/port/pg_x86_feature.c b/src/port/pg_x86_feature.c index c92cfbe6d5d..afe552a3bcd 100644 --- a/src/port/pg_x86_feature.c +++ b/src/port/pg_x86_feature.c @@ -31,31 +31,29 @@ #include "port/pg_x86_feature.h" +/* XSAVE state component bits that we need */ +#define XMM (1<<1) +#define YMM (1<<2) +#define OPMASK (1<<5) +#define ZMM0_15 (1<<6) +#define ZMM16_31 (1<<7) + + bool X86Feature[X86FeatureSize] = {0}; -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that osxsave is available - * before calling this. - */ -#ifdef HAVE_XSAVE_INTRINSICS -pg_attribute_target("xsave") -#endif static bool -zmm_regs_available(void) +mask_available(uint32 value, uint32 mask) { -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif + return (value & mask) == mask; } /* * Parse the CPU ID info for runtime checks. */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif void x86_set_runtime_features(void) { @@ -75,17 +73,24 @@ x86_set_runtime_features(void) /* All these features depend on OSXSAVE */ if (exx[2] & (1 << 27)) { - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + uint32 xcr0_val = 0; + /* second cpuid call on leaf 7 to check extended AVX-512 support */ memset(exx, 0, 4 * sizeof(exx[0])); - #if defined(HAVE__GET_CPUID_COUNT) __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUIDEX) __cpuidex(exx, 7, 0); #endif - if (zmm_regs_available()) +#ifdef HAVE_XSAVE_INTRINSICS + /* get value of Extended Control Register */ + xcr0_val = _xgetbv(0); +#endif + + /* Are ZMM registeres enabled? */ + if (mask_available(xcr0_val, XMM | YMM | + OPMASK | ZMM0_15 | ZMM16_31)) { X86Feature[PG_AVX512_BW] = exx[1] >> 30 & 1; X86Feature[PG_AVX512_VL] = exx[1] >> 31 & 1; -- 2.53.0
From c05a5b30df212248009a29df739b9d1d57ea9261 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Thu, 12 Feb 2026 12:45:23 +0700 Subject: [PATCH v1 2/4] Centralize detection of CPU features WIP: x86 only --- configure | 4 +- configure.ac | 4 +- src/include/port/pg_x86_feature.h | 44 +++++++++++++++ src/port/Makefile | 1 + src/port/meson.build | 3 +- src/port/pg_crc32c_sse42.c | 29 ++++++++++ src/port/pg_popcount_x86.c | 91 ++----------------------------- src/port/pg_x86_feature.c | 75 ++++++++++++------------- 8 files changed, 119 insertions(+), 132 deletions(-) create mode 100644 src/include/port/pg_x86_feature.h diff --git a/configure b/configure index 373194daa05..185703289b4 100755 --- a/configure +++ b/configure @@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else @@ -18204,7 +18204,7 @@ else $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5 $as_echo "SSE 4.2 with runtime check" >&6; } else diff --git a/configure.ac b/configure.ac index 62e47394544..0955b7e4371 100644 --- a/configure.ac +++ b/configure.ac @@ -2245,12 +2245,12 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_x86_feature.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o" AC_MSG_RESULT(SSE 4.2 with runtime check) else if test x"$USE_ARMV8_CRC32C" = x"1"; then diff --git a/src/include/port/pg_x86_feature.h b/src/include/port/pg_x86_feature.h new file mode 100644 index 00000000000..de56882c9e1 --- /dev/null +++ b/src/include/port/pg_x86_feature.h @@ -0,0 +1,44 @@ +/*------------------------------------------------------------------------- + * + * pg_x86_feature.h + * Runtime CPU feature detection + * + * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/port/pg_x86_feature.h + * + *------------------------------------------------------------------------- + */ +#ifndef PG_X86_FEATURE_H +#define PG_X86_FEATURE_H + + +typedef enum X86FeatureId +{ + init, + + PG_SSE4_2, + PG_POPCNT, + + PG_AVX512_BW, + PG_AVX512_VL, + PG_VPCLMULQDQ, + PG_AVX512_VPOPCNTDQ, +} X86FeatureId; +#define X86FeatureSize (PG_AVX512_VPOPCNTDQ + 1) + +extern PGDLLEXPORT bool X86Feature[]; + +extern void x86_set_runtime_features(void); + +static inline bool +x86_feature_available(X86FeatureId feature) +{ + if (X86Feature[init] == false) + x86_set_runtime_features(); + + return X86Feature[feature]; +} + +#endif /* PG_X86_FEATURE_H */ diff --git a/src/port/Makefile b/src/port/Makefile index 6e3b7d154ed..7b5bc58a898 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -49,6 +49,7 @@ OBJS = \ pg_popcount_aarch64.o \ pg_popcount_x86.o \ pg_strong_random.o \ + pg_x86_feature.o \ pgcheckdir.o \ pgmkdirp.o \ pgsleep.o \ diff --git a/src/port/meson.build b/src/port/meson.build index d96b4eed4c6..9a25a634a5b 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -12,6 +12,7 @@ pgport_sources = [ 'pg_popcount_aarch64.c', 'pg_popcount_x86.c', 'pg_strong_random.c', + 'pg_x86_feature.c', 'pgcheckdir.c', 'pgmkdirp.c', 'pgsleep.c', @@ -86,8 +87,6 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], - ['pg_x86_feature', 'USE_SSE42_CRC32C'], - ['pg_x86_feature', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], # arm / aarch64 diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index c1279d31fbd..f64ddde28cd 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -20,6 +20,10 @@ #endif #include "port/pg_crc32c.h" +#include "port/pg_x86_feature.h" + +static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len); + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") @@ -159,3 +163,28 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len) } #endif + +static pg_crc32c +pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +{ + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + + if (x86_feature_available(PG_SSE4_2)) + pg_comp_crc32c = pg_comp_crc32c_sse42; + +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX512_VL) && + x86_feature_available(PG_VPCLMULQDQ)) + pg_comp_crc32c = pg_comp_crc32c_avx512; +#endif + + return pg_comp_crc32c(crc, data, len); +}; + +pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 6bce089432f..45e8930adc7 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -14,19 +14,12 @@ #ifdef HAVE_X86_64_POPCNTQ -#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) -#include <cpuid.h> -#endif - #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK #include <immintrin.h> #endif -#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) -#include <intrin.h> -#endif - #include "port/pg_bitutils.h" +#include "port/pg_x86_feature.h" /* * The SSE4.2 versions are built regardless of whether we are building the @@ -58,84 +51,9 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -/* - * Return true if CPUID indicates that the POPCNT instruction is available. - */ -static bool -pg_popcount_sse42_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK -/* - * Does CPUID say there's support for XSAVE instructions? - */ -static inline bool -xsave_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 27)) != 0; /* osxsave */ -} - -/* - * Does XGETBV say the ZMM registers are enabled? - * - * NB: Caller is responsible for verifying that xsave_available() returns true - * before calling this. - */ -#ifdef HAVE_XSAVE_INTRINSICS -pg_attribute_target("xsave") -#endif -static inline bool -zmm_regs_available(void) -{ -#ifdef HAVE_XSAVE_INTRINSICS - return (_xgetbv(0) & 0xe6) == 0xe6; -#else - return false; -#endif -} - -/* - * Does CPUID say there's support for AVX-512 popcount and byte-and-word - * instructions? - */ -static inline bool -avx512_popcnt_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); -#else -#error cpuid instruction not available -#endif - return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */ - (exx[1] & (1 << 30)) != 0; /* avx512-bw */ -} - /* * Returns true if the CPU supports the instructions required for the AVX-512 * pg_popcount() implementation. @@ -143,9 +61,8 @@ avx512_popcnt_available(void) static bool pg_popcount_avx512_available(void) { - return xsave_available() && - zmm_regs_available() && - avx512_popcnt_available(); + return x86_feature_available(PG_AVX512_BW) && + x86_feature_available(PG_AVX512_VPOPCNTDQ); } #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ @@ -159,7 +76,7 @@ pg_popcount_avx512_available(void) static inline void choose_popcount_functions(void) { - if (pg_popcount_sse42_available()) + if (x86_feature_available(PG_POPCNT)) { pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; diff --git a/src/port/pg_x86_feature.c b/src/port/pg_x86_feature.c index f586476964f..c92cfbe6d5d 100644 --- a/src/port/pg_x86_feature.c +++ b/src/port/pg_x86_feature.c @@ -1,25 +1,21 @@ /*------------------------------------------------------------------------- * - * pg_crc32c_sse42_choose.c - * Choose between Intel SSE 4.2 and software CRC-32C implementation. - * - * On first call, checks if the CPU we're running on supports Intel SSE - * 4.2. If it does, use the special SSE instructions for CRC-32C - * computation. Otherwise, fall back to the pure software implementation - * (slicing-by-8). + * pg_x86_feature.c + * Runtime CPU feature detection * * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * * * IDENTIFICATION - * src/port/pg_crc32c_sse42_choose.c + * src/port/pg_x86_feature.c * *------------------------------------------------------------------------- */ - #include "c.h" +#if defined(USE_SSE2) || defined(__i386__) + #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include <cpuid.h> #endif @@ -32,7 +28,11 @@ #include <immintrin.h> #endif -#include "port/pg_crc32c.h" +#include "port/pg_x86_feature.h" + + +bool X86Feature[X86FeatureSize] = {0}; + /* * Does XGETBV say the ZMM registers are enabled? @@ -54,22 +54,13 @@ zmm_regs_available(void) } /* - * This gets called on the first call. It replaces the function pointer - * so that subsequent calls are routed directly to the chosen implementation. + * Parse the CPU ID info for runtime checks. */ -static pg_crc32c -pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) +void +x86_set_runtime_features(void) { unsigned int exx[4] = {0, 0, 0, 0}; - /* - * Set fallback. We must guard since slicing-by-8 is not visible - * everywhere. - */ -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK - pg_comp_crc32c = pg_comp_crc32c_sb8; -#endif - #if defined(HAVE__GET_CPUID) __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUID) @@ -78,32 +69,38 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) #error cpuid instruction not available #endif - if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ - { - pg_comp_crc32c = pg_comp_crc32c_sse42; + X86Feature[PG_SSE4_2] = exx[2] >> 20 & 1; + X86Feature[PG_POPCNT] = exx[2] >> 23 & 1; - if (exx[2] & (1 << 27) && /* OSXSAVE */ - zmm_regs_available()) - { - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + /* All these features depend on OSXSAVE */ + if (exx[2] & (1 << 27)) + { + /* second cpuid call on leaf 7 to check extended AVX-512 support */ - memset(exx, 0, 4 * sizeof(exx[0])); + memset(exx, 0, 4 * sizeof(exx[0])); #if defined(HAVE__GET_CPUID_COUNT) - __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); #elif defined(HAVE__CPUIDEX) - __cpuidex(exx, 7, 0); + __cpuidex(exx, 7, 0); #endif -#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK - if (exx[2] & (1 << 10) && /* VPCLMULQDQ */ - exx[1] & (1 << 31)) /* AVX512-VL */ - pg_comp_crc32c = pg_comp_crc32c_avx512; -#endif + if (zmm_regs_available()) + { + X86Feature[PG_AVX512_BW] = exx[1] >> 30 & 1; + X86Feature[PG_AVX512_VL] = exx[1] >> 31 & 1; + + X86Feature[PG_VPCLMULQDQ] = exx[2] >> 10 & 1; + X86Feature[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1; } } - return pg_comp_crc32c(crc, data, len); + X86Feature[init] = true; + +#if 1 + /* TODO: DEBUG log all set booleans with enum string */ + fprintf(stderr, "SSE4.2: %s\n", X86Feature[PG_SSE4_2] ? "yes" : "no"); +#endif } -pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose; +#endif /* defined(USE_SSE2) || defined(__i386__) */ -- 2.53.0
From 493393b91e5f853fa36f4624e444632e316922d1 Mon Sep 17 00:00:00 2001 From: John Naylor <[email protected]> Date: Sat, 14 Feb 2026 19:01:34 +0700 Subject: [PATCH v1 4/4] Enable autovectorizing page checksums with AVX2 where available We already rely on autovectorization for computing page checksums, but on x86 we can get about twice the performance by annotating pg_checksum_block() with function target attributes for AVX2, which uses 256-bit registers. Co-authored-by: Matthew Sterrett <[email protected]> Co-authored-by: Andrew Kim <[email protected]> Reviewed-by: Oleg Tselebrovskiy <[email protected]> Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal --- config/c-compiler.m4 | 26 ++++++++++++++++ configure | 46 +++++++++++++++++++++++++++ configure.ac | 9 ++++++ meson.build | 30 ++++++++++++++++++ src/backend/storage/page/checksum.c | 44 +++++++++++++++++++++++++- src/include/pg_config.h.in | 3 ++ src/include/port/pg_x86_feature.h | 2 ++ src/include/storage/checksum_impl.h | 48 +++++++++-------------------- src/port/pg_x86_feature.c | 6 +++- src/tools/pginclude/headerscheck | 2 ++ 10 files changed, 181 insertions(+), 35 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 1509dbfa2ab..1f3e31fc2d3 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -613,6 +613,32 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX2_SUPPORT +# --------------------------- +# Check if the compiler supports AVX2 target attribute. +# This is used for optimized checksum calculations with runtime detection. +# +# If AVX2 target attribute is supported, sets pgac_avx2_support. +AC_DEFUN([PGAC_AVX2_SUPPORT], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl +AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar], +[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif], + [return avx2_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx2_support=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX2_SUPPORT + # PGAC_AVX512_PCLMUL_INTRINSICS # --------------------------- # Check if the compiler supports AVX-512 carryless multiplication diff --git a/configure b/configure index 185703289b4..2d2c6308005 100755 --- a/configure +++ b/configure @@ -17718,6 +17718,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5 +$as_echo_n "checking for AVX2 target attribute support... " >&6; } +if ${pgac_cv_avx2_support+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <stdint.h> + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("avx2"))) + static int avx2_test(void) + { + return 0; + } + #endif +int +main () +{ +return avx2_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + pgac_cv_avx2_support=yes +else + pgac_cv_avx2_support=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5 +$as_echo "$pgac_cv_avx2_support" >&6; } +if test x"$pgac_cv_avx2_support" = x"yes"; then + pgac_avx2_support=yes +fi + + if test x"$pgac_avx2_support" = x"yes"; then + +$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + # Check for XSAVE intrinsics # { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5 diff --git a/configure.ac b/configure.ac index 0955b7e4371..0b4c3970b68 100644 --- a/configure.ac +++ b/configure.ac @@ -2122,6 +2122,15 @@ else fi fi +# Check for AVX2 target and intrinsic support +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX2_SUPPORT() + if test x"$pgac_avx2_support" = x"yes"; then + AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.]) + fi +fi + # Check for XSAVE intrinsics # PGAC_XSAVE_INTRINSICS() diff --git a/meson.build b/meson.build index f6d5842d852..feea3658ff3 100644 --- a/meson.build +++ b/meson.build @@ -2377,6 +2377,36 @@ int main(void) endif +############################################################### +# Check for the availability of AVX2 support +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> +#include <stdint.h> +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("avx2"))) +#endif +static int avx2_test(void) +{ + return 0; +} + +int main(void) +{ + return avx2_test(); +} +''' + + if cc.links(prog, name: 'AVX2 support', args: test_c_args) + cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index 8716651c8b5..9bf62ab2579 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -13,10 +13,52 @@ */ #include "postgres.h" +#include "port/pg_x86_feature.h" #include "storage/checksum.h" /* * The actual code is in storage/checksum_impl.h. This is done so that * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * that file from the exported Postgres headers. (Compare our legacy + * CRC code in pg_crc.h.) + * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific + * coding without affecting external programs. */ +#define PG_CHECKSUM_INTERNAL #include "storage/checksum_impl.h" /* IWYU pragma: keep */ + + +static uint32 +pg_checksum_block_fallback(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} + +/* + * AVX2-optimized block checksum algorithm. + */ +#ifdef USE_AVX2_WITH_RUNTIME_CHECK +pg_attribute_target("avx2") +static uint32 +pg_checksum_block_avx2(const PGChecksummablePage *page) +{ +#include "storage/checksum_block_internal.h" +} +#endif /* USE_AVX2_WITH_RUNTIME_CHECK */ + +/* + * Choose the best available checksum implementation. + */ +static uint32 +pg_checksum_choose(const PGChecksummablePage *page) +{ + pg_checksum_block = pg_checksum_block_fallback; + +#ifdef USE_AVX2_WITH_RUNTIME_CHECK + if (x86_feature_available(PG_AVX2)) + pg_checksum_block = pg_checksum_block_avx2; +#endif + + return pg_checksum_block(page); +} + +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8ef..1e43e9b2bc4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -665,6 +665,9 @@ /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use AVX2 instructions with a runtime check. */ +#undef USE_AVX2_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_x86_feature.h b/src/include/port/pg_x86_feature.h index de56882c9e1..543ecceb047 100644 --- a/src/include/port/pg_x86_feature.h +++ b/src/include/port/pg_x86_feature.h @@ -21,6 +21,8 @@ typedef enum X86FeatureId PG_SSE4_2, PG_POPCNT, + PG_AVX2, + PG_AVX512_BW, PG_AVX512_VL, PG_VPCLMULQDQ, diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h index 5c2dcbc63e7..8a308e423c3 100644 --- a/src/include/storage/checksum_impl.h +++ b/src/include/storage/checksum_impl.h @@ -73,11 +73,10 @@ * 2e-16 false positive rate within margin of error. * * Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer - * multiplication instruction. As of 2013 the corresponding instruction is - * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32). - * Vectorization requires a compiler to do the vectorization for us. For recent - * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough - * to achieve vectorization. + * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld) + * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the + * vectorization for us. For GCC and clang the flags -funroll-loops + * -ftree-vectorize are enough to achieve vectorization. * * The optimal amount of parallelism to use depends on CPU specific instruction * latency, SIMD instruction width, throughput and the amount of registers @@ -89,8 +88,9 @@ * * The parallelism number 32 was chosen based on the fact that it is the * largest state that fits into architecturally visible x86 SSE registers while - * leaving some free registers for intermediate values. For future processors - * with 256bit vector registers this will leave some performance on the table. + * leaving some free registers for intermediate values. For processors + * with 256bit vector registers this leaves some performance on the table. + * * When vectorization is not available it might be beneficial to restructure * the computation to calculate a subset of the columns at a time and perform * multiple passes to avoid register spilling. This optimization opportunity @@ -138,6 +138,9 @@ do { \ (checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \ } while (0) +/* Provide a static definition for external programs */ +#ifndef PG_CHECKSUM_INTERNAL + /* * Block checksum algorithm. The page must be adequately aligned * (at least on 4-byte boundary). @@ -145,34 +148,13 @@ do { \ static uint32 pg_checksum_block(const PGChecksummablePage *page) { - uint32 sums[N_SUMS]; - uint32 result = 0; - uint32 i, - j; - - /* ensure that the size is compatible with the algorithm */ - Assert(sizeof(PGChecksummablePage) == BLCKSZ); - - /* initialize partial checksums to their corresponding offsets */ - memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); - - /* main checksum calculation */ - for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], page->data[i][j]); - - /* finally add in two rounds of zeroes for additional mixing */ - for (i = 0; i < 2; i++) - for (j = 0; j < N_SUMS; j++) - CHECKSUM_COMP(sums[j], 0); - - /* xor fold partial checksums together */ - for (i = 0; i < N_SUMS; i++) - result ^= sums[i]; - - return result; +#include "storage/checksum_block_internal.h" } +#else +static uint32 (*pg_checksum_block) (const PGChecksummablePage *page); +#endif + /* * Compute the checksum for a Postgres page. * diff --git a/src/port/pg_x86_feature.c b/src/port/pg_x86_feature.c index afe552a3bcd..a9a6a901373 100644 --- a/src/port/pg_x86_feature.c +++ b/src/port/pg_x86_feature.c @@ -75,7 +75,7 @@ x86_set_runtime_features(void) { uint32 xcr0_val = 0; - /* second cpuid call on leaf 7 to check extended AVX-512 support */ + /* second cpuid call on leaf 7 to check extended support */ memset(exx, 0, 4 * sizeof(exx[0])); #if defined(HAVE__GET_CPUID_COUNT) __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); @@ -88,6 +88,10 @@ x86_set_runtime_features(void) xcr0_val = _xgetbv(0); #endif + /* Are YMM registers enabled? */ + if (mask_available(xcr0_val, XMM | YMM)) + X86Feature[PG_AVX2] = exx[1] >> 5 & 1; + /* Are ZMM registeres enabled? */ if (mask_available(xcr0_val, XMM | YMM | OPMASK | ZMM0_15 | ZMM16_31)) diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 7a6755991bb..569e749b25a 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -154,6 +154,8 @@ do test "$f" = src/include/catalog/syscache_ids.h && continue test "$f" = src/include/catalog/syscache_info.h && continue + test "$f" = src/include/storage/checksum_block_internal.h && continue + # We can't make these Bison output files compilable standalone # without using "%code require", which old Bison versions lack. # parser/gram.h will be included by parser/gramparse.h anyway. -- 2.53.0
