v3 removes some debug code that was causing CI to fail.
--
John Naylor
Amazon Web Services
From 57045e62fab60627c653fa6f0d8501cca9d37719 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Thu, 12 Feb 2026 12:45:23 +0700
Subject: [PATCH v3 2/4] Centralize detection of CPU features
WIP: x86 only
---
src/include/port/pg_cpu.h | 50 +++++++++++++++++++++
src/port/pg_cpu_x86.c | 62 +++++++++++---------------
src/port/pg_crc32c_sse42.c | 28 ++++++++++++
src/port/pg_popcount_x86.c | 91 ++------------------------------------
4 files changed, 107 insertions(+), 124 deletions(-)
create mode 100644 src/include/port/pg_cpu.h
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
new file mode 100644
index 00000000000..ffa9bfb3fd4
--- /dev/null
+++ b/src/include/port/pg_cpu.h
@@ -0,0 +1,50 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpu.h
+ * Runtime CPU feature detection
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_cpu.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_X86_FEATURE_H
+#define PG_X86_FEATURE_H
+
+#if defined(USE_SSE2) || defined(__i386__)
+
+typedef enum X86FeatureId
+{
+ /* Have we run feature detection? */
+ init,
+
+ /* scalar and 128-bit registers */
+ PG_SSE4_2,
+ PG_POPCNT,
+
+ /* 512-bit registers */
+ PG_AVX512_BW,
+ PG_AVX512_VL,
+ PG_AVX512_VPCLMULQDQ,
+ PG_AVX512_VPOPCNTDQ,
+} X86FeatureId;
+#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+
+extern PGDLLIMPORT bool X86Features[];
+
+extern void set_x86_features(void);
+
+static inline bool
+x86_feature_available(X86FeatureId feature)
+{
+ if (X86Features[init] == false)
+ set_x86_features();
+
+ return X86Features[feature];
+}
+
+#endif /* defined(USE_SSE2) || defined(__i386__) */
+
+#endif /* PG_X86_FEATURE_H */
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 998a70ffa41..8951e560ffe 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -1,12 +1,7 @@
/*-------------------------------------------------------------------------
*
* pg_cpu_x86.c
- * Choose between Intel SSE 4.2 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports Intel SSE
- * 4.2. If it does, use the special SSE instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
+ * Runtime CPU feature detection for x86
*
* Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -34,7 +29,10 @@
#include <immintrin.h>
#endif
-#include "port/pg_crc32c.h"
+#include "port/pg_cpu.h"
+
+
+bool X86Features[X86FeaturesSize] = {0};
/*
* Does XGETBV say the ZMM registers are enabled?
@@ -56,22 +54,13 @@ zmm_regs_available(void)
}
/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
+ * Parse the CPU ID info for runtime checks.
*/
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+void
+set_x86_features(void)
{
unsigned int exx[4] = {0, 0, 0, 0};
- /*
- * Set fallback. We must guard since slicing-by-8 is not visible
- * everywhere.
- */
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
- pg_comp_crc32c = pg_comp_crc32c_sb8;
-#endif
-
#if defined(HAVE__GET_CPUID)
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUID)
@@ -80,34 +69,33 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
#error cpuid instruction not available
#endif
- if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */
- {
- pg_comp_crc32c = pg_comp_crc32c_sse42;
+ X86Features[PG_SSE4_2] = exx[2] >> 20 & 1;
+ X86Features[PG_POPCNT] = exx[2] >> 23 & 1;
- if (exx[2] & (1 << 27) && /* OSXSAVE */
- zmm_regs_available())
- {
- /* second cpuid call on leaf 7 to check extended AVX-512 support */
+ /* All these features depend on OSXSAVE */
+ if (exx[2] & (1 << 27))
+ {
+ /* second cpuid call on leaf 7 to check extended AVX-512 support */
- memset(exx, 0, 4 * sizeof(exx[0]));
+ memset(exx, 0, 4 * sizeof(exx[0]));
#if defined(HAVE__GET_CPUID_COUNT)
- __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUIDEX)
- __cpuidex(exx, 7, 0);
+ __cpuidex(exx, 7, 0);
#endif
-#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
- if (exx[2] & (1 << 10) && /* VPCLMULQDQ */
- exx[1] & (1 << 31)) /* AVX512-VL */
- pg_comp_crc32c = pg_comp_crc32c_avx512;
-#endif
+ if (zmm_regs_available())
+ {
+ X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1;
+ X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1;
+
+ X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1;
+ X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1;
}
}
- return pg_comp_crc32c(crc, data, len);
+ X86Features[init] = true;
}
-pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-
#endif /* defined(USE_SSE2) || defined(__i386__) */
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index c1279d31fbd..2e740e12a7a 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -20,6 +20,9 @@
#endif
#include "port/pg_crc32c.h"
+#include "port/pg_cpu.h"
+
+static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len);
pg_attribute_no_sanitize_alignment()
pg_attribute_target("sse4.2")
@@ -159,3 +162,28 @@ pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len)
}
#endif
+
+static pg_crc32c
+pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
+{
+ /*
+ * Set fallback. We must guard since slicing-by-8 is not visible
+ * everywhere.
+ */
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
+ if (x86_feature_available(PG_SSE4_2))
+ pg_comp_crc32c = pg_comp_crc32c_sse42;
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+ if (x86_feature_available(PG_AVX512_VL) &&
+ x86_feature_available(PG_VPCLMULQDQ))
+ pg_comp_crc32c = pg_comp_crc32c_avx512;
+#endif
+
+ return pg_comp_crc32c(crc, data, len);
+};
+
+pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 6bce089432f..a99613f1818 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -14,19 +14,12 @@
#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
-#include <cpuid.h>
-#endif
-
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
#include <immintrin.h>
#endif
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
-#include <intrin.h>
-#endif
-
#include "port/pg_bitutils.h"
+#include "port/pg_cpu.h"
/*
* The SSE4.2 versions are built regardless of whether we are building the
@@ -58,84 +51,9 @@ static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
-/*
- * Return true if CPUID indicates that the POPCNT instruction is available.
- */
-static bool
-pg_popcount_sse42_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
- return (exx[2] & (1 << 23)) != 0; /* POPCNT */
-}
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
-/*
- * Does CPUID say there's support for XSAVE instructions?
- */
-static inline bool
-xsave_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 27)) != 0; /* osxsave */
-}
-
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that xsave_available() returns true
- * before calling this.
- */
-#ifdef HAVE_XSAVE_INTRINSICS
-pg_attribute_target("xsave")
-#endif
-static inline bool
-zmm_regs_available(void)
-{
-#ifdef HAVE_XSAVE_INTRINSICS
- return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
- return false;
-#endif
-}
-
-/*
- * Does CPUID say there's support for AVX-512 popcount and byte-and-word
- * instructions?
- */
-static inline bool
-avx512_popcnt_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID_COUNT)
- __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUIDEX)
- __cpuidex(exx, 7, 0);
-#else
-#error cpuid instruction not available
-#endif
- return (exx[2] & (1 << 14)) != 0 && /* avx512-vpopcntdq */
- (exx[1] & (1 << 30)) != 0; /* avx512-bw */
-}
-
/*
* Returns true if the CPU supports the instructions required for the AVX-512
* pg_popcount() implementation.
@@ -143,9 +61,8 @@ avx512_popcnt_available(void)
static bool
pg_popcount_avx512_available(void)
{
- return xsave_available() &&
- zmm_regs_available() &&
- avx512_popcnt_available();
+ return x86_feature_available(PG_AVX512_BW) &&
+ x86_feature_available(PG_AVX512_VPOPCNTDQ);
}
#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
@@ -159,7 +76,7 @@ pg_popcount_avx512_available(void)
static inline void
choose_popcount_functions(void)
{
- if (pg_popcount_sse42_available())
+ if (x86_feature_available(PG_POPCNT))
{
pg_popcount_optimized = pg_popcount_sse42;
pg_popcount_masked_optimized = pg_popcount_masked_sse42;
--
2.53.0
From fd69a8a901222145dcebd721e88a57ccf0b96f80 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Sat, 14 Feb 2026 19:01:34 +0700
Subject: [PATCH v3 4/4] Enable autovectorizing page checksums with AVX2 where
available
We already rely on autovectorization for computing page checksums,
but on x86 we can get about twice the performance by annotating
pg_checksum_block() with function target attributes for AVX2,
which uses 256-bit registers.
Co-authored-by: Matthew Sterrett <[email protected]>
Co-authored-by: Andrew Kim <[email protected]>
Reviewed-by: Oleg Tselebrovskiy <[email protected]>
Discussion: https://postgr.es/m/CA%2BvA85_5GTu%2BHHniSbvvP%2B8k3%3DxZO%3DWE84NPwiKyxztqvpfZ3Q%40mail.gmail.com
Discussion: https://postgr.es/m/20250911054220.3784-1-root%40ip-172-31-36-228.ec2.internal
---
config/c-compiler.m4 | 26 ++++++++++
configure | 46 ++++++++++++++++++
configure.ac | 9 ++++
meson.build | 30 ++++++++++++
src/backend/storage/page/checksum.c | 44 ++++++++++++++++-
src/include/pg_config.h.in | 3 ++
src/include/port/pg_cpu.h | 3 ++
src/include/storage/checksum_block_internal.h | 42 ++++++++++++++++
src/include/storage/checksum_impl.h | 48 ++++++-------------
src/port/pg_cpu_x86.c | 6 ++-
src/tools/pginclude/headerscheck | 2 +
11 files changed, 224 insertions(+), 35 deletions(-)
create mode 100644 src/include/storage/checksum_block_internal.h
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 1509dbfa2ab..1f3e31fc2d3 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -613,6 +613,32 @@ fi
undefine([Ac_cachevar])dnl
])# PGAC_SSE42_CRC32_INTRINSICS
+# PGAC_AVX2_SUPPORT
+# ---------------------------
+# Check if the compiler supports AVX2 target attribute.
+# This is used for optimized checksum calculations with runtime detection.
+#
+# If AVX2 target attribute is supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 target attribute support], [Ac_cachevar],
+[AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx2")))
+ static int avx2_test(void)
+ {
+ return 0;
+ }
+ #endif],
+ [return avx2_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
# PGAC_AVX512_PCLMUL_INTRINSICS
# ---------------------------
# Check if the compiler supports AVX-512 carryless multiplication
diff --git a/configure b/configure
index 185703289b4..2d2c6308005 100755
--- a/configure
+++ b/configure
@@ -17718,6 +17718,52 @@ $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
fi
fi
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 target attribute support" >&5
+$as_echo_n "checking for AVX2 target attribute support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx2")))
+ static int avx2_test(void)
+ {
+ return 0;
+ }
+ #endif
+int
+main ()
+{
+return avx2_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+ pgac_cv_avx2_support=yes
+else
+ pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+
+ if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ fi
+fi
+
# Check for XSAVE intrinsics
#
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv" >&5
diff --git a/configure.ac b/configure.ac
index 0955b7e4371..0b4c3970b68 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2122,6 +2122,15 @@ else
fi
fi
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ PGAC_AVX2_SUPPORT()
+ if test x"$pgac_avx2_support" = x"yes"; then
+ AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+ fi
+fi
+
# Check for XSAVE intrinsics
#
PGAC_XSAVE_INTRINSICS()
diff --git a/meson.build b/meson.build
index f6d5842d852..feea3658ff3 100644
--- a/meson.build
+++ b/meson.build
@@ -2377,6 +2377,36 @@ int main(void)
endif
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+ prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+static int avx2_test(void)
+{
+ return 0;
+}
+
+int main(void)
+{
+ return avx2_test();
+}
+'''
+
+ if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+ cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+ endif
+
+endif
+
+
###############################################################
# Check for the availability of AVX-512 popcount intrinsics.
###############################################################
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
index 8716651c8b5..030c44f7308 100644
--- a/src/backend/storage/page/checksum.c
+++ b/src/backend/storage/page/checksum.c
@@ -13,10 +13,52 @@
*/
#include "postgres.h"
+#include "port/pg_cpu.h"
#include "storage/checksum.h"
/*
* The actual code is in storage/checksum_impl.h. This is done so that
* external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers. (Compare our CRC code.)
+ * that file from the exported Postgres headers. (Compare our legacy
+ * CRC code in pg_crc.h.)
+ * The PG_CHECKSUM_INTERNAL symbol allows core to use hardware-specific
+ * coding without affecting external programs.
*/
+#define PG_CHECKSUM_INTERNAL
#include "storage/checksum_impl.h" /* IWYU pragma: keep */
+
+
+static uint32
+pg_checksum_block_fallback(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+
+/*
+ * AVX2-optimized block checksum algorithm.
+ */
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+pg_attribute_target("avx2")
+static uint32
+pg_checksum_block_avx2(const PGChecksummablePage *page)
+{
+#include "storage/checksum_block_internal.h"
+}
+#endif /* USE_AVX2_WITH_RUNTIME_CHECK */
+
+/*
+ * Choose the best available checksum implementation.
+ */
+static uint32
+pg_checksum_choose(const PGChecksummablePage *page)
+{
+ pg_checksum_block = pg_checksum_block_fallback;
+
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+ if (x86_feature_available(PG_AVX2))
+ pg_checksum_block = pg_checksum_block_avx2;
+#endif
+
+ return pg_checksum_block(page);
+}
+
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page) = pg_checksum_choose;
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 339268dc8ef..1e43e9b2bc4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -665,6 +665,9 @@
/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index ffa9bfb3fd4..00efa9bb959 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -24,6 +24,9 @@ typedef enum X86FeatureId
PG_SSE4_2,
PG_POPCNT,
+ /* 256-bit registers */
+ PG_AVX2,
+
/* 512-bit registers */
PG_AVX512_BW,
PG_AVX512_VL,
diff --git a/src/include/storage/checksum_block_internal.h b/src/include/storage/checksum_block_internal.h
new file mode 100644
index 00000000000..b4e6987d6b5
--- /dev/null
+++ b/src/include/storage/checksum_block_internal.h
@@ -0,0 +1,42 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum_block_internal.h
+ * Core algorithm for page checksums , semi private to checksum_impl.h
+ * and checksum.c.
+ *
+ * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksum_block_internal.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/* there is deliberately not an #ifndef CHECKSUM_BLOCK_INTERNAL_H here */
+
+uint32 sums[N_SUMS];
+uint32 result = 0;
+uint32 i,
+ j;
+
+/* ensure that the size is compatible with the algorithm */
+Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+
+/* initialize partial checksums to their corresponding offsets */
+memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+
+/* main checksum calculation */
+for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], page->data[i][j]);
+
+/* finally add in two rounds of zeroes for additional mixing */
+for (i = 0; i < 2; i++)
+ for (j = 0; j < N_SUMS; j++)
+ CHECKSUM_COMP(sums[j], 0);
+
+/* xor fold partial checksums together */
+for (i = 0; i < N_SUMS; i++)
+ result ^= sums[i];
+
+return result;
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index 5c2dcbc63e7..8a308e423c3 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -73,11 +73,10 @@
* 2e-16 false positive rate within margin of error.
*
* Vectorization of the algorithm requires 32bit x 32bit -> 32bit integer
- * multiplication instruction. As of 2013 the corresponding instruction is
- * available on x86 SSE4.1 extensions (pmulld) and ARM NEON (vmul.i32).
- * Vectorization requires a compiler to do the vectorization for us. For recent
- * GCC versions the flags -msse4.1 -funroll-loops -ftree-vectorize are enough
- * to achieve vectorization.
+ * multiplication instruction. Examples include x86 AVX2 extensions (vpmulld)
+ * and ARM NEON (vmul.i32). For simplicity we rely on the compiler to do the
+ * vectorization for us. For GCC and clang the flags -funroll-loops
+ * -ftree-vectorize are enough to achieve vectorization.
*
* The optimal amount of parallelism to use depends on CPU specific instruction
* latency, SIMD instruction width, throughput and the amount of registers
@@ -89,8 +88,9 @@
*
* The parallelism number 32 was chosen based on the fact that it is the
* largest state that fits into architecturally visible x86 SSE registers while
- * leaving some free registers for intermediate values. For future processors
- * with 256bit vector registers this will leave some performance on the table.
+ * leaving some free registers for intermediate values. For processors
+ * with 256bit vector registers this leaves some performance on the table.
+ *
* When vectorization is not available it might be beneficial to restructure
* the computation to calculate a subset of the columns at a time and perform
* multiple passes to avoid register spilling. This optimization opportunity
@@ -138,6 +138,9 @@ do { \
(checksum) = __tmp * FNV_PRIME ^ (__tmp >> 17); \
} while (0)
+/* Provide a static definition for external programs */
+#ifndef PG_CHECKSUM_INTERNAL
+
/*
* Block checksum algorithm. The page must be adequately aligned
* (at least on 4-byte boundary).
@@ -145,34 +148,13 @@ do { \
static uint32
pg_checksum_block(const PGChecksummablePage *page)
{
- uint32 sums[N_SUMS];
- uint32 result = 0;
- uint32 i,
- j;
-
- /* ensure that the size is compatible with the algorithm */
- Assert(sizeof(PGChecksummablePage) == BLCKSZ);
-
- /* initialize partial checksums to their corresponding offsets */
- memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
-
- /* main checksum calculation */
- for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], page->data[i][j]);
-
- /* finally add in two rounds of zeroes for additional mixing */
- for (i = 0; i < 2; i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], 0);
-
- /* xor fold partial checksums together */
- for (i = 0; i < N_SUMS; i++)
- result ^= sums[i];
-
- return result;
+#include "storage/checksum_block_internal.h"
}
+#else
+static uint32 (*pg_checksum_block) (const PGChecksummablePage *page);
+#endif
+
/*
* Compute the checksum for a Postgres page.
*
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index b016a42bd9a..a812f378c96 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -74,7 +74,7 @@ set_x86_features(void)
{
uint32 xcr0_val = 0;
- /* second cpuid call on leaf 7 to check extended AVX-512 support */
+ /* second cpuid call on leaf 7 to check extended support */
memset(exx, 0, 4 * sizeof(exx[0]));
#if defined(HAVE__GET_CPUID_COUNT)
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
@@ -87,6 +87,10 @@ set_x86_features(void)
xcr0_val = _xgetbv(0);
#endif
+ /* Are YMM registers enabled? */
+ if (mask_available(xcr0_val, XMM | YMM))
+ X86Features[PG_AVX2] = exx[1] >> 5 & 1;
+
/* Are ZMM registeres enabled? */
if (mask_available(xcr0_val, XMM | YMM |
OPMASK | ZMM0_15 | ZMM16_31))
diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck
index 7a6755991bb..569e749b25a 100755
--- a/src/tools/pginclude/headerscheck
+++ b/src/tools/pginclude/headerscheck
@@ -154,6 +154,8 @@ do
test "$f" = src/include/catalog/syscache_ids.h && continue
test "$f" = src/include/catalog/syscache_info.h && continue
+ test "$f" = src/include/storage/checksum_block_internal.h && continue
+
# We can't make these Bison output files compilable standalone
# without using "%code require", which old Bison versions lack.
# parser/gram.h will be included by parser/gramparse.h anyway.
--
2.53.0
From 4903f5b9c27d6d84b0e33f7506d200753958b133 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Fri, 13 Feb 2026 18:11:39 +0700
Subject: [PATCH v3 3/4] Refactor the detection of ZMM registers
- Call _xgetbv within x86_set_runtime_features rather than in a
separate function
- Use symbols for XCR mask bits rather than a magic constant
A future commit will build on this to detect YMM registers without
code duplication.
---
src/port/pg_cpu_x86.c | 42 +++++++++++++++++++++-----------------
src/port/pg_crc32c_sse42.c | 2 +-
2 files changed, 24 insertions(+), 20 deletions(-)
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 8951e560ffe..b016a42bd9a 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -31,31 +31,28 @@
#include "port/pg_cpu.h"
+/* XSAVE state component bits that we need */
+#define XMM (1<<1)
+#define YMM (1<<2)
+#define OPMASK (1<<5)
+#define ZMM0_15 (1<<6)
+#define ZMM16_31 (1<<7)
+
bool X86Features[X86FeaturesSize] = {0};
-/*
- * Does XGETBV say the ZMM registers are enabled?
- *
- * NB: Caller is responsible for verifying that osxsave is available
- * before calling this.
- */
-#ifdef HAVE_XSAVE_INTRINSICS
-pg_attribute_target("xsave")
-#endif
static bool
-zmm_regs_available(void)
+mask_available(uint32 value, uint32 mask)
{
-#ifdef HAVE_XSAVE_INTRINSICS
- return (_xgetbv(0) & 0xe6) == 0xe6;
-#else
- return false;
-#endif
+ return (value & mask) == mask;
}
/*
* Parse the CPU ID info for runtime checks.
*/
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
void
set_x86_features(void)
{
@@ -75,22 +72,29 @@ set_x86_features(void)
/* All these features depend on OSXSAVE */
if (exx[2] & (1 << 27))
{
- /* second cpuid call on leaf 7 to check extended AVX-512 support */
+ uint32 xcr0_val = 0;
+ /* second cpuid call on leaf 7 to check extended AVX-512 support */
memset(exx, 0, 4 * sizeof(exx[0]));
-
#if defined(HAVE__GET_CPUID_COUNT)
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
#elif defined(HAVE__CPUIDEX)
__cpuidex(exx, 7, 0);
#endif
- if (zmm_regs_available())
+#ifdef HAVE_XSAVE_INTRINSICS
+ /* get value of Extended Control Register */
+ xcr0_val = _xgetbv(0);
+#endif
+
+ /* Are ZMM registeres enabled? */
+ if (mask_available(xcr0_val, XMM | YMM |
+ OPMASK | ZMM0_15 | ZMM16_31))
{
X86Features[PG_AVX512_BW] = exx[1] >> 30 & 1;
X86Features[PG_AVX512_VL] = exx[1] >> 31 & 1;
- X86Features[PG_VPCLMULQDQ] = exx[2] >> 10 & 1;
+ X86Features[PG_AVX512_VPCLMULQDQ] = exx[2] >> 10 & 1;
X86Features[PG_AVX512_VPOPCNTDQ] = exx[2] >> 14 & 1;
}
}
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 2e740e12a7a..d1d9d74e5ab 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -179,7 +179,7 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
if (x86_feature_available(PG_AVX512_VL) &&
- x86_feature_available(PG_VPCLMULQDQ))
+ x86_feature_available(PG_AVX512_VPCLMULQDQ))
pg_comp_crc32c = pg_comp_crc32c_avx512;
#endif
--
2.53.0
From 29a80a9ab3aeb23106afdc63281bb51b23a0b7a8 Mon Sep 17 00:00:00 2001
From: John Naylor <[email protected]>
Date: Wed, 11 Feb 2026 14:34:18 +0700
Subject: [PATCH v3 1/4] Rename CRC "choose" files for future general purpose
WIP: x86 only
---
configure | 4 ++--
configure.ac | 4 ++--
src/port/Makefile | 1 +
src/port/meson.build | 3 +--
src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} | 8 ++++++--
5 files changed, 12 insertions(+), 8 deletions(-)
rename src/port/{pg_crc32c_sse42_choose.c => pg_cpu_x86.c} (94%)
diff --git a/configure b/configure
index a10a2c85c6a..185703289b4 100755
--- a/configure
+++ b/configure
@@ -18196,7 +18196,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
$as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
+ PG_CRC32C_OBJS="pg_crc32c_sse42.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
$as_echo "SSE 4.2" >&6; }
else
@@ -18204,7 +18204,7 @@ else
$as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+ PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
$as_echo "SSE 4.2 with runtime check" >&6; }
else
diff --git a/configure.ac b/configure.ac
index 814e64a967e..0955b7e4371 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2245,12 +2245,12 @@ fi
AC_MSG_CHECKING([which CRC-32C implementation to use])
if test x"$USE_SSE42_CRC32C" = x"1"; then
AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.])
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o"
+ PG_CRC32C_OBJS="pg_crc32c_sse42.o"
AC_MSG_RESULT(SSE 4.2)
else
if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
- PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+ PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o"
AC_MSG_RESULT(SSE 4.2 with runtime check)
else
if test x"$USE_ARMV8_CRC32C" = x"1"; then
diff --git a/src/port/Makefile b/src/port/Makefile
index 6e3b7d154ed..47cfea1507d 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
noblock.o \
path.o \
pg_bitutils.o \
+ pg_cpu_x86.o \
pg_localeconv_r.o \
pg_numa.o \
pg_popcount_aarch64.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index d7d4e705b89..edb2e5632bd 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
'noblock.c',
'path.c',
'pg_bitutils.c',
+ 'pg_cpu_x86.c',
'pg_localeconv_r.c',
'pg_numa.c',
'pg_popcount_aarch64.c',
@@ -86,8 +87,6 @@ replace_funcs_pos = [
# x86/x64
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
- ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
# arm / aarch64
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_cpu_x86.c
similarity index 94%
rename from src/port/pg_crc32c_sse42_choose.c
rename to src/port/pg_cpu_x86.c
index f586476964f..998a70ffa41 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_cpu_x86.c
@@ -1,6 +1,6 @@
/*-------------------------------------------------------------------------
*
- * pg_crc32c_sse42_choose.c
+ * pg_cpu_x86.c
* Choose between Intel SSE 4.2 and software CRC-32C implementation.
*
* On first call, checks if the CPU we're running on supports Intel SSE
@@ -13,13 +13,15 @@
*
*
* IDENTIFICATION
- * src/port/pg_crc32c_sse42_choose.c
+ * src/port/pg_cpu_x86.c
*
*-------------------------------------------------------------------------
*/
#include "c.h"
+#if defined(USE_SSE2) || defined(__i386__)
+
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
#include <cpuid.h>
#endif
@@ -107,3 +109,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
}
pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif /* defined(USE_SSE2) || defined(__i386__) */
--
2.53.0