Here is a new patch set. Notably, I've added a 0004 that does the
following:
* Removes TRY_POPCNT_X86_64. We now assume that the required CPUID
intrinsics are available, as we have long done in some of the CRC-32C code.
* Moves the MSVC check for HAVE_X86_64_POPCNTQ to configuration-time. This
way, we set it for all relevant platforms in one place.
* Moves the #defines for USE_SSE2 and USE_NEON to c.h so that they can be
used elsewhere without simd.h. Consequently, we can remove POPCNT_AARCH64.
* Moves the #includes for pg_bitutils.h to below the system headers in
pg_popcount_{aarch64,x86}.c (since we no longer depend on macros defined in
pg_bitutils.h).
--
nathan
>From 2fd556d0e1856d25381666b370fc000ba015523b Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 14 Jan 2026 11:37:28 -0600
Subject: [PATCH v2 1/4] Rename pg_popcount_avx512.c to pg_popcount_x86.c.
This is preparatory work for a follow-up commit that will move the
rest of the x86-64-specific popcount code to this file.
---
src/port/Makefile | 2 +-
src/port/meson.build | 2 +-
src/port/{pg_popcount_avx512.c => pg_popcount_x86.c} | 6 +++---
3 files changed, 5 insertions(+), 5 deletions(-)
rename src/port/{pg_popcount_avx512.c => pg_popcount_x86.c} (98%)
diff --git a/src/port/Makefile b/src/port/Makefile
index 4274949dfa4..6e3b7d154ed 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -47,7 +47,7 @@ OBJS = \
pg_localeconv_r.o \
pg_numa.o \
pg_popcount_aarch64.o \
- pg_popcount_avx512.o \
+ pg_popcount_x86.o \
pg_strong_random.o \
pgcheckdir.o \
pgmkdirp.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 28655142ebe..d7d4e705b89 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -10,7 +10,7 @@ pgport_sources = [
'pg_localeconv_r.c',
'pg_numa.c',
'pg_popcount_aarch64.c',
- 'pg_popcount_avx512.c',
+ 'pg_popcount_x86.c',
'pg_strong_random.c',
'pgcheckdir.c',
'pgmkdirp.c',
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_x86.c
similarity index 98%
rename from src/port/pg_popcount_avx512.c
rename to src/port/pg_popcount_x86.c
index 407b610bacb..453c7a06ce9 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_x86.c
@@ -1,12 +1,12 @@
/*-------------------------------------------------------------------------
*
- * pg_popcount_avx512.c
- * Holds the AVX-512 pg_popcount() implementation.
+ * pg_popcount_x86_64.c
+ * Holds the x86-64 pg_popcount() implementations.
*
* Copyright (c) 2024-2026, PostgreSQL Global Development Group
*
* IDENTIFICATION
- * src/port/pg_popcount_avx512.c
+ * src/port/pg_popcount_x86_64.c
*
*-------------------------------------------------------------------------
*/
--
2.50.1 (Apple Git-155)
>From 9cf58f65ba5fa838d908ea4ffe528fff42154f9c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Wed, 14 Jan 2026 11:54:54 -0600
Subject: [PATCH v2 2/4] Move x86 popcount code to pg_popcount_x86_64.c.
This moves the SSE4.2 popcount implementations to the recently
renamed file for x86-64-specific popcount code.
---
src/include/port/pg_bitutils.h | 20 ++-
src/port/pg_bitutils.c | 263 +-------------------------------
src/port/pg_popcount_x86.c | 264 +++++++++++++++++++++++++++++++--
3 files changed, 267 insertions(+), 280 deletions(-)
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 8ed12f7a021..c9c508d4ba3 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -307,23 +307,21 @@ pg_ceil_log2_64(uint64 num)
#define POPCNT_AARCH64 1
#endif
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+extern uint64 pg_popcount_slow(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
+
#ifdef TRY_POPCNT_X86_64
-/* Attempt to use the POPCNT instruction, but perform a runtime check first */
+/*
+ * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check
+ * first.
+ */
extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int
bytes);
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf,
int bytes, bits8 mask);
-/*
- * We can also try to use the AVX-512 popcount instruction on some systems.
- * The implementation of that is located in its own file.
- */
-#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
-extern bool pg_popcount_avx512_available(void);
-extern uint64 pg_popcount_avx512(const char *buf, int bytes);
-extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8
mask);
-#endif
-
#elif POPCNT_AARCH64
/* Use the Neon version of pg_popcount{32,64} without function pointer. */
extern int pg_popcount32(uint32 word);
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 7875bbb0f4b..9f9f90ddd4d 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -12,13 +12,6 @@
*/
#include "c.h"
-#ifdef HAVE__GET_CPUID
-#include <cpuid.h>
-#endif
-#ifdef HAVE__CPUID
-#include <intrin.h>
-#endif
-
#include "port/pg_bitutils.h"
@@ -103,257 +96,11 @@ const uint8 pg_number_of_ones[256] = {
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
-/*
- * If we are building the Neon versions, we don't need the "slow" fallbacks.
- */
-#ifndef POPCNT_AARCH64
-static inline int pg_popcount32_slow(uint32 word);
-static inline int pg_popcount64_slow(uint64 word);
-static uint64 pg_popcount_slow(const char *buf, int bytes);
-static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
-#endif
-
-#ifdef TRY_POPCNT_X86_64
-static bool pg_popcount_available(void);
-static int pg_popcount32_choose(uint32 word);
-static int pg_popcount64_choose(uint64 word);
-static uint64 pg_popcount_choose(const char *buf, int bytes);
-static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8
mask);
-static inline int pg_popcount32_fast(uint32 word);
-static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
-static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
-
-int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
-int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-uint64 (*pg_popcount_optimized) (const char *buf, int bytes) =
pg_popcount_choose;
-uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes,
bits8 mask) = pg_popcount_masked_choose;
-#endif /* TRY_POPCNT_X86_64 */
-
-#ifdef TRY_POPCNT_X86_64
-
-/*
- * Return true if CPUID indicates that the POPCNT instruction is available.
- */
-static bool
-pg_popcount_available(void)
-{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
-#else
-#error cpuid instruction not available
-#endif
-
- return (exx[2] & (1 << 23)) != 0; /* POPCNT */
-}
-
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static inline void
-choose_popcount_functions(void)
-{
- if (pg_popcount_available())
- {
- pg_popcount32 = pg_popcount32_fast;
- pg_popcount64 = pg_popcount64_fast;
- pg_popcount_optimized = pg_popcount_fast;
- pg_popcount_masked_optimized = pg_popcount_masked_fast;
- }
- else
- {
- pg_popcount32 = pg_popcount32_slow;
- pg_popcount64 = pg_popcount64_slow;
- pg_popcount_optimized = pg_popcount_slow;
- pg_popcount_masked_optimized = pg_popcount_masked_slow;
- }
-
-#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
- if (pg_popcount_avx512_available())
- {
- pg_popcount_optimized = pg_popcount_avx512;
- pg_popcount_masked_optimized = pg_popcount_masked_avx512;
- }
-#endif
-}
-
-static int
-pg_popcount32_choose(uint32 word)
-{
- choose_popcount_functions();
- return pg_popcount32(word);
-}
-
-static int
-pg_popcount64_choose(uint64 word)
-{
- choose_popcount_functions();
- return pg_popcount64(word);
-}
-
-static uint64
-pg_popcount_choose(const char *buf, int bytes)
-{
- choose_popcount_functions();
- return pg_popcount_optimized(buf, bytes);
-}
-
-static uint64
-pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
-{
- choose_popcount_functions();
- return pg_popcount_masked(buf, bytes, mask);
-}
-
-/*
- * pg_popcount32_fast
- * Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount32_fast(uint32 word)
-{
-#ifdef _MSC_VER
- return __popcnt(word);
-#else
- uint32 res;
-
-__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
- return (int) res;
-#endif
-}
-
-/*
- * pg_popcount64_fast
- * Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount64_fast(uint64 word)
-{
-#ifdef _MSC_VER
- return __popcnt64(word);
-#else
- uint64 res;
-
-__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
- return (int) res;
-#endif
-}
-
-/*
- * pg_popcount_fast
- * Returns the number of 1-bits in buf
- */
-static uint64
-pg_popcount_fast(const char *buf, int bytes)
-{
- uint64 popcnt = 0;
-
-#if SIZEOF_VOID_P >= 8
- /* Process in 64-bit chunks if the buffer is aligned. */
- if (buf == (const char *) TYPEALIGN(8, buf))
- {
- const uint64 *words = (const uint64 *) buf;
-
- while (bytes >= 8)
- {
- popcnt += pg_popcount64_fast(*words++);
- bytes -= 8;
- }
-
- buf = (const char *) words;
- }
-#else
- /* Process in 32-bit chunks if the buffer is aligned. */
- if (buf == (const char *) TYPEALIGN(4, buf))
- {
- const uint32 *words = (const uint32 *) buf;
-
- while (bytes >= 4)
- {
- popcnt += pg_popcount32_fast(*words++);
- bytes -= 4;
- }
-
- buf = (const char *) words;
- }
-#endif
-
- /* Process any remaining bytes */
- while (bytes--)
- popcnt += pg_number_of_ones[(unsigned char) *buf++];
-
- return popcnt;
-}
-
-/*
- * pg_popcount_masked_fast
- * Returns the number of 1-bits in buf after applying the mask to
each byte
- */
-static uint64
-pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
-{
- uint64 popcnt = 0;
-
-#if SIZEOF_VOID_P >= 8
- /* Process in 64-bit chunks if the buffer is aligned */
- uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
-
- if (buf == (const char *) TYPEALIGN(8, buf))
- {
- const uint64 *words = (const uint64 *) buf;
-
- while (bytes >= 8)
- {
- popcnt += pg_popcount64_fast(*words++ & maskv);
- bytes -= 8;
- }
-
- buf = (const char *) words;
- }
-#else
- /* Process in 32-bit chunks if the buffer is aligned. */
- uint32 maskv = ~((uint32) 0) / 0xFF * mask;
-
- if (buf == (const char *) TYPEALIGN(4, buf))
- {
- const uint32 *words = (const uint32 *) buf;
-
- while (bytes >= 4)
- {
- popcnt += pg_popcount32_fast(*words++ & maskv);
- bytes -= 4;
- }
-
- buf = (const char *) words;
- }
-#endif
-
- /* Process any remaining bytes */
- while (bytes--)
- popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
-
- return popcnt;
-}
-
-#endif /* TRY_POPCNT_X86_64 */
-
-/*
- * If we are building the Neon versions, we don't need the "slow" fallbacks.
- */
-#ifndef POPCNT_AARCH64
-
/*
* pg_popcount32_slow
* Return the number of 1 bits set in word
*/
-static inline int
+int
pg_popcount32_slow(uint32 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -375,7 +122,7 @@ pg_popcount32_slow(uint32 word)
* pg_popcount64_slow
* Return the number of 1 bits set in word
*/
-static inline int
+int
pg_popcount64_slow(uint64 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
@@ -403,7 +150,7 @@ pg_popcount64_slow(uint64 word)
* pg_popcount_slow
* Returns the number of 1-bits in buf
*/
-static uint64
+uint64
pg_popcount_slow(const char *buf, int bytes)
{
uint64 popcnt = 0;
@@ -449,7 +196,7 @@ pg_popcount_slow(const char *buf, int bytes)
* pg_popcount_masked_slow
* Returns the number of 1-bits in buf after applying the mask to
each byte
*/
-static uint64
+uint64
pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
{
uint64 popcnt = 0;
@@ -495,8 +242,6 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8
mask)
return popcnt;
}
-#endif /* ! POPCNT_AARCH64 */
-
#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
/*
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 453c7a06ce9..f8643642613 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -12,26 +12,74 @@
*/
#include "c.h"
-#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+#include "port/pg_bitutils.h"
+
+#ifdef TRY_POPCNT_X86_64
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
#include <cpuid.h>
#endif
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
#include <immintrin.h>
+#endif
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
#include <intrin.h>
#endif
-#include "port/pg_bitutils.h"
+/*
+ * The SSE4.2 versions are built regardless of whether we are building the
+ * AVX-512 versions.
+ */
+static inline int pg_popcount32_fast(uint32 word);
+static inline int pg_popcount64_fast(uint64 word);
+static uint64 pg_popcount_fast(const char *buf, int bytes);
+static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
/*
- * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to
- * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
- * the function pointers that are only used when TRY_POPCNT_X86_64 is set.
+ * These are the AVX-512 implementations of the popcount functions.
*/
-#ifdef TRY_POPCNT_X86_64
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+static uint64 pg_popcount_avx512(const char *buf, int bytes);
+static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8
mask);
+#endif /*
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * The function pointers are initially set to "choose" functions. These
+ * functions will first set the pointers to the right implementations (base on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static int pg_popcount32_choose(uint32 word);
+static int pg_popcount64_choose(uint64 word);
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8
mask);
+int (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
+int (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
+uint64 (*pg_popcount_optimized) (const char *buf, int bytes) =
pg_popcount_choose;
+uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes,
bits8 mask) = pg_popcount_masked_choose;
+
+/*
+ * Return true if CPUID indicates that the POPCNT instruction is available.
+ */
+static bool
+pg_popcount_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+ return (exx[2] & (1 << 23)) != 0; /* POPCNT */
+}
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
/*
* Does CPUID say there's support for XSAVE instructions?
@@ -94,7 +142,7 @@ avx512_popcnt_available(void)
* Returns true if the CPU supports the instructions required for the AVX-512
* pg_popcount() implementation.
*/
-bool
+static bool
pg_popcount_avx512_available(void)
{
return xsave_available() &&
@@ -102,12 +150,77 @@ pg_popcount_avx512_available(void)
avx512_popcnt_available();
}
+#endif /*
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static inline void
+choose_popcount_functions(void)
+{
+ if (pg_popcount_available())
+ {
+ pg_popcount32 = pg_popcount32_fast;
+ pg_popcount64 = pg_popcount64_fast;
+ pg_popcount_optimized = pg_popcount_fast;
+ pg_popcount_masked_optimized = pg_popcount_masked_fast;
+ }
+ else
+ {
+ pg_popcount32 = pg_popcount32_slow;
+ pg_popcount64 = pg_popcount64_slow;
+ pg_popcount_optimized = pg_popcount_slow;
+ pg_popcount_masked_optimized = pg_popcount_masked_slow;
+ }
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+ if (pg_popcount_avx512_available())
+ {
+ pg_popcount_optimized = pg_popcount_avx512;
+ pg_popcount_masked_optimized = pg_popcount_masked_avx512;
+ }
+#endif
+}
+
+static int
+pg_popcount32_choose(uint32 word)
+{
+ choose_popcount_functions();
+ return pg_popcount32(word);
+}
+
+static int
+pg_popcount64_choose(uint64 word)
+{
+ choose_popcount_functions();
+ return pg_popcount64(word);
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+ choose_popcount_functions();
+ return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+ choose_popcount_functions();
+ return pg_popcount_masked(buf, bytes, mask);
+}
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
/*
* pg_popcount_avx512
* Returns the number of 1-bits in buf
*/
pg_attribute_target("avx512vpopcntdq,avx512bw")
-uint64
+static uint64
pg_popcount_avx512(const char *buf, int bytes)
{
__m512i val,
@@ -163,7 +276,7 @@ pg_popcount_avx512(const char *buf, int bytes)
* Returns the number of 1-bits in buf after applying the mask to
each byte
*/
pg_attribute_target("avx512vpopcntdq,avx512bw")
-uint64
+static uint64
pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
{
__m512i val,
@@ -219,5 +332,136 @@ pg_popcount_masked_avx512(const char *buf, int bytes,
bits8 mask)
return _mm512_reduce_add_epi64(accum);
}
-#endif /* TRY_POPCNT_X86_64 */
#endif /*
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * pg_popcount32_fast
+ * Return the number of 1 bits set in word
+ */
+static inline int
+pg_popcount32_fast(uint32 word)
+{
+#ifdef _MSC_VER
+ return __popcnt(word);
+#else
+ uint32 res;
+
+__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
+ return (int) res;
+#endif
+}
+
+/*
+ * pg_popcount64_fast
+ * Return the number of 1 bits set in word
+ */
+static inline int
+pg_popcount64_fast(uint64 word)
+{
+#ifdef _MSC_VER
+ return __popcnt64(word);
+#else
+ uint64 res;
+
+__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
+ return (int) res;
+#endif
+}
+
+/*
+ * pg_popcount_fast
+ * Returns the number of 1-bits in buf
+ */
+static uint64
+pg_popcount_fast(const char *buf, int bytes)
+{
+ uint64 popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+ /* Process in 64-bit chunks if the buffer is aligned. */
+ if (buf == (const char *) TYPEALIGN(8, buf))
+ {
+ const uint64 *words = (const uint64 *) buf;
+
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_fast(*words++);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+ }
+#else
+ /* Process in 32-bit chunks if the buffer is aligned. */
+ if (buf == (const char *) TYPEALIGN(4, buf))
+ {
+ const uint32 *words = (const uint32 *) buf;
+
+ while (bytes >= 4)
+ {
+ popcnt += pg_popcount32_fast(*words++);
+ bytes -= 4;
+ }
+
+ buf = (const char *) words;
+ }
+#endif
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+ return popcnt;
+}
+
+/*
+ * pg_popcount_masked_fast
+ * Returns the number of 1-bits in buf after applying the mask to
each byte
+ */
+static uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+ uint64 popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+ /* Process in 64-bit chunks if the buffer is aligned */
+ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(8, buf))
+ {
+ const uint64 *words = (const uint64 *) buf;
+
+ while (bytes >= 8)
+ {
+ popcnt += pg_popcount64_fast(*words++ & maskv);
+ bytes -= 8;
+ }
+
+ buf = (const char *) words;
+ }
+#else
+ /* Process in 32-bit chunks if the buffer is aligned. */
+ uint32 maskv = ~((uint32) 0) / 0xFF * mask;
+
+ if (buf == (const char *) TYPEALIGN(4, buf))
+ {
+ const uint32 *words = (const uint32 *) buf;
+
+ while (bytes >= 4)
+ {
+ popcnt += pg_popcount32_fast(*words++ & maskv);
+ bytes -= 4;
+ }
+
+ buf = (const char *) words;
+ }
+#endif
+
+ /* Process any remaining bytes */
+ while (bytes--)
+ popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask];
+
+ return popcnt;
+}
+
+#endif /* TRY_POPCNT_X86_64 */
--
2.50.1 (Apple Git-155)
>From 831fa399e5907e084512b7761bf9b624bbee5508 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Thu, 15 Jan 2026 11:00:36 -0600
Subject: [PATCH v2 3/4] Rename "fast" and "slow" popcount functions.
Since we now how several implementations of the popcount functions,
let's give them more descriptive names. This commit replaces
"slow" with "portable" and "fast" with "sse42". While the POPCNT
instruction is technically not part of SSE4.2, this naming scheme
is close enough in practice and is arguably easier to understand
than using "popcnt" instead.
---
src/include/port/pg_bitutils.h | 8 ++---
src/port/pg_bitutils.c | 38 +++++++++++------------
src/port/pg_popcount_x86.c | 56 ++++++++++++++++++----------------
3 files changed, 53 insertions(+), 49 deletions(-)
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index c9c508d4ba3..89b117d9817 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -307,10 +307,10 @@ pg_ceil_log2_64(uint64 num)
#define POPCNT_AARCH64 1
#endif
-extern int pg_popcount32_slow(uint32 word);
-extern int pg_popcount64_slow(uint64 word);
-extern uint64 pg_popcount_slow(const char *buf, int bytes);
-extern uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
+extern int pg_popcount32_portable(uint32 word);
+extern int pg_popcount64_portable(uint64 word);
+extern uint64 pg_popcount_portable(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8
mask);
#ifdef TRY_POPCNT_X86_64
/*
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 9f9f90ddd4d..170aeef7548 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -97,11 +97,11 @@ const uint8 pg_number_of_ones[256] = {
};
/*
- * pg_popcount32_slow
+ * pg_popcount32_portable
* Return the number of 1 bits set in word
*/
int
-pg_popcount32_slow(uint32 word)
+pg_popcount32_portable(uint32 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
return __builtin_popcount(word);
@@ -119,11 +119,11 @@ pg_popcount32_slow(uint32 word)
}
/*
- * pg_popcount64_slow
+ * pg_popcount64_portable
* Return the number of 1 bits set in word
*/
int
-pg_popcount64_slow(uint64 word)
+pg_popcount64_portable(uint64 word)
{
#ifdef HAVE__BUILTIN_POPCOUNT
#if SIZEOF_LONG == 8
@@ -147,11 +147,11 @@ pg_popcount64_slow(uint64 word)
}
/*
- * pg_popcount_slow
+ * pg_popcount_portable
* Returns the number of 1-bits in buf
*/
uint64
-pg_popcount_slow(const char *buf, int bytes)
+pg_popcount_portable(const char *buf, int bytes)
{
uint64 popcnt = 0;
@@ -163,7 +163,7 @@ pg_popcount_slow(const char *buf, int bytes)
while (bytes >= 8)
{
- popcnt += pg_popcount64_slow(*words++);
+ popcnt += pg_popcount64_portable(*words++);
bytes -= 8;
}
@@ -177,7 +177,7 @@ pg_popcount_slow(const char *buf, int bytes)
while (bytes >= 4)
{
- popcnt += pg_popcount32_slow(*words++);
+ popcnt += pg_popcount32_portable(*words++);
bytes -= 4;
}
@@ -193,11 +193,11 @@ pg_popcount_slow(const char *buf, int bytes)
}
/*
- * pg_popcount_masked_slow
+ * pg_popcount_masked_portable
* Returns the number of 1-bits in buf after applying the mask to
each byte
*/
uint64
-pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask)
{
uint64 popcnt = 0;
@@ -211,7 +211,7 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8
mask)
while (bytes >= 8)
{
- popcnt += pg_popcount64_slow(*words++ & maskv);
+ popcnt += pg_popcount64_portable(*words++ & maskv);
bytes -= 8;
}
@@ -227,7 +227,7 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8
mask)
while (bytes >= 4)
{
- popcnt += pg_popcount32_slow(*words++ & maskv);
+ popcnt += pg_popcount32_portable(*words++ & maskv);
bytes -= 4;
}
@@ -246,20 +246,20 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8
mask)
/*
* When special CPU instructions are not available, there's no point in using
- * function pointers to vary the implementation between the fast and slow
- * method. We instead just make these actual external functions. The compiler
- * should be able to inline the slow versions here.
+ * function pointers to vary the implementation. We instead just make these
+ * actual external functions. The compiler should be able to inline the
+ * portable versions here.
*/
int
pg_popcount32(uint32 word)
{
- return pg_popcount32_slow(word);
+ return pg_popcount32_portable(word);
}
int
pg_popcount64(uint64 word)
{
- return pg_popcount64_slow(word);
+ return pg_popcount64_portable(word);
}
/*
@@ -269,7 +269,7 @@ pg_popcount64(uint64 word)
uint64
pg_popcount_optimized(const char *buf, int bytes)
{
- return pg_popcount_slow(buf, bytes);
+ return pg_popcount_portable(buf, bytes);
}
/*
@@ -279,7 +279,7 @@ pg_popcount_optimized(const char *buf, int bytes)
uint64
pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
{
- return pg_popcount_masked_slow(buf, bytes, mask);
+ return pg_popcount_masked_portable(buf, bytes, mask);
}
#endif /* ! TRY_POPCNT_X86_64
&& ! POPCNT_AARCH64 */
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index f8643642613..f88511dcc7c 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -31,11 +31,15 @@
/*
* The SSE4.2 versions are built regardless of whether we are building the
* AVX-512 versions.
+ *
+ * XXX Technically, POPCNT is not part of SSE4.2, and isn't even a vector
+ * operation, but in practice this is close enough, and "sse42" seems easier to
+ * follow than "popcnt" for these names.
*/
-static inline int pg_popcount32_fast(uint32 word);
-static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
-static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
+static inline int pg_popcount32_sse42(uint32 word);
+static inline int pg_popcount64_sse42(uint64 word);
+static uint64 pg_popcount_sse42(const char *buf, int bytes);
+static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
/*
* These are the AVX-512 implementations of the popcount functions.
@@ -64,7 +68,7 @@ uint64 (*pg_popcount_masked_optimized) (const
char *buf, int bytes, bits8 mask)
* Return true if CPUID indicates that the POPCNT instruction is available.
*/
static bool
-pg_popcount_available(void)
+pg_popcount_sse42_available(void)
{
unsigned int exx[4] = {0, 0, 0, 0};
@@ -161,19 +165,19 @@ pg_popcount_avx512_available(void)
static inline void
choose_popcount_functions(void)
{
- if (pg_popcount_available())
+ if (pg_popcount_sse42_available())
{
- pg_popcount32 = pg_popcount32_fast;
- pg_popcount64 = pg_popcount64_fast;
- pg_popcount_optimized = pg_popcount_fast;
- pg_popcount_masked_optimized = pg_popcount_masked_fast;
+ pg_popcount32 = pg_popcount32_sse42;
+ pg_popcount64 = pg_popcount64_sse42;
+ pg_popcount_optimized = pg_popcount_sse42;
+ pg_popcount_masked_optimized = pg_popcount_masked_sse42;
}
else
{
- pg_popcount32 = pg_popcount32_slow;
- pg_popcount64 = pg_popcount64_slow;
- pg_popcount_optimized = pg_popcount_slow;
- pg_popcount_masked_optimized = pg_popcount_masked_slow;
+ pg_popcount32 = pg_popcount32_portable;
+ pg_popcount64 = pg_popcount64_portable;
+ pg_popcount_optimized = pg_popcount_portable;
+ pg_popcount_masked_optimized = pg_popcount_masked_portable;
}
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
@@ -335,11 +339,11 @@ pg_popcount_masked_avx512(const char *buf, int bytes,
bits8 mask)
#endif /*
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
/*
- * pg_popcount32_fast
+ * pg_popcount32_sse42
* Return the number of 1 bits set in word
*/
static inline int
-pg_popcount32_fast(uint32 word)
+pg_popcount32_sse42(uint32 word)
{
#ifdef _MSC_VER
return __popcnt(word);
@@ -352,11 +356,11 @@ __asm__ __volatile__(" popcntl
%1,%0\n":"=q"(res):"rm"(word):"cc");
}
/*
- * pg_popcount64_fast
+ * pg_popcount64_sse42
* Return the number of 1 bits set in word
*/
static inline int
-pg_popcount64_fast(uint64 word)
+pg_popcount64_sse42(uint64 word)
{
#ifdef _MSC_VER
return __popcnt64(word);
@@ -369,11 +373,11 @@ __asm__ __volatile__(" popcntq
%1,%0\n":"=q"(res):"rm"(word):"cc");
}
/*
- * pg_popcount_fast
+ * pg_popcount_sse42
* Returns the number of 1-bits in buf
*/
static uint64
-pg_popcount_fast(const char *buf, int bytes)
+pg_popcount_sse42(const char *buf, int bytes)
{
uint64 popcnt = 0;
@@ -385,7 +389,7 @@ pg_popcount_fast(const char *buf, int bytes)
while (bytes >= 8)
{
- popcnt += pg_popcount64_fast(*words++);
+ popcnt += pg_popcount64_sse42(*words++);
bytes -= 8;
}
@@ -399,7 +403,7 @@ pg_popcount_fast(const char *buf, int bytes)
while (bytes >= 4)
{
- popcnt += pg_popcount32_fast(*words++);
+ popcnt += pg_popcount32_sse42(*words++);
bytes -= 4;
}
@@ -415,11 +419,11 @@ pg_popcount_fast(const char *buf, int bytes)
}
/*
- * pg_popcount_masked_fast
+ * pg_popcount_masked_sse42
* Returns the number of 1-bits in buf after applying the mask to
each byte
*/
static uint64
-pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask)
{
uint64 popcnt = 0;
@@ -433,7 +437,7 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8
mask)
while (bytes >= 8)
{
- popcnt += pg_popcount64_fast(*words++ & maskv);
+ popcnt += pg_popcount64_sse42(*words++ & maskv);
bytes -= 8;
}
@@ -449,7 +453,7 @@ pg_popcount_masked_fast(const char *buf, int bytes, bits8
mask)
while (bytes >= 4)
{
- popcnt += pg_popcount32_fast(*words++ & maskv);
+ popcnt += pg_popcount32_sse42(*words++ & maskv);
bytes -= 4;
}
--
2.50.1 (Apple Git-155)
>From 71cc03643abbc568ce48e8adc3405f0bc994a9dd Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Thu, 15 Jan 2026 12:26:31 -0600
Subject: [PATCH v2 4/4] Refactor some SIMD and popcount macros.
---
meson.build | 4 +++-
src/include/c.h | 19 +++++++++++++++++
src/include/port/pg_bitutils.h | 37 +++-------------------------------
src/include/port/simd.h | 17 ++--------------
src/port/pg_bitutils.c | 4 ++--
src/port/pg_popcount_aarch64.c | 8 ++++----
src/port/pg_popcount_x86.c | 8 ++++----
7 files changed, 37 insertions(+), 60 deletions(-)
diff --git a/meson.build b/meson.build
index eedd40b8137..299817cb83e 100644
--- a/meson.build
+++ b/meson.build
@@ -2609,7 +2609,9 @@ endif
if host_cpu == 'x86_64'
- if cc.compiles('''
+ if cc.get_id() == 'msvc'
+ cdata.set('HAVE_X86_64_POPCNTQ', 1)
+ elif cc.compiles('''
void main(void)
{
long long x = 1; long long r;
diff --git a/src/include/c.h b/src/include/c.h
index 7136102e5ff..b6d53073fad 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1230,6 +1230,25 @@ typedef struct PGAlignedXLogBlock
((underlying_type) (expr))
#endif
+/*
+ * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
+ * that compilers targeting this architecture understand SSE2 intrinsics.
+ */
+#if (defined(__x86_64__) || defined(_M_AMD64))
+#define USE_SSE2
+
+/*
+ * We use the Neon instructions if the compiler provides access to them (as
+ * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
+ * technically optional for aarch64, it appears that all available 64-bit
+ * hardware does have it. Neon exists in some 32-bit hardware too, but we
+ * could not realistically use it there without a run-time check, which seems
+ * not worth the trouble for now.
+ */
+#elif defined(__aarch64__) && defined(__ARM_NEON)
+#define USE_NEON
+#endif
+
/* ----------------------------------------------------------------
* Section 9: system-specific hacks
*
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 89b117d9817..35761f509ec 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -276,43 +276,12 @@ pg_ceil_log2_64(uint64 num)
return pg_leftmost_one_pos64(num - 1) + 1;
}
-/*
- * With MSVC on x86_64 builds, try using native popcnt instructions via the
- * __popcnt and __popcnt64 intrinsics. These don't work the same as GCC's
- * __builtin_popcount* intrinsic functions as they always emit popcnt
- * instructions.
- */
-#if defined(_MSC_VER) && defined(_M_AMD64)
-#define HAVE_X86_64_POPCNTQ
-#endif
-
-/*
- * On x86_64, we can use the hardware popcount instruction, but only if
- * we can verify that the CPU supports it via the cpuid instruction.
- *
- * Otherwise, we fall back to a hand-rolled implementation.
- */
-#ifdef HAVE_X86_64_POPCNTQ
-#if defined(HAVE__GET_CPUID) || defined(HAVE__CPUID)
-#define TRY_POPCNT_X86_64 1
-#endif
-#endif
-
-/*
- * On AArch64, we can use Neon instructions if the compiler provides access to
- * them (as indicated by __ARM_NEON). As in simd.h, we assume that all
- * available 64-bit hardware has Neon support.
- */
-#if defined(__aarch64__) && defined(__ARM_NEON)
-#define POPCNT_AARCH64 1
-#endif
-
extern int pg_popcount32_portable(uint32 word);
extern int pg_popcount64_portable(uint64 word);
extern uint64 pg_popcount_portable(const char *buf, int bytes);
extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8
mask);
-#ifdef TRY_POPCNT_X86_64
+#ifdef HAVE_X86_64_POPCNTQ
/*
* Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check
* first.
@@ -322,7 +291,7 @@ extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int
bytes);
extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf,
int bytes, bits8 mask);
-#elif POPCNT_AARCH64
+#elif defined(USE_NEON)
/* Use the Neon version of pg_popcount{32,64} without function pointer. */
extern int pg_popcount32(uint32 word);
extern int pg_popcount64(uint64 word);
@@ -346,7 +315,7 @@ extern int pg_popcount64(uint64 word);
extern uint64 pg_popcount_optimized(const char *buf, int bytes);
extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8
mask);
-#endif /* TRY_POPCNT_X86_64 */
+#endif
/*
* Returns the number of 1-bits in buf.
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index 33202a4b0e2..50615aec7f4 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -18,32 +18,19 @@
#ifndef SIMD_H
#define SIMD_H
-#if (defined(__x86_64__) || defined(_M_AMD64))
+#if defined(USE_SSE2)
/*
- * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
- * that compilers targeting this architecture understand SSE2 intrinsics.
- *
* We use emmintrin.h rather than the comprehensive header immintrin.h in
* order to exclude extensions beyond SSE2. This is because MSVC, at least,
* will allow the use of intrinsics that haven't been enabled at compile
* time.
*/
#include <emmintrin.h>
-#define USE_SSE2
typedef __m128i Vector8;
typedef __m128i Vector32;
-#elif defined(__aarch64__) && defined(__ARM_NEON)
-/*
- * We use the Neon instructions if the compiler provides access to them (as
- * indicated by __ARM_NEON) and we are on aarch64. While Neon support is
- * technically optional for aarch64, it appears that all available 64-bit
- * hardware does have it. Neon exists in some 32-bit hardware too, but we
- * could not realistically use it there without a run-time check, which seems
- * not worth the trouble for now.
- */
+#elif defined(USE_NEON)
#include <arm_neon.h>
-#define USE_NEON
typedef uint8x16_t Vector8;
typedef uint32x4_t Vector32;
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 170aeef7548..ffda75825e5 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -242,7 +242,7 @@ pg_popcount_masked_portable(const char *buf, int bytes,
bits8 mask)
return popcnt;
}
-#if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64)
+#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON)
/*
* When special CPU instructions are not available, there's no point in using
@@ -282,4 +282,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes,
bits8 mask)
return pg_popcount_masked_portable(buf, bytes, mask);
}
-#endif /* ! TRY_POPCNT_X86_64
&& ! POPCNT_AARCH64 */
+#endif /* !
HAVE_X86_64_POPCNTQ && ! USE_NEON */
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index cda73cf6088..2184854dbf7 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -12,9 +12,7 @@
*/
#include "c.h"
-#include "port/pg_bitutils.h"
-
-#ifdef POPCNT_AARCH64
+#ifdef USE_NEON
#include <arm_neon.h>
@@ -30,6 +28,8 @@
#endif
#endif
+#include "port/pg_bitutils.h"
+
/*
* The Neon versions are built regardless of whether we are building the SVE
* versions.
@@ -478,4 +478,4 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8
mask)
return popcnt;
}
-#endif /* POPCNT_AARCH64 */
+#endif /* USE_NEON */
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index f88511dcc7c..b17d90da340 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -12,9 +12,7 @@
*/
#include "c.h"
-#include "port/pg_bitutils.h"
-
-#ifdef TRY_POPCNT_X86_64
+#ifdef HAVE_X86_64_POPCNTQ
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
#include <cpuid.h>
@@ -28,6 +26,8 @@
#include <intrin.h>
#endif
+#include "port/pg_bitutils.h"
+
/*
* The SSE4.2 versions are built regardless of whether we are building the
* AVX-512 versions.
@@ -468,4 +468,4 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8
mask)
return popcnt;
}
-#endif /* TRY_POPCNT_X86_64 */
+#endif /* HAVE_X86_64_POPCNTQ
*/
--
2.50.1 (Apple Git-155)