On Thu, Jan 22, 2026 at 11:50:38AM -0600, Nathan Bossart wrote: > On Thu, Jan 22, 2026 at 04:50:26PM +0700, John Naylor wrote: >> 1) Nowadays, the only global call sites of the word-sized functions >> are select_best_grantor() and in bitmapsets. The latter calls the >> word-sized functions in a loop (could be just one word). It may be >> more efficient to calculate the size in bytes and call pg_popcount(). > > Yeah, these seem like obvious places to use pg_popcount(). Note that > bms_member_index() does a final popcount on a masked version of the last > word. We could swap that with pg_popcount(), too, but it might be slower > than just calling the word-sized function. However, it could be hard to > tell the difference, as we'd be trading a function or function pointer call > with an inlined loop over pg_number_of_ones. And even if it is slower, I'm > not sure it matters all that much in the grand scheme of things.
I added a 0003 that swaps that final popcount with pg_popcount(). >> Then we could get rid of all the pointer indirection for the >> word-sized functions. > > Do you mean that we'd just keep the portable ones around? I see some code > in pgvector that might be negatively impacted by that, but if I understand > correctly it would require an unusual setup. I added a 0004 that removes the architecture-specific word-sized functions. -- nathan
>From 4fa180bf4f9e7dfddbe8ca6f2abdf14b8cb36978 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Thu, 22 Jan 2026 11:16:09 -0600 Subject: [PATCH v4 1/4] Make use of pg_popcount() in more places. --- src/backend/nodes/bitmapset.c | 27 ++++----------------------- src/include/lib/radixtree.h | 4 ++-- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index a4765876c31..23c91fdb6c9 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x) bitnum = BITNUM(x); /* count bits in preceding words */ - for (int i = 0; i < wordnum; i++) - { - bitmapword w = a->words[i]; - - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } + result += pg_popcount((const char *) a->words, + wordnum * sizeof(bitmapword)); /* * Now add bits of the last word, but only those before the item. We can @@ -749,26 +743,13 @@ bms_get_singleton_member(const Bitmapset *a, int *member) int bms_num_members(const Bitmapset *a) { - int result = 0; - int nwords; - int wordnum; - Assert(bms_is_valid_set(a)); if (a == NULL) return 0; - nwords = a->nwords; - wordnum = 0; - do - { - bitmapword w = a->words[wordnum]; - - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } while (++wordnum < nwords); - return result; + return pg_popcount((const char *) a->words, + a->nwords * sizeof(bitmapword)); } /* diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index b223ce10a2d..1425654a67c 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -2725,8 +2725,8 @@ RT_VERIFY_NODE(RT_NODE * node) /* RT_DUMP_NODE(node); */ - for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++) - cnt += bmw_popcount(n256->isset[i]); + cnt += pg_popcount((const char *) n256->isset, + RT_NODE_MAX_SLOTS / BITS_PER_BYTE); /* * Check if the number of used chunk matches, accounting for -- 2.50.1 (Apple Git-155)
>From 7340452ec091af478749925e070ccc1b5f420eec Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Thu, 22 Jan 2026 11:33:56 -0600 Subject: [PATCH v4 2/4] Remove unnecessary 32-bit optimizations and alignment checks. --- src/port/pg_popcount_x86.c | 63 ++++++++------------------------------ 1 file changed, 13 insertions(+), 50 deletions(-) diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 245f0167d00..0e98f532552 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -382,33 +382,16 @@ pg_popcount_sse42(const char *buf, int bytes) uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++); - bytes -= 8; - } + /* Process in 64-bit chunks. */ + const uint64 *words = (const uint64 *) buf; - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) + while (bytes >= 8) { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++); - bytes -= 4; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++); + bytes -= 8; } + + buf = (const char *) words; #endif /* Process any remaining bytes */ @@ -428,37 +411,17 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) uint64 popcnt = 0; #if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned */ + /* Process in 64-bit chunks. */ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + const uint64 *words = (const uint64 *) buf; - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++ & maskv); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++ & maskv); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++ & maskv); - bytes -= 4; - } - - buf = (const char *) words; - } + buf = (const char *) words; #endif /* Process any remaining bytes */ -- 2.50.1 (Apple Git-155)
>From 60ed9ddad06775079a981360d9f06ff555ee073c Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 23 Jan 2026 17:07:32 -0600 Subject: [PATCH v4 3/4] Remove bmw_popcount(). --- src/backend/nodes/bitmapset.c | 4 +++- src/include/nodes/bitmapset.h | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index 23c91fdb6c9..c057058a974 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -542,6 +542,7 @@ bms_member_index(Bitmapset *a, int x) int wordnum; int result = 0; bitmapword mask; + bitmapword last; Assert(bms_is_valid_set(a)); @@ -563,7 +564,8 @@ bms_member_index(Bitmapset *a, int x) * itself, so we subtract 1. */ mask = ((bitmapword) 1 << bitnum) - 1; - result += bmw_popcount(a->words[wordnum] & mask); + last = a->words[wordnum] & mask; + result += pg_popcount((const char *) &last, sizeof(bitmapword)); return result; } diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h index 067ec72e99b..20938cfd2a7 100644 --- a/src/include/nodes/bitmapset.h +++ b/src/include/nodes/bitmapset.h @@ -77,11 +77,9 @@ typedef enum #if BITS_PER_BITMAPWORD == 32 #define bmw_leftmost_one_pos(w) pg_leftmost_one_pos32(w) #define bmw_rightmost_one_pos(w) pg_rightmost_one_pos32(w) -#define bmw_popcount(w) pg_popcount32(w) #elif BITS_PER_BITMAPWORD == 64 #define bmw_leftmost_one_pos(w) pg_leftmost_one_pos64(w) #define bmw_rightmost_one_pos(w) pg_rightmost_one_pos64(w) -#define bmw_popcount(w) pg_popcount64(w) #else #error "invalid BITS_PER_BITMAPWORD" #endif -- 2.50.1 (Apple Git-155)
>From dd4ce5c39e7b1e9087c8990067bc31a1dd30fbe6 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 23 Jan 2026 17:31:20 -0600 Subject: [PATCH v4 4/4] Remove specialized word-length popcount implementations. --- src/include/port/pg_bitutils.h | 29 ++++------------------- src/port/pg_bitutils.c | 27 +++++++-------------- src/port/pg_popcount_aarch64.c | 22 +++++------------ src/port/pg_popcount_x86.c | 43 +--------------------------------- 4 files changed, 19 insertions(+), 102 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 35761f509ec..101815daa98 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -276,42 +276,21 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -extern int pg_popcount32_portable(uint32 word); -extern int pg_popcount64_portable(uint64 word); +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* - * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * Attempt to use specialized CPU instructions, but perform a runtime check * first. */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); -extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); -extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); - -#elif defined(USE_NEON) -/* Use the Neon version of pg_popcount{32,64} without function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); - -/* - * We can try to use an SVE-optimized pg_popcount() on some systems For that, - * we do use a function pointer. - */ -#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#else -extern uint64 pg_popcount_optimized(const char *buf, int bytes); -extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); -#endif #else /* Use a portable implementation -- no need for a function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index ffda75825e5..037b313a46b 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -97,11 +97,11 @@ const uint8 pg_number_of_ones[256] = { }; /* - * pg_popcount32_portable + * pg_popcount32 * Return the number of 1 bits set in word */ int -pg_popcount32_portable(uint32 word) +pg_popcount32(uint32 word) { #ifdef HAVE__BUILTIN_POPCOUNT return __builtin_popcount(word); @@ -119,11 +119,11 @@ pg_popcount32_portable(uint32 word) } /* - * pg_popcount64_portable + * pg_popcount64 * Return the number of 1 bits set in word */ int -pg_popcount64_portable(uint64 word) +pg_popcount64(uint64 word) { #ifdef HAVE__BUILTIN_POPCOUNT #if SIZEOF_LONG == 8 @@ -163,7 +163,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -177,7 +177,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 4) { - popcnt += pg_popcount32_portable(*words++); + popcnt += pg_popcount32(*words++); bytes -= 4; } @@ -211,7 +211,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -227,7 +227,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 4) { - popcnt += pg_popcount32_portable(*words++ & maskv); + popcnt += pg_popcount32(*words++ & maskv); bytes -= 4; } @@ -250,17 +250,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) * actual external functions. The compiler should be able to inline the * portable versions here. */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_portable(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_portable(word); -} /* * pg_popcount_optimized diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index ba57f2cd4bd..99fd24eb980 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -292,21 +292,11 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ /* - * pg_popcount32 - * Return number of 1 bits in word + * pg_popcount64_neon + * Return number of 1 bits in word */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount64((uint64) word); -} - -/* - * pg_popcount64 - * Return number of 1 bits in word - */ -int -pg_popcount64(uint64 word) +static inline int +pg_popcount64_neon(uint64 word) { /* * For some compilers, __builtin_popcountl() already emits Neon @@ -383,7 +373,7 @@ pg_popcount_neon(const char *buf, int bytes) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf)); + popcnt += pg_popcount64_neon(*((const uint64 *) buf)); buf += sizeof(uint64); } @@ -465,7 +455,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf) & mask64); + popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64); buf += sizeof(uint64); } diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 0e98f532552..9fd8e18ed16 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -36,8 +36,6 @@ * operation, but in practice this is close enough, and "sse42" seems easier to * follow than "popcnt" for these names. */ -static inline int pg_popcount32_sse42(uint32 word); -static inline int pg_popcount64_sse42(uint64 word); static uint64 pg_popcount_sse42(const char *buf, int bytes); static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask); @@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; @@ -157,7 +151,7 @@ pg_popcount_avx512_available(void) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ /* - * These functions get called on the first call to pg_popcount32 etc. + * These functions get called on the first call to pg_popcount(), etc. * They detect whether we can use the asm implementations, and replace * the function pointers so that subsequent calls are routed directly to * the chosen implementation. @@ -167,15 +161,11 @@ choose_popcount_functions(void) { if (pg_popcount_sse42_available()) { - pg_popcount32 = pg_popcount32_sse42; - pg_popcount64 = pg_popcount64_sse42; pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; } else { - pg_popcount32 = pg_popcount32_portable; - pg_popcount64 = pg_popcount64_portable; pg_popcount_optimized = pg_popcount_portable; pg_popcount_masked_optimized = pg_popcount_masked_portable; } @@ -189,20 +179,6 @@ choose_popcount_functions(void) #endif } -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - static uint64 pg_popcount_choose(const char *buf, int bytes) { @@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_sse42(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount64_sse42 * Return the number of 1 bits set in word -- 2.50.1 (Apple Git-155)
