On Thu, Feb 12, 2026 at 11:58:37AM +0700, John Naylor wrote: > I have a better idea, but it depends on re-thinking feature detection > and compile-time vs run-time checks, and that's not quite ready to > share, so let's shelve this for now.
Sounds good. > Anyway, I think 0001 and 0002 are ready for commit. Committed. I've attached rebased versions of the remaining patches under consideration. -- nathan
>From e98478abc31b6da8662c4715a075166d60911f92 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Fri, 6 Feb 2026 10:00:21 -0600 Subject: [PATCH v14 1/2] Remove uses of popcount builtins. This commit replaces the implementations of pg_popcount{32,64} with branchless ones in plain C. While these new implementations do not make use of more sophisticated population count instructions available on some CPUs, testing indicates they perform well, especially now that they are inlined. A follow-up commit will replace various loops over these functions with calls to pg_popcount(), leaving us little reason to worry about micro-optimizing them further. Since this commit removes the only uses of the popcount builtins, we can also remove the corresponding configuration checks. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- configure | 38 ---------------------------- configure.ac | 1 - meson.build | 1 - src/include/pg_config.h.in | 3 --- src/include/port/pg_bitutils.h | 46 +++++++++++----------------------- src/port/pg_popcount_aarch64.c | 5 ---- 6 files changed, 14 insertions(+), 80 deletions(-) diff --git a/configure b/configure index a10a2c85c6a..8fe368b7201 100755 --- a/configure +++ b/configure @@ -15878,44 +15878,6 @@ cat >>confdefs.h <<_ACEOF #define HAVE__BUILTIN_CTZ 1 _ACEOF -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5 -$as_echo_n "checking for __builtin_popcount... " >&6; } -if ${pgac_cv__builtin_popcount+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -call__builtin_popcount(unsigned int x) -{ - return __builtin_popcount(x); -} -int -main () -{ - - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__builtin_popcount=yes -else - pgac_cv__builtin_popcount=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" >&5 -$as_echo "$pgac_cv__builtin_popcount" >&6; } -if test x"${pgac_cv__builtin_popcount}" = xyes ; then - -cat >>confdefs.h <<_ACEOF -#define HAVE__BUILTIN_POPCOUNT 1 -_ACEOF - fi # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. diff --git a/configure.ac b/configure.ac index 814e64a967e..f569b1c3f35 100644 --- a/configure.ac +++ b/configure.ac @@ -1852,7 +1852,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x]) # We assume that we needn't test all widths of these explicitly: PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x]) PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x]) -PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x]) # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0]) diff --git a/meson.build b/meson.build index 96b3869df86..c89293cd80f 100644 --- a/meson.build +++ b/meson.build @@ -2004,7 +2004,6 @@ builtins = [ 'ctz', 'constant_p', 'frame_address', - 'popcount', 'unreachable', ] diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 339268dc8ef..2651b56ae4d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -526,9 +526,6 @@ /* Define to 1 if your compiler understands __builtin_$op_overflow. */ #undef HAVE__BUILTIN_OP_OVERFLOW -/* Define to 1 if your compiler understands __builtin_popcount. */ -#undef HAVE__BUILTIN_POPCOUNT - /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 789663edd93..c9b1f5f17dc 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -297,51 +297,33 @@ extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mas /* * pg_popcount32 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. */ static inline int pg_popcount32(uint32 word) { -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & 0x55555555; + word = (word & 0x33333333) + ((word >> 2) & 0x33333333); + return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; } /* * pg_popcount64 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. */ static inline int pg_popcount64(uint64 word) { -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & UINT64CONST(0x5555555555555555); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f); + return (word * UINT64CONST(0x101010101010101)) >> 56; } /* diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index f474ef45510..b0f10ae07a4 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -298,11 +298,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) static inline int pg_popcount64_neon(uint64 word) { - /* - * For some compilers, __builtin_popcountl() already emits Neon - * instructions. The line below should compile to the same code on those - * systems. - */ return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); } -- 2.50.1 (Apple Git-155)
>From a59ac729fb804e2a654352b5db10d5021ff9d2a3 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <[email protected]> Date: Tue, 10 Feb 2026 12:40:08 -0600 Subject: [PATCH v14 2/2] Make use of pg_popcount() in more places. This replaces some loops over word-length popcount functions with calls to pg_popcount(). Since pg_popcount() uses a function pointer for inputs with sizes >= a Bitmapset word, this produces a small regression for the common one-word case in bms_num_members(). To deal with that, this commit adds an inlined fast-path for that case. This fast-path could arguably go in pg_popcount() itself (with an appropriate alignment check), but that is left for future study. Suggested-by: John Naylor <[email protected]> Reviewed-by: John Naylor <[email protected]> Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/backend/nodes/bitmapset.c | 29 +++++++---------------------- src/include/lib/radixtree.h | 6 +++--- 2 files changed, 10 insertions(+), 25 deletions(-) diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c index a4765876c31..786f343b3c9 100644 --- a/src/backend/nodes/bitmapset.c +++ b/src/backend/nodes/bitmapset.c @@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x) bitnum = BITNUM(x); /* count bits in preceding words */ - for (int i = 0; i < wordnum; i++) - { - bitmapword w = a->words[i]; - - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } + result += pg_popcount((const char *) a->words, + wordnum * sizeof(bitmapword)); /* * Now add bits of the last word, but only those before the item. We can @@ -749,26 +743,17 @@ bms_get_singleton_member(const Bitmapset *a, int *member) int bms_num_members(const Bitmapset *a) { - int result = 0; - int nwords; - int wordnum; - Assert(bms_is_valid_set(a)); if (a == NULL) return 0; - nwords = a->nwords; - wordnum = 0; - do - { - bitmapword w = a->words[wordnum]; + /* fast-path for common case */ + if (a->nwords == 1) + return bmw_popcount(a->words[0]); - /* No need to count the bits in a zero word */ - if (w != 0) - result += bmw_popcount(w); - } while (++wordnum < nwords); - return result; + return pg_popcount((const char *) a->words, + a->nwords * sizeof(bitmapword)); } /* diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h index b223ce10a2d..e6c9a591c17 100644 --- a/src/include/lib/radixtree.h +++ b/src/include/lib/radixtree.h @@ -2721,12 +2721,12 @@ RT_VERIFY_NODE(RT_NODE * node) case RT_NODE_KIND_256: { RT_NODE_256 *n256 = (RT_NODE_256 *) node; - int cnt = 0; + int cnt; /* RT_DUMP_NODE(node); */ - for (int i = 0; i < RT_BM_IDX(RT_NODE_MAX_SLOTS); i++) - cnt += bmw_popcount(n256->isset[i]); + cnt = pg_popcount((const char *) n256->isset, + RT_NODE_MAX_SLOTS / BITS_PER_BYTE); /* * Check if the number of used chunk matches, accounting for -- 2.50.1 (Apple Git-155)
