On Sat, Feb 21, 2026 at 08:49:32AM +0700, John Naylor wrote:
> In short, I'm in favor of v14, along with the comment about newer
> compilers from v15.

WFM.  Here's what I have staged for commit.

-- 
nathan
>From a733e2e9155c5ebcca6cb6cc1f9d19adbe3c1942 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Sat, 21 Feb 2026 15:12:02 -0600
Subject: [PATCH v16 1/2] Remove uses of popcount builtins.

This commit replaces the implementations of pg_popcount{32,64} with
branchless ones in plain C.  While these new implementations do not
make use of more sophisticated population count instructions
available on some CPUs, testing indicates they perform well,
especially now that they are inlined.  Newer versions of popular
compilers will automatically replace these with special
instructions if possible, anyway.  A follow-up commit will replace
various loops over these functions with calls to pg_popcount(),
leaving us little reason to worry about micro-optimizing them
further.

Since this commit removes the only uses of the popcount builtins,
we can also remove the corresponding configuration checks.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 configure                      | 38 ------------------------
 configure.ac                   |  1 -
 meson.build                    |  1 -
 src/include/pg_config.h.in     |  3 --
 src/include/port/pg_bitutils.h | 54 ++++++++++++++--------------------
 src/port/pg_popcount_aarch64.c |  5 ----
 6 files changed, 22 insertions(+), 80 deletions(-)

diff --git a/configure b/configure
index e1a08129974..cb143a48141 100755
--- a/configure
+++ b/configure
@@ -15836,44 +15836,6 @@ cat >>confdefs.h <<_ACEOF
 #define HAVE__BUILTIN_CTZ 1
 _ACEOF
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5
-$as_echo_n "checking for __builtin_popcount... " >&6; }
-if ${pgac_cv__builtin_popcount+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-call__builtin_popcount(unsigned int x)
-{
-    return __builtin_popcount(x);
-}
-int
-main ()
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv__builtin_popcount=yes
-else
-  pgac_cv__builtin_popcount=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" 
>&5
-$as_echo "$pgac_cv__builtin_popcount" >&6; }
-if test x"${pgac_cv__builtin_popcount}" = xyes ; then
-
-cat >>confdefs.h <<_ACEOF
-#define HAVE__BUILTIN_POPCOUNT 1
-_ACEOF
-
 fi
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
diff --git a/configure.ac b/configure.ac
index cc85c233c03..3951787313a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1851,7 +1851,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x])
 # We assume that we needn't test all widths of these explicitly:
 PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x])
 PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x])
-PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x])
 # __builtin_frame_address may draw a diagnostic for non-constant argument,
 # so it needs a different test function.
 PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0])
diff --git a/meson.build b/meson.build
index 055e96315d0..e0972f3a3d9 100644
--- a/meson.build
+++ b/meson.build
@@ -2006,7 +2006,6 @@ builtins = [
   'ctz',
   'constant_p',
   'frame_address',
-  'popcount',
   'unreachable',
 ]
 
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 3824a5571bb..af08c5a7eb8 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -526,9 +526,6 @@
 /* Define to 1 if your compiler understands __builtin_$op_overflow. */
 #undef HAVE__BUILTIN_OP_OVERFLOW
 
-/* Define to 1 if your compiler understands __builtin_popcount. */
-#undef HAVE__BUILTIN_POPCOUNT
-
 /* Define to 1 if your compiler understands __builtin_types_compatible_p. */
 #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 789663edd93..0bca559caaa 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -297,51 +297,41 @@ extern uint64 pg_popcount_masked_optimized(const char 
*buf, int bytes, bits8 mas
 /*
  * pg_popcount32
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
+ *
+ * Note that newer versions of popular compilers will automatically replace
+ * this with a special popcount instruction if possible, so we don't bother
+ * using builtin functions or intrinsics.
  */
 static inline int
 pg_popcount32(uint32 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-       return __builtin_popcount(word);
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & 0x55555555;
+       word = (word & 0x33333333) + ((word >> 2) & 0x33333333);
+       return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24;
 }
 
 /*
  * pg_popcount64
  *             Return the number of 1 bits set in word
+ *
+ * Adapted from
+ * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel.
+ *
+ * Note that newer versions of popular compilers will automatically replace
+ * this with a special popcount instruction if possible, so we don't bother
+ * using builtin functions or intrinsics.
  */
 static inline int
 pg_popcount64(uint64 word)
 {
-#ifdef HAVE__BUILTIN_POPCOUNT
-#if SIZEOF_LONG == 8
-       return __builtin_popcountl(word);
-#elif SIZEOF_LONG_LONG == 8
-       return __builtin_popcountll(word);
-#else
-#error "cannot find integer of the same size as uint64_t"
-#endif
-#else                                                  /* 
!HAVE__BUILTIN_POPCOUNT */
-       int                     result = 0;
-
-       while (word != 0)
-       {
-               result += pg_number_of_ones[word & 255];
-               word >>= 8;
-       }
-
-       return result;
-#endif                                                 /* 
HAVE__BUILTIN_POPCOUNT */
+       word -= (word >> 1) & UINT64CONST(0x5555555555555555);
+       word = (word & UINT64CONST(0x3333333333333333)) +
+               ((word >> 2) & UINT64CONST(0x3333333333333333));
+       word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f);
+       return (word * UINT64CONST(0x101010101010101)) >> 56;
 }
 
 /*
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index f474ef45510..b0f10ae07a4 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -298,11 +298,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
 static inline int
 pg_popcount64_neon(uint64 word)
 {
-       /*
-        * For some compilers, __builtin_popcountl() already emits Neon
-        * instructions.  The line below should compile to the same code on 
those
-        * systems.
-        */
        return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word)));
 }
 
-- 
2.50.1 (Apple Git-155)

>From 579db648d8fb10eb96e89390988e1e7293731f34 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Tue, 10 Feb 2026 12:40:08 -0600
Subject: [PATCH v16 2/2] Make use of pg_popcount() in more places.

This replaces some loops over word-length popcount functions with
calls to pg_popcount().  Since pg_popcount() uses a function
pointer for inputs with sizes >= a Bitmapset word, this produces a
small regression for the common one-word case in bms_num_members().
To deal with that, this commit adds an inlined fast-path for that
case.  This fast-path could arguably go in pg_popcount() itself
(with an appropriate alignment check), but that is left for future
study.

Suggested-by: John Naylor <[email protected]>
Reviewed-by: John Naylor <[email protected]>
Discussion: 
https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com
---
 src/backend/nodes/bitmapset.c | 29 +++++++----------------------
 src/include/lib/radixtree.h   |  6 +++---
 2 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index a4765876c31..786f343b3c9 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x)
        bitnum = BITNUM(x);
 
        /* count bits in preceding words */
-       for (int i = 0; i < wordnum; i++)
-       {
-               bitmapword      w = a->words[i];
-
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       }
+       result += pg_popcount((const char *) a->words,
+                                                 wordnum * sizeof(bitmapword));
 
        /*
         * Now add bits of the last word, but only those before the item. We can
@@ -749,26 +743,17 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 int
 bms_num_members(const Bitmapset *a)
 {
-       int                     result = 0;
-       int                     nwords;
-       int                     wordnum;
-
        Assert(bms_is_valid_set(a));
 
        if (a == NULL)
                return 0;
 
-       nwords = a->nwords;
-       wordnum = 0;
-       do
-       {
-               bitmapword      w = a->words[wordnum];
+       /* fast-path for common case */
+       if (a->nwords == 1)
+               return bmw_popcount(a->words[0]);
 
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       } while (++wordnum < nwords);
-       return result;
+       return pg_popcount((const char *) a->words,
+                                          a->nwords * sizeof(bitmapword));
 }
 
 /*
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
index b223ce10a2d..e6c9a591c17 100644
--- a/src/include/lib/radixtree.h
+++ b/src/include/lib/radixtree.h
@@ -2721,12 +2721,12 @@ RT_VERIFY_NODE(RT_NODE * node)
                case RT_NODE_KIND_256:
                        {
                                RT_NODE_256 *n256 = (RT_NODE_256 *) node;
-                               int                     cnt = 0;
+                               int                     cnt;
 
                                /* RT_DUMP_NODE(node); */
 
-                               for (int i = 0; i < 
RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
-                                       cnt += bmw_popcount(n256->isset[i]);
+                               cnt = pg_popcount((const char *) n256->isset,
+                                                                 
RT_NODE_MAX_SLOTS / BITS_PER_BYTE);
 
                                /*
                                 * Check if the number of used chunk matches, 
accounting for
-- 
2.50.1 (Apple Git-155)

Reply via email to