On Thu, Jan 22, 2026 at 11:50:38AM -0600, Nathan Bossart wrote:
> On Thu, Jan 22, 2026 at 04:50:26PM +0700, John Naylor wrote:
>> 1) Nowadays, the only global call sites of the word-sized functions
>> are select_best_grantor() and in bitmapsets. The latter calls the
>> word-sized functions in a loop (could be just one word). It may be
>> more efficient to calculate the size in bytes and call pg_popcount().
> 
> Yeah, these seem like obvious places to use pg_popcount().  Note that
> bms_member_index() does a final popcount on a masked version of the last
> word.  We could swap that with pg_popcount(), too, but it might be slower
> than just calling the word-sized function.  However, it could be hard to
> tell the difference, as we'd be trading a function or function pointer call
> with an inlined loop over pg_number_of_ones.  And even if it is slower, I'm
> not sure it matters all that much in the grand scheme of things.

I added a 0003 that swaps that final popcount with pg_popcount().

>> Then we could get rid of all the pointer indirection for the
>> word-sized functions.
> 
> Do you mean that we'd just keep the portable ones around?  I see some code
> in pgvector that might be negatively impacted by that, but if I understand
> correctly it would require an unusual setup.

I added a 0004 that removes the architecture-specific word-sized functions.

-- 
nathan
>From 4fa180bf4f9e7dfddbe8ca6f2abdf14b8cb36978 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Thu, 22 Jan 2026 11:16:09 -0600
Subject: [PATCH v4 1/4] Make use of pg_popcount() in more places.

---
 src/backend/nodes/bitmapset.c | 27 ++++-----------------------
 src/include/lib/radixtree.h   |  4 ++--
 2 files changed, 6 insertions(+), 25 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index a4765876c31..23c91fdb6c9 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -553,14 +553,8 @@ bms_member_index(Bitmapset *a, int x)
        bitnum = BITNUM(x);
 
        /* count bits in preceding words */
-       for (int i = 0; i < wordnum; i++)
-       {
-               bitmapword      w = a->words[i];
-
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       }
+       result += pg_popcount((const char *) a->words,
+                                                 wordnum * sizeof(bitmapword));
 
        /*
         * Now add bits of the last word, but only those before the item. We can
@@ -749,26 +743,13 @@ bms_get_singleton_member(const Bitmapset *a, int *member)
 int
 bms_num_members(const Bitmapset *a)
 {
-       int                     result = 0;
-       int                     nwords;
-       int                     wordnum;
-
        Assert(bms_is_valid_set(a));
 
        if (a == NULL)
                return 0;
 
-       nwords = a->nwords;
-       wordnum = 0;
-       do
-       {
-               bitmapword      w = a->words[wordnum];
-
-               /* No need to count the bits in a zero word */
-               if (w != 0)
-                       result += bmw_popcount(w);
-       } while (++wordnum < nwords);
-       return result;
+       return pg_popcount((const char *) a->words,
+                                          a->nwords * sizeof(bitmapword));
 }
 
 /*
diff --git a/src/include/lib/radixtree.h b/src/include/lib/radixtree.h
index b223ce10a2d..1425654a67c 100644
--- a/src/include/lib/radixtree.h
+++ b/src/include/lib/radixtree.h
@@ -2725,8 +2725,8 @@ RT_VERIFY_NODE(RT_NODE * node)
 
                                /* RT_DUMP_NODE(node); */
 
-                               for (int i = 0; i < 
RT_BM_IDX(RT_NODE_MAX_SLOTS); i++)
-                                       cnt += bmw_popcount(n256->isset[i]);
+                               cnt += pg_popcount((const char *) n256->isset,
+                                                                  
RT_NODE_MAX_SLOTS / BITS_PER_BYTE);
 
                                /*
                                 * Check if the number of used chunk matches, 
accounting for
-- 
2.50.1 (Apple Git-155)

>From 7340452ec091af478749925e070ccc1b5f420eec Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Thu, 22 Jan 2026 11:33:56 -0600
Subject: [PATCH v4 2/4] Remove unnecessary 32-bit optimizations and alignment
 checks.

---
 src/port/pg_popcount_x86.c | 63 ++++++++------------------------------
 1 file changed, 13 insertions(+), 50 deletions(-)

diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 245f0167d00..0e98f532552 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -382,33 +382,16 @@ pg_popcount_sse42(const char *buf, int bytes)
        uint64          popcnt = 0;
 
 #if SIZEOF_VOID_P >= 8
-       /* Process in 64-bit chunks if the buffer is aligned. */
-       if (buf == (const char *) TYPEALIGN(8, buf))
-       {
-               const uint64 *words = (const uint64 *) buf;
-
-               while (bytes >= 8)
-               {
-                       popcnt += pg_popcount64_sse42(*words++);
-                       bytes -= 8;
-               }
+       /* Process in 64-bit chunks. */
+       const uint64 *words = (const uint64 *) buf;
 
-               buf = (const char *) words;
-       }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       if (buf == (const char *) TYPEALIGN(4, buf))
+       while (bytes >= 8)
        {
-               const uint32 *words = (const uint32 *) buf;
-
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_sse42(*words++);
-                       bytes -= 4;
-               }
-
-               buf = (const char *) words;
+               popcnt += pg_popcount64_sse42(*words++);
+               bytes -= 8;
        }
+
+       buf = (const char *) words;
 #endif
 
        /* Process any remaining bytes */
@@ -428,37 +411,17 @@ pg_popcount_masked_sse42(const char *buf, int bytes, 
bits8 mask)
        uint64          popcnt = 0;
 
 #if SIZEOF_VOID_P >= 8
-       /* Process in 64-bit chunks if the buffer is aligned */
+       /* Process in 64-bit chunks. */
        uint64          maskv = ~UINT64CONST(0) / 0xFF * mask;
+       const uint64 *words = (const uint64 *) buf;
 
-       if (buf == (const char *) TYPEALIGN(8, buf))
+       while (bytes >= 8)
        {
-               const uint64 *words = (const uint64 *) buf;
-
-               while (bytes >= 8)
-               {
-                       popcnt += pg_popcount64_sse42(*words++ & maskv);
-                       bytes -= 8;
-               }
-
-               buf = (const char *) words;
+               popcnt += pg_popcount64_sse42(*words++ & maskv);
+               bytes -= 8;
        }
-#else
-       /* Process in 32-bit chunks if the buffer is aligned. */
-       uint32          maskv = ~((uint32) 0) / 0xFF * mask;
 
-       if (buf == (const char *) TYPEALIGN(4, buf))
-       {
-               const uint32 *words = (const uint32 *) buf;
-
-               while (bytes >= 4)
-               {
-                       popcnt += pg_popcount32_sse42(*words++ & maskv);
-                       bytes -= 4;
-               }
-
-               buf = (const char *) words;
-       }
+       buf = (const char *) words;
 #endif
 
        /* Process any remaining bytes */
-- 
2.50.1 (Apple Git-155)

>From 60ed9ddad06775079a981360d9f06ff555ee073c Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 23 Jan 2026 17:07:32 -0600
Subject: [PATCH v4 3/4] Remove bmw_popcount().

---
 src/backend/nodes/bitmapset.c | 4 +++-
 src/include/nodes/bitmapset.h | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 23c91fdb6c9..c057058a974 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -542,6 +542,7 @@ bms_member_index(Bitmapset *a, int x)
        int                     wordnum;
        int                     result = 0;
        bitmapword      mask;
+       bitmapword      last;
 
        Assert(bms_is_valid_set(a));
 
@@ -563,7 +564,8 @@ bms_member_index(Bitmapset *a, int x)
         * itself, so we subtract 1.
         */
        mask = ((bitmapword) 1 << bitnum) - 1;
-       result += bmw_popcount(a->words[wordnum] & mask);
+       last = a->words[wordnum] & mask;
+       result += pg_popcount((const char *) &last, sizeof(bitmapword));
 
        return result;
 }
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index 067ec72e99b..20938cfd2a7 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -77,11 +77,9 @@ typedef enum
 #if BITS_PER_BITMAPWORD == 32
 #define bmw_leftmost_one_pos(w)                pg_leftmost_one_pos32(w)
 #define bmw_rightmost_one_pos(w)       pg_rightmost_one_pos32(w)
-#define bmw_popcount(w)                                pg_popcount32(w)
 #elif BITS_PER_BITMAPWORD == 64
 #define bmw_leftmost_one_pos(w)                pg_leftmost_one_pos64(w)
 #define bmw_rightmost_one_pos(w)       pg_rightmost_one_pos64(w)
-#define bmw_popcount(w)                                pg_popcount64(w)
 #else
 #error "invalid BITS_PER_BITMAPWORD"
 #endif
-- 
2.50.1 (Apple Git-155)

>From dd4ce5c39e7b1e9087c8990067bc31a1dd30fbe6 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <[email protected]>
Date: Fri, 23 Jan 2026 17:31:20 -0600
Subject: [PATCH v4 4/4] Remove specialized word-length popcount
 implementations.

---
 src/include/port/pg_bitutils.h | 29 ++++-------------------
 src/port/pg_bitutils.c         | 27 +++++++--------------
 src/port/pg_popcount_aarch64.c | 22 +++++------------
 src/port/pg_popcount_x86.c     | 43 +---------------------------------
 4 files changed, 19 insertions(+), 102 deletions(-)

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 35761f509ec..101815daa98 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -276,42 +276,21 @@ pg_ceil_log2_64(uint64 num)
                return pg_leftmost_one_pos64(num - 1) + 1;
 }
 
-extern int     pg_popcount32_portable(uint32 word);
-extern int     pg_popcount64_portable(uint64 word);
+extern int     pg_popcount32(uint32 word);
+extern int     pg_popcount64(uint64 word);
 extern uint64 pg_popcount_portable(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 
mask);
 
-#ifdef HAVE_X86_64_POPCNTQ
+#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK)
 /*
- * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check
+ * Attempt to use specialized CPU instructions, but perform a runtime check
  * first.
  */
-extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
-extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
-extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
-extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
-
-#elif defined(USE_NEON)
-/* Use the Neon version of pg_popcount{32,64} without function pointer. */
-extern int     pg_popcount32(uint32 word);
-extern int     pg_popcount64(uint64 word);
-
-/*
- * We can try to use an SVE-optimized pg_popcount() on some systems  For that,
- * we do use a function pointer.
- */
-#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
 extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int 
bytes);
 extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, 
int bytes, bits8 mask);
-#else
-extern uint64 pg_popcount_optimized(const char *buf, int bytes);
-extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
-#endif
 
 #else
 /* Use a portable implementation -- no need for a function pointer. */
-extern int     pg_popcount32(uint32 word);
-extern int     pg_popcount64(uint64 word);
 extern uint64 pg_popcount_optimized(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 
mask);
 
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index ffda75825e5..037b313a46b 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -97,11 +97,11 @@ const uint8 pg_number_of_ones[256] = {
 };
 
 /*
- * pg_popcount32_portable
+ * pg_popcount32
  *             Return the number of 1 bits set in word
  */
 int
-pg_popcount32_portable(uint32 word)
+pg_popcount32(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
        return __builtin_popcount(word);
@@ -119,11 +119,11 @@ pg_popcount32_portable(uint32 word)
 }
 
 /*
- * pg_popcount64_portable
+ * pg_popcount64
  *             Return the number of 1 bits set in word
  */
 int
-pg_popcount64_portable(uint64 word)
+pg_popcount64(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
 #if SIZEOF_LONG == 8
@@ -163,7 +163,7 @@ pg_popcount_portable(const char *buf, int bytes)
 
                while (bytes >= 8)
                {
-                       popcnt += pg_popcount64_portable(*words++);
+                       popcnt += pg_popcount64(*words++);
                        bytes -= 8;
                }
 
@@ -177,7 +177,7 @@ pg_popcount_portable(const char *buf, int bytes)
 
                while (bytes >= 4)
                {
-                       popcnt += pg_popcount32_portable(*words++);
+                       popcnt += pg_popcount32(*words++);
                        bytes -= 4;
                }
 
@@ -211,7 +211,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
 
                while (bytes >= 8)
                {
-                       popcnt += pg_popcount64_portable(*words++ & maskv);
+                       popcnt += pg_popcount64(*words++ & maskv);
                        bytes -= 8;
                }
 
@@ -227,7 +227,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
 
                while (bytes >= 4)
                {
-                       popcnt += pg_popcount32_portable(*words++ & maskv);
+                       popcnt += pg_popcount32(*words++ & maskv);
                        bytes -= 4;
                }
 
@@ -250,17 +250,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, 
bits8 mask)
  * actual external functions.  The compiler should be able to inline the
  * portable versions here.
  */
-int
-pg_popcount32(uint32 word)
-{
-       return pg_popcount32_portable(word);
-}
-
-int
-pg_popcount64(uint64 word)
-{
-       return pg_popcount64_portable(word);
-}
 
 /*
  * pg_popcount_optimized
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index ba57f2cd4bd..99fd24eb980 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -292,21 +292,11 @@ pg_popcount_masked_optimized(const char *buf, int bytes, 
bits8 mask)
 #endif                                                 /* ! 
USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
 
 /*
- * pg_popcount32
- *             Return number of 1 bits in word
+ * pg_popcount64_neon
+ *      Return number of 1 bits in word
  */
-int
-pg_popcount32(uint32 word)
-{
-       return pg_popcount64((uint64) word);
-}
-
-/*
- * pg_popcount64
- *             Return number of 1 bits in word
- */
-int
-pg_popcount64(uint64 word)
+static inline int
+pg_popcount64_neon(uint64 word)
 {
        /*
         * For some compilers, __builtin_popcountl() already emits Neon
@@ -383,7 +373,7 @@ pg_popcount_neon(const char *buf, int bytes)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64(*((const uint64 *) buf));
+               popcnt += pg_popcount64_neon(*((const uint64 *) buf));
                buf += sizeof(uint64);
        }
 
@@ -465,7 +455,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 
mask)
         */
        for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64))
        {
-               popcnt += pg_popcount64(*((const uint64 *) buf) & mask64);
+               popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64);
                buf += sizeof(uint64);
        }
 
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 0e98f532552..9fd8e18ed16 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -36,8 +36,6 @@
  * operation, but in practice this is close enough, and "sse42" seems easier to
  * follow than "popcnt" for these names.
  */
-static inline int pg_popcount32_sse42(uint32 word);
-static inline int pg_popcount64_sse42(uint64 word);
 static uint64 pg_popcount_sse42(const char *buf, int bytes);
 static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask);
 
@@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int 
bytes, bits8 mask);
  * what the current CPU supports) and then will call the pointer to fulfill the
  * caller's request.
  */
-static int     pg_popcount32_choose(uint32 word);
-static int     pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
 static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 
mask);
-int                    (*pg_popcount32) (uint32 word) = pg_popcount32_choose;
-int                    (*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64         (*pg_popcount_optimized) (const char *buf, int bytes) = 
pg_popcount_choose;
 uint64         (*pg_popcount_masked_optimized) (const char *buf, int bytes, 
bits8 mask) = pg_popcount_masked_choose;
 
@@ -157,7 +151,7 @@ pg_popcount_avx512_available(void)
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
 /*
- * These functions get called on the first call to pg_popcount32 etc.
+ * These functions get called on the first call to pg_popcount(), etc.
  * They detect whether we can use the asm implementations, and replace
  * the function pointers so that subsequent calls are routed directly to
  * the chosen implementation.
@@ -167,15 +161,11 @@ choose_popcount_functions(void)
 {
        if (pg_popcount_sse42_available())
        {
-               pg_popcount32 = pg_popcount32_sse42;
-               pg_popcount64 = pg_popcount64_sse42;
                pg_popcount_optimized = pg_popcount_sse42;
                pg_popcount_masked_optimized = pg_popcount_masked_sse42;
        }
        else
        {
-               pg_popcount32 = pg_popcount32_portable;
-               pg_popcount64 = pg_popcount64_portable;
                pg_popcount_optimized = pg_popcount_portable;
                pg_popcount_masked_optimized = pg_popcount_masked_portable;
        }
@@ -189,20 +179,6 @@ choose_popcount_functions(void)
 #endif
 }
 
-static int
-pg_popcount32_choose(uint32 word)
-{
-       choose_popcount_functions();
-       return pg_popcount32(word);
-}
-
-static int
-pg_popcount64_choose(uint64 word)
-{
-       choose_popcount_functions();
-       return pg_popcount64(word);
-}
-
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
@@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, 
bits8 mask)
 
 #endif                                                 /* 
USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */
 
-/*
- * pg_popcount32_sse42
- *             Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount32_sse42(uint32 word)
-{
-#ifdef _MSC_VER
-       return __popcnt(word);
-#else
-       uint32          res;
-
-__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
-       return (int) res;
-#endif
-}
-
 /*
  * pg_popcount64_sse42
  *             Return the number of 1 bits set in word
-- 
2.50.1 (Apple Git-155)

Reply via email to