On Fri, 3 Apr 2026 at 16:24, Chao Li <[email protected]> wrote:
> I also did a load test with a standalone c program with 4 versions:
I think you'd be better off picking a more realistic Bitmapset. The
code is doing:
int words_to_alloc = 20000; // Large set to bypass CPU cache slightly
Bitmapset *bms = malloc(sizeof(Bitmapset) + words_to_alloc *
sizeof(bitmapword));
bms->nwords = words_to_alloc;
memset(bms->words, 0, words_to_alloc * sizeof(bitmapword));
/* Set a bit far into the set to force a long scan */
int target_bit = (words_to_alloc - 1) * 64 + 10;
bms->words[words_to_alloc - 1] |= (1ULL << 10);
A set with 20k words! Then you're doing:
for (int i = 0; i < iterations; i++) sink = bms_next_member(bms, 0);
So you're not looping correctly over the set, as looping over a set
with 1 member requires 2 calls to the function.
I'd say this unrealistically favours the unrolled loop version, as
most of the effort is in the loop looping over empty words and you can
do that without the extra bit-wise AND that the other versions have.
That version also seems to apply both the 64-bit fix and the INT_MAX
fix. Why both? Because you're only calling the function once per
iteration, it also means your || INT_MAX is only executed once, rather
than n_members + 1 as it would normally be executed. That'll make your
version seem better than it is.
I fixed all that and changed the Bitmapset to something more realistic
with how it's more likely to be seen in PostgreSQL and ran the
benchmark with a proper bms_next_member loop. File attached and
results below:
Zen2 Linux
$ gcc test_bms_next.c -O2 -o test_bms_next && ./test_bms_next
Benchmarking 100000000 iterations...
Original: 0.62113 seconds
David's: 0.70257 seconds
Chao's version: 1.70691 seconds
Unrolled loop: 1.56239 seconds
2800000000
$ clang test_bms_next.c -O2 -o test_bms_next && ./test_bms_next
Benchmarking 100000000 iterations...
Original: 1.11263 seconds
David's: 0.87399 seconds
Chao's version: 0.52962 seconds
Unrolled loop: 1.07799 seconds
2800000000
Apple M2:
Benchmarking 100000000 iterations...
Original: 0.64023 seconds
David's: 0.41087 seconds
Chao's version: 1.21113 seconds
Unrolled loop: 0.55924 seconds
2800000000
> I’m curious that, when something performs differently across platforms, which
> platform should take priority?
I don't think it matters too much for this case. If we were actually
working on a function that was a bottleneck, then it would be worth
looking deeper and seeing if the compilers are producing suboptimal
code and then try to coax them into making better code. As a last
resort, have two versions and some preprocessor to decide which one.
However, this just isn't performance-critical enough to warrant such
an effort. I think we're already going far too far with this. I just
want to fix the bug. If performance increases in some way as a result
of that, then good.
If I sum up the times from each version of the above results, I get:
Original: 2.37399
David's: 1.98743
Chao's: 3.44766
Unrolled: 3.19962
I'm happy to go with the fastest one. I expect the one with the fewest
instructions is likely more important when running it in the planner,
as that's less to decode. I didn't look, but I expect your loop
unrolled version and your || INT_MAX version to have more
instructions.
David
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <limits.h>
//#define NULL ((void *) 0)
typedef uint64_t uint64;
typedef int64_t int64;
#define BITS_PER_BITMAPWORD 64
typedef uint64 bitmapword; /* must be an unsigned type */
typedef int64 signedbitmapword; /* must be the matching signed type */
#define WORDNUM(x) ((x) / BITS_PER_BITMAPWORD)
#define BITNUM(x) ((x) % BITS_PER_BITMAPWORD)
typedef struct Bitmapset
{
int nwords; /* number of words in array */
bitmapword words[]; /* really [nwords] */
} Bitmapset;
static inline int
bmw_rightmost_one_pos(uint64 word)
{
return __builtin_ctzll(word);
}
// 1. Original version
int
bms_next_member(const Bitmapset *a, int prevbit)
{
int nwords;
bitmapword mask;
if (a == NULL)
return -2;
nwords = a->nwords;
prevbit++;
mask = (~(bitmapword) 0) << BITNUM(prevbit);
for (int wordnum = WORDNUM(prevbit); wordnum < nwords; wordnum++)
{
bitmapword w = a->words[wordnum];
/* ignore bits before prevbit */
w &= mask;
if (w != 0)
{
int result;
result = wordnum * BITS_PER_BITMAPWORD;
result += bmw_rightmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
// 2. David's version
int
bms_next_member_david(const Bitmapset *a, int prevbit)
{
int64 currbit;
size_t nwords;
bitmapword mask;
if (a == NULL)
return-2;
nwords = a->nwords;
currbit = (int64) prevbit + 1;
mask = (~(bitmapword) 0) << BITNUM(currbit);
for (size_t wordnum = WORDNUM(currbit); wordnum < nwords; wordnum++)
{
bitmapword w = a->words[wordnum];
/* ignore bits before currbit */
w &= mask;
if (w != 0)
{
int result;
result = (int) wordnum * BITS_PER_BITMAPWORD;
result += bmw_rightmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
// 3. Original version + INT32_MAX check + 64bit
int
bms_next_member_chao(const Bitmapset *a, int prevbit)
{
size_t nwords;
bitmapword mask;
if (a == NULL || prevbit == INT32_MAX)
return -2;
nwords = (size_t) a->nwords;
prevbit++;
mask = (~(bitmapword) 0) << BITNUM(prevbit);
for (size_t wordnum = WORDNUM(prevbit); wordnum < nwords; wordnum++)
{
bitmapword w = a->words[wordnum];
/* ignore bits before prevbit */
w &= mask;
if (w != 0)
{
int result;
result = (int)wordnum * BITS_PER_BITMAPWORD;
result += bmw_rightmost_one_pos(w);
return result;
}
/* in subsequent words, consider all bits */
mask = (~(bitmapword) 0);
}
return -2;
}
// 4. Pull up first iteration
int bms_next_member_unroll_first(const Bitmapset *a, int prevbit)
{
uint64 currbit;
int wordnum;
int nwords;
if (a == NULL)
return -2;
currbit = (int64) prevbit + 1;
wordnum = WORDNUM(currbit);
nwords = a->nwords;
if (wordnum >= nwords)
return -2;
/* Handle first word with mask */
const bitmapword *p = &a->words[wordnum];
bitmapword w = (*p) & ((~(bitmapword) 0) << BITNUM(currbit));
if (w != 0)
return (wordnum * BITS_PER_BITMAPWORD) +
bmw_rightmost_one_pos(w);
/* The "Tight" Pointer Scan */
const bitmapword *end = &a->words[nwords];
for (p++; p < end; p++)
{
if (*p != 0)
{
wordnum = p - a->words; // Pointer arithmetic to get
index
return (wordnum * BITS_PER_BITMAPWORD) +
bmw_rightmost_one_pos(*p);
}
}
return -2;
}
double get_time() {
struct timespec ts;
clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &ts);
return ts.tv_sec + ts.tv_nsec * 1e-9;
}
int main() {
int words_to_alloc = 1; // Large set to bypass CPU cache slightly
Bitmapset *bms = malloc(sizeof(Bitmapset) + words_to_alloc *
sizeof(bitmapword));
bms->nwords = words_to_alloc;
memset(bms->words, 0, words_to_alloc * sizeof(bitmapword));
double end;
int64 count = 0;
/* Set a bit far into the set to force a long scan */
bms->words[words_to_alloc - 1] |= 0xaf4;
int iterations = 100000000;
printf("Benchmarking %d iterations...\n\n", iterations);
// Test Original
double start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member(bms, j)) >= 0)
count++;
}
end = get_time();
printf("Original: %.5f seconds\n", end - start);
// Test Fast
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member_david(bms, j)) >= 0)
count++;
}
end = get_time();
printf("David's: %.5f seconds\n", end - start);
// Test Chao's version
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member_chao(bms, j)) >= 0)
count++;
}
end = get_time();
printf("Chao's version: %.5f seconds\n", end - start);
// Pull up first iteration
start = get_time();
for (int i = 0; i < iterations; i++)
{
int j = -1;
while ((j = bms_next_member_unroll_first(bms, j)) >= 0)
count++;
}
end = get_time();
printf("Unrolled loop: %.5f seconds\n", end - start);
printf("%ld\n", count);
free(bms);
return 0;
}