New benchmarks with my own formula. I added my own find_first_bit algorithm, that still doesn't quite beat Ingo's sched_find_first_bit, but it does a number on the generic find_first_bit. I guess the compiler has finally beat what we do in asm. Here's my generic algorithm.
static inline int my_find_first_bit(const unsigned long *b, unsigned size) { int x = 0; size += 31; do { if (unlikely(*b)) return __ffs(*b) + x; b++; x += 32; } while (x < size); return x-32; } And here's the new results of my benchmark: Comments added. /* * I put in what is returned to see what really is returned is * correct. Funny that sched_find_first_bit is wrong if there * isn't a bit set. But this shouldn't be called if there isn't * one set. */ sched=128 ffb=160 my=160 clock speed = 00000000:7f2f2cbb 2133798075 ticks per second /* Here I set the last bit of 5 32bit words. */ sched=159 ffb=159 my=159 last bit set generic ffb: 00000000:026a1cea time: 0.018984294us sched ffb: 00000000:001ee719 time: 0.000949125us /* * My time is 8ns, 8 times longer than Ingo's if the last * bit is set (most likely case), but still more than twice * as fast as ffb. */ my ffb: 00000000:0106a3dd time: 0.008066546us sched=0 ffb=0 my=0 first bit set generic ffb: 00000000:01ee8e2b time: 0.015189432us sched ffb: 00000000:001eea92 time: 0.000949542us /* * With the first bit set, I'm still slower than Ingo's * but only by 2, and I still beat the pants off of ffb. */ my ffb: 00000000:004d5de6 time: 0.002376190us Conclusion: it looks like it might be time to change the i386 generic ffb to something else. Oh, and if you are wondering... $ gcc -v Using built-in specs. Target: i486-linux-gnu Configured with: ../src/configure -v --enable-languages=c,c ++,java,f95,objc,ada,treelang --prefix=/usr --enable-shared --with-system-zlib --libexecdir=/usr/lib --enable-nls --without-included-gettext --enable-threads=posix --program-suffix=-4.0 --enable-__cxa_atexit --enable-libstdcxx-allocator=mt --enable-clocale=gnu --enable-libstdcxx-debug --enable-java-gc=boehm --enable-java-awt=gtk --with-java-home=/usr/lib/jvm/java-1.4.2-gcj-4.0-1.4.2.0/jre --enable-mpfr --disable-werror --enable-checking=release i486-linux-gnu Thread model: posix gcc version 4.0.1 (Debian 4.0.1-2) -- Steve Attached is new version of ffb.c
#include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #define unlikely(x) __builtin_expect(!!(x), 0) static inline int find_first_bit(const unsigned long *addr, unsigned size) { int d0, d1; int res; /* This looks at memory. Mark it volatile to tell gcc not to move it around */ __asm__ __volatile__( "xorl %%eax,%%eax\n\t" "repe; scasl\n\t" "jz 1f\n\t" "leal -4(%%edi),%%edi\n\t" "bsfl (%%edi),%%eax\n" "1:\tsubl %%ebx,%%edi\n\t" "shll $3,%%edi\n\t" "addl %%edi,%%eax" :"=a" (res), "=&c" (d0), "=&D" (d1) :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); return res; } static inline unsigned long __ffs(unsigned long word) { __asm__("bsfl %1,%0" :"=r" (word) :"rm" (word)); return word; } static inline int sched_find_first_bit(const unsigned long *b) { if (unlikely(b[0])) return __ffs(b[0]); if (unlikely(b[1])) return __ffs(b[1]) + 32; if (unlikely(b[2])) return __ffs(b[2]) + 64; if (b[3]) return __ffs(b[3]) + 96; return __ffs(b[4]) + 128; } static inline int my_find_first_bit(const unsigned long *b, unsigned size) { int x = 0; size += 31; do { if (unlikely(*b)) return __ffs(*b) + x; b++; x += 32; } while (x < size); return x-32; } #define rdtscll(val) \ __asm__ __volatile__("rdtsc" : "=A" (val)) #define rdtsc(low,high) \ __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high)) static unsigned long array[5]; #define ITER 1000000 /* 1,000,000 times */ void testit(unsigned long *array, unsigned long long clock) { unsigned long long s; unsigned long long e; unsigned long long t; double f; int i; int x; /* * Since ITER is 1,000,000 the times will be in us. */ rdtscll(s); for (i=0; i < ITER; i++) x = find_first_bit(array,140); rdtscll(e); t = e - s; f = (float)t / (float)clock; printf("generic ffb: %08lx:%08lx\n", (unsigned long)(t>>32),(unsigned long)t); printf("time: %.09fus\n",f); rdtscll(s); for (i=0; i < ITER; i++) x = sched_find_first_bit(array); rdtscll(e); t = e - s; f = (float)t / (float)clock; printf("sched ffb: %08lx:%08lx\n", (unsigned long)(t>>32),(unsigned long)t); printf("time: %.09fus\n",f); rdtscll(s); for (i=0; i < ITER; i++) x = my_find_first_bit(array,140); rdtscll(e); t = e - s; f = (float)t / (float)clock; printf("my ffb: %08lx:%08lx\n", (unsigned long)(t>>32),(unsigned long)t); printf("time: %.09fus\n",f); } int main(int argc, char **argv) { unsigned long long s; unsigned long long e; unsigned long long t; unsigned long long clock; printf("sched=%d ffb=%d my=%d\n", sched_find_first_bit(array), find_first_bit(array,140), my_find_first_bit(array,140)); /* * Calculate BS time, just to get an * idea of the tsc speed. */ rdtscll(s); sleep(8); rdtscll(e); t = e - s; t >>= 3; printf("clock speed = %08lx:%08lx %llu ticks per second\n", (unsigned long)(t>>32),(unsigned long)t, t); clock = t; array[4] = 0x80000000; printf("sched=%d ffb=%d my=%d\n", sched_find_first_bit(array), find_first_bit(array,140), my_find_first_bit(array,140)); printf("last bit set\n"); testit(array,clock); array[0] = 0x00000001; printf("sched=%d ffb=%d my=%d\n", sched_find_first_bit(array), find_first_bit(array,140), my_find_first_bit(array,140)); printf("\nfirst bit set\n"); testit(array,clock); exit(0); }