I was perusing the commit log for mesa and stumbled upon the recently
added util_bitcount. It uses a rather naïve algorithm and I thought I'd
look into it as someone mentioned this problem to me before.
This is what I found, should anyone be interested:
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel

In any case, I wrote a little profiler that I have attached (bc.c). The
interesting bit is what result it throws out.
[zha...@ztoshiba ~]$ gcc bc.c -O3 -o bc && ./bc
1 billion of __builtin_popcount(), fast_bitcount(), and naive() (in that
order)
__builtin_popcount(): 12.541 seconds
fast_bitcount(): 7.312 seconds
naive(): 58.240 seconds

And this is the real reason I'm putting something this
unimportant/trivial on the ML. Should the builtin be removed in favor of
this wonder-algorithm? and can I even justify pushing this patch
considering the requirement of knowing CHAR_BIT (from limits.h, amount
of bits in a char)?
>From f0df6fb47f9854b31d5d2cb02fc5c9371e67f912 Mon Sep 17 00:00:00 2001
From: Joakim Sindholt <opensou...@zhasha.com>
Date: Sat, 28 Nov 2009 19:06:22 +0100
Subject: [PATCH] u_math: simple util_bitcount speedup

Thanks to the person behind:
http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
---
 src/gallium/auxiliary/util/u_math.h |   16 +++++-----------
 1 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 7e75702..83a3150 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -490,23 +490,17 @@ util_next_power_of_two(unsigned x)
 }
 
 
+#include <limits.h>
 /**
  * Return number of bits set in n.
  */
 static INLINE unsigned
 util_bitcount(unsigned n)
 {
-#if defined(PIPE_CC_GCC)
-   return __builtin_popcount(n);
-#else
-   /* XXX there are more clever ways of doing this */
-   unsigned bits = 0;
-   while (n) {
-      bits += (n & 1);
-      n = n >> 1;
-   }
-   return bits;
-#endif
+    n = n - ((n >> 1) & ~0U/3);
+    n = (n & ~0U/15*3) + ((n >> 2) & ~0U/15*3);
+    n = (n + (n >> 4)) & ~0U/255*15;
+    return (unsigned)(n * (~0U/255)) >> (sizeof(n) - 1) * CHAR_BIT;
 }
 
 
-- 
1.6.5.2

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <limits.h>

/* Templated C++ code (works for ints of up to 128bit)
template<typename T>
int fast_bitcount(T v)
{
    v = v - ((v >> 1) & (T)~(T)0/3);
    v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
    v = (v + (v >> 4)) & (T)~(T)0/255*15;
    return (T)(v * ((T)~(T)0/255)) >> (sizeof(v) - 1) * CHAR_BIT;    
}
*/

inline int fast_bitcount(unsigned v)
{
    v = v - ((v >> 1) & ~0U/3);
    v = (v & ~0U/15*3) + ((v >> 2) & ~0U/15*3);
    v = (v + (v >> 4)) & ~0U/255*15;
    return (v * (~0U/255)) >> (sizeof(v) - 1) * CHAR_BIT;
}

inline int naive(unsigned v)
{
    unsigned c = 0;
    while (v) {
        c += v & 1;
        v >>= 1;
    }
    return c;
}

#define TEST(func) \
    gettimeofday(&ts, NULL); \
    for (i = 0; i < 1000000000; i++) { \
        c = func(i); \
    } \
    srand(c); \
    gettimeofday(&te, NULL); \
    t = (te.tv_sec - ts.tv_sec) * 1000 + (te.tv_usec - ts.tv_usec) / 1000; \
    printf(#func "(): %u.%u seconds\n", t / 1000, t % 1000)

int main()
{
    unsigned c, i, t;
    struct timeval ts, te;
    
    printf("1 billion of __builtin_popcount(), fast_bitcount(), and naive() (in that order)\n");
    TEST(__builtin_popcount);
    TEST(fast_bitcount);
    TEST(naive);
    return 0;
}
------------------------------------------------------------------------------
Let Crystal Reports handle the reporting - Free Crystal Reports 2008 30-Day 
trial. Simplify your report design, integration and deployment - and focus on 
what you do best, core application coding. Discover what's new with
Crystal Reports now.  http://p.sf.net/sfu/bobj-july
_______________________________________________
Mesa3d-dev mailing list
Mesa3d-dev@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/mesa3d-dev

Reply via email to