Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-29 Thread Ian Rogers
On Wed, Jul 29, 2020 at 1:44 PM Arnaldo Carvalho de Melo
 wrote:
>
> Em Wed, Jul 29, 2020 at 04:59:18PM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> > > Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > > > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > > > within the kernel. If many bits were set then one could imagine on
> > > > Intel a "bt" instruction with every bit may be faster than the function
> > > > call and word length find_next_bit logic. Add a benchmark to measure
> > > > this.
> > >
> > > Thanks, applied.
>
> > > > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > > > option except for very small bitmaps.
> >
> > > > +++ b/tools/perf/bench/find-bit-bench.c
> >
> > > > +#if defined(__i386__) || defined(__x86_64__)
> > > > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > > > +{
> > > > + bool oldbit;
> > > > +
> > > > + asm volatile("bt %2,%1"
> > > > +  : "=@ccc" (oldbit)
> > > > +  : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > > > +
> > > > + return oldbit;
> >
> > Some old clang versions are not liking this:
>
> Failed with:
>
>   clang version 3.8.0 (tags/RELEASE_380/final)
>   clang version 3.8.1 (tags/RELEASE_381/final)
>   clang version 4.0.0 (tags/RELEASE_400/final)
>   Alpine clang version 8.0.0 (tags/RELEASE_800/final) (based on LLVM 8.0.0)
>   Alpine clang version 5.0.0 (tags/RELEASE_500/final) (based on LLVM 5.0.0)
>   Alpine clang version 5.0.1 (tags/RELEASE_501/final) (based on LLVM 5.0.1)
>   Alpine clang version 5.0.1 (tags/RELEASE_502/final) (based on LLVM 5.0.1)
>
> Worked with:
>
>   Alpine clang version 9.0.0 (https://git.alpinelinux.org/aports 
> f7f0d2c2b8bcd6a5843401a9a702029556492689) (based on LLVM 9.0.0)
>   Alpine clang version 10.0.0 
> (https://gitlab.alpinelinux.org/alpine/aports.git 
> 7445adce501f8473efdb93b17b5eaf2f1445ed4c)
>   Alpine clang version 10.0.0 (git://git.alpinelinux.org/aports 
> 7445adce501f8473efdb93b17b5eaf2f1445ed4c)
>
>
> Also failed for;
>
> # grep FAIL dm.log/summary  | grep -v alpine
>  alt:p8: FAIL
>clang version 3.8.0 (tags/RELEASE_380/final)
>  alt:p9: FAIL
>clang version 7.0.1
>  amazonlinux:1: FAIL
>clang version 3.6.2 (tags/RELEASE_362/final)
>  amazonlinux:2: FAIL
>clang version 7.0.1 (Amazon Linux 2 7.0.1-1.amzn2.0.2)
> #

Thanks, I added a __GCC_ASM_FLAG_OUTPUTS__ guard:
https://lore.kernel.org/lkml/20200729220034.1337168-1-irog...@google.com/T/#u

Ian


Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-29 Thread Arnaldo Carvalho de Melo
Em Wed, Jul 29, 2020 at 04:59:18PM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> > Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > > within the kernel. If many bits were set then one could imagine on
> > > Intel a "bt" instruction with every bit may be faster than the function
> > > call and word length find_next_bit logic. Add a benchmark to measure
> > > this.
> > 
> > Thanks, applied.

> > > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > > option except for very small bitmaps.
> 
> > > +++ b/tools/perf/bench/find-bit-bench.c
> 
> > > +#if defined(__i386__) || defined(__x86_64__)
> > > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > > +{
> > > + bool oldbit;
> > > +
> > > + asm volatile("bt %2,%1"
> > > +  : "=@ccc" (oldbit)
> > > +  : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > > +
> > > + return oldbit;
> 
> Some old clang versions are not liking this:

Failed with:

  clang version 3.8.0 (tags/RELEASE_380/final)
  clang version 3.8.1 (tags/RELEASE_381/final)
  clang version 4.0.0 (tags/RELEASE_400/final)
  Alpine clang version 8.0.0 (tags/RELEASE_800/final) (based on LLVM 8.0.0)
  Alpine clang version 5.0.0 (tags/RELEASE_500/final) (based on LLVM 5.0.0)
  Alpine clang version 5.0.1 (tags/RELEASE_501/final) (based on LLVM 5.0.1)
  Alpine clang version 5.0.1 (tags/RELEASE_502/final) (based on LLVM 5.0.1)

Worked with:

  Alpine clang version 9.0.0 (https://git.alpinelinux.org/aports 
f7f0d2c2b8bcd6a5843401a9a702029556492689) (based on LLVM 9.0.0)
  Alpine clang version 10.0.0 (https://gitlab.alpinelinux.org/alpine/aports.git 
7445adce501f8473efdb93b17b5eaf2f1445ed4c)
  Alpine clang version 10.0.0 (git://git.alpinelinux.org/aports 
7445adce501f8473efdb93b17b5eaf2f1445ed4c)


Also failed for;

# grep FAIL dm.log/summary  | grep -v alpine
 alt:p8: FAIL
   clang version 3.8.0 (tags/RELEASE_380/final)
 alt:p9: FAIL
   clang version 7.0.1
 amazonlinux:1: FAIL
   clang version 3.6.2 (tags/RELEASE_362/final)
 amazonlinux:2: FAIL
   clang version 7.0.1 (Amazon Linux 2 7.0.1-1.amzn2.0.2)
#


Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-29 Thread Arnaldo Carvalho de Melo
Em Tue, Jul 28, 2020 at 08:51:52AM -0300, Arnaldo Carvalho de Melo escreveu:
> Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > within the kernel. If many bits were set then one could imagine on
> > Intel a "bt" instruction with every bit may be faster than the function
> > call and word length find_next_bit logic. Add a benchmark to measure
> > this.
> 
> Thanks, applied.
> 
> - Arnaldo
>  
> > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > option except for very small bitmaps.

> > +++ b/tools/perf/bench/find-bit-bench.c

> > +#if defined(__i386__) || defined(__x86_64__)
> > +static bool asm_test_bit(long nr, const unsigned long *addr)
> > +{
> > +   bool oldbit;
> > +
> > +   asm volatile("bt %2,%1"
> > +: "=@ccc" (oldbit)
> > +: "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> > +
> > +   return oldbit;

Some old clang versions are not liking this:

clang version 3.8.0 (tags/RELEASE_380/final)
Target: x86_64-alpine-linux-musl
Thread model: posix
InstalledDir: /usr/bin
Found candidate GCC installation: 
/usr/bin/../lib/gcc/x86_64-alpine-linux-musl/5.3.0
Found candidate GCC installation: /usr/lib/gcc/x86_64-alpine-linux-musl/5.3.0
Selected GCC installation: /usr/bin/../lib/gcc/x86_64-alpine-linux-musl/5.3.0
Candidate multilib: .;@m64
Selected multilib: .;@m64
+ make ARCH= CROSS_COMPILE= EXTRA_CFLAGS= -C /git/linux/tools/perf 
O=/tmp/build/perf CC=clang


  CC   /tmp/build/perf/trace/beauty/pkey_alloc.o
  CC   /tmp/build/perf/tests/openat-syscall-tp-fields.o
bench/find-bit-bench.c:46:10: error: invalid output constraint '=@ccc' in asm
 : "=@ccc" (oldbit)
   ^
1 error generated.
mv: can't rename '/tmp/build/perf/bench/.find-bit-bench.o.tmp': No such file or 
directory
/git/linux/tools/build/Makefile.build:96: recipe for target 
'/tmp/build/perf/bench/find-bit-bench.o' failed
make[4]: *** [/tmp/build/perf/bench/find-bit-bench.o] Error 1
/git/linux/tools/build/Makefile.build:139: recipe for target 'bench' failed
make[3]: *** [bench] Error 2
make[3]: *** Waiting for unfinished jobs
  MKDIR/tmp/build/perf/arch/x86/tests/


Also:

clang version 3.8.1 (tags/RELEASE_381/final)
clang version 4.0.0 (tags/RELEASE_400/final)


- Arnaldo


Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-28 Thread Arnaldo Carvalho de Melo
Em Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers escreveu:
> for_each_set_bit, or similar functions like for_each_cpu, may be hot
> within the kernel. If many bits were set then one could imagine on
> Intel a "bt" instruction with every bit may be faster than the function
> call and word length find_next_bit logic. Add a benchmark to measure
> this.

Thanks, applied.

- Arnaldo
 
> This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> option except for very small bitmaps.
> 
> Signed-off-by: Ian Rogers 
> ---
>  tools/perf/bench/Build|   1 +
>  tools/perf/bench/bench.h  |   1 +
>  tools/perf/bench/find-bit-bench.c | 135 ++
>  tools/perf/builtin-bench.c|   1 +
>  4 files changed, 138 insertions(+)
>  create mode 100644 tools/perf/bench/find-bit-bench.c
> 
> diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> index 768e408757a0..fb114bca3a8d 100644
> --- a/tools/perf/bench/Build
> +++ b/tools/perf/bench/Build
> @@ -10,6 +10,7 @@ perf-y += epoll-wait.o
>  perf-y += epoll-ctl.o
>  perf-y += synthesize.o
>  perf-y += kallsyms-parse.o
> +perf-y += find-bit-bench.o
>  
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
>  perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
> diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
> index 61cae4966cae..3291b0fe 100644
> --- a/tools/perf/bench/bench.h
> +++ b/tools/perf/bench/bench.h
> @@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
>  int bench_sched_pipe(int argc, const char **argv);
>  int bench_mem_memcpy(int argc, const char **argv);
>  int bench_mem_memset(int argc, const char **argv);
> +int bench_mem_find_bit(int argc, const char **argv);
>  int bench_futex_hash(int argc, const char **argv);
>  int bench_futex_wake(int argc, const char **argv);
>  int bench_futex_wake_parallel(int argc, const char **argv);
> diff --git a/tools/perf/bench/find-bit-bench.c 
> b/tools/perf/bench/find-bit-bench.c
> new file mode 100644
> index ..1aadbd9d7e86
> --- /dev/null
> +++ b/tools/perf/bench/find-bit-bench.c
> @@ -0,0 +1,135 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Benchmark find_next_bit and related bit operations.
> + *
> + * Copyright 2020 Google LLC.
> + */
> +#include 
> +#include "bench.h"
> +#include "../util/stat.h"
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +static unsigned int outer_iterations = 5;
> +static unsigned int inner_iterations = 10;
> +
> +static const struct option options[] = {
> + OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
> + "Number of outerer iterations used"),
> + OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
> + "Number of outerer iterations used"),
> + OPT_END()
> +};
> +
> +static const char *const bench_usage[] = {
> + "perf bench mem find_bit ",
> + NULL
> +};
> +
> +static unsigned int accumulator;
> +static unsigned int use_of_val;
> +
> +static noinline void workload(int val)
> +{
> + use_of_val += val;
> + accumulator++;
> +}
> +
> +#if defined(__i386__) || defined(__x86_64__)
> +static bool asm_test_bit(long nr, const unsigned long *addr)
> +{
> + bool oldbit;
> +
> + asm volatile("bt %2,%1"
> +  : "=@ccc" (oldbit)
> +  : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
> +
> + return oldbit;
> +}
> +#else
> +#define asm_test_bit test_bit
> +#endif
> +
> +static int do_for_each_set_bit(unsigned int num_bits)
> +{
> + unsigned long *to_test = bitmap_alloc(num_bits);
> + struct timeval start, end, diff;
> + u64 runtime_us;
> + struct stats fb_time_stats, tb_time_stats;
> + double time_average, time_stddev;
> + unsigned int bit, i, j;
> + unsigned int set_bits, skip;
> + unsigned int old;
> +
> + init_stats(&fb_time_stats);
> + init_stats(&tb_time_stats);
> +
> + for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
> + bitmap_zero(to_test, num_bits);
> + skip = num_bits / set_bits;
> + for (i = 0; i < num_bits; i += skip)
> + set_bit(i, to_test);
> +
> + for (i = 0; i < outer_iterations; i++) {
> + old = accumulator;
> + gettimeofday(&start, NULL);
> + for (j = 0; j < inner_iterations; j++) {
> + for_each_set_bit(bit, to_test, num_bits)
> + workload(bit);
> + }
> + gettimeofday(&end, NULL);
> + assert(old + (inner_iterations * set_bits) == 
> accumulator);
> + timersub(&end, &start, &diff);
> + runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
> + update_stats(&fb_time_stats, runtime_us);
> +
> + old = accumulator;
> + gettimeofday(&start, NULL);

Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-24 Thread Ian Rogers
On Fri, Jul 24, 2020 at 7:45 AM Andi Kleen  wrote:
>
> On Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers wrote:
> > for_each_set_bit, or similar functions like for_each_cpu, may be hot
> > within the kernel. If many bits were set then one could imagine on
> > Intel a "bt" instruction with every bit may be faster than the function
> > call and word length find_next_bit logic. Add a benchmark to measure
> > this.
>
> > This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> > option except for very small bitmaps.
>
> Small bitmaps is a common case in the kernel (e.g. cpu bitmaps)
>
> But the current code isn't that great for small bitmaps. It always looks 
> horrific
> when I look at PT traces or brstackinsn, especially since it was optimized
> purely for code size at some point.
>
> Probably would be better to have different implementations for
> different sizes.

Thanks Andi!

what I was kind of expecting was "bt [mem],reg; jae" to have a fixed
cost and the find_next_bit to be a big win when the majority of bits
in a bitmask were 0 but to lose if the majority of bits in the bitmask
were 1. So the ratio of 1s and 0s was going to matter, not so much the
bitmap size. For bitmaps smaller than 32 this was pretty much the
case:

100 operations 1 bits set of 1 bits
  Average for_each_set_bit took: 8368.640 usec (+- 103.009 usec)
  Average test_bit loop took:4052.360 usec (+- 15.381 usec)
100 operations 1 bits set of 2 bits
  Average for_each_set_bit took: 8503.770 usec (+- 4.385 usec)
  Average test_bit loop took:6103.570 usec (+- 0.514 usec)
100 operations 2 bits set of 2 bits
  Average for_each_set_bit took: 12754.485 usec (+- 301.342 usec)
  Average test_bit loop took:6882.950 usec (+- 55.254 usec)
100 operations 1 bits set of 4 bits
  Average for_each_set_bit took: 8517.670 usec (+- 5.712 usec)
  Average test_bit loop took:10485.580 usec (+- 1.386 usec)
100 operations 2 bits set of 4 bits
  Average for_each_set_bit took: 12931.060 usec (+- 312.882 usec)
  Average test_bit loop took:11093.980 usec (+- 43.149 usec)
100 operations 4 bits set of 4 bits
  Average for_each_set_bit took: 19664.760 usec (+- 588.841 usec)
  Average test_bit loop took:12217.583 usec (+- 96.287 usec)
100 operations 1 bits set of 8 bits
  Average for_each_set_bit took: 8501.810 usec (+- 5.250 usec)
  Average test_bit loop took:18924.550 usec (+- 2.999 usec)
100 operations 2 bits set of 8 bits
  Average for_each_set_bit took: 12760.025 usec (+- 301.881 usec)
  Average test_bit loop took:19726.540 usec (+- 56.878 usec)
100 operations 4 bits set of 8 bits
  Average for_each_set_bit took: 19151.163 usec (+- 560.053 usec)
  Average test_bit loop took:20817.317 usec (+- 96.923 usec)
100 operations 8 bits set of 8 bits
  Average for_each_set_bit took: 29770.865 usec (+- 1012.050 usec)
  Average test_bit loop took:22677.150 usec (+- 176.883 usec)
100 operations 1 bits set of 16 bits
  Average for_each_set_bit took: 8534.640 usec (+- 4.920 usec)
  Average test_bit loop took:36451.930 usec (+- 9.763 usec)
100 operations 2 bits set of 16 bits
  Average for_each_set_bit took: 12797.065 usec (+- 302.176 usec)
  Average test_bit loop took:37178.395 usec (+- 51.756 usec)
100 operations 4 bits set of 16 bits
  Average for_each_set_bit took: 18736.880 usec (+- 525.846 usec)
  Average test_bit loop took:38306.897 usec (+- 98.527 usec)
100 operations 8 bits set of 16 bits
  Average for_each_set_bit took: 29257.765 usec (+- 993.813 usec)
  Average test_bit loop took:40038.798 usec (+- 167.358 usec)
100 operations 16 bits set of 16 bits
  Average for_each_set_bit took: 46784.984 usec (+- 1759.075 usec)
  Average test_bit loop took:43014.266 usec (+- 298.138 usec)

 but for larger bitmaps, for example, 2048 bits where all are set,
even though the bt was a single instruction and find_next_bit was
going to have to do a lot of work to just return true (function call,
bunch of shifts) I got:

100 operations 2048 bits set of 2048 bits
  Average for_each_set_bit took: 2218881.255 usec (+- 106485.020 usec)
  Average test_bit loop took:4934851.515 usec (+- 18463.511 usec)

And so other than say for 4-bit and smaller bitmaps the current
find_next_bit approach works best. I don't see an obvious bug in the
benchmark so I am surprised at how bad bt performs.

Thanks,
Ian

> -Andi


Re: [PATCH] perf bench: Add benchmark of find_next_bit

2020-07-24 Thread Andi Kleen
On Fri, Jul 24, 2020 at 12:19:59AM -0700, Ian Rogers wrote:
> for_each_set_bit, or similar functions like for_each_cpu, may be hot
> within the kernel. If many bits were set then one could imagine on
> Intel a "bt" instruction with every bit may be faster than the function
> call and word length find_next_bit logic. Add a benchmark to measure
> this.
 
> This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
> option except for very small bitmaps.

Small bitmaps is a common case in the kernel (e.g. cpu bitmaps) 

But the current code isn't that great for small bitmaps. It always looks 
horrific
when I look at PT traces or brstackinsn, especially since it was optimized
purely for code size at some point.

Probably would be better to have different implementations for
different sizes.

-Andi


[PATCH] perf bench: Add benchmark of find_next_bit

2020-07-24 Thread Ian Rogers
for_each_set_bit, or similar functions like for_each_cpu, may be hot
within the kernel. If many bits were set then one could imagine on
Intel a "bt" instruction with every bit may be faster than the function
call and word length find_next_bit logic. Add a benchmark to measure
this.

This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
option except for very small bitmaps.

Signed-off-by: Ian Rogers 
---
 tools/perf/bench/Build|   1 +
 tools/perf/bench/bench.h  |   1 +
 tools/perf/bench/find-bit-bench.c | 135 ++
 tools/perf/builtin-bench.c|   1 +
 4 files changed, 138 insertions(+)
 create mode 100644 tools/perf/bench/find-bit-bench.c

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 768e408757a0..fb114bca3a8d 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -10,6 +10,7 @@ perf-y += epoll-wait.o
 perf-y += epoll-ctl.o
 perf-y += synthesize.o
 perf-y += kallsyms-parse.o
+perf-y += find-bit-bench.o
 
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 61cae4966cae..3291b0fe 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
 int bench_sched_pipe(int argc, const char **argv);
 int bench_mem_memcpy(int argc, const char **argv);
 int bench_mem_memset(int argc, const char **argv);
+int bench_mem_find_bit(int argc, const char **argv);
 int bench_futex_hash(int argc, const char **argv);
 int bench_futex_wake(int argc, const char **argv);
 int bench_futex_wake_parallel(int argc, const char **argv);
diff --git a/tools/perf/bench/find-bit-bench.c 
b/tools/perf/bench/find-bit-bench.c
new file mode 100644
index ..1aadbd9d7e86
--- /dev/null
+++ b/tools/perf/bench/find-bit-bench.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark find_next_bit and related bit operations.
+ *
+ * Copyright 2020 Google LLC.
+ */
+#include 
+#include "bench.h"
+#include "../util/stat.h"
+#include 
+#include 
+#include 
+#include 
+
+static unsigned int outer_iterations = 5;
+static unsigned int inner_iterations = 10;
+
+static const struct option options[] = {
+   OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
+   "Number of outerer iterations used"),
+   OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
+   "Number of outerer iterations used"),
+   OPT_END()
+};
+
+static const char *const bench_usage[] = {
+   "perf bench mem find_bit ",
+   NULL
+};
+
+static unsigned int accumulator;
+static unsigned int use_of_val;
+
+static noinline void workload(int val)
+{
+   use_of_val += val;
+   accumulator++;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static bool asm_test_bit(long nr, const unsigned long *addr)
+{
+   bool oldbit;
+
+   asm volatile("bt %2,%1"
+: "=@ccc" (oldbit)
+: "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
+
+   return oldbit;
+}
+#else
+#define asm_test_bit test_bit
+#endif
+
+static int do_for_each_set_bit(unsigned int num_bits)
+{
+   unsigned long *to_test = bitmap_alloc(num_bits);
+   struct timeval start, end, diff;
+   u64 runtime_us;
+   struct stats fb_time_stats, tb_time_stats;
+   double time_average, time_stddev;
+   unsigned int bit, i, j;
+   unsigned int set_bits, skip;
+   unsigned int old;
+
+   init_stats(&fb_time_stats);
+   init_stats(&tb_time_stats);
+
+   for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
+   bitmap_zero(to_test, num_bits);
+   skip = num_bits / set_bits;
+   for (i = 0; i < num_bits; i += skip)
+   set_bit(i, to_test);
+
+   for (i = 0; i < outer_iterations; i++) {
+   old = accumulator;
+   gettimeofday(&start, NULL);
+   for (j = 0; j < inner_iterations; j++) {
+   for_each_set_bit(bit, to_test, num_bits)
+   workload(bit);
+   }
+   gettimeofday(&end, NULL);
+   assert(old + (inner_iterations * set_bits) == 
accumulator);
+   timersub(&end, &start, &diff);
+   runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+   update_stats(&fb_time_stats, runtime_us);
+
+   old = accumulator;
+   gettimeofday(&start, NULL);
+   for (j = 0; j < inner_iterations; j++) {
+   for (bit = 0; bit < num_bits; bit++) {
+   if (asm_test_bit(bit, to_test))
+   workload(bit);
+