for_each_set_bit, or similar functions like for_each_cpu, may be hot
within the kernel. If many bits were set then one could imagine on
Intel a "bt" instruction with every bit may be faster than the function
call and word length find_next_bit logic. Add a benchmark to measure
this.

This benchmark on AMD rome and Intel skylakex shows "bt" is not a good
option except for very small bitmaps.

Signed-off-by: Ian Rogers <irog...@google.com>
---
 tools/perf/bench/Build            |   1 +
 tools/perf/bench/bench.h          |   1 +
 tools/perf/bench/find-bit-bench.c | 135 ++++++++++++++++++++++++++++++
 tools/perf/builtin-bench.c        |   1 +
 4 files changed, 138 insertions(+)
 create mode 100644 tools/perf/bench/find-bit-bench.c

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 768e408757a0..fb114bca3a8d 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -10,6 +10,7 @@ perf-y += epoll-wait.o
 perf-y += epoll-ctl.o
 perf-y += synthesize.o
 perf-y += kallsyms-parse.o
+perf-y += find-bit-bench.o
 
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-lib.o
 perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 61cae4966cae..3291b0ddddfe 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -35,6 +35,7 @@ int bench_sched_messaging(int argc, const char **argv);
 int bench_sched_pipe(int argc, const char **argv);
 int bench_mem_memcpy(int argc, const char **argv);
 int bench_mem_memset(int argc, const char **argv);
+int bench_mem_find_bit(int argc, const char **argv);
 int bench_futex_hash(int argc, const char **argv);
 int bench_futex_wake(int argc, const char **argv);
 int bench_futex_wake_parallel(int argc, const char **argv);
diff --git a/tools/perf/bench/find-bit-bench.c 
b/tools/perf/bench/find-bit-bench.c
new file mode 100644
index 000000000000..1aadbd9d7e86
--- /dev/null
+++ b/tools/perf/bench/find-bit-bench.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Benchmark find_next_bit and related bit operations.
+ *
+ * Copyright 2020 Google LLC.
+ */
+#include <stdlib.h>
+#include "bench.h"
+#include "../util/stat.h"
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/time64.h>
+#include <subcmd/parse-options.h>
+
+static unsigned int outer_iterations = 5;
+static unsigned int inner_iterations = 100000;
+
+static const struct option options[] = {
+       OPT_UINTEGER('i', "outer-iterations", &outer_iterations,
+               "Number of outerer iterations used"),
+       OPT_UINTEGER('j', "inner-iterations", &inner_iterations,
+               "Number of outerer iterations used"),
+       OPT_END()
+};
+
+static const char *const bench_usage[] = {
+       "perf bench mem find_bit <options>",
+       NULL
+};
+
+static unsigned int accumulator;
+static unsigned int use_of_val;
+
+static noinline void workload(int val)
+{
+       use_of_val += val;
+       accumulator++;
+}
+
+#if defined(__i386__) || defined(__x86_64__)
+static bool asm_test_bit(long nr, const unsigned long *addr)
+{
+       bool oldbit;
+
+       asm volatile("bt %2,%1"
+                    : "=@ccc" (oldbit)
+                    : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory");
+
+       return oldbit;
+}
+#else
+#define asm_test_bit test_bit
+#endif
+
+static int do_for_each_set_bit(unsigned int num_bits)
+{
+       unsigned long *to_test = bitmap_alloc(num_bits);
+       struct timeval start, end, diff;
+       u64 runtime_us;
+       struct stats fb_time_stats, tb_time_stats;
+       double time_average, time_stddev;
+       unsigned int bit, i, j;
+       unsigned int set_bits, skip;
+       unsigned int old;
+
+       init_stats(&fb_time_stats);
+       init_stats(&tb_time_stats);
+
+       for (set_bits = 1; set_bits <= num_bits; set_bits <<= 1) {
+               bitmap_zero(to_test, num_bits);
+               skip = num_bits / set_bits;
+               for (i = 0; i < num_bits; i += skip)
+                       set_bit(i, to_test);
+
+               for (i = 0; i < outer_iterations; i++) {
+                       old = accumulator;
+                       gettimeofday(&start, NULL);
+                       for (j = 0; j < inner_iterations; j++) {
+                               for_each_set_bit(bit, to_test, num_bits)
+                                       workload(bit);
+                       }
+                       gettimeofday(&end, NULL);
+                       assert(old + (inner_iterations * set_bits) == 
accumulator);
+                       timersub(&end, &start, &diff);
+                       runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+                       update_stats(&fb_time_stats, runtime_us);
+
+                       old = accumulator;
+                       gettimeofday(&start, NULL);
+                       for (j = 0; j < inner_iterations; j++) {
+                               for (bit = 0; bit < num_bits; bit++) {
+                                       if (asm_test_bit(bit, to_test))
+                                               workload(bit);
+                               }
+                       }
+                       gettimeofday(&end, NULL);
+                       assert(old + (inner_iterations * set_bits) == 
accumulator);
+                       timersub(&end, &start, &diff);
+                       runtime_us = diff.tv_sec * USEC_PER_SEC + diff.tv_usec;
+                       update_stats(&tb_time_stats, runtime_us);
+               }
+
+               printf("%d operations %d bits set of %d bits\n",
+                       inner_iterations, set_bits, num_bits);
+               time_average = avg_stats(&fb_time_stats);
+               time_stddev = stddev_stats(&fb_time_stats);
+               printf("  Average for_each_set_bit took: %.3f usec (+- %.3f 
usec)\n",
+                       time_average, time_stddev);
+               time_average = avg_stats(&tb_time_stats);
+               time_stddev = stddev_stats(&tb_time_stats);
+               printf("  Average test_bit loop took:    %.3f usec (+- %.3f 
usec)\n",
+                       time_average, time_stddev);
+
+               if (use_of_val == accumulator)  /* Try to avoid compiler 
tricks. */
+                       printf("\n");
+       }
+       bitmap_free(to_test);
+       return 0;
+}
+
+int bench_mem_find_bit(int argc, const char **argv)
+{
+       int err = 0, i;
+
+       argc = parse_options(argc, argv, options, bench_usage, 0);
+       if (argc) {
+               usage_with_options(bench_usage, options);
+               exit(EXIT_FAILURE);
+       }
+
+       for (i = 1; i <= 2048; i <<= 1)
+               do_for_each_set_bit(i);
+
+       return err;
+}
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index cad31b1d3438..690eee1120a7 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -52,6 +52,7 @@ static struct bench sched_benchmarks[] = {
 static struct bench mem_benchmarks[] = {
        { "memcpy",     "Benchmark for memcpy() functions",             
bench_mem_memcpy        },
        { "memset",     "Benchmark for memset() functions",             
bench_mem_memset        },
+       { "find_bit",   "Benchmark for find_bit() functions",           
bench_mem_find_bit      },
        { "all",        "Run all memory access benchmarks",             NULL    
                },
        { NULL,         NULL,                                           NULL    
                }
 };
-- 
2.28.0.rc0.142.g3c755180ce-goog

Reply via email to