Introduce a benchmark to compare the architecture-optimized strlen()
implementation against the generic C version (__generic_strlen).

The benchmark uses a table-driven approach to evaluate performance
across different string lengths (short, medium, and long). It employs
ktime_get() for timing and get_random_bytes() followed by null-byte
filtering to generate test data that prevents early termination.

This helps in quantifying the performance gains of architecture-specific
optimizations on various platforms.

Suggested-by: Andy Shevchenko <[email protected]>
Signed-off-by: Feng Jiang <[email protected]>
---
 lib/tests/string_kunit.c | 117 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

diff --git a/lib/tests/string_kunit.c b/lib/tests/string_kunit.c
index 8eb095404b95..2266954ae5e0 100644
--- a/lib/tests/string_kunit.c
+++ b/lib/tests/string_kunit.c
@@ -20,6 +20,77 @@
 #define STRING_TEST_MAX_LEN    128
 #define STRING_TEST_MAX_OFFSET 16
 
+#if defined(__HAVE_ARCH_STRLEN)
+#define STRING_BENCH_ENABLED
+#endif
+
+#ifdef STRING_BENCH_ENABLED
+/* Configuration for string benchmark scenarios */
+struct string_bench_case {
+       const char *name;
+       size_t len;
+       unsigned int iterations;
+};
+
+static const struct string_bench_case bench_cases[] = {
+       {"short", 8, 100000},
+       {"medium", 64, 100000},
+       {"long", 2048, 10000},
+};
+
+/**
+ * get_max_bench_len() - Get the maximum length from benchmark cases
+ * @cases: array of test cases
+ * @count: number of cases
+ */
+static size_t get_max_bench_len(const struct string_bench_case *cases, size_t 
count)
+{
+       size_t i, max_len = 0;
+
+       for (i = 0; i < count; i++) {
+               if (cases[i].len > max_len)
+                       max_len = cases[i].len;
+       }
+
+       return max_len;
+}
+
+/**
+ * get_random_nonzero_bytes() - Fill buffer with random non-null bytes
+ * @buf: buffer to fill
+ * @len: number of bytes to fill
+ */
+static void get_random_nonzero_bytes(void *buf, size_t len)
+{
+       u8 *s = (u8 *)buf;
+
+       get_random_bytes(buf, len);
+
+       /* Replace null bytes to avoid early string termination */
+       for (size_t i = 0; i < len; i++) {
+               if (s[i] == '\0')
+                       s[i] = 0x01;
+       }
+}
+
+static void string_bench_report(struct kunit *test, const char *func,
+               const struct string_bench_case *bc,
+               u64 time_arch, u64 time_generic)
+{
+       u64 ratio_int, ratio_frac;
+
+       /* Calculate speedup ratio with 2 decimal places. */
+       ratio_int = div64_u64(time_generic, time_arch);
+       ratio_frac = div64_u64((time_generic % time_arch) * 100, time_arch);
+
+       kunit_info(test, "%s performance (%s, len: %zu, iters: %u):\n",
+               func, bc->name, bc->len, bc->iterations);
+       kunit_info(test, "  arch-optimized: %llu ns\n", time_arch);
+       kunit_info(test, "  generic C:      %llu ns\n", time_generic);
+       kunit_info(test, "  speedup:        %llu.%02llux\n", ratio_int, 
ratio_frac);
+}
+#endif /* STRING_BENCH_ENABLED */
+
 static void string_test_memset16(struct kunit *test)
 {
        unsigned i, j, k;
@@ -129,6 +200,49 @@ static void string_test_strlen(struct kunit *test)
        }
 }
 
+#ifdef __HAVE_ARCH_STRLEN
+static void string_test_strlen_bench(struct kunit *test)
+{
+       char *buf;
+       size_t buf_len, iters;
+       ktime_t start, end;
+       u64 time_arch, time_generic;
+
+       buf_len = get_max_bench_len(bench_cases, ARRAY_SIZE(bench_cases)) + 1;
+
+       buf = kunit_kzalloc(test, buf_len, GFP_KERNEL);
+       KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buf);
+
+       for (size_t i = 0; i < ARRAY_SIZE(bench_cases); i++) {
+               get_random_nonzero_bytes(buf, bench_cases[i].len);
+               buf[bench_cases[i].len] = '\0';
+
+               iters = bench_cases[i].iterations;
+
+               /* 1. Benchmark the architecture-optimized version */
+               start = ktime_get();
+               for (unsigned int j = 0; j < iters; j++) {
+                       OPTIMIZER_HIDE_VAR(buf);
+                       (void)strlen(buf);
+               }
+               end = ktime_get();
+               time_arch = ktime_to_ns(ktime_sub(end, start));
+
+               /* 2. Benchmark the generic C version */
+               start = ktime_get();
+               for (unsigned int j = 0; j < iters; j++) {
+                       OPTIMIZER_HIDE_VAR(buf);
+                       (void)__generic_strlen(buf);
+               }
+               end = ktime_get();
+               time_generic = ktime_to_ns(ktime_sub(end, start));
+
+               string_bench_report(test, "strlen", &bench_cases[i],
+                               time_arch, time_generic);
+       }
+}
+#endif
+
 static void string_test_strnlen(struct kunit *test)
 {
        char *s;
@@ -702,6 +816,9 @@ static struct kunit_case string_test_cases[] = {
        KUNIT_CASE(string_test_memset32),
        KUNIT_CASE(string_test_memset64),
        KUNIT_CASE(string_test_strlen),
+#ifdef __HAVE_ARCH_STRLEN
+       KUNIT_CASE(string_test_strlen_bench),
+#endif
        KUNIT_CASE(string_test_strnlen),
        KUNIT_CASE(string_test_strchr),
        KUNIT_CASE(string_test_strnchr),
-- 
2.25.1


Reply via email to