On 5/13/24 09:17, Bruno Haible wrote:
The reason is that such a 256-bytes table will tend to occupy 256 bytes in the
CPU's L1 cache, and thus reduce the ability of other code to use the L1 cache.

Yes, it partly depends on whether the function is called a lot (so the 256-byte table is in the cache) or rarely (so it's not). I had optimized for the former.

The code can be changed to not use an extern data table. I installed the attached. This probably a win (over de Bruijn too), at least for some apps and platforms, though I haven't benchmarked.
From 1b7e82be9dd80fbb67a5567cdf1d60d36dd40db2 Mon Sep 17 00:00:00 2001
From: Paul Eggert <egg...@cs.ucla.edu>
Date: Mon, 13 May 2024 15:21:55 -0700
Subject: [PATCH] stdbit: redo clzll without lookcup table

* lib/stdbit.c (__gl_stdbit_clztab):
* lib/stdbit.in.h (__gl_stdbit_clzll):
[!_GL_STDBIT_HAS_BUILTIN_CLZ && !_MSC_VER]: Rewrite to avoid the
need for a lookup table in memory, and remove the lookup table.
Do this by shrinking the table to 64 bits and puttiung in a 64-bit
constant.  Although this needs another round of shifts, it avoids
the need for a multiplication and memory access a la de Bruijn,
and is probably a win.
---
 ChangeLog       | 12 ++++++++++++
 lib/stdbit.c    | 24 ------------------------
 lib/stdbit.in.h |  5 ++---
 3 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 18aab772fc..3e4eba3796 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2024-05-13  Paul Eggert  <egg...@cs.ucla.edu>
+
+	stdbit: redo clzll without lookcup table
+	* lib/stdbit.c (__gl_stdbit_clztab):
+	* lib/stdbit.in.h (__gl_stdbit_clzll):
+	[!_GL_STDBIT_HAS_BUILTIN_CLZ && !_MSC_VER]: Rewrite to avoid the
+	need for a lookup table in memory, and remove the lookup table.
+	Do this by shrinking the table to 64 bits and puttiung in a 64-bit
+	constant.  Although this needs another round of shifts, it avoids
+	the need for a multiplication and memory access a la de Bruijn,
+	and is probably a win.
+
 2024-05-13  Bruno Haible  <br...@clisp.org>
 
 	stdbit tests: Adhere better to Gnulib naming conventions.
diff --git a/lib/stdbit.c b/lib/stdbit.c
index f2a51b10f7..a9f0ef5074 100644
--- a/lib/stdbit.c
+++ b/lib/stdbit.c
@@ -22,30 +22,6 @@
 #define _GL_STDBIT_INLINE _GL_EXTERN_INLINE
 #include <stdbit.h>
 
-#if !defined _GL_STDBIT_HAS_BUILTIN_CLZ && !_MSC_VER
-/* __gl_stdbit_clztab[B] is the number of leading zeros in
-   the 8-bit byte with value B.  */
-char const __gl_stdbit_clztab[256] =
-  {
-    8,
-    7,
-    6, 6,
-    5, 5, 5, 5,
-    4, 4, 4, 4, 4, 4, 4, 4,
-    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-
-    /* The rest is zero.  */
-  };
-#endif
-
 #if 1500 <= _MSC_VER && (defined _M_IX86 || defined _M_X64)
 signed char __gl_stdbit_popcount_support;
 #endif
diff --git a/lib/stdbit.in.h b/lib/stdbit.in.h
index 15533dbbda..18efaf2d7c 100644
--- a/lib/stdbit.in.h
+++ b/lib/stdbit.in.h
@@ -122,8 +122,6 @@ __gl_stdbit_clzll (unsigned long long int n)
 
 #else /* !_MSC_VER */
 
-extern char const __gl_stdbit_clztab[256];
-
 _GL_STDBIT_INLINE int
 __gl_stdbit_clzll (unsigned long long int n)
 {
@@ -135,7 +133,8 @@ __gl_stdbit_clzll (unsigned long long int n)
   int a5 = (0x00000000ffffffff < n) << 5; n >>= a5; r += a5;
   int a4 = (0x000000000000ffff < n) << 4; n >>= a4; r += a4;
   int a3 = (0x00000000000000ff < n) << 3; n >>= a3; r += a3;
-  return (8 * (sizeof n - 1) - r) + __gl_stdbit_clztab[n];
+  int a2 = (0x000000000000000f < n) << 2; n >>= a2; r += a2;
+  return (8 * sizeof n - (1 << 2) - r) + ((0x11112234ull >> (n << 2)) & 0xf);
 }
 _GL_STDBIT_INLINE int
 __gl_stdbit_clz (unsigned int n)
-- 
2.45.0

Reply via email to