From e3a1be4a356434b10007196e85136ed4f9501304 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@2ndquadrant.com>
Date: Thu, 26 Dec 2019 18:28:50 -0500
Subject: [PATCH v2] Use the CLZ instruction in AllocSetFreeIndex()

In commit ab5b4e2f9ed, we optimized AllocSetFreeIndex() using a lookup
table. At the time, using CLZ was rejected because compiler/platform
support was not widespread enough to justify it. Since 02a6a54ecd6,
we test for availablity of __builtin_clz(), so use that instead. This
is about 50% faster on Intel platforms, but perhaps more importantly
eliminates cache pollution caused by the lookup table approach.

In addition, for the open-coded case, use the general-purpose lookup
table added by 02a6a54ecd6, rather than a single-purpose one.
---
 src/backend/utils/mmgr/aset.c | 50 ++++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/backend/utils/mmgr/aset.c b/src/backend/utils/mmgr/aset.c
index f729d9b6de..bc05ea906a 100644
--- a/src/backend/utils/mmgr/aset.c
+++ b/src/backend/utils/mmgr/aset.c
@@ -46,6 +46,7 @@
 
 #include "postgres.h"
 
+#include "port/pg_bitutils.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
@@ -297,18 +298,6 @@ static const MemoryContextMethods AllocSetMethods = {
 #endif
 };
 
-/*
- * Table for AllocSetFreeIndex
- */
-#define LT16(n) n, n, n, n, n, n, n, n, n, n, n, n, n, n, n, n
-
-static const unsigned char LogTable256[256] =
-{
-	0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
-	LT16(5), LT16(6), LT16(6), LT16(7), LT16(7), LT16(7), LT16(7),
-	LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8), LT16(8)
-};
-
 /* ----------
  * Debug macros
  * ----------
@@ -337,24 +326,37 @@ static inline int
 AllocSetFreeIndex(Size size)
 {
 	int			idx;
-	unsigned int t,
-				tsize;
 
 	if (size > (1 << ALLOC_MINBITS))
 	{
-		tsize = (size - 1) >> ALLOC_MINBITS;
-
 		/*
-		 * At this point we need to obtain log2(tsize)+1, ie, the number of
-		 * not-all-zero bits at the right.  We used to do this with a
-		 * shift-and-count loop, but this function is enough of a hotspot to
-		 * justify micro-optimization effort.  The best approach seems to be
-		 * to use a lookup table.  Note that this code assumes that
-		 * ALLOCSET_NUM_FREELISTS <= 17, since we only cope with two bytes of
-		 * the tsize value.
+		 * At this point we compute ceil(log2(size >> ALLOC_MINBITS)).
+		 * We can do this quickly via the number of not-all-zero bits at
+		 * the right.  We could simply use
+		 *
+		 * pg_leftmost_one_pos32((size - 1) >> ALLOC_MINBITS) + 1
+		 *
+		 * for this, but duplicating the logic here affords us additional
+		 * optimizations:
+		 *
+		 * 1. The compiler can fold ALLOC_MINBITS into other constants,
+		 *    rather than right-shifting as a separate step.
+		 * 2. In the open-coded case, we only need to cope with two
+		 *    bytes of the size value.
 		 */
+#ifdef HAVE__BUILTIN_CLZ
+		idx = 31 - __builtin_clz((uint32) size - 1) - ALLOC_MINBITS + 1;
+#else
+		unsigned int t,
+					tsize;
+
+		StaticAssertStmt(ALLOCSET_NUM_FREELISTS + ALLOC_MINBITS <= 17, "");
+
+		tsize = (size - 1) >> ALLOC_MINBITS;
 		t = tsize >> 8;
-		idx = t ? LogTable256[t] + 8 : LogTable256[tsize];
+		idx = t ? pg_leftmost_one_pos[t] + 8 : pg_leftmost_one_pos[tsize];
+		idx += 1;
+#endif
 
 		Assert(idx < ALLOCSET_NUM_FREELISTS);
 	}
-- 
2.22.0

