From ba867ed01a1a25b2603eeda13a44e94a0a61648e Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amonson@intel.com>
Date: Thu, 21 Mar 2024 11:19:23 -0700
Subject: [PATCH 1/2] [Refactor] Split pg_popcount functions into multiple
 files.

Signed-off-by: Paul Amonson <paul.d.amonson@intel.com>
---
 src/include/port/pg_bitutils.h       |   6 +-
 src/port/Makefile                    |   2 +
 src/port/meson.build                 |   2 +
 src/port/pg_bitutils.c               | 232 +++------------------------
 src/port/pg_popcount_x86_64_accel.c  | 134 ++++++++++++++++
 src/port/pg_popcount_x86_64_choose.c | 158 ++++++++++++++++++
 6 files changed, 324 insertions(+), 210 deletions(-)
 create mode 100644 src/port/pg_popcount_x86_64_accel.c
 create mode 100644 src/port/pg_popcount_x86_64_choose.c

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..26f6a48377 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -305,11 +305,13 @@ extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
 
 #else
-/* Use a portable implementation -- no need for a function pointer. */
+/*
+ *  Use a portable implementation -- no need for a function pointer. Use
+  * inlining for small speed increase.
+ */
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
 extern uint64 pg_popcount(const char *buf, int bytes);
-
 #endif							/* TRY_POPCNT_FAST */
 
 /*
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..1499985dfc 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,8 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_popcount_x86_64_choose.o \
+	pg_popcount_x86_64_accel.o \
 	pg_strong_random.o \
 	pgcheckdir.o \
 	pgmkdirp.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..cf6e9fa06c 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,8 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_x86_64_choose.c',
+  'pg_popcount_x86_64_accel.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 1197696e97..21a4d0ca97 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -21,7 +21,6 @@
 
 #include "port/pg_bitutils.h"
 
-
 /*
  * Array giving the position of the left-most set bit for each possible
  * byte value.  We count the right-most position as the 0th bit, and the
@@ -103,196 +102,46 @@ const uint8 pg_number_of_ones[256] = {
 	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
 };
 
-static inline int pg_popcount32_slow(uint32 word);
-static inline int pg_popcount64_slow(uint64 word);
-static uint64 pg_popcount_slow(const char *buf, int bytes);
-
-#ifdef TRY_POPCNT_FAST
-static bool pg_popcount_available(void);
-static int	pg_popcount32_choose(uint32 word);
-static int	pg_popcount64_choose(uint64 word);
-static uint64 pg_popcount_choose(const char *buf, int bytes);
-static inline int pg_popcount32_fast(uint32 word);
-static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
-
-int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
-int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
-uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
-#endif							/* TRY_POPCNT_FAST */
-
-#ifdef TRY_POPCNT_FAST
-
+#ifndef TRY_POPCNT_FAST
 /*
- * Return true if CPUID indicates that the POPCNT instruction is available.
+ * Optimize function signature if using the slow functions.
  */
-static bool
-pg_popcount_available(void)
-{
-	unsigned int exx[4] = {0, 0, 0, 0};
+#define INLINE static inline
 
-#if defined(HAVE__GET_CPUID)
-	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
-	__cpuid(exx, 1);
 #else
-#error cpuid instruction not available
-#endif
-
-	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
-}
-
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static int
-pg_popcount32_choose(uint32 word)
-{
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
-	return pg_popcount32(word);
-}
-
-static int
-pg_popcount64_choose(uint64 word)
-{
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
-	return pg_popcount64(word);
-}
-
-static uint64
-pg_popcount_choose(const char *buf, int bytes)
-{
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
+#define INLINE
+#endif							/* !TRY_POPCNT_FAST */
 
-	return pg_popcount(buf, bytes);
-}
+/* Forward References */
+INLINE int pg_popcount32_slow(uint32 word);
+INLINE int pg_popcount64_slow(uint64 word);
+INLINE uint64 pg_popcount_slow(const char *buf, int bytes);
 
-/*
- * pg_popcount32_fast
- *		Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount32_fast(uint32 word)
+#ifndef TRY_POPCNT_FAST
+/* Slow function defintions for exported functions. */
+int
+pg_popcount32(uint32 word)
 {
-#ifdef _MSC_VER
-	return __popcnt(word);
-#else
-	uint32		res;
-
-__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
-	return (int) res;
-#endif
+	return pg_popcount32_slow(word);
 }
 
-/*
- * pg_popcount64_fast
- *		Return the number of 1 bits set in word
- */
-static inline int
-pg_popcount64_fast(uint64 word)
+int
+pg_popcount64(uint64 word)
 {
-#ifdef _MSC_VER
-	return __popcnt64(word);
-#else
-	uint64		res;
-
-__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
-	return (int) res;
-#endif
+	return pg_popcount64_slow(word);
 }
-
-/*
- * pg_popcount_fast
- *		Returns the number of 1-bits in buf
- */
-static uint64
-pg_popcount_fast(const char *buf, int bytes)
+uint64
+pg_popcount(const char *buf, int bytes)
 {
-	uint64		popcnt = 0;
-
-#if SIZEOF_VOID_P >= 8
-	/* Process in 64-bit chunks if the buffer is aligned. */
-	if (buf == (const char *) TYPEALIGN(8, buf))
-	{
-		const uint64 *words = (const uint64 *) buf;
-
-		while (bytes >= 8)
-		{
-			popcnt += pg_popcount64_fast(*words++);
-			bytes -= 8;
-		}
-
-		buf = (const char *) words;
-	}
-#else
-	/* Process in 32-bit chunks if the buffer is aligned. */
-	if (buf == (const char *) TYPEALIGN(4, buf))
-	{
-		const uint32 *words = (const uint32 *) buf;
-
-		while (bytes >= 4)
-		{
-			popcnt += pg_popcount32_fast(*words++);
-			bytes -= 4;
-		}
-
-		buf = (const char *) words;
-	}
-#endif
-
-	/* Process any remaining bytes */
-	while (bytes--)
-		popcnt += pg_number_of_ones[(unsigned char) *buf++];
-
-	return popcnt;
+	return pg_popcount_slow(buf, bytes);
 }
-
-#endif							/* TRY_POPCNT_FAST */
-
+#endif							/* !TRY_POPCNT_FAST */
 
 /*
  * pg_popcount32_slow
  *		Return the number of 1 bits set in word
  */
-static inline int
+INLINE int
 pg_popcount32_slow(uint32 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -314,7 +163,7 @@ pg_popcount32_slow(uint32 word)
  * pg_popcount64_slow
  *		Return the number of 1 bits set in word
  */
-static inline int
+INLINE int
 pg_popcount64_slow(uint64 word)
 {
 #ifdef HAVE__BUILTIN_POPCOUNT
@@ -342,7 +191,7 @@ pg_popcount64_slow(uint64 word)
  * pg_popcount_slow
  *		Returns the number of 1-bits in buf
  */
-static uint64
+INLINE uint64
 pg_popcount_slow(const char *buf, int bytes)
 {
 	uint64		popcnt = 0;
@@ -383,36 +232,3 @@ pg_popcount_slow(const char *buf, int bytes)
 
 	return popcnt;
 }
-
-#ifndef TRY_POPCNT_FAST
-
-/*
- * When the POPCNT instruction is not available, there's no point in using
- * function pointers to vary the implementation between the fast and slow
- * method.  We instead just make these actual external functions when
- * TRY_POPCNT_FAST is not defined.  The compiler should be able to inline
- * the slow versions here.
- */
-int
-pg_popcount32(uint32 word)
-{
-	return pg_popcount32_slow(word);
-}
-
-int
-pg_popcount64(uint64 word)
-{
-	return pg_popcount64_slow(word);
-}
-
-/*
- * pg_popcount
- *		Returns the number of 1-bits in buf
- */
-uint64
-pg_popcount(const char *buf, int bytes)
-{
-	return pg_popcount_slow(buf, bytes);
-}
-
-#endif							/* !TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_x86_64_accel.c b/src/port/pg_popcount_x86_64_accel.c
new file mode 100644
index 0000000000..d5500d56e7
--- /dev/null
+++ b/src/port/pg_popcount_x86_64_accel.c
@@ -0,0 +1,134 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_x86_64_accel.c
+ *	  Miscellaneous functions for bit-wise operations.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_x86_64_accel.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "port/pg_bitutils.h"
+
+#if defined(HAVE__IMMINTRIN)
+#include <immintrin.h>
+#endif
+
+#ifdef TRY_POPCNT_FAST
+int pg_popcount32_fast(uint32 word);
+int pg_popcount64_fast(uint64 word);
+uint64 pg_popcount_fast(const char *buf, int bytes);
+uint64 pg_popcount512_fast(const char *buf, int bytes);
+
+/*
+ * pg_popcount32_fast
+ *		Return the number of 1 bits set in word
+ */
+int
+pg_popcount32_fast(uint32 word)
+{
+#ifdef _MSC_VER
+	return __popcnt(word);
+#else
+	uint32		res;
+
+__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc");
+	return (int) res;
+#endif
+}
+
+/*
+ * pg_popcount64_fast
+ *		Return the number of 1 bits set in word
+ */
+int
+pg_popcount64_fast(uint64 word)
+{
+#ifdef _MSC_VER
+	return __popcnt64(word);
+#else
+	uint64		res;
+
+__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
+	return (int) res;
+#endif
+}
+
+/*
+ * pg_popcount_fast
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_fast(const char *buf, int bytes)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned. */
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_fast(*words++);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_fast(*words++);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++];
+
+	return popcnt;
+}
+
+/*
+ * Use AVX-512 Intrinsics for supported CPUs or fall back the non-152 fast
+ * implem entation and use the best 64 bit fast methods. If no fast
+ * methods are used this will fall back to __builtin_* or pure software.
+ */
+uint64
+pg_popcount512_fast(const char *buf, int bytes)
+{
+	uint64 popcnt = 0;
+ #if defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1
+	__m512i accumulator = _mm512_setzero_si512();
+
+	while (bytes >= 64)
+	{
+		const __m512i v = _mm512_loadu_si512((const __m512i *)buf);
+		const __m512i p = _mm512_popcnt_epi64(v);
+
+		accumulator = _mm512_add_epi64(accumulator, p);
+		bytes -= 64;
+		buf += 64;
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accumulator);
+#endif 				/* defined(HAVE__IMMINTRIN) && HAVE__AVX512_POPCNT == 1 */
+
+	/* Process any remaining bytes */
+	return popcnt + pg_popcount_fast(buf, bytes);
+}
+#endif							/* TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_x86_64_choose.c b/src/port/pg_popcount_x86_64_choose.c
new file mode 100644
index 0000000000..e73d1999ad
--- /dev/null
+++ b/src/port/pg_popcount_x86_64_choose.c
@@ -0,0 +1,158 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_x86_64_choose.c
+ *	  Miscellaneous functions for bit-wise operations.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_x86_64_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+#include "port/pg_bitutils.h"
+
+#ifdef TRY_POPCNT_FAST
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+static bool pg_popcount_available(void);
+static int	pg_popcount32_choose(uint32 word);
+static int	pg_popcount64_choose(uint64 word);
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+extern int pg_popcount32_fast(uint32 word);
+extern int pg_popcount64_fast(uint64 word);
+extern uint64 pg_popcount_fast(const char *buf, int bytes);
+extern uint64 pg_popcount512_fast(const char *buf, int bytes);
+extern int pg_popcount32_slow(uint32 word);
+extern int pg_popcount64_slow(uint64 word);
+extern uint64 pg_popcount_slow(const char *buf, int bytes);
+
+int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
+int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
+uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
+
+/*
+ * Return true if CPUID indicates that the POPCNT instruction is available.
+ */
+static bool
+pg_popcount_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
+}
+
+/*
+ * Return true if CPUID indicates that the AVX512_POPCNT instruction is
+ * available. This is similar to the method above; see
+ * https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features
+ *
+ * Finally, we make sure the xgetbv result is consistent with the CPUID
+ * results.
+ */
+static bool
+pg_popcount512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* Check for AVX512VPOPCNTDQ and AVX512F */
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#endif
+
+	if ((exx[2] & (0x00004000)) != 0 && (exx[1] & (0x00010000)) != 0)
+	{
+		/*
+		 * CPUID succeeded, does the current running OS support the
+		 * ZMM registers which are required for AVX512? This check is
+		 * required to make sure an old OS on a new CPU is correctly
+		 * checked or a VM hypervisor is not excluding AVX512 ZMM
+		 * support in the VM; see "5.1.9 Detection of AVX Instructions"
+		 * https://www.intel.com/content/www/us/en/content-details/671488/intel-64-and-ia-32-architectures-optimization-reference-manual-volume-1.html
+		 */
+		uint64 xcr = 0;
+#ifdef _MSC_VER
+		uint64 highlow = _xgetbv(xcr);
+
+		return (highlow & 0xE0) != 0;
+#else
+		uint32 high;
+		uint32 low;
+
+		__asm__ __volatile__("xgetbv\t\n" : "=a"(low), "=d"(high) : "c"(xcr));
+		return (low & 0xE0) != 0;
+#endif
+	} /* POPCNT 512 */
+	return false;
+}
+
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static inline void set_function_pointers()
+{
+if (pg_popcount512_available())
+	{
+		pg_popcount32 = pg_popcount32_fast;
+		pg_popcount64 = pg_popcount64_fast;
+		pg_popcount = pg_popcount_fast;
+	}
+	else
+	{
+		if (pg_popcount_available())
+		{
+			pg_popcount32 = pg_popcount32_fast;
+			pg_popcount64 = pg_popcount64_fast;
+			pg_popcount = pg_popcount_fast;
+		}
+		else
+		{
+			pg_popcount32 = pg_popcount32_slow;
+			pg_popcount64 = pg_popcount64_slow;
+			pg_popcount = pg_popcount_slow;
+		}
+	}
+}
+
+static inline int
+pg_popcount32_choose(uint32 word)
+{
+	set_function_pointers();
+	return pg_popcount32(word);
+}
+
+static inline int
+pg_popcount64_choose(uint64 word)
+{
+	set_function_pointers();
+	return pg_popcount64(word);
+}
+
+static inline uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+	set_function_pointers();
+	return pg_popcount(buf, bytes);
+}
+
+#endif							/* TRY_POPCNT_FAST */
-- 
2.34.1

