On Mon, Mar 25, 2024 at 03:05:51PM -0500, Nathan Bossart wrote:
> On Mon, Mar 25, 2024 at 06:42:36PM +0000, Amonson, Paul D wrote:
>> Ok, CI turned green after my re-post of the patches.  Can this please get
>> merged?
> 
> Thanks for the new patches.  I intend to take another look soon.

Thanks for your patience.  I spent most of my afternoon looking into the
latest patch set, but I needed to do a CHECKPOINT and take a break.  I am
in the middle of doing some rather heavy editorialization, but the core of
your changes will remain the same (and so I still intend to give you
authorship credit).  I've attached what I have so far, which is still
missing the configuration checks and the changes to make sure the extra
compiler flags make it to the right places.

Unless something pops up while I work on the remainder of this patch, I
think we'll end up going with a simpler approach.  I originally set out to
make this look like the CRC32C stuff (e.g., a file per implementation), but
that seemed primarily useful if we can choose which files need to be
compiled at configure-time.  However, the TRY_POPCNT_FAST macro is defined
at compile-time (AFAICT for good reason [0]), so we end up having to
compile all the files in many cases anyway, and we continue to need to
surround lots of code with "#ifdef TRY_POPCNT_FAST" or similar.  So, my
current thinking is that we should only move the AVX512 stuff to its own
file for the purposes of compiling it with special flags when possible.  (I
realize that I'm essentially recanting much of my previous feedback, which
I apologize for.)

[0] 
https://postgr.es/m/CAApHDvrONNcYxGV6C0O3ZmaL0BvXBWY%2BrBOCBuYcQVUOURwhkA%40mail.gmail.com

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From 031eb4a365665edd304f0281ad7e412341504749 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 27 Mar 2024 16:39:24 -0500
Subject: [PATCH v13 1/1] AVX512 popcount support

---
 src/include/port/pg_bitutils.h | 16 +++++++
 src/port/Makefile              |  1 +
 src/port/meson.build           |  1 +
 src/port/pg_bitutils.c         | 53 ++++++++------------
 src/port/pg_popcount_avx512.c  | 88 ++++++++++++++++++++++++++++++++++
 5 files changed, 125 insertions(+), 34 deletions(-)
 create mode 100644 src/port/pg_popcount_avx512.c

diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..4b1e4d92b4 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -298,6 +298,22 @@ pg_ceil_log2_64(uint64 num)
 #endif
 #endif
 
+/*
+ * We can also try to use the AVX512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files.
+ */
+#if defined(TRY_POPCNT_FAST) && \
+	defined(HAVE__IMMINTRIN) && \
+	defined(HAVE__AVX512_POPCNT)
+#if defined(HAVE__GET_CPUID_COUNT) || defined(HAVE__CPUIDEX)
+#define TRY_POPCNT_AVX512 1
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+#endif
+
 #ifdef TRY_POPCNT_FAST
 /* Attempt to use the POPCNT instruction, but perform a runtime check first */
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..eb1e56fe41 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_popcount_avx512.o \
 	pg_strong_random.o \
 	pgcheckdir.o \
 	pgmkdirp.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..c77bbd3168 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 1197696e97..2f9a6690e0 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -142,20 +142,18 @@ pg_popcount_available(void)
 	return (exx[2] & (1 << 23)) != 0;	/* POPCNT */
 }
 
-/*
- * These functions get called on the first call to pg_popcount32 etc.
- * They detect whether we can use the asm implementations, and replace
- * the function pointers so that subsequent calls are routed directly to
- * the chosen implementation.
- */
-static int
-pg_popcount32_choose(uint32 word)
+static inline void
+choose_popcount_functions(void)
 {
 	if (pg_popcount_available())
 	{
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+#ifdef TRY_POPCNT_AVX512
+		if (pg_popcount_avx512_available())
+			pg_popcount = pg_popcount_avx512;
+#endif
 	}
 	else
 	{
@@ -163,45 +161,32 @@ pg_popcount32_choose(uint32 word)
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
 	}
+}
 
+/*
+ * These functions get called on the first call to pg_popcount32 etc.
+ * They detect whether we can use the asm implementations, and replace
+ * the function pointers so that subsequent calls are routed directly to
+ * the chosen implementation.
+ */
+static int
+pg_popcount32_choose(uint32 word)
+{
+	choose_popcount_functions();
 	return pg_popcount32(word);
 }
 
 static int
 pg_popcount64_choose(uint64 word)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount64(word);
 }
 
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount(buf, bytes);
 }
 
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 0000000000..7c595a4b33
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ *	  Holds the pg_popcount() implementation that uses AVX512 instructions.
+ *
+ * Copyright (c) 2019-2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_bitutils.h"
+
+/*
+ * XXX: Someday we should figure out how to determine whether this file needs
+ * to comiled at configure-time instead of relying on macros that are
+ * determined at compile-time.
+ */
+#ifdef TRY_POPCOUNT_AVX512
+
+/*
+ * Return true if CPUID indicates that the AVX512 POPCNT instruction is
+ * available.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+
+	if ((exx[1] & (1 << 16)) != 0 &&
+		(exx[2] & (1 << 14)) != 0)
+	{
+		/*
+		 * We also need to check that the OS has enabled support for the ZMM
+		 * registers.
+		 */
+#ifdef _MSC_VER
+		return (_xgetbv(0) & 0xe0) != 0;
+#else
+		uint64		xcr = 0;
+		uint32		high;
+		uint32		low;
+
+__asm__ __volatile__(" xgetbv\n":"=a"(low), "=d"(high):"c"(xcr));
+		return (low & 0xe0) != 0;
+#endif
+	}
+
+	return false;
+}
+
+/*
+ * pg_popcount_avx512
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i cnt = _mm512_popcnt_epi64(val);
+
+		accum = _mm512_add_epi64(accum, cnt);
+
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+
+	return popcnt + pg_popcount_fast(buf, bytes);
+}
+
+#endif							/* TRY_POPCOUNT_AVX512 */
-- 
2.25.1

Reply via email to