Here is a v19 of the patch set.  I moved out the refactoring of the
function pointer selection code to 0001.  I think this is a good change
independent of $SUBJECT, and I plan to commit this soon.  In 0002, I
changed the syslogger.c usage of pg_popcount() to use pg_number_of_ones
instead.  This is standard practice elsewhere where the popcount functions
are unlikely to win.  I'll probably commit this one soon, too, as it's even
more trivial than 0001.

0003 is the AVX512 POPCNT patch.  Besides refactoring out 0001, there are
no changes from v18.  0004 is an early proof-of-concept for using AVX512
for the visibility map code.  The code is missing comments, and I haven't
performed any benchmarking yet, but I figured I'd post it because it
demonstrates how it's possible to build upon 0003 in other areas.

AFAICT the main open question is the function call overhead in 0003 that
Alvaro brought up earlier.  After 0002 is committed, I believe the only
in-tree caller of pg_popcount() with very few bytes is bit_count(), and I'm
not sure it's worth expending too much energy to make sure there are
absolutely no regressions there.  However, I'm happy to do so if folks feel
that it is necessary, and I'd be grateful for thoughts on how to proceed on
this one.

-- 
Nathan Bossart
Amazon Web Services: https://aws.amazon.com
>From cedad23b7b35e77fde164b1d577c37fb07a578c6 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 1 Apr 2024 16:37:53 -0500
Subject: [PATCH v19 1/4] refactor popcount function choosing

---
 src/port/pg_bitutils.c | 37 +++++++++----------------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 1197696e97..28312f3dd9 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -148,8 +148,8 @@ pg_popcount_available(void)
  * the function pointers so that subsequent calls are routed directly to
  * the chosen implementation.
  */
-static int
-pg_popcount32_choose(uint32 word)
+static inline void
+choose_popcount_functions(void)
 {
 	if (pg_popcount_available())
 	{
@@ -163,45 +163,26 @@ pg_popcount32_choose(uint32 word)
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
 	}
+}
 
+static int
+pg_popcount32_choose(uint32 word)
+{
+	choose_popcount_functions();
 	return pg_popcount32(word);
 }
 
 static int
 pg_popcount64_choose(uint64 word)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount64(word);
 }
 
 static uint64
 pg_popcount_choose(const char *buf, int bytes)
 {
-	if (pg_popcount_available())
-	{
-		pg_popcount32 = pg_popcount32_fast;
-		pg_popcount64 = pg_popcount64_fast;
-		pg_popcount = pg_popcount_fast;
-	}
-	else
-	{
-		pg_popcount32 = pg_popcount32_slow;
-		pg_popcount64 = pg_popcount64_slow;
-		pg_popcount = pg_popcount_slow;
-	}
-
+	choose_popcount_functions();
 	return pg_popcount(buf, bytes);
 }
 
-- 
2.25.1

>From 038b74045b006c5d8a5470364f2041370ec0b083 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Mon, 1 Apr 2024 16:47:22 -0500
Subject: [PATCH v19 2/4] use pg_number_of_ones instead of pg_popcount for
 single byte

---
 src/backend/postmaster/syslogger.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/postmaster/syslogger.c b/src/backend/postmaster/syslogger.c
index 08efe74cc9..437947dbb9 100644
--- a/src/backend/postmaster/syslogger.c
+++ b/src/backend/postmaster/syslogger.c
@@ -898,7 +898,7 @@ process_pipe_input(char *logbuffer, int *bytes_in_logbuffer)
 		if (p.nuls[0] == '\0' && p.nuls[1] == '\0' &&
 			p.len > 0 && p.len <= PIPE_MAX_PAYLOAD &&
 			p.pid != 0 &&
-			pg_popcount((char *) &dest_flags, 1) == 1)
+			pg_number_of_ones[dest_flags] == 1)
 		{
 			List	   *buffer_list;
 			ListCell   *cell;
-- 
2.25.1

>From 73ee8d6018b047856e63ad075641a0dcfe889417 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Wed, 27 Mar 2024 16:39:24 -0500
Subject: [PATCH v19 3/4] AVX512 popcount support

---
 config/c-compiler.m4                 |  58 ++++++
 configure                            | 252 +++++++++++++++++++++++++++
 configure.ac                         |  51 ++++++
 meson.build                          |  87 +++++++++
 src/Makefile.global.in               |   5 +
 src/include/pg_config.h.in           |  12 ++
 src/include/port/pg_bitutils.h       |  15 ++
 src/makefiles/meson.build            |   4 +-
 src/port/Makefile                    |  11 ++
 src/port/meson.build                 |   6 +-
 src/port/pg_bitutils.c               |   7 +-
 src/port/pg_popcount_avx512.c        |  49 ++++++
 src/port/pg_popcount_avx512_choose.c |  71 ++++++++
 src/test/regress/expected/bit.out    |  24 +++
 src/test/regress/sql/bit.sql         |   4 +
 15 files changed, 651 insertions(+), 5 deletions(-)
 create mode 100644 src/port/pg_popcount_avx512.c
 create mode 100644 src/port/pg_popcount_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 3268a780bb..5fb60775ca 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_LOONGARCH_CRC32C_INTRINSICS
+
+# PGAC_XSAVE_INTRINSICS
+# ---------------------
+# Check if the compiler supports the XSAVE instructions using the _xgetbv
+# intrinsic function.
+#
+# An optional compiler flag can be passed as argument (e.g., -mxsave).  If the
+# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
+AC_DEFUN([PGAC_XSAVE_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [return _xgetbv(0) & 0xe0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_XSAVE="$1"
+  pgac_xsave_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_XSAVE_INTRINSICS
+
+# PGAC_AVX512_POPCNT_INTRINSICS
+# -----------------------------
+# Check if the compiler supports the AVX512 POPCNT instructions using the
+# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64,
+# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
+#
+# An optional compiler flag can be passed as argument
+# (e.g., -mavx512vpopcntdq).  If the intrinsics are supported, sets
+# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
+AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [const char buf@<:@sizeof(__m512i)@:>@;
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_POPCNT="$1"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_POPCNT_INTRINSICS
diff --git a/configure b/configure
index 36feeafbb2..b48ed7f271 100755
--- a/configure
+++ b/configure
@@ -647,6 +647,9 @@ MSGFMT_FLAGS
 MSGFMT
 PG_CRC32C_OBJS
 CFLAGS_CRC
+PG_POPCNT_OBJS
+CFLAGS_POPCNT
+CFLAGS_XSAVE
 LIBOBJS
 OPENSSL
 ZSTD
@@ -17404,6 +17407,40 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
+$as_echo_n "checking for __get_cpuid_count... " >&6; }
+if ${pgac_cv__get_cpuid_count+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <cpuid.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__get_cpuid_count="yes"
+else
+  pgac_cv__get_cpuid_count="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5
+$as_echo "$pgac_cv__get_cpuid_count" >&6; }
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+
+$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
+
+fi
+
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5
 $as_echo_n "checking for __cpuid... " >&6; }
 if ${pgac_cv__cpuid+:} false; then :
@@ -17438,6 +17475,221 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
+$as_echo_n "checking for __cpuidex... " >&6; }
+if ${pgac_cv__cpuidex+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <intrin.h>
+int
+main ()
+{
+unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuidex(exx[0], 7, 0);
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv__cpuidex="yes"
+else
+  pgac_cv__cpuidex="no"
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
+$as_echo "$pgac_cv__cpuidex" >&6; }
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+
+$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
+
+fi
+
+# Check for XSAVE intrinsics
+#
+CFLAGS_XSAVE=""
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
+$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
+if ${pgac_cv_xsave_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+return _xgetbv(0) & 0xe0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_xsave_intrinsics_=yes
+else
+  pgac_cv_xsave_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5
+$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
+if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
+  CFLAGS_XSAVE=""
+  pgac_xsave_intrinsics=yes
+fi
+
+if test x"$pgac_xsave_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5
+$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
+if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mxsave"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+return _xgetbv(0) & 0xe0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_xsave_intrinsics__mxsave=yes
+else
+  pgac_cv_xsave_intrinsics__mxsave=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5
+$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
+if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
+  CFLAGS_XSAVE="-mxsave"
+  pgac_xsave_intrinsics=yes
+fi
+
+fi
+if test x"$pgac_xsave_intrinsics" = x"yes"; then
+
+$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
+
+fi
+
+
+# Check for AVX512 popcount intrinsics
+#
+CFLAGS_POPCNT=""
+PG_POPCNT_OBJS=""
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics_=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
+  CFLAGS_POPCNT=""
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5
+$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; }
+if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const char buf[sizeof(__m512i)];
+   PG_INT64_TYPE popcnt = 0;
+   __m512i accum = _mm512_setzero_si512();
+   const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+   const __m512i cnt = _mm512_popcnt_epi64(val);
+   accum = _mm512_add_epi64(accum, cnt);
+   popcnt = _mm512_reduce_add_epi64(accum);
+   /* return computed value, to prevent the above being optimized away */
+   return popcnt == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes
+else
+  pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5
+$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; }
+if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then
+  CFLAGS_POPCNT="-mavx512vpopcntdq"
+  pgac_avx512_popcnt_intrinsics=yes
+fi
+
+  fi
+  if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+
+$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
+
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/configure.ac b/configure.ac
index 57f734879e..2bbd81dfb8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
   AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
 fi
 
+AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
+  [[unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+  ]])],
+  [pgac_cv__get_cpuid_count="yes"],
+  [pgac_cv__get_cpuid_count="no"])])
+if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
+  AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
+fi
+
 AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
   [[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
   AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
 fi
 
+AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
+  [[unsigned int exx[4] = {0, 0, 0, 0};
+  __get_cpuidex(exx[0], 7, 0);
+  ]])],
+  [pgac_cv__cpuidex="yes"],
+  [pgac_cv__cpuidex="no"])])
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+  AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
+fi
+
+# Check for XSAVE intrinsics
+#
+CFLAGS_XSAVE=""
+PGAC_XSAVE_INTRINSICS([])
+if test x"$pgac_xsave_intrinsics" != x"yes"; then
+  PGAC_XSAVE_INTRINSICS([-mxsave])
+fi
+if test x"$pgac_xsave_intrinsics" = x"yes"; then
+  AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
+fi
+AC_SUBST(CFLAGS_XSAVE)
+
+# Check for AVX512 popcount intrinsics
+#
+CFLAGS_POPCNT=""
+PG_POPCNT_OBJS=""
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX512_POPCNT_INTRINSICS([])
+  if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
+    PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq])
+  fi
+  if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
+    PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
+    AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.])
+  fi
+fi
+AC_SUBST(CFLAGS_POPCNT)
+AC_SUBST(PG_POPCNT_OBJS)
+
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
 # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
diff --git a/meson.build b/meson.build
index 18b5be842e..96be29c22b 100644
--- a/meson.build
+++ b/meson.build
@@ -1783,6 +1783,30 @@ elif cc.links('''
 endif
 
 
+# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
+if cc.links('''
+    #include <cpuid.h>
+    int main(int arg, char **argv)
+    {
+        unsigned int exx[4] = {0, 0, 0, 0};
+        __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+    }
+    ''', name: '__get_cpuid_count',
+    args: test_c_args)
+  cdata.set('HAVE__GET_CPUID_COUNT', 1)
+elif cc.links('''
+    #include <intrin.h>
+    int main(int arg, char **argv)
+    {
+        unsigned int exx[4] = {0, 0, 0, 0};
+        __cpuidex(exx, 7, 0);
+    }
+    ''', name: '__cpuidex',
+    args: test_c_args)
+  cdata.set('HAVE__CPUIDEX', 1)
+endif
+
+
 # Defend against clang being used on x86-32 without SSE2 enabled.  As current
 # versions of clang do not understand -fexcess-precision=standard, the use of
 # x87 floating point operations leads to problems like isinf possibly returning
@@ -1996,6 +2020,69 @@ int main(void)
 endif
 
 
+###############################################################
+# Check for the availability of XSAVE intrinsics.
+###############################################################
+
+cflags_xsave = []
+if host_cpu == 'x86' or host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+    return _xgetbv(0) & 0xe0;
+}
+'''
+
+  if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
+        args: test_c_args)
+    cdata.set('HAVE_XSAVE_INTRINSICS', 1)
+  elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
+        args: test_c_args + ['-mxsave'])
+    cdata.set('HAVE_XSAVE_INTRINSICS', 1)
+    cflags_xsave += '-mxsave'
+  endif
+
+endif
+
+
+###############################################################
+# Check for the availability of AVX512 popcount intrinsics.
+###############################################################
+
+cflags_popcnt = []
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+    const char buf[sizeof(__m512i)];
+    INT64 popcnt = 0;
+    __m512i accum = _mm512_setzero_si512();
+    const __m512i val = _mm512_loadu_si512((const __m512i *) buf);
+    const __m512i cnt = _mm512_popcnt_epi64(val);
+    accum = _mm512_add_epi64(accum, cnt);
+    popcnt = _mm512_reduce_add_epi64(accum);
+    /* return computed value, to prevent the above being optimized away */
+    return popcnt == 0;
+}
+'''
+
+  if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+  elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq',
+        args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'])
+    cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
+    cflags_popcnt += '-mavx512vpopcntdq'
+  endif
+
+endif
+
 
 ###############################################################
 # Select CRC-32C implementation.
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e0..36d880d225 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
 CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
 CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
 CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT = @CFLAGS_POPCNT@
 CFLAGS_CRC = @CFLAGS_CRC@
+CFLAGS_XSAVE = @CFLAGS_XSAVE@
 PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
 CXXFLAGS = @CXXFLAGS@
 
@@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@
 # files needed for the chosen CRC-32C implementation
 PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
 
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
+
 LIBS := -lpgcommon -lpgport $(LIBS)
 
 # to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 591e1ca3df..de067e6182 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -513,6 +513,9 @@
 /* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */
 #undef HAVE_X86_64_POPCNTQ
 
+/* Define to 1 if you have XSAVE intrinsics. */
+#undef HAVE_XSAVE_INTRINSICS
+
 /* Define to 1 if the system has the type `_Bool'. */
 #undef HAVE__BOOL
 
@@ -555,9 +558,15 @@
 /* Define to 1 if you have __cpuid. */
 #undef HAVE__CPUID
 
+/* Define to 1 if you have __cpuidex. */
+#undef HAVE__CPUIDEX
+
 /* Define to 1 if you have __get_cpuid. */
 #undef HAVE__GET_CPUID
 
+/* Define to 1 if you have __get_cpuid_count. */
+#undef HAVE__GET_CPUID_COUNT
+
 /* Define to 1 if your compiler understands _Static_assert. */
 #undef HAVE__STATIC_ASSERT
 
@@ -680,6 +689,9 @@
 /* Define to 1 to build with assertion checks. (--enable-cassert) */
 #undef USE_ASSERT_CHECKING
 
+/* Define to 1 to use AVX512 popcount instructions with a runtime check. */
+#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to 1 to build with Bonjour support. (--with-bonjour) */
 #undef USE_BONJOUR
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 53e5239717..1a92c56bcd 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -304,6 +304,21 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
 
+/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+extern uint64 pg_popcount_fast(const char *buf, int bytes);
+
+/*
+ * We can also try to use the AVX512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files.  Note that we only build this when TRY_POPCNT_FAST is set so that we
+ * can fall back to pg_popcount_fast() as needed.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d..5618050b30 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -100,8 +100,10 @@ pgxs_kv = {
     ' '.join(cflags_no_decl_after_statement),
 
   'CFLAGS_CRC': ' '.join(cflags_crc),
+  'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
   'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
   'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
+  'CFLAGS_XSAVE': ' '.join(cflags_xsave),
 
   'LDFLAGS': var_ldflags,
   'LDFLAGS_EX': var_ldflags_ex,
@@ -177,7 +179,7 @@ pgxs_empty = [
   'WANTED_LANGUAGES',
 
   # Not needed because we don't build the server / PLs with the generated makefile
-  'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+  'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
   'DTRACEFLAGS', # only server has dtrace probes
 
   'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e68..db7c02117b 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
 OBJS = \
 	$(LIBOBJS) \
 	$(PG_CRC32C_OBJS) \
+	$(PG_POPCNT_OBJS) \
 	bsearch_arg.o \
 	chklocale.o \
 	inet_net_ntop.o \
@@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
+pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
+
+# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
+pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef..fd9ee199d1 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+  ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
 
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
+pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
 
 foreach f : replace_funcs_neg
   func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 28312f3dd9..177509518f 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -114,7 +114,6 @@ static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
-static uint64 pg_popcount_fast(const char *buf, int bytes);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
@@ -156,6 +155,10 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+		if (pg_popcount_avx512_available())
+			pg_popcount = pg_popcount_avx512;
+#endif
 	}
 	else
 	{
@@ -224,7 +227,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc");
  * pg_popcount_fast
  *		Returns the number of 1-bits in buf
  */
-static uint64
+uint64
 pg_popcount_fast(const char *buf, int bytes)
 {
 	uint64		popcnt = 0;
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 0000000000..f86558d1ee
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,49 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ *	  Holds the pg_popcount() implementation that uses AVX512 instructions.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX512 intrinsics, but we check it anyway to be sure.  We rely on
+ * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512().
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * pg_popcount_avx512
+ *		Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i cnt = _mm512_popcnt_epi64(val);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_fast(buf, bytes);
+}
+
+#endif							/* TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
new file mode 100644
index 0000000000..9e81cd33ad
--- /dev/null
+++ b/src/port/pg_popcount_avx512_choose.c
@@ -0,0 +1,71 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512_choose.c
+ *    Test whether we can use AVX512 POPCNT instructions.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    src/port/pg_popcount_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX512 intrinsics, but we check it anyway to be sure.  We rely on
+ * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512().
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * Returns true if the CPU supports AVX512 POPCNT.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+	/* does CPUID say there's support for AVX512 POPCNT? */
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#endif
+	if ((exx[2] & (1 << 14)) == 0)	/* avx512vpopcntdq */
+		return false;
+
+	/* does CPUID say there's support for XGETBV? */
+	memset(exx, 0, sizeof(exx));
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#endif
+	if ((exx[2] & (1 << 26)) == 0)	/* xsave */
+		return false;
+
+	/* does XGETBV say the ZMM registers are enabled? */
+#ifdef HAVE_XSAVE_INTRINSICS
+	return (_xgetbv(0) & 0xe0) != 0;
+#else
+	return false;
+#endif
+}
+
+#endif							/* TRY_POPCNT_FAST */
diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out
index e17cbf42ca..6a436288bb 100644
--- a/src/test/regress/expected/bit.out
+++ b/src/test/regress/expected/bit.out
@@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10));
         10
 (1 row)
 
+SELECT bit_count(repeat('0', 100)::bit(100));
+ bit_count 
+-----------
+         0
+(1 row)
+
+SELECT bit_count(repeat('1', 100)::bit(100));
+ bit_count 
+-----------
+       100
+(1 row)
+
+SELECT bit_count(repeat('01', 500)::bit(1000));
+ bit_count 
+-----------
+       500
+(1 row)
+
+SELECT bit_count(repeat('10101', 200)::bit(1000));
+ bit_count 
+-----------
+       600
+(1 row)
+
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
   b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql
index 34230b99fb..8ba6facd03 100644
--- a/src/test/regress/sql/bit.sql
+++ b/src/test/regress/sql/bit.sql
@@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20);
 -- bit_count
 SELECT bit_count(B'0101011100'::bit(10));
 SELECT bit_count(B'1111111111'::bit(10));
+SELECT bit_count(repeat('0', 100)::bit(100));
+SELECT bit_count(repeat('1', 100)::bit(100));
+SELECT bit_count(repeat('01', 500)::bit(1000));
+SELECT bit_count(repeat('10101', 200)::bit(1000));
 
 -- This table is intentionally left around to exercise pg_dump/pg_upgrade
 CREATE TABLE bit_defaults(
-- 
2.25.1

>From 8ea529dd315723ca3e8ad4243853148da23f1202 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nat...@postgresql.org>
Date: Sun, 31 Mar 2024 22:22:15 -0500
Subject: [PATCH v19 4/4] optimize visibilitymap_count() with AVX512

---
 src/backend/access/heap/visibilitymap.c |  25 ++----
 src/include/port/pg_bitutils.h          |   6 +-
 src/port/pg_bitutils.c                  | 113 ++++++++++++++++++++++++
 src/port/pg_popcount_avx512.c           |  21 +++++
 4 files changed, 144 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 1ab6c865e3..8b24e7bc33 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -119,10 +119,8 @@
 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
 
 /* Masks for counting subsets of bits in the visibility map. */
-#define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
-														 * bit pair */
-#define FROZEN_MASK64	UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each
-														 * bit pair */
+#define VISIBLE_MASK8	(0x55)	/* The lower bit of each bit pair */
+#define FROZEN_MASK8	(0xaa)	/* The upper bit of each bit pair */
 
 /* prototypes for internal routines */
 static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
@@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 	{
 		Buffer		mapBuffer;
 		uint64	   *map;
-		int			i;
 
 		/*
 		 * Read till we fall off the end of the map.  We assume that any extra
@@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro
 		 */
 		map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer));
 
-		StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0,
-						 "unsupported MAPSIZE");
-		if (all_frozen == NULL)
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-		}
-		else
-		{
-			for (i = 0; i < MAPSIZE / sizeof(uint64); i++)
-			{
-				nvisible += pg_popcount64(map[i] & VISIBLE_MASK64);
-				nfrozen += pg_popcount64(map[i] & FROZEN_MASK64);
-			}
-		}
+		nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8);
+		if (all_frozen)
+			nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8);
 
 		ReleaseBuffer(mapBuffer);
 	}
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 1a92c56bcd..16145c746e 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num)
 extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
 extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
 extern PGDLLIMPORT uint64 (*pg_popcount) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked) (const char *buf, int bytes, bits8 mask);
 
-/* Export pg_popcount_fast() for use in the AVX512 implementation. */
+/* Exported for use in the AVX512 implementation. */
 extern uint64 pg_popcount_fast(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask);
 
 /*
  * We can also try to use the AVX512 popcount instruction on some systems.
@@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes);
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 extern bool pg_popcount_avx512_available(void);
 extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
 #else
@@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern int	pg_popcount32(uint32 word);
 extern int	pg_popcount64(uint64 word);
 extern uint64 pg_popcount(const char *buf, int bytes);
+extern uint64 pg_popcount_masked(const char *buf, int bytes);
 
 #endif							/* TRY_POPCNT_FAST */
 
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 177509518f..902ecdebbf 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -106,18 +106,21 @@ const uint8 pg_number_of_ones[256] = {
 static inline int pg_popcount32_slow(uint32 word);
 static inline int pg_popcount64_slow(uint64 word);
 static uint64 pg_popcount_slow(const char *buf, int bytes);
+static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask);
 
 #ifdef TRY_POPCNT_FAST
 static bool pg_popcount_available(void);
 static int	pg_popcount32_choose(uint32 word);
 static int	pg_popcount64_choose(uint64 word);
 static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
 static inline int pg_popcount32_fast(uint32 word);
 static inline int pg_popcount64_fast(uint64 word);
 
 int			(*pg_popcount32) (uint32 word) = pg_popcount32_choose;
 int			(*pg_popcount64) (uint64 word) = pg_popcount64_choose;
 uint64		(*pg_popcount) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_masked) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
 #endif							/* TRY_POPCNT_FAST */
 
 #ifdef TRY_POPCNT_FAST
@@ -155,9 +158,13 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_fast;
 		pg_popcount64 = pg_popcount64_fast;
 		pg_popcount = pg_popcount_fast;
+		pg_popcount_masked = pg_popcount_masked_fast;
 #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 		if (pg_popcount_avx512_available())
+		{
 			pg_popcount = pg_popcount_avx512;
+			pg_popcount_masked = pg_popcount_masked_avx512;
+		}
 #endif
 	}
 	else
@@ -165,6 +172,7 @@ choose_popcount_functions(void)
 		pg_popcount32 = pg_popcount32_slow;
 		pg_popcount64 = pg_popcount64_slow;
 		pg_popcount = pg_popcount_slow;
+		pg_popcount_masked = pg_popcount_masked_slow;
 	}
 }
 
@@ -189,6 +197,13 @@ pg_popcount_choose(const char *buf, int bytes)
 	return pg_popcount(buf, bytes);
 }
 
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+	choose_popcount_functions();
+	return pg_popcount_masked(buf, bytes, mask);
+}
+
 /*
  * pg_popcount32_fast
  *		Return the number of 1 bits set in word
@@ -269,6 +284,52 @@ pg_popcount_fast(const char *buf, int bytes)
 	return popcnt;
 }
 
+uint64
+pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_fast(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_fast(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
 #endif							/* TRY_POPCNT_FAST */
 
 
@@ -368,6 +429,52 @@ pg_popcount_slow(const char *buf, int bytes)
 	return popcnt;
 }
 
+static uint64
+pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt = 0;
+
+#if SIZEOF_VOID_P >= 8
+	/* Process in 64-bit chunks if the buffer is aligned */
+	uint64		maskv = ~UINT64CONST(0) / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(8, buf))
+	{
+		const uint64 *words = (const uint64 *) buf;
+
+		while (bytes >= 8)
+		{
+			popcnt += pg_popcount64_slow(*words++ & maskv);
+			bytes -= 8;
+		}
+
+		buf = (const char *) words;
+	}
+#else
+	/* Process in 32-bit chunks if the buffer is aligned. */
+	uint32		maskv = ~0 / 0xFF * mask;
+
+	if (buf == (const char *) TYPEALIGN(4, buf))
+	{
+		const uint32 *words = (const uint32 *) buf;
+
+		while (bytes >= 4)
+		{
+			popcnt += pg_popcount32_slow(*words++ & maskv);
+			bytes -= 4;
+		}
+
+		buf = (const char *) words;
+	}
+#endif
+
+	/* Process any remaining bytes */
+	while (bytes--)
+		popcnt += pg_number_of_ones[(unsigned char) *buf++ & maskv];
+
+	return popcnt;
+}
+
 #ifndef TRY_POPCNT_FAST
 
 /*
@@ -399,4 +506,10 @@ pg_popcount(const char *buf, int bytes)
 	return pg_popcount_slow(buf, bytes);
 }
 
+uint64
+pg_popcount_masked(const char *buf, int bytes, bits8 mask)
+{
+	return pg_popcount_masked_slow(buf, bytes, mask);
+}
+
 #endif							/* !TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
index f86558d1ee..8965a8d530 100644
--- a/src/port/pg_popcount_avx512.c
+++ b/src/port/pg_popcount_avx512.c
@@ -46,4 +46,25 @@ pg_popcount_avx512(const char *buf, int bytes)
 	return popcnt + pg_popcount_fast(buf, bytes);
 }
 
+uint64
+pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
+{
+	uint64		popcnt;
+	__m512i		accum = _mm512_setzero_si512();
+	const		__m512i maskv = _mm512_set1_epi8(mask);
+
+	for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i))
+	{
+		const		__m512i val = _mm512_loadu_si512((const __m512i *) buf);
+		const		__m512i vmasked = _mm512_and_si512(val, maskv);
+		const		__m512i cnt = _mm512_popcnt_epi64(vmasked);
+
+		accum = _mm512_add_epi64(accum, cnt);
+		buf += sizeof(__m512i);
+	}
+
+	popcnt = _mm512_reduce_add_epi64(accum);
+	return popcnt + pg_popcount_masked_fast(buf, bytes, mask);
+}
+
 #endif							/* TRY_POPCNT_FAST */
-- 
2.25.1

Reply via email to