I committed v23-0001. Here is a rebased version of the remaining patches. I intend to test the masking idea from Ants next.
-- Nathan Bossart Amazon Web Services: https://aws.amazon.com
>From 295b03530de5f42fe876b4489191da2f8dc83194 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Wed, 27 Mar 2024 16:39:24 -0500 Subject: [PATCH v24 1/2] AVX512 popcount support --- config/c-compiler.m4 | 58 ++++++ configure | 252 +++++++++++++++++++++++++++ configure.ac | 51 ++++++ meson.build | 87 +++++++++ src/Makefile.global.in | 5 + src/include/pg_config.h.in | 12 ++ src/include/port/pg_bitutils.h | 15 ++ src/makefiles/meson.build | 4 +- src/port/Makefile | 11 ++ src/port/meson.build | 6 +- src/port/pg_bitutils.c | 29 ++- src/port/pg_popcount_avx512.c | 49 ++++++ src/port/pg_popcount_avx512_choose.c | 71 ++++++++ src/test/regress/expected/bit.out | 24 +++ src/test/regress/sql/bit.sql | 4 + 15 files changed, 673 insertions(+), 5 deletions(-) create mode 100644 src/port/pg_popcount_avx512.c create mode 100644 src/port/pg_popcount_avx512_choose.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 3268a780bb..5fb60775ca 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_LOONGARCH_CRC32C_INTRINSICS + +# PGAC_XSAVE_INTRINSICS +# --------------------- +# Check if the compiler supports the XSAVE instructions using the _xgetbv +# intrinsic function. +# +# An optional compiler flag can be passed as argument (e.g., -mxsave). If the +# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE. +AC_DEFUN([PGAC_XSAVE_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>], + [return _xgetbv(0) & 0xe0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_XSAVE="$1" + pgac_xsave_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_XSAVE_INTRINSICS + +# PGAC_AVX512_POPCNT_INTRINSICS +# ----------------------------- +# Check if the compiler supports the AVX512 POPCNT instructions using the +# _mm512_setzero_si512, _mm512_loadu_si512, _mm512_popcnt_epi64, +# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions. +# +# An optional compiler flag can be passed as argument +# (e.g., -mavx512vpopcntdq). If the intrinsics are supported, sets +# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT. +AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl +AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>], + [const char buf@<:@sizeof(__m512i)@:>@; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_POPCNT="$1" + pgac_avx512_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_POPCNT_INTRINSICS diff --git a/configure b/configure index 36feeafbb2..b48ed7f271 100755 --- a/configure +++ b/configure @@ -647,6 +647,9 @@ MSGFMT_FLAGS MSGFMT PG_CRC32C_OBJS CFLAGS_CRC +PG_POPCNT_OBJS +CFLAGS_POPCNT +CFLAGS_XSAVE LIBOBJS OPENSSL ZSTD @@ -17404,6 +17407,40 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5 +$as_echo_n "checking for __get_cpuid_count... " >&6; } +if ${pgac_cv__get_cpuid_count+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <cpuid.h> +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__get_cpuid_count="yes" +else + pgac_cv__get_cpuid_count="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5 +$as_echo "$pgac_cv__get_cpuid_count" >&6; } +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + +$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h + +fi + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5 $as_echo_n "checking for __cpuid... " >&6; } if ${pgac_cv__cpuid+:} false; then : @@ -17438,6 +17475,221 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5 +$as_echo_n "checking for __cpuidex... " >&6; } +if ${pgac_cv__cpuidex+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <intrin.h> +int +main () +{ +unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv__cpuidex="yes" +else + pgac_cv__cpuidex="no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5 +$as_echo "$pgac_cv__cpuidex" >&6; } +if test x"$pgac_cv__cpuidex" = x"yes"; then + +$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h + +fi + +# Check for XSAVE intrinsics +# +CFLAGS_XSAVE="" +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5 +$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; } +if ${pgac_cv_xsave_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +return _xgetbv(0) & 0xe0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_xsave_intrinsics_=yes +else + pgac_cv_xsave_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5 +$as_echo "$pgac_cv_xsave_intrinsics_" >&6; } +if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then + CFLAGS_XSAVE="" + pgac_xsave_intrinsics=yes +fi + +if test x"$pgac_xsave_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5 +$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; } +if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mxsave" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +return _xgetbv(0) & 0xe0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_xsave_intrinsics__mxsave=yes +else + pgac_cv_xsave_intrinsics__mxsave=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5 +$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; } +if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then + CFLAGS_XSAVE="-mxsave" + pgac_xsave_intrinsics=yes +fi + +fi +if test x"$pgac_xsave_intrinsics" = x"yes"; then + +$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h + +fi + + +# Check for AVX512 popcount intrinsics +# +CFLAGS_POPCNT="" +PG_POPCNT_OBJS="" +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +const char buf[sizeof(__m512i)]; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics_=yes +else + pgac_cv_avx512_popcnt_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then + CFLAGS_POPCNT="" + pgac_avx512_popcnt_intrinsics=yes +fi + + if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq" >&5 +$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq... " >&6; } +if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <immintrin.h> +int +main () +{ +const char buf[sizeof(__m512i)]; + PG_INT64_TYPE popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=yes +else + pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&5 +$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" >&6; } +if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq" = x"yes"; then + CFLAGS_POPCNT="-mavx512vpopcntdq" + pgac_avx512_popcnt_intrinsics=yes +fi + + fi + if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then + PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o" + +$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h + + fi +fi + + + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/configure.ac b/configure.ac index 57f734879e..2bbd81dfb8 100644 --- a/configure.ac +++ b/configure.ac @@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.]) fi +AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + ]])], + [pgac_cv__get_cpuid_count="yes"], + [pgac_cv__get_cpuid_count="no"])]) +if test x"$pgac_cv__get_cpuid_count" = x"yes"; then + AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.]) +fi + AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid], [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>], [[unsigned int exx[4] = {0, 0, 0, 0}; @@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.]) fi +AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>], + [[unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuidex(exx[0], 7, 0); + ]])], + [pgac_cv__cpuidex="yes"], + [pgac_cv__cpuidex="no"])]) +if test x"$pgac_cv__cpuidex" = x"yes"; then + AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.]) +fi + +# Check for XSAVE intrinsics +# +CFLAGS_XSAVE="" +PGAC_XSAVE_INTRINSICS([]) +if test x"$pgac_xsave_intrinsics" != x"yes"; then + PGAC_XSAVE_INTRINSICS([-mxsave]) +fi +if test x"$pgac_xsave_intrinsics" = x"yes"; then + AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.]) +fi +AC_SUBST(CFLAGS_XSAVE) + +# Check for AVX512 popcount intrinsics +# +CFLAGS_POPCNT="" +PG_POPCNT_OBJS="" +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX512_POPCNT_INTRINSICS([]) + if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then + PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq]) + fi + if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then + PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o" + AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX512 popcount instructions with a runtime check.]) + fi +fi +AC_SUBST(CFLAGS_POPCNT) +AC_SUBST(PG_POPCNT_OBJS) + # Check for Intel SSE 4.2 intrinsics to do CRC calculations. # # First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used diff --git a/meson.build b/meson.build index 18b5be842e..96be29c22b 100644 --- a/meson.build +++ b/meson.build @@ -1783,6 +1783,30 @@ elif cc.links(''' endif +# Check for __get_cpuid_count() and __cpuidex() in a similar fashion. +if cc.links(''' + #include <cpuid.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); + } + ''', name: '__get_cpuid_count', + args: test_c_args) + cdata.set('HAVE__GET_CPUID_COUNT', 1) +elif cc.links(''' + #include <intrin.h> + int main(int arg, char **argv) + { + unsigned int exx[4] = {0, 0, 0, 0}; + __cpuidex(exx, 7, 0); + } + ''', name: '__cpuidex', + args: test_c_args) + cdata.set('HAVE__CPUIDEX', 1) +endif + + # Defend against clang being used on x86-32 without SSE2 enabled. As current # versions of clang do not understand -fexcess-precision=standard, the use of # x87 floating point operations leads to problems like isinf possibly returning @@ -1996,6 +2020,69 @@ int main(void) endif +############################################################### +# Check for the availability of XSAVE intrinsics. +############################################################### + +cflags_xsave = [] +if host_cpu == 'x86' or host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> + +int main(void) +{ + return _xgetbv(0) & 0xe0; +} +''' + + if cc.links(prog, name: 'XSAVE intrinsics without -mxsave', + args: test_c_args) + cdata.set('HAVE_XSAVE_INTRINSICS', 1) + elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave', + args: test_c_args + ['-mxsave']) + cdata.set('HAVE_XSAVE_INTRINSICS', 1) + cflags_xsave += '-mxsave' + endif + +endif + + +############################################################### +# Check for the availability of AVX512 popcount intrinsics. +############################################################### + +cflags_popcnt = [] +if host_cpu == 'x86_64' + + prog = ''' +#include <immintrin.h> + +int main(void) +{ + const char buf[sizeof(__m512i)]; + INT64 popcnt = 0; + __m512i accum = _mm512_setzero_si512(); + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + popcnt = _mm512_reduce_add_epi64(accum); + /* return computed value, to prevent the above being optimized away */ + return popcnt == 0; +} +''' + + if cc.links(prog, name: 'AVX512 popcount without -mavx512vpopcntdq', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))]) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + elif cc.links(prog, name: 'AVX512 popcount with -mavx512vpopcntdq', + args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq']) + cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1) + cflags_popcnt += '-mavx512vpopcntdq' + endif + +endif + ############################################################### # Select CRC-32C implementation. diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e0..36d880d225 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ +CFLAGS_POPCNT = @CFLAGS_POPCNT@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_XSAVE = @CFLAGS_XSAVE@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ @@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ +# files needed for the chosen popcount implementation +PG_POPCNT_OBJS = @PG_POPCNT_OBJS@ + LIBS := -lpgcommon -lpgport $(LIBS) # to make ws2_32.lib the last library diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 591e1ca3df..de067e6182 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -513,6 +513,9 @@ /* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */ #undef HAVE_X86_64_POPCNTQ +/* Define to 1 if you have XSAVE intrinsics. */ +#undef HAVE_XSAVE_INTRINSICS + /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL @@ -555,9 +558,15 @@ /* Define to 1 if you have __cpuid. */ #undef HAVE__CPUID +/* Define to 1 if you have __cpuidex. */ +#undef HAVE__CPUIDEX + /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + /* Define to 1 if your compiler understands _Static_assert. */ #undef HAVE__STATIC_ASSERT @@ -680,6 +689,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to use AVX512 popcount instructions with a runtime check. */ +#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index de480da71e..2b35021bc6 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -304,6 +304,21 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +/* Export pg_popcount_fast() for use in the AVX512 implementation. */ +extern uint64 pg_popcount_fast(const char *buf, int bytes); + +/* + * We can also try to use the AVX512 popcount instruction on some systems. + * The implementation of that is located in its own file because it may + * require special compiler flags that we don't want to apply to any other + * files. Note that we only build this when TRY_POPCNT_FAST is set so that we + * can fall back to pg_popcount_fast() as needed. + */ +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +extern bool pg_popcount_avx512_available(void); +extern uint64 pg_popcount_avx512(const char *buf, int bytes); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d..5618050b30 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -100,8 +100,10 @@ pgxs_kv = { ' '.join(cflags_no_decl_after_statement), 'CFLAGS_CRC': ' '.join(cflags_crc), + 'CFLAGS_POPCNT': ' '.join(cflags_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), + 'CFLAGS_XSAVE': ' '.join(cflags_xsave), 'LDFLAGS': var_ldflags, 'LDFLAGS_EX': var_ldflags_ex, @@ -177,7 +179,7 @@ pgxs_empty = [ 'WANTED_LANGUAGES', # Not needed because we don't build the server / PLs with the generated makefile - 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS', + 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS', 'DTRACEFLAGS', # only server has dtrace probes 'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp', diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e68..db7c02117b 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_POPCNT_OBJS) \ bsearch_arg.o \ chklocale.o \ inet_net_ntop.o \ @@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE +pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE) + +# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT +pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef..fd9ee199d1 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -84,6 +84,8 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], + ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -98,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave} +pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 6271acea60..2fa16b54b8 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -114,7 +114,10 @@ static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static inline int pg_popcount32_fast(uint32 word); static inline int pg_popcount64_fast(uint64 word); -static uint64 pg_popcount_fast(const char *buf, int bytes); + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +static uint64 pg_popcount_fast_or_avx512(const char *buf, int bytes); +#endif int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; @@ -156,6 +159,10 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; pg_popcount_optimized = pg_popcount_fast; +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_avx512_available()) + pg_popcount_optimized = pg_popcount_fast_or_avx512; +#endif } else { @@ -224,7 +231,7 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount_fast * Returns the number of 1-bits in buf */ -static uint64 +uint64 pg_popcount_fast(const char *buf, int bytes) { uint64 popcnt = 0; @@ -266,6 +273,24 @@ pg_popcount_fast(const char *buf, int bytes) return popcnt; } +/* + * This is a wrapper function for pg_popcount_avx512() that uses + * pg_popcount_fast() when there aren't enough bytes to fit in an AVX-512 + * register. The compiler should be able to inline pg_popcount_fast() so that + * we only take on additional function call overhead when it's likely to be a + * better option. + */ +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +static uint64 +pg_popcount_fast_or_avx512(const char *buf, int bytes) +{ + if (bytes < 64) + return pg_popcount_fast(buf, bytes); + else + return pg_popcount_avx512(buf, bytes); +} +#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ + #endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c new file mode 100644 index 0000000000..f86558d1ee --- /dev/null +++ b/src/port/pg_popcount_avx512.c @@ -0,0 +1,49 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512.c + * Holds the pg_popcount() implementation that uses AVX512 instructions. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include <immintrin.h> + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX512 intrinsics, but we check it anyway to be sure. We rely on + * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512(). + */ +#ifdef TRY_POPCNT_FAST + +/* + * pg_popcount_avx512 + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_avx512(const char *buf, int bytes) +{ + uint64 popcnt; + __m512i accum = _mm512_setzero_si512(); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i cnt = _mm512_popcnt_epi64(val); + + accum = _mm512_add_epi64(accum, cnt); + buf += sizeof(__m512i); + } + + popcnt = _mm512_reduce_add_epi64(accum); + return popcnt + pg_popcount_fast(buf, bytes); +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c new file mode 100644 index 0000000000..9e81cd33ad --- /dev/null +++ b/src/port/pg_popcount_avx512_choose.c @@ -0,0 +1,71 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512_choose.c + * Test whether we can use AVX512 POPCNT instructions. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512_choose.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include <cpuid.h> +#endif + +#ifdef HAVE_XSAVE_INTRINSICS +#include <immintrin.h> +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include <intrin.h> +#endif + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX512 intrinsics, but we check it anyway to be sure. We rely on + * pg_popcount_fast() as our fallback implementation in pg_popcount_avx512(). + */ +#ifdef TRY_POPCNT_FAST + +/* + * Returns true if the CPU supports AVX512 POPCNT. + */ +bool +pg_popcount_avx512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* does CPUID say there's support for AVX512 POPCNT? */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + if ((exx[2] & (1 << 14)) == 0) /* avx512vpopcntdq */ + return false; + + /* does CPUID say there's support for XGETBV? */ + memset(exx, 0, sizeof(exx)); +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#endif + if ((exx[2] & (1 << 26)) == 0) /* xsave */ + return false; + + /* does XGETBV say the ZMM registers are enabled? */ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe0) != 0; +#else + return false; +#endif +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out index e17cbf42ca..6a436288bb 100644 --- a/src/test/regress/expected/bit.out +++ b/src/test/regress/expected/bit.out @@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10)); 10 (1 row) +SELECT bit_count(repeat('0', 100)::bit(100)); + bit_count +----------- + 0 +(1 row) + +SELECT bit_count(repeat('1', 100)::bit(100)); + bit_count +----------- + 100 +(1 row) + +SELECT bit_count(repeat('01', 500)::bit(1000)); + bit_count +----------- + 500 +(1 row) + +SELECT bit_count(repeat('10101', 200)::bit(1000)); + bit_count +----------- + 600 +(1 row) + -- This table is intentionally left around to exercise pg_dump/pg_upgrade CREATE TABLE bit_defaults( b1 bit(4) DEFAULT '1001', diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql index 34230b99fb..8ba6facd03 100644 --- a/src/test/regress/sql/bit.sql +++ b/src/test/regress/sql/bit.sql @@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20); -- bit_count SELECT bit_count(B'0101011100'::bit(10)); SELECT bit_count(B'1111111111'::bit(10)); +SELECT bit_count(repeat('0', 100)::bit(100)); +SELECT bit_count(repeat('1', 100)::bit(100)); +SELECT bit_count(repeat('01', 500)::bit(1000)); +SELECT bit_count(repeat('10101', 200)::bit(1000)); -- This table is intentionally left around to exercise pg_dump/pg_upgrade CREATE TABLE bit_defaults( -- 2.25.1
>From 872394c3bf67aaa9db0975be4a4fdf3d8863b105 Mon Sep 17 00:00:00 2001 From: Nathan Bossart <nat...@postgresql.org> Date: Sun, 31 Mar 2024 22:22:15 -0500 Subject: [PATCH v24 2/2] optimize visibilitymap_count() with AVX512 --- src/backend/access/heap/visibilitymap.c | 25 +--- src/include/port/pg_bitutils.h | 37 +++++- src/port/pg_bitutils.c | 144 ++++++++++++++++++++++++ src/port/pg_popcount_avx512.c | 25 ++++ 4 files changed, 210 insertions(+), 21 deletions(-) diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 1ab6c865e3..8b24e7bc33 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -119,10 +119,8 @@ #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) /* Masks for counting subsets of bits in the visibility map. */ -#define VISIBLE_MASK64 UINT64CONST(0x5555555555555555) /* The lower bit of each - * bit pair */ -#define FROZEN_MASK64 UINT64CONST(0xaaaaaaaaaaaaaaaa) /* The upper bit of each - * bit pair */ +#define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */ +#define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */ /* prototypes for internal routines */ static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend); @@ -396,7 +394,6 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro { Buffer mapBuffer; uint64 *map; - int i; /* * Read till we fall off the end of the map. We assume that any extra @@ -414,21 +411,9 @@ visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_fro */ map = (uint64 *) PageGetContents(BufferGetPage(mapBuffer)); - StaticAssertStmt(MAPSIZE % sizeof(uint64) == 0, - "unsupported MAPSIZE"); - if (all_frozen == NULL) - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - } - else - { - for (i = 0; i < MAPSIZE / sizeof(uint64); i++) - { - nvisible += pg_popcount64(map[i] & VISIBLE_MASK64); - nfrozen += pg_popcount64(map[i] & FROZEN_MASK64); - } - } + nvisible += pg_popcount_masked((const char *) map, MAPSIZE, VISIBLE_MASK8); + if (all_frozen) + nfrozen += pg_popcount_masked((const char *) map, MAPSIZE, FROZEN_MASK8); ReleaseBuffer(mapBuffer); } diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 2b35021bc6..a0d743d36b 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -303,9 +303,11 @@ pg_ceil_log2_64(uint64 num) extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -/* Export pg_popcount_fast() for use in the AVX512 implementation. */ +/* Exported for use in the AVX512 implementation. */ extern uint64 pg_popcount_fast(const char *buf, int bytes); +extern uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask); /* * We can also try to use the AVX512 popcount instruction on some systems. @@ -317,6 +319,7 @@ extern uint64 pg_popcount_fast(const char *buf, int bytes); #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK extern bool pg_popcount_avx512_available(void); extern uint64 pg_popcount_avx512(const char *buf, int bytes); +extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif #else @@ -324,6 +327,7 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern int pg_popcount32(uint32 word); extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_optimized(const char *buf, int bytes); +extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); #endif /* TRY_POPCNT_FAST */ @@ -361,6 +365,37 @@ pg_popcount(const char *buf, int bytes) return pg_popcount_optimized(buf, bytes); } +/* + * Returns the number of 1-bits in buf after applying the mask to each byte. + * + * Similar to pg_popcount(), we only take on the function pointer overhead when + * it's likely to be faster. + */ +static inline uint64 +pg_popcount_masked(const char *buf, int bytes, bits8 mask) +{ + /* + * We set the threshold to the point at which we'll first use special + * instructions in the optimized version. + */ +#if SIZEOF_VOID_P >= 8 + int threshold = 8; +#else + int threshold = 4; +#endif + + if (bytes < threshold) + { + uint64 popcnt = 0; + + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + return popcnt; + } + + return pg_popcount_masked_optimized(buf, bytes, mask); +} + /* * Rotate the bits of "word" to the right/left by n bits. */ diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 2fa16b54b8..11b69608fa 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -106,22 +106,26 @@ const uint8 pg_number_of_ones[256] = { static inline int pg_popcount32_slow(uint32 word); static inline int pg_popcount64_slow(uint64 word); static uint64 pg_popcount_slow(const char *buf, int bytes); +static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); #ifdef TRY_POPCNT_FAST static bool pg_popcount_available(void); static int pg_popcount32_choose(uint32 word); static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); static inline int pg_popcount32_fast(uint32 word); static inline int pg_popcount64_fast(uint64 word); #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK static uint64 pg_popcount_fast_or_avx512(const char *buf, int bytes); +static uint64 pg_popcount_masked_fast_or_avx512(const char *buf, int bytes, bits8 mask); #endif int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; #endif /* TRY_POPCNT_FAST */ #ifdef TRY_POPCNT_FAST @@ -159,9 +163,13 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_fast; pg_popcount64 = pg_popcount64_fast; pg_popcount_optimized = pg_popcount_fast; + pg_popcount_masked_optimized = pg_popcount_masked_fast; #ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK if (pg_popcount_avx512_available()) + { pg_popcount_optimized = pg_popcount_fast_or_avx512; + pg_popcount_masked_optimized = pg_popcount_masked_fast_or_avx512; + } #endif } else @@ -169,6 +177,7 @@ choose_popcount_functions(void) pg_popcount32 = pg_popcount32_slow; pg_popcount64 = pg_popcount64_slow; pg_popcount_optimized = pg_popcount_slow; + pg_popcount_masked_optimized = pg_popcount_masked_slow; } } @@ -193,6 +202,13 @@ pg_popcount_choose(const char *buf, int bytes) return pg_popcount_optimized(buf, bytes); } +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked(buf, bytes, mask); +} + /* * pg_popcount32_fast * Return the number of 1 bits set in word @@ -291,6 +307,74 @@ pg_popcount_fast_or_avx512(const char *buf, int bytes) } #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ +/* + * pg_popcount_masked_fast + * Returns the number of 1-bits in buf after apply the mask to each byte + */ +uint64 +pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~0 / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + +/* + * This is a wrapper function for pg_popcount_masked_avx512() that uses + * pg_popcount_masked_fast() when there aren't enough bytes to fit in an + * AVX-512 register. The compiler should be able to inline + * pg_popcount_masked_fast() so that we only take on additional function call + * overhead when it's likely to be a better option. + */ +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +static uint64 +pg_popcount_masked_fast_or_avx512(const char *buf, int bytes, bits8 mask) +{ + if (bytes < 64) + return pg_popcount_masked_fast(buf, bytes, mask); + else + return pg_popcount_masked_avx512(buf, bytes, mask); +} +#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ + #endif /* TRY_POPCNT_FAST */ @@ -390,6 +474,56 @@ pg_popcount_slow(const char *buf, int bytes) return popcnt; } +/* + * pg_popcount_masked_slow + * Returns the number of 1-bits in buf after apply the mask to each byte + */ +static uint64 +pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_slow(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~0 / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_slow(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + #ifndef TRY_POPCNT_FAST /* @@ -421,4 +555,14 @@ pg_popcount_optimized(const char *buf, int bytes) return pg_popcount_slow(buf, bytes); } +/* + * pg_popcount_masked_optimized + * Returns the number of 1-bits in buf after apply the mask to each byte + */ +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_slow(buf, bytes, mask); +} + #endif /* !TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c index f86558d1ee..fb9ab3313b 100644 --- a/src/port/pg_popcount_avx512.c +++ b/src/port/pg_popcount_avx512.c @@ -46,4 +46,29 @@ pg_popcount_avx512(const char *buf, int bytes) return popcnt + pg_popcount_fast(buf, bytes); } +/* + * pg_popcount_masked_avx512 + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +uint64 +pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt; + __m512i accum = _mm512_setzero_si512(); + const __m512i maskv = _mm512_set1_epi8(mask); + + for (; bytes >= sizeof(__m512i); bytes -= sizeof(__m512i)) + { + const __m512i val = _mm512_loadu_si512((const __m512i *) buf); + const __m512i vmasked = _mm512_and_si512(val, maskv); + const __m512i cnt = _mm512_popcnt_epi64(vmasked); + + accum = _mm512_add_epi64(accum, cnt); + buf += sizeof(__m512i); + } + + popcnt = _mm512_reduce_add_epi64(accum); + return popcnt + pg_popcount_masked_fast(buf, bytes, mask); +} + #endif /* TRY_POPCNT_FAST */ -- 2.25.1