From 0b8f9851f38444dbe29009120d98cd38e93efe7f Mon Sep 17 00:00:00 2001
From: Paul Amonson <paul.d.amonson@intel.com>
Date: Tue, 21 May 2024 13:23:39 -0700
Subject: [PATCH] [Feat] Add-AVX512 crc32c algorithm to postgres

Signed-off-by: Paul Amonson <paul.d.amonson@intel.com>
---
 config/c-compiler.m4               |  48 +++++++
 configure                          | 223 +++++++++++++++++++++++------
 configure.ac                       | 106 +++++++++-----
 meson.build                        |  41 +++++-
 src/include/pg_config.h.in         |   3 +
 src/include/port/pg_crc32c.h       |  24 +++-
 src/port/Makefile                  |  10 ++
 src/port/meson.build               |   4 +
 src/port/pg_crc32c_avx512.c        | 222 ++++++++++++++++++++++++++++
 src/port/pg_crc32c_avx512_choose.c | 202 ++++++++++++++++++++++++++
 10 files changed, 797 insertions(+), 86 deletions(-)
 create mode 100644 src/port/pg_crc32c_avx512.c
 create mode 100644 src/port/pg_crc32c_avx512_choose.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 10f8c7bd0a..1d33932cb5 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -628,6 +628,54 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
+# PGAC_AVX512_CRC32_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the x86 CRC instructions added in AVX-512,
+# using the intrinsic functions:
+
+# (We don't test the 8-byte variant, _mm_crc32_u64, but it is assumed to
+# be present if the other ones are, on x86-64 platforms)
+#
+# An optional compiler flag can be passed as arguments (e.g. -msse4.2
+# -mavx512vl -mvpclmulqdq). If the intrinsics are supported, sets
+# pgac_avx512_crc32_intrinsics, and CFLAGS_CRC.
+AC_DEFUN([PGAC_AVX512_CRC32_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_crc32_intrinsics_$1])])dnl
+AC_CACHE_CHECK([for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=$1], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
+  [const unsigned long k1k2[[8]] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[[512]];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_avx512_crc32_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX512_CRC32_INTRINSICS
+
 
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
diff --git a/configure b/configure
index 7b03db56a6..45cd755867 100755
--- a/configure
+++ b/configure
@@ -14898,7 +14898,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -14944,7 +14944,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -14968,7 +14968,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -15013,7 +15013,7 @@ else
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -15037,7 +15037,7 @@ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
     We can't simply define LARGE_OFF_T to be 9223372036854775807,
     since some C++ compilers masquerading as C compilers
     incorrectly reject 9223372036854775807.  */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
   int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
 		       && LARGE_OFF_T % 2147483647 == 1)
 		      ? 1 : -1];
@@ -17774,6 +17774,123 @@ fi
 
 fi
 
+# Check for Intel AVX-512 intrinsics to do CRC calculations.
+#
+# First check if the _mm512_clmulepi64_epi128 and more intrinsics can
+# be used with the default compiler flags. If not, check if adding
+# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC
+# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=... " >&6; }
+if ${pgac_cv_avx512_crc32_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_crc32_intrinsics_=yes
+else
+  pgac_cv_avx512_crc32_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics_" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics_" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_avx512_crc32_intrinsics=yes
+fi
+
+if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq" >&5
+$as_echo_n "checking for _mm512_clmulepi64_epi128, _mm512_clmulepi64_epi128... with CFLAGS=-msse4.2 -mavx512vl -mvpclmulqdq... " >&6; }
+if ${pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -msse4.2 -mavx512vl -mvpclmulqdq"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+int
+main ()
+{
+const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=yes
+else
+  pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&5
+$as_echo "$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" >&6; }
+if test x"$pgac_cv_avx512_crc32_intrinsics__msse4_2__mavx512vl__mvpclmulqdq" = x"yes"; then
+  CFLAGS_CRC="-msse4.2 -mavx512vl -mvpclmulqdq"
+  pgac_avx512_crc32_intrinsics=yes
+fi
+
+fi
+
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
@@ -17946,31 +18063,42 @@ fi
 #
 # If we are targeting a LoongArch processor, CRC instructions are
 # always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
-  # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_CRC32C=1
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
+  # Use Intel AVX 512 if available.
+  if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then
+    USE_AVX512_CRC32C=1
   else
-    # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
-    # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
+   # Use Intel SSE 4.2 if available.
+    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+      USE_SSE42_CRC32C=1
     else
-      # Use ARM CRC Extension if available.
-      if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
-        USE_ARMV8_CRC32C=1
+      # Intel AVX 512, with runtime check? The CPUID instruction is needed for
+      # the runtime check.
+      if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1
       else
-        # ARM CRC Extension, with runtime check?
-        if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
-          USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+        # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
+        # the runtime check.
+        if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # LoongArch CRCC instructions.
-          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            USE_LOONGARCH_CRC32C=1
+          # Use ARM CRC Extension if available.
+          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
+            USE_ARMV8_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # ARM CRC Extension, with runtime check?
+            if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
+              USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+            else
+              # LoongArch CRCC instructions.
+              if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+                USE_LOONGARCH_CRC32C=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -17989,44 +18117,53 @@ $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+
+$as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o"
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX 512 with runtime check" >&5
+$as_echo "AVX 512 with runtime check" >&6; }
+  else
+    if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
 
 $as_echo "#define USE_SSE42_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
+      PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+      { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
-  else
-    if test x"$USE_ARMV8_CRC32C" = x"1"; then
+    else
+      if test x"$USE_ARMV8_CRC32C" = x"1"; then
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
-    else
-      if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+      else
+        if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
 
 $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
+          PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
-      else
-        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+        else
+          if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
 
 $as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
-          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+            PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
 $as_echo "LoongArch CRCC instructions" >&6; }
-        else
+          else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+            PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+            { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+          fi
         fi
       fi
     fi
diff --git a/configure.ac b/configure.ac
index 63e7be3847..73ea4d95dd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2124,6 +2124,17 @@ if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
   PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
 fi
 
+# Check for Intel AVX-512 intrinsics to do CRC calculations.
+#
+# First check if the _mm512_clmulepi64_epi128 and more intrinsics can
+# be used with the default compiler flags. If not, check if adding
+# the -msse4.2, -mavx512vl and -mvpclmulqdqif flag helps. CFLAGS_CRC
+# is set to -msse4.2, -mavx512vl and -mvpclmulqdqif that's required.
+PGAC_AVX512_CRC32_INTRINSICS([])
+if test x"$pgac_avx512_crc32_intrinsics" != x"yes"; then
+  PGAC_AVX512_CRC32_INTRINSICS([-msse4.2 -mavx512vl -mvpclmulqdq])
+fi
+
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [
@@ -2169,31 +2180,42 @@ AC_SUBST(CFLAGS_CRC)
 #
 # If we are targeting a LoongArch processor, CRC instructions are
 # always available (at least on 64 bit), so no runtime check is needed.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
-  # Use Intel SSE 4.2 if available.
-  if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_CRC32C=1
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
+  # Use Intel AVX 512 if available.
+  if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && test x"$AVX512_TARGETED" = x"1" ; then
+    USE_AVX512_CRC32C=1
   else
-    # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
-    # the runtime check.
-    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
+   # Use Intel SSE 4.2 if available.
+    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
+      USE_SSE42_CRC32C=1
     else
-      # Use ARM CRC Extension if available.
-      if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
-        USE_ARMV8_CRC32C=1
+      # Intel AVX 512, with runtime check? The CPUID instruction is needed for
+      # the runtime check.
+      if test x"$pgac_avx512_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_AVX512_CRC32C_WITH_RUNTIME_CHECK=1
       else
-        # ARM CRC Extension, with runtime check?
-        if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
-          USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+        # Intel SSE 4.2, with runtime check? The CPUID instruction is needed for
+        # the runtime check.
+        if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
+          USE_SSE42_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # LoongArch CRCC instructions.
-          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            USE_LOONGARCH_CRC32C=1
+          # Use ARM CRC Extension if available.
+          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes" && test x"$CFLAGS_CRC" = x""; then
+            USE_ARMV8_CRC32C=1
           else
-            # fall back to slicing-by-8 algorithm, which doesn't require any
-            # special CPU support.
-            USE_SLICING_BY_8_CRC32C=1
+            # ARM CRC Extension, with runtime check?
+            if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
+              USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
+            else
+              # LoongArch CRCC instructions.
+              if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+                USE_LOONGARCH_CRC32C=1
+              else
+                # fall back to slicing-by-8 algorithm, which doesn't require any
+                # special CPU support.
+                USE_SLICING_BY_8_CRC32C=1
+              fi
+            fi
           fi
         fi
       fi
@@ -2208,29 +2230,35 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then
   PG_CRC32C_OBJS="pg_crc32c_sse42.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
-    PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
-    AC_MSG_RESULT(SSE 4.2 with runtime check)
+  if test x"$USE_AVX512_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel AVX 512 CRC instructions with a runtime check.])
+    PG_CRC32C_OBJS="pg_crc32c_avx512.o pg_crc32c_sb8.o pg_crc32c_avx512_choose.o"
+    AC_MSG_RESULT(AVX 512 with runtime check)
   else
-    if test x"$USE_ARMV8_CRC32C" = x"1"; then
-      AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
-      AC_MSG_RESULT(ARMv8 CRC instructions)
+    if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+      AC_DEFINE(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check.])
+      PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sb8.o pg_crc32c_sse42_choose.o"
+      AC_MSG_RESULT(SSE 4.2 with runtime check)
     else
-      if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
-        AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
-        PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
-        AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
+      if test x"$USE_ARMV8_CRC32C" = x"1"; then
+        AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
+        PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+        AC_MSG_RESULT(ARMv8 CRC instructions)
       else
-        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
-          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
-          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
-          AC_MSG_RESULT(LoongArch CRCC instructions)
+        if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
+          AC_DEFINE(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use ARMv8 CRC Extension with a runtime check.])
+          PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
+          AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
         else
-          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-          AC_MSG_RESULT(slicing-by-8)
+          if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+            AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+            PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+            AC_MSG_RESULT(LoongArch CRCC instructions)
+          else
+            AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+            PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+            AC_MSG_RESULT(slicing-by-8)
+          fi
         fi
       fi
     fi
diff --git a/meson.build b/meson.build
index f9279c837d..a2b087d561 100644
--- a/meson.build
+++ b/meson.build
@@ -2144,6 +2144,34 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
     cdata.set('USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   else
+    avx_prog = '''
+#include <immintrin.h>
+
+int main(void)
+{
+  const unsigned long k1k2[8] = {
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86,
+  0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+  unsigned char buffer[512];
+  unsigned char *aligned = (unsigned char*)(((size_t)buffer + 64L) & 0xffffffffffc0L);
+  unsigned long val;
+  __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+  __m128i a1, a2;
+  unsigned int crc = 0xffffffff;
+  y8 = _mm512_load_si512((__m512i *)aligned);
+  x0 = _mm512_loadu_si512((__m512i *)k1k2);
+  x1 = _mm512_loadu_si512((__m512i *)(buffer + 0x00));
+  x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+  x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+  x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+  a1 = _mm512_extracti32x4_epi32(x1, 3);
+  a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+  x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+  val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+  crc = (unsigned int)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+  return crc != 0;
+}
+'''
 
     prog = '''
 #include <nmmintrin.h>
@@ -2157,13 +2185,20 @@ int main(void)
     return crc == 0;
 }
 '''
-
-    if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+    if cc.links(avx_prog,
+          name: '_mm512_clmulepi64_epi128 ... with -msse4.2 -mavx512vl -mvpclmulqdq',
+          args: test_c_args + ['-msse4.2', '-mavx512vl', '-mvpclmulqdq'])
+      cflags_crc += ['-msse4.2','-mavx512vl','-mvpclmulqdq']
+      cdata.set('USE_AVX512_CRC32C', false)
+      cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1)
+      have_optimized_crc = true
+    endif
+    if have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
           args: test_c_args)
       # Use Intel SSE 4.2 unconditionally.
       cdata.set('USE_SSE42_CRC32C', 1)
       have_optimized_crc = true
-    elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
+    elif have_optimized_crc == false and cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
           args: test_c_args + ['-msse4.2'])
       # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
       # the runtime check.
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index f8d3e3b6b8..6e08f1c6c7 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -738,6 +738,9 @@
 /* Define to 1 use Intel SSE 4.2 CRC instructions. */
 #undef USE_SSE42_CRC32C
 
+/* Define to 1 to use Intel AVX 512 CRC instructions with a runtime check. */
+#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 63c8e3a00b..b632ac7d59 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -49,6 +49,14 @@ typedef uint32 pg_crc32c;
 
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined (USE_AVX512_CRC32)
+/* Use Intel AVX512 instructions. */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c_avx512((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_ARMV8_CRC32C)
 /* Use ARMv8 CRC Extension instructions. */
 
@@ -67,6 +75,21 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel AVX-512 instructions, but perform a runtime check first to check that
+ * they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c (*pg_comp_crc32c)(pg_crc32c crc, const void *data, size_t len);
+
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
@@ -86,7 +109,6 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 #ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 #endif
-
 #else
 /*
  * Use slicing-by-8 algorithm.
diff --git a/src/port/Makefile b/src/port/Makefile
index db7c02117b..7ae632c6fc 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -88,11 +88,21 @@ pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_crc32c_avx512.o need CFLAGS_CRC
+pg_crc32c_avx512.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_avx512_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_avx512_srv.o: CFLAGS+=$(CFLAGS_CRC)
+
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_crc32c_avx512_choose.o need CFLAGS_XSAVE
+pg_crc32c_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_crc32c_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_crc32c_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
+
 # all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
 pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
 pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
diff --git a/src/port/meson.build b/src/port/meson.build
index fd9ee199d1..d635913e9b 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -83,6 +83,10 @@ replace_funcs_pos = [
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  ['pg_crc32c_avx512', 'USE_AVX512_CRC32C'],
+  ['pg_crc32c_avx512', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_avx512_choose', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 'xsave'],
+  ['pg_crc32c_sb8', 'USE_AVX512_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
   ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
diff --git a/src/port/pg_crc32c_avx512.c b/src/port/pg_crc32c_avx512.c
new file mode 100644
index 0000000000..085c8d99a8
--- /dev/null
+++ b/src/port/pg_crc32c_avx512.c
@@ -0,0 +1,222 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_avx512.c
+ *	  Compute CRC-32C checksum using Intel AVX-512 instructions.
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2024, Intel(r) Corporation
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_crc32c.h"
+
+/*
+ * Process eight bytes of data at a time.
+ *
+ * NB: We do unaligned accesses here. The Intel architecture allows that,
+ * and performance testing didn't show any performance gain from aligning
+ * the begin address.
+ */
+pg_attribute_no_sanitize_alignment()
+inline
+static
+pg_crc32c
+crc32c_fallback(pg_crc32c crc, const uint8 *p, size_t length)
+{
+	const unsigned char *pend = p + length;
+
+	/*
+	 * Process eight bytes of data at a time.
+	 *
+	 * NB: We do unaligned accesses here. The Intel architecture allows that,
+	 * and performance testing didn't show any performance gain from aligning
+	 * the begin address.
+	 */
+	while (p + 8 <= pend)
+	{
+		crc = (uint32)_mm_crc32_u64(crc, *((const uint64 *)p));
+		p += 8;
+	}
+
+	/* Process remaining full four bytes if any */
+	if (p + 4 <= pend)
+	{
+		crc = _mm_crc32_u32(crc, *((const unsigned int *)p));
+		p += 4;
+	}
+
+	/* Process any remaining bytes one at a time. */
+	while (p < pend)
+	{
+		crc = _mm_crc32_u8(crc, *p);
+		p++;
+	}
+
+	return crc;
+}
+
+/*******************************************************************
+ * pg_crc32c_avx512(): compute the crc32c of the buffer, where the
+ * buffer length must be at least 256, and a multiple of 64. Based
+ * on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ
+ * Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009,
+ *  https://www.researchgate.net/publication/263424619_Fast_CRC_computation#full-text
+ *
+ * This Function:
+ * Copyright 2017 The Chromium Authors
+ * Copyright (c) 2024, Intel(r) Corporation
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the Chromium source repository LICENSE file.
+ * https://chromium.googlesource.com/chromium/src/+/refs/heads/main/LICENSE
+ */
+pg_attribute_no_sanitize_alignment()
+inline
+pg_crc32c
+pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
+{
+	static const uint64 k1k2[8] = {
+		0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4,
+		0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+	static const uint64 k3k4[8] = {
+		0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02,
+		0x9e4addf8, 0x740eef02, 0x9e4addf8};
+	static const uint64 k9k10[8] = {
+		0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2,
+		0x0d3b6092, 0x6992cea2, 0x0d3b6092};
+	static const uint64 k1k4[8] = {
+		0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe,
+		0x493c7d27, 0x00000000, 0x00000000};
+
+	const uint8 *input = (const uint8 *)data;
+	if (length >= 256)
+	{
+		uint64 val;
+		__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+		__m128i a1, a2;
+
+		/*
+		 * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned
+		 * to 32 bytes.
+		 * >>> BEGIN
+		 */
+		/*
+		 * There's at least one block of 256.
+		 */
+		x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+		x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+		x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+		x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+		x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+		x0 = _mm512_load_si512((__m512i *)k1k2);
+
+		input += 256;
+		length -= 256;
+
+		/*
+		 * Parallel fold blocks of 256, if any.
+		 */
+		while (length >= 256)
+		{
+			x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+			x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+			x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+			x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+			x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+			x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+			x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+			x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+			y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+			y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+			y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+			y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+			x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+			x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
+			x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
+			x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
+
+			input += 256;
+			length -= 256;
+		}
+
+		/*
+		 * Fold 256 bytes into 64 bytes.
+		 */
+		x0 = _mm512_load_si512((__m512i *)k9k10);
+		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+		x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+		x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
+
+		x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+		x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+		x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
+
+		x0 = _mm512_load_si512((__m512i *)k3k4);
+		y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+		y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+		x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
+
+		/*
+		 * Single fold blocks of 64, if any.
+		 */
+		while (length >= 64)
+		{
+			x2 = _mm512_loadu_si512((__m512i *)input);
+
+			x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+			x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+			x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
+
+			input += 64;
+			length -= 64;
+		}
+
+		/*
+		 * Fold 512-bits to 128-bits.
+		 */
+		x0 = _mm512_loadu_si512((__m512i *)k1k4);
+
+		a2 = _mm512_extracti32x4_epi32(x1, 3);
+		x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+		x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+		x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96);
+
+		x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+		x0 = _mm512_xor_epi64(x1, x0);
+		a1 = _mm512_extracti32x4_epi32(x0, 1);
+		a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
+
+		/*
+		 * Fold 128-bits to 32-bits.
+		 */
+		val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+		crc = (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
+		/*
+		 * AVX-512 Optimized crc32c algorithm with mimimum of 256 bytes aligned
+		 * to 32 bytes.
+		 * <<< END
+		 ******************************************************************/
+	}
+
+	/*
+	 * Finish any remaining bytes.
+	 */
+	return crc32c_fallback(crc, input, length);
+}
diff --git a/src/port/pg_crc32c_avx512_choose.c b/src/port/pg_crc32c_avx512_choose.c
new file mode 100644
index 0000000000..d5ccb69d10
--- /dev/null
+++ b/src/port/pg_crc32c_avx512_choose.c
@@ -0,0 +1,202 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_avx512_choose.c
+ *	  Choose between Intel AVX-512 and software CRC-32C implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel AVX-
+ * 512. If it does, use the special AVX-512 instructions for CRC-32C
+ * computation. Otherwise, fall back to the pure software implementation
+ * (slicing-by-8).
+ *
+ * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ * Portions Copyright (c) 2024, Intel(r) Corp.
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#include "port/pg_crc32c.h"
+
+typedef unsigned int exx_t;
+
+/*
+ * Helper function.
+ * Test for a bit being set in a exx_t field.
+ */
+inline
+static
+bool
+is_bit_set(exx_t reg, int bit)
+{
+	return (reg & (1 << bit)) != 0;
+}
+
+/*
+ * Intel Platform CPUID check for Linux and Visual Studio platforms.
+ */
+inline
+static
+void
+pg_getcpuid(unsigned int leaf, exx_t *exx)
+{
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(leaf, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+}
+
+/*
+ * Intel Platform CPUIDEX check for Linux and Visual Studio platforms.
+ */
+inline
+static
+void
+pg_getcpuidex(unsigned int leaf, unsigned int subleaf, exx_t *exx)
+{
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(leaf, subleaf, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+}
+
+/*
+ * Check for CPU supprt for CPUID: sse4.2
+ */
+inline
+static
+bool
+sse42_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuid(1, exx);
+	return is_bit_set(exx[2], 20); /* sse4.2 */
+}
+
+/*
+ * Check for CPU supprt for CPUID: osxsave
+ */
+inline
+static
+bool
+osxsave_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuid(1, exx);
+	return is_bit_set(exx[2], 27); /* osxsave */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: avx512-f
+ */
+inline
+static
+bool
+avx512f_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set(exx[1], 16); /* avx512-f */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline
+static
+bool
+vpclmulqdq_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set(exx[1], 10); /* vpclmulqdq */
+}
+
+/*
+ * Check for CPU supprt for CPUIDEX: vpclmulqdq
+ */
+inline
+static
+bool
+avx512vl_available(void)
+{
+	exx_t exx[4] = {0, 0, 0, 0};
+
+	pg_getcpuidex(7, 0, exx);
+	return is_bit_set(exx[1], 31); /* avx512-vl */
+}
+
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+static inline bool
+zmm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+	return (_xgetbv(0) & 0xe6) == 0xe6;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_crc32c implementation.
+ */
+inline
+static
+bool
+pg_crc32c_avx512_available(void)
+{
+	return sse42_available() && osxsave_available() &&
+		   avx512f_available() && vpclmulqdq_available() &&
+		   avx512vl_available() && zmm_regs_available();
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static
+pg_crc32c
+pg_comp_avx512_choose(pg_crc32c crc, const void *data, size_t len)
+{
+	if (pg_crc32c_avx512_available())
+		pg_comp_crc32c = pg_comp_crc32c_avx512;
+	else
+		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+	return pg_comp_crc32c(crc, data, len);
+}
+
+pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_avx512_choose;
-- 
2.34.1

