From 6d3277787a6d9d6a626d87cfa37d05412bf8ef32 Mon Sep 17 00:00:00 2001
From: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
Date: Thu, 7 Nov 2024 12:16:58 -0800
Subject: [PATCH v5 1/2] Use __attribute__((target(sse4.2))) for SSE42 CRC32C

Following commit f78667b, this commit builds the SSE4.2 CRC32C code using
function attribute target instead of extra compiler flags.
---
 config/c-compiler.m4              | 30 +++++------
 configure                         | 84 +++++++++----------------------
 configure.ac                      |  8 +--
 meson.build                       | 14 ++++--
 src/port/Makefile                 |  5 --
 src/port/meson.build              |  5 +-
 src/port/pg_crc32c_sse42.c        |  5 ++
 src/port/pg_crc32c_sse42_choose.c |  5 ++
 8 files changed, 62 insertions(+), 94 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index c7eb896f14..113a5b22a1 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -608,27 +608,29 @@ fi])# PGAC_HAVE_GCC__ATOMIC_INT64_CAS
 # An optional compiler flag can be passed as argument (e.g. -msse4.2). If the
 # intrinsics are supported, sets pgac_sse42_crc32_intrinsics, and CFLAGS_CRC.
 AC_DEFUN([PGAC_SSE42_CRC32_INTRINSICS],
-[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics_$1])])dnl
-AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=$1], [Ac_cachevar],
-[pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS $1"
-AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>],
-  [unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;])],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sse42_crc32_intrinsics])])dnl
+AC_CACHE_CHECK([for _mm_crc32_u8 and _mm_crc32_u32 with function attribute], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <nmmintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("sse4.2")))
+    #endif
+    static int crc32_sse42_test(void)
+    {
+      unsigned int crc = 0;
+      crc = _mm_crc32_u8(crc, 0);
+      crc = _mm_crc32_u32(crc, 0);
+      /* return computed value, to prevent the above being optimized away */
+      return crc == 0;
+    }],
+  [return crc32_sse42_test();])],
   [Ac_cachevar=yes],
-  [Ac_cachevar=no])
-CFLAGS="$pgac_save_CFLAGS"])
+  [Ac_cachevar=no])])
 if test x"$Ac_cachevar" = x"yes"; then
-  CFLAGS_CRC="$1"
   pgac_sse42_crc32_intrinsics=yes
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_SSE42_CRC32_INTRINSICS
 
-
 # PGAC_ARMV8_CRC32C_INTRINSICS
 # ----------------------------
 # Check if the compiler supports the CRC32C instructions using the __crc32cb,
diff --git a/configure b/configure
index 3a7332f834..bfdb46c9dc 100755
--- a/configure
+++ b/configure
@@ -17368,87 +17368,49 @@ fi
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics_+:} false; then :
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute" >&5
+$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with function attribute... " >&6; }
+if ${pgac_cv_sse42_crc32_intrinsics+:} false; then :
   $as_echo_n "(cached) " >&6
 else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS "
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <nmmintrin.h>
-int
-main ()
-{
-unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics_=yes
-else
-  pgac_cv_sse42_crc32_intrinsics_=no
-fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics_" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics_" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics_" = x"yes"; then
-  CFLAGS_CRC=""
-  pgac_sse42_crc32_intrinsics=yes
-fi
-
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2" >&5
-$as_echo_n "checking for _mm_crc32_u8 and _mm_crc32_u32 with CFLAGS=-msse4.2... " >&6; }
-if ${pgac_cv_sse42_crc32_intrinsics__msse4_2+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  pgac_save_CFLAGS=$CFLAGS
-CFLAGS="$pgac_save_CFLAGS -msse4.2"
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 #include <nmmintrin.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("sse4.2")))
+    #endif
+    static int crc32_sse42_test(void)
+    {
+      unsigned int crc = 0;
+      crc = _mm_crc32_u8(crc, 0);
+      crc = _mm_crc32_u32(crc, 0);
+      /* return computed value, to prevent the above being optimized away */
+      return crc == 0;
+    }
 int
 main ()
 {
-unsigned int crc = 0;
-   crc = _mm_crc32_u8(crc, 0);
-   crc = _mm_crc32_u32(crc, 0);
-   /* return computed value, to prevent the above being optimized away */
-   return crc == 0;
+return crc32_sse42_test();
   ;
   return 0;
 }
 _ACEOF
 if ac_fn_c_try_link "$LINENO"; then :
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=yes
+  pgac_cv_sse42_crc32_intrinsics=yes
 else
-  pgac_cv_sse42_crc32_intrinsics__msse4_2=no
+  pgac_cv_sse42_crc32_intrinsics=no
 fi
 rm -f core conftest.err conftest.$ac_objext \
     conftest$ac_exeext conftest.$ac_ext
-CFLAGS="$pgac_save_CFLAGS"
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics__msse4_2" >&5
-$as_echo "$pgac_cv_sse42_crc32_intrinsics__msse4_2" >&6; }
-if test x"$pgac_cv_sse42_crc32_intrinsics__msse4_2" = x"yes"; then
-  CFLAGS_CRC="-msse4.2"
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sse42_crc32_intrinsics" >&5
+$as_echo "$pgac_cv_sse42_crc32_intrinsics" >&6; }
+if test x"$pgac_cv_sse42_crc32_intrinsics" = x"yes"; then
   pgac_sse42_crc32_intrinsics=yes
 fi
 
-fi
 
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
diff --git a/configure.ac b/configure.ac
index e7f4f0fc22..bb6dea7b1b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2066,13 +2066,9 @@ fi
 
 # Check for Intel SSE 4.2 intrinsics to do CRC calculations.
 #
-# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
-# with the default compiler flags. If not, check if adding the -msse4.2
-# flag helps. CFLAGS_CRC is set to -msse4.2 if that's required.
+# Check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
+# with the __attribute__((target("sse4.2"))).
 PGAC_SSE42_CRC32_INTRINSICS([])
-if test x"$pgac_sse42_crc32_intrinsics" != x"yes"; then
-  PGAC_SSE42_CRC32_INTRINSICS([-msse4.2])
-fi
 
 # Are we targeting a processor that supports SSE 4.2? gcc, clang and icc all
 # define __SSE4_2__ in that case.
diff --git a/meson.build b/meson.build
index 9eddd72a27..671ac7052b 100644
--- a/meson.build
+++ b/meson.build
@@ -2234,9 +2234,13 @@ if host_cpu == 'x86' or host_cpu == 'x86_64'
     have_optimized_crc = true
   else
 
-    prog = '''
+    sse42_crc_prog = '''
 #include <nmmintrin.h>
-
+#ifdef TEST_SSE42_WITH_ATTRIBUTE
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("sse4.2")))
+#endif
+#endif
 int main(void)
 {
     unsigned int crc = 0;
@@ -2247,13 +2251,13 @@ int main(void)
 }
 '''
 
-    if cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
+    if cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 without -msse4.2',
           args: test_c_args)
       # Use Intel SSE 4.2 unconditionally.
       cdata.set('USE_SSE42_CRC32C', 1)
       have_optimized_crc = true
-    elif cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with -msse4.2',
-          args: test_c_args + ['-msse4.2'])
+    elif cc.links(sse42_crc_prog, name: '_mm_crc32_u8 and _mm_crc32_u32 with sse4.2 function attribute',
+          args: test_c_args + ['-D TEST_SSE42_WITH_ATTRIBUTE'])
       # Use Intel SSE 4.2, with runtime check. The CPUID instruction is needed for
       # the runtime check.
       cflags_crc += '-msse4.2'
diff --git a/src/port/Makefile b/src/port/Makefile
index 366c814bd9..4c22431951 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -82,11 +82,6 @@ libpgport.a: $(OBJS)
 	rm -f $@
 	$(AR) $(AROPT) $@ $^
 
-# all versions of pg_crc32c_sse42.o need CFLAGS_CRC
-pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_CRC)
-pg_crc32c_sse42_srv.o: CFLAGS+=$(CFLAGS_CRC)
-
 # all versions of pg_crc32c_armv8.o need CFLAGS_CRC
 pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
diff --git a/src/port/meson.build b/src/port/meson.build
index 83a0632520..37d12cbd8f 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -8,6 +8,8 @@ pgport_sources = [
   'path.c',
   'pg_bitutils.c',
   'pg_popcount_avx512.c',
+  'pg_crc32c_sse42_choose.c',
+  'pg_crc32c_sse42.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
   'pgmkdirp.c',
@@ -81,9 +83,6 @@ endif
 # is true
 replace_funcs_pos = [
   # x86/x64
-  ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
-  ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
-  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 7f88c11480..37693eb516 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -14,11 +14,15 @@
  */
 #include "c.h"
 
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
 #include <nmmintrin.h>
 
 #include "port/pg_crc32c.h"
 
+
 pg_attribute_no_sanitize_alignment()
+pg_attribute_target("sse4.2")
 pg_crc32c
 pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 {
@@ -67,3 +71,4 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+#endif // SSE42_CRC
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 56d600f3a9..50ae82b312 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -18,8 +18,11 @@
  *-------------------------------------------------------------------------
  */
 
+
 #include "c.h"
 
+#if defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
 #ifdef HAVE__GET_CPUID
 #include <cpuid.h>
 #endif
@@ -62,3 +65,5 @@ pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 }
 
 pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
+
+#endif // USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-- 
2.43.0

