From 2a8a44c7fe8cfed6c7298533d633688cd2efd0b3 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Sat, 15 Feb 2025 19:18:16 +0700
Subject: [PATCH v9 1/5] Dispatch CRC computation by branching rather than
 indirect calls

Signed-off-by: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
---
 src/backend/postmaster/postmaster.c |  4 ++
 src/include/port/pg_cpucap.h        | 25 +++++++++
 src/include/port/pg_crc32c.h        | 78 +++++++++++++++++++++--------
 src/port/Makefile                   |  1 +
 src/port/meson.build                |  4 ++
 src/port/pg_cpucap.c                | 51 +++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c   | 26 +---------
 src/port/pg_crc32c_sse42_choose.c   | 26 +---------
 8 files changed, 145 insertions(+), 70 deletions(-)
 create mode 100644 src/include/port/pg_cpucap.h
 create mode 100644 src/port/pg_cpucap.c

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index bb22b13ade..4fa95f1d2c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -99,6 +99,7 @@
 #include "pg_getopt.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
+#include "port/pg_cpucap.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/pgarch.h"
@@ -1951,6 +1952,9 @@ InitProcessGlobals(void)
 #ifndef WIN32
 	srandom(pg_prng_uint32(&pg_global_prng_state));
 #endif
+
+	/* detect CPU capabilities */
+	pg_cpucap_initialize();
 }
 
 /*
diff --git a/src/include/port/pg_cpucap.h b/src/include/port/pg_cpucap.h
new file mode 100644
index 0000000000..81edfedce5
--- /dev/null
+++ b/src/include/port/pg_cpucap.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpucap.h
+ *	  Runtime detection of CPU capabilities.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ *	  src/include/port/pg_cpucap.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_CPUCAP_H
+#define PG_CPUCAP_H
+
+#define PGCPUCAP_INIT           (1 << 0)
+#define PGCPUCAP_POPCNT         (1 << 1)
+#define PGCPUCAP_VPOPCNT        (1 << 2)
+#define PGCPUCAP_CRC32C         (1 << 3)
+
+extern PGDLLIMPORT uint32 pg_cpucap;
+extern void pg_cpucap_initialize(void);
+
+#endif							/* PG_CPUCAP_H */
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 65ebeacf4b..b565a0f294 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -34,6 +34,7 @@
 #define PG_CRC32C_H
 
 #include "port/pg_bswap.h"
+#include "port/pg_cpucap.h"
 
 typedef uint32 pg_crc32c;
 
@@ -41,52 +42,55 @@ typedef uint32 pg_crc32c;
 #define INIT_CRC32C(crc) ((crc) = 0xFFFFFFFF)
 #define EQ_CRC32C(c1, c2) ((c1) == (c2))
 
-#if defined(USE_SSE42_CRC32C)
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
 /* Use Intel SSE4.2 instructions. */
 #define COMP_CRC32C(crc, data, len) \
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
+#define COMP_CRC32C_HW(crc, data, len) \
 	((crc) = pg_comp_crc32c_sse42((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+#if defined(USE_SSE42_CRC32C)
+#define HAVE_CRC_COMPTIME
+#else
+#define HAVE_CRC_RUNTIME
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+#endif
+
+extern bool pg_crc32c_sse42_available(void);
 extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_ARMV8_CRC32C)
+#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 /* Use ARMv8 CRC Extension instructions. */
 
 #define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
+#define COMP_CRC32C_HW(crc, data, len)						\
 	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+#if defined(USE_ARMV8_CRC32C)
+#define HAVE_CRC_COMPTIME
+#else
+#define HAVE_CRC_RUNTIME
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+#endif
+
+extern bool pg_crc32c_armv8_available(void);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
 
 #define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
+#define COMP_CRC32C_HW(crc, data, len)						\
 	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+#define HAVE_CRC_COMPTIME
 extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
 
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
-
-/*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
- */
-#define COMP_CRC32C(crc, data, len) \
-	((crc) = pg_comp_crc32c((crc), (data), (len)))
-#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
-
-extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
-extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
-
 #else
 /*
  * Use slicing-by-8 algorithm.
@@ -105,6 +109,36 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 
+#endif							/* end of CPU-specfic symbols */
+
+#if defined(HAVE_CRC_COMPTIME) || defined(HAVE_CRC_RUNTIME)
+/*
+ * Check if the CPU we're running on supports special
+ * instructions for CRC-32C computation. Otherwise, fall
+ * back to the pure software implementation (slicing-by-8).
+ */
+static inline pg_crc32c
+pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
+{
+	/*
+	 * If this is firing in a frontend program, first look if you forgot a
+	 * call to pg_cpucap_initialize() in main(). See for example
+	 * src/bin/pg_controldata/pg_controldata.c.
+	 */
+	// WIP: how to best intialize in frontend?
+#ifndef FRONTEND
+	Assert(pg_cpucap & PGCPUCAP_INIT);
+#endif
+
+#if defined(HAVE_CRC_COMPTIME)
+	return COMP_CRC32C_HW(crc, data, len);
+#else
+	if (pg_cpucap & PGCPUCAP_CRC32C)
+		return COMP_CRC32C_HW(crc, data, len);
+	else
+		return pg_comp_crc32c_sb8(crc, data, len);
 #endif
+}
+#endif							/* HAVE_CRC_COMPTIME || HAVE_CRC_RUNTIME */
 
 #endif							/* PG_CRC32C_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index 4c22431951..5a05179e92 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -44,6 +44,7 @@ OBJS = \
 	noblock.o \
 	path.o \
 	pg_bitutils.o \
+	pg_cpucap.o \
 	pg_popcount_avx512.o \
 	pg_strong_random.o \
 	pgcheckdir.o \
diff --git a/src/port/meson.build b/src/port/meson.build
index 7fcfa728d4..e1e7ce8fb8 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -7,6 +7,7 @@ pgport_sources = [
   'noblock.c',
   'path.c',
   'pg_bitutils.c',
+  'pg_cpucap.c',
   'pg_popcount_avx512.c',
   'pg_strong_random.c',
   'pgcheckdir.c',
@@ -83,12 +84,15 @@ replace_funcs_pos = [
   # x86/x64
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+  # WIP sometime we'll need to build these based on host_cpu
+  ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
   ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
 
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_cpucap.c b/src/port/pg_cpucap.c
new file mode 100644
index 0000000000..eba6e31c63
--- /dev/null
+++ b/src/port/pg_cpucap.c
@@ -0,0 +1,51 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_cpucap.c
+ *	  Runtime detection of CPU capabilities.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ *	  src/port/pg_cpucap.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_cpucap.h"
+#include "port/pg_crc32c.h"
+
+
+/* starts uninitialized so we can detect errors of omission */
+uint32		pg_cpucap = 0;
+
+/*
+ * Check if hardware instructions for CRC computation are available.
+ */
+static void
+pg_cpucap_crc32c(void)
+{
+	/* WIP: It seems like we should use CPU arch symbols instead */
+#if defined(USE_SSE42_CRC32C) || defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+	if (pg_crc32c_sse42_available())
+		pg_cpucap |= PGCPUCAP_CRC32C;
+
+#elif defined(USE_ARMV8_CRC32C) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+	if (pg_crc32c_armv8_available())
+		pg_cpucap |= PGCPUCAP_CRC32C;
+#endif
+}
+
+/*
+ * This needs to be called in main() for every
+ * program that calls a function that dispatches
+ * according to CPU features.
+ */
+void
+pg_cpucap_initialize(void)
+{
+	pg_cpucap = PGCPUCAP_INIT;
+
+	pg_cpucap_crc32c();
+}
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index ec12be1bbc..e3654427c3 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -1,12 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_crc32c_armv8_choose.c
- *	  Choose between ARMv8 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports the ARMv8
- * CRC Extension. If it does, use the special instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
+ *	  Check if the CPU we're running on supports the ARMv8 CRC Extension.
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -40,7 +35,7 @@
 
 #include "port/pg_crc32c.h"
 
-static bool
+bool
 pg_crc32c_armv8_available(void)
 {
 #if defined(HAVE_ELF_AUX_INFO)
@@ -106,20 +101,3 @@ pg_crc32c_armv8_available(void)
 	return false;
 #endif
 }
-
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
-	if (pg_crc32c_armv8_available())
-		pg_comp_crc32c = pg_comp_crc32c_armv8;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
-
-	return pg_comp_crc32c(crc, data, len);
-}
-
-pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d424..f4d3215bc5 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -1,12 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * pg_crc32c_sse42_choose.c
- *	  Choose between Intel SSE 4.2 and software CRC-32C implementation.
- *
- * On first call, checks if the CPU we're running on supports Intel SSE
- * 4.2. If it does, use the special SSE instructions for CRC-32C
- * computation. Otherwise, fall back to the pure software implementation
- * (slicing-by-8).
+ *	  Check if the CPU we're running on supports SSE4.2.
  *
  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -30,7 +25,7 @@
 
 #include "port/pg_crc32c.h"
 
-static bool
+bool
 pg_crc32c_sse42_available(void)
 {
 	unsigned int exx[4] = {0, 0, 0, 0};
@@ -45,20 +40,3 @@ pg_crc32c_sse42_available(void)
 
 	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
 }
-
-/*
- * This gets called on the first call. It replaces the function pointer
- * so that subsequent calls are routed directly to the chosen implementation.
- */
-static pg_crc32c
-pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
-{
-	if (pg_crc32c_sse42_available())
-		pg_comp_crc32c = pg_comp_crc32c_sse42;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
-
-	return pg_comp_crc32c(crc, data, len);
-}
-
-pg_crc32c	(*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len) = pg_comp_crc32c_choose;
-- 
2.43.0

