From 881a13883395bf386ab71707a08a34c261fbb422 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Wed, 11 Mar 2026 00:55:03 -0700
Subject: [PATCH v21 2/5] Allow retrieving x86 TSC frequency/flags from CPUID

This adds additional x86 specific CPUID checks for flags needed for
determining whether the Time-Stamp Counter (TSC) is usable on a given
system, as well as a helper function to retrieve the TSC frequency from
CPUID.

This is intended for a future patch that will utilize the TSC to lower
the overhead of timing instrumentation.

In passing, always make pg_cpuid_subleaf reset the variables used for its
result, to avoid accidentally using stale results if __get_cpuid_count
errors out.

Author: Lukas Fittl <lukas@fittl.com>
Author: David Geier <geidav.pg@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: David Geier <geidav.pg@gmail.com>
Reviewed-by: John Naylor <john.naylor@postgresql.org>
Reviewed-by: Jakub Wartak <jakub.wartak@enterprisedb.com> (in an earlier version)
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
---
 src/include/port/pg_cpu.h |  12 +++-
 src/port/pg_cpu_x86.c     | 132 ++++++++++++++++++++++++++++++++++++--
 2 files changed, 139 insertions(+), 5 deletions(-)

diff --git a/src/include/port/pg_cpu.h b/src/include/port/pg_cpu.h
index c5d96bb4f47..a5d42f1b68d 100644
--- a/src/include/port/pg_cpu.h
+++ b/src/include/port/pg_cpu.h
@@ -32,8 +32,16 @@ typedef enum X86FeatureId
 	PG_AVX512_VL,
 	PG_AVX512_VPCLMULQDQ,
 	PG_AVX512_VPOPCNTDQ,
+
+	/* identification */
+	PG_HYPERVISOR,
+
+	/* Time-Stamp Counter (TSC) flags */
+	PG_RDTSCP,
+	PG_TSC_INVARIANT,
+	PG_TSC_ADJUST,
 } X86FeatureId;
-#define X86FeaturesSize (PG_AVX512_VPOPCNTDQ + 1)
+#define X86FeaturesSize (PG_TSC_ADJUST + 1)
 
 extern PGDLLIMPORT bool X86Features[];
 
@@ -48,6 +56,8 @@ x86_feature_available(X86FeatureId feature)
 	return X86Features[feature];
 }
 
+extern uint32 x86_tsc_frequency_khz(void);
+
 #endif							/* defined(USE_SSE2) || defined(__i386__) */
 
 #endif							/* PG_CPU_H */
diff --git a/src/port/pg_cpu_x86.c b/src/port/pg_cpu_x86.c
index 40ff78633ca..8951e7a0811 100644
--- a/src/port/pg_cpu_x86.c
+++ b/src/port/pg_cpu_x86.c
@@ -80,13 +80,13 @@ pg_cpuid(int leaf, unsigned int *reg)
 static inline bool
 pg_cpuid_subleaf(int leaf, int subleaf, unsigned int *reg)
 {
+	memset(reg, 0, 4 * sizeof(unsigned int));
 #if defined(HAVE__GET_CPUID_COUNT)
 	return __get_cpuid_count(leaf, subleaf, &reg[EAX], &reg[EBX], &reg[ECX], &reg[EDX]) == 1;
 #elif defined(HAVE__CPUIDEX)
 	__cpuidex((int *) reg, leaf, subleaf);
 	return true;
 #else
-	memset(reg, 0, 4 * sizeof(unsigned int));
 	return false;
 #endif
 }
@@ -101,19 +101,24 @@ void
 set_x86_features(void)
 {
 	unsigned int reg[4] = {0};
+	bool		have_osxsave;
 
 	pg_cpuid(0x01, reg);
 
 	X86Features[PG_SSE4_2] = reg[ECX] >> 20 & 1;
 	X86Features[PG_POPCNT] = reg[ECX] >> 23 & 1;
+	X86Features[PG_HYPERVISOR] = reg[ECX] >> 31 & 1;
+	have_osxsave = reg[ECX] >> 27 & 1;
+
+	pg_cpuid_subleaf(0x07, 0, reg);
+
+	X86Features[PG_TSC_ADJUST] = reg[EBX] >> 1 & 1;
 
 	/* leaf 7 features that depend on OSXSAVE */
-	if (reg[ECX] & (1 << 27))
+	if (have_osxsave)
 	{
 		uint32		xcr0_val = 0;
 
-		pg_cpuid_subleaf(0x07, 0, reg);
-
 #ifdef HAVE_XSAVE_INTRINSICS
 		/* get value of Extended Control Register */
 		xcr0_val = _xgetbv(0);
@@ -135,7 +140,126 @@ set_x86_features(void)
 		}
 	}
 
+	/* Check for other TSC related flags */
+	pg_cpuid(0x80000001, reg);
+	X86Features[PG_RDTSCP] = reg[EDX] >> 27 & 1;
+
+	pg_cpuid(0x80000007, reg);
+	X86Features[PG_TSC_INVARIANT] = reg[EDX] >> 8 & 1;
+
 	X86Features[INIT_PG_X86] = true;
 }
 
+/* TSC (Time-stamp Counter) handling code */
+
+static uint32 x86_hypervisor_tsc_frequency_khz(void);
+
+/*
+ * Determine the TSC frequency of the CPU through CPUID, where supported.
+ *
+ * Needed to interpret the tick value returned by RDTSC/RDTSCP. Return value of
+ * 0 indicates the frequency information was not accessible via CPUID.
+ */
+uint32
+x86_tsc_frequency_khz(void)
+{
+	unsigned int reg[4] = {0};
+
+	if (x86_feature_available(PG_HYPERVISOR))
+	{
+		uint32		freq = x86_hypervisor_tsc_frequency_khz();
+
+		if (freq > 0)
+			return freq;
+	}
+
+	/*
+	 * On modern Intel CPUs, the TSC is implemented by invariant timekeeping
+	 * hardware, also called "Always Running Timer", or ART. The ART stays
+	 * consistent even if the CPU changes frequency due to changing power
+	 * levels.
+	 *
+	 * As documented in "Determining the Processor Base Frequency" in the
+	 * "Intel® 64 and IA-32 Architectures Software Developer's Manual",
+	 * February 2026 Edition, we can get the TSC frequency as follows:
+	 *
+	 * Nominal TSC frequency = ( CPUID.15H:ECX[31:0] * CPUID.15H:EBX[31:0] ) /
+	 * CPUID.15H:EAX[31:0]
+	 *
+	 * With CPUID.15H:ECX representing the nominal core crystal clock
+	 * frequency, and EAX/EBX representing values used to translate the TSC
+	 * value to that frequency, see "Chapter 20.17 "Time-Stamp Counter" of
+	 * that manual.
+	 *
+	 * Older Intel CPUs, and other vendors do not set CPUID.15H:ECX, and as
+	 * such we fall back to alternate approaches.
+	 */
+	pg_cpuid(0x15, reg);
+	if (reg[ECX] > 0)
+	{
+		/*
+		 * EBX not being set indicates invariant TSC is not available. Require
+		 * EAX being non-zero too, to avoid a theoretical divide by zero.
+		 */
+		if (reg[EAX] == 0 || reg[EBX] == 0)
+			return 0;
+
+		return reg[ECX] / 1000 * reg[EBX] / reg[EAX];
+	}
+
+	/*
+	 * When CPUID.15H is not available/incomplete, we can instead try to get
+	 * the processor base frequency in MHz from CPUID.16H:EAX, the "Processor
+	 * Frequency Information Leaf".
+	 */
+	pg_cpuid(0x16, reg);
+	if (reg[EAX] > 0)
+		return reg[EAX] * 1000;
+
+	return 0;
+}
+
+/*
+ * Support for reading TSC frequency for hypervisors passing it to a guest VM.
+ *
+ * Two Hypervisors (VMware and KVM) are known to make TSC frequency in KHz
+ * available at the vendor-specific 0x40000010 leaf in the EAX register.
+ *
+ * For some other Hypervisors that have an invariant TSC, e.g. HyperV, we would
+ * need to access a model-specific register (MSR) to get the frequency. MSRs are
+ * separate from CPUID and typically not available for unprivileged processes,
+ * so we can't get the frequency this way.
+ */
+#define CPUID_HYPERVISOR_VMWARE(r) (r[EBX] == 0x61774d56 && r[ECX] == 0x4d566572 && r[EDX] == 0x65726177)	/* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(r) (r[EBX] == 0x4b4d564b && r[ECX] == 0x564b4d56 && r[EDX] == 0x0000004d)	/* KVMKVMKVM */
+static uint32
+x86_hypervisor_tsc_frequency_khz(void)
+{
+	unsigned int reg[4] = {0};
+
+#if defined(HAVE__CPUIDEX)
+
+	/*
+	 * The hypervisor is determined using the 0x40000000 Hypervisor
+	 * information leaf, which requires use of __cpuidex to set ECX to 0 to
+	 * access it.
+	 *
+	 * The similar __get_cpuid_count function does not work as expected since
+	 * it contains a check for __get_cpuid_max, which has been observed to be
+	 * lower than the special Hypervisor leaf, despite it being available.
+	 */
+	__cpuidex((int *) reg, 0x40000000, 0);
+
+	if (reg[EAX] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(reg) || CPUID_HYPERVISOR_KVM(reg)))
+	{
+		__cpuidex((int *) reg, 0x40000010, 0);
+		if (reg[EAX] > 0)
+			return reg[EAX];
+	}
+#endif							/* HAVE__CPUIDEX */
+
+	return 0;
+}
+
+
 #endif							/* defined(USE_SSE2) || defined(__i386__) */
-- 
2.47.1

