Re: Reduce timing overhead of EXPLAIN ANALYZE using rdtsc?

Lukas Fittl Sat, 31 Jan 2026 19:15:57 -0800

On Sat, Jan 31, 2026 at 12:11 PM Lukas Fittl <[email protected]> wrote:
> I've reworked the patch a bit more, see attached v4


And of course, I took the wrong branch when running "git format-patch"
- apologies.

See attached v5.


Thanks,
Lukas

--
Lukas Fittl

From 39b3a0abf937320458c84005a7faed0fd005e5d7 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sat, 31 Jan 2026 08:49:46 -0800
Subject: [PATCH v5 1/3] Check for HAVE__CPUIDEX and HAVE__GET_CPUID_COUNT
 separately

Previously we would only check for the availability of __cpuidex if
the related __get_cpuid_count was not available on a platform. But there
are cases where we want to be able to call __cpuidex as the only viable
option, specifically, when accessing a high leaf like VM Hypervisor
information (0x40000000), which __get_cpuid_count does not allow.

This will be used in an future commit to access Hypervisor information
about the TSC frequency of x86 CPUs, where available.

Note that __cpuidex is defined in cpuid.h for GCC/clang, but in intrin.h
for MSVC. Because we now set HAVE__CPUIDEX for GCC/clang when available,
adjust existing code to check for _MSC_VER when including intrin.h.
---
 configure                         | 20 ++++++++++++--------
 configure.ac                      | 30 +++++++++++++++++-------------
 meson.build                       | 10 ++++++++--
 src/port/pg_crc32c_sse42_choose.c |  4 ++--
 src/port/pg_popcount_x86.c        |  4 ++--
 5 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/configure b/configure
index a10a2c85c6a..38de88fcc50 100755
--- a/configure
+++ b/configure
@@ -17648,7 +17648,8 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
   fi
 fi
 
-# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
+# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes
+# need __cpuidex() even if __get_cpuid_count() is available.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
 $as_echo_n "checking for __get_cpuid_count... " >&6; }
 if ${pgac_cv__get_cpuid_count+:} false; then :
@@ -17681,21 +17682,25 @@ if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
 
 $as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
 
-else
-  # __cpuidex()
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
+fi
+# __cpuidex()
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
 $as_echo_n "checking for __cpuidex... " >&6; }
 if ${pgac_cv__cpuidex+:} false; then :
   $as_echo_n "(cached) " >&6
 else
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
-#include <intrin.h>
+#ifdef _MSC_VER
+    #include <intrin.h>
+    #else
+    #include <cpuid.h>
+    #endif
 int
 main ()
 {
 unsigned int exx[4] = {0, 0, 0, 0};
-    __cpuidex(exx, 7, 0);
+  __cpuidex(exx, 7, 0);
 
   ;
   return 0;
@@ -17711,11 +17716,10 @@ rm -f core conftest.err conftest.$ac_objext \
 fi
 { $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
 $as_echo "$pgac_cv__cpuidex" >&6; }
-  if test x"$pgac_cv__cpuidex" = x"yes"; then
+if test x"$pgac_cv__cpuidex" = x"yes"; then
 
 $as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
 
-  fi
 fi
 
 # Check for XSAVE intrinsics
diff --git a/configure.ac b/configure.ac
index 814e64a967e..6e174cba328 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2098,7 +2098,8 @@ else
   fi
 fi
 
-# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
+# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes
+# need __cpuidex() even if __get_cpuid_count() is available.
 AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
 [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
   [[unsigned int exx[4] = {0, 0, 0, 0};
@@ -2108,18 +2109,21 @@ AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
   [pgac_cv__get_cpuid_count="no"])])
 if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
   AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
-else
-  # __cpuidex()
-  AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
-  [AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
-    [[unsigned int exx[4] = {0, 0, 0, 0};
-    __cpuidex(exx, 7, 0);
-    ]])],
-    [pgac_cv__cpuidex="yes"],
-    [pgac_cv__cpuidex="no"])])
-  if test x"$pgac_cv__cpuidex" = x"yes"; then
-    AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
-  fi
+fi
+# __cpuidex()
+AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#ifdef _MSC_VER
+    #include <intrin.h>
+    #else
+    #include <cpuid.h>
+    #endif],
+  [[unsigned int exx[4] = {0, 0, 0, 0};
+  __cpuidex(exx, 7, 0);
+  ]])],
+  [pgac_cv__cpuidex="yes"],
+  [pgac_cv__cpuidex="no"])])
+if test x"$pgac_cv__cpuidex" = x"yes"; then
+  AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
 fi
 
 # Check for XSAVE intrinsics
diff --git a/meson.build b/meson.build
index df907b62da3..2b106135869 100644
--- a/meson.build
+++ b/meson.build
@@ -2078,7 +2078,8 @@ elif cc.links('''
 endif
 
 
-# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
+# Check for __get_cpuid_count() and __cpuidex() separately, since we sometimes
+# need __cpuidex() even if __get_cpuid_count() is available.
 if cc.links('''
     #include <cpuid.h>
     int main(int arg, char **argv)
@@ -2089,8 +2090,13 @@ if cc.links('''
     ''', name: '__get_cpuid_count',
     args: test_c_args)
   cdata.set('HAVE__GET_CPUID_COUNT', 1)
-elif cc.links('''
+endif
+if cc.links('''
+    #ifdef _MSC_VER
     #include <intrin.h>
+    #else
+    #include <cpuid.h>
+    #endif
     int main(int arg, char **argv)
     {
         unsigned int exx[4] = {0, 0, 0, 0};
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index f586476964f..7a75380b483 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,11 +20,11 @@
 
 #include "c.h"
 
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
 #include <cpuid.h>
 #endif
 
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
 #include <intrin.h>
 #endif
 
diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c
index 245f0167d00..f8a20766f2d 100644
--- a/src/port/pg_popcount_x86.c
+++ b/src/port/pg_popcount_x86.c
@@ -14,7 +14,7 @@
 
 #ifdef HAVE_X86_64_POPCNTQ
 
-#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
 #include <cpuid.h>
 #endif
 
@@ -22,7 +22,7 @@
 #include <immintrin.h>
 #endif
 
-#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
 #include <intrin.h>
 #endif
 
-- 
2.47.1

From 59effa585e987c50bf11708c0d0f36165422491d Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Fri, 25 Jul 2025 17:57:20 -0700
Subject: [PATCH v5 2/3] Use time stamp counter to measure time on Linux/x86

We switch to using the time stamp counter (TSC) instead of clock_gettime()
to reduce overhead of EXPLAIN (ANALYZE, TIME ON). Tests showed that runtime
is reduced by around 10% for queries moving lots of rows through the plan.

For now this is only enabled on Linux/x86, in case the system clocksource is
reported as TSC. Relying on the Linux kernel simplifies the logic to detect
if the present TSC is usable (frequency invariant, synchronized between
sockets, etc.). In all other cases we fallback to clock_gettime().

Note, that we intentionally use RDTSC in the fast paths, rather than RDTSCP.
RDTSCP waits for outstanding instructions to retire on out-of-order CPUs.
This adds noticably for little benefit in the typical InstrStartNode() /
InstrStopNode() use case. The macro to be used in such cases is called
INSTR_TIME_SET_CURRENT_FAST(). The original macro INSTR_TIME_SET_CURRENT()
uses RDTSCP and is supposed to be used when precision is more important
than performance.

Author: David Geier <[email protected]>
Author: Andres Freund <[email protected]>
Author: Lukas Fittl <[email protected]>
Reviewed-by:
Discussion: https://www.postgresql.org/message-id/flat/20200612232810.f46nbqkdhbutzqdg%40alap3.anarazel.de
---
 src/backend/executor/instrument.c         |  12 +-
 src/backend/utils/misc/guc_parameters.dat |  10 +
 src/backend/utils/misc/guc_tables.c       |  10 +
 src/common/Makefile                       |   1 +
 src/common/instr_time.c                   | 283 ++++++++++++++++++++++
 src/common/meson.build                    |   1 +
 src/include/portability/instr_time.h      | 155 ++++++++++--
 src/include/utils/guc_hooks.h             |   2 +
 src/include/utils/guc_tables.h            |   1 +
 9 files changed, 454 insertions(+), 21 deletions(-)
 create mode 100644 src/common/instr_time.c

diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index edab92a0ebe..ebdad31ca3b 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -67,9 +67,13 @@ InstrInit(Instrumentation *instr, int instrument_options)
 void
 InstrStartNode(Instrumentation *instr)
 {
-	if (instr->need_timer &&
-		!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
-		elog(ERROR, "InstrStartNode called twice in a row");
+	if (instr->need_timer)
+	{
+		if (!INSTR_TIME_IS_ZERO(instr->starttime))
+			elog(ERROR, "InstrStartNode called twice in a row");
+		else
+			INSTR_TIME_SET_CURRENT_FAST(instr->starttime);
+	}
 
 	/* save buffer usage totals at node entry, if needed */
 	if (instr->need_bufusage)
@@ -95,7 +99,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 			elog(ERROR, "InstrStopNode called without start");
 
-		INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT_FAST(endtime);
 		INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime);
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index f0260e6e412..ca6c0d2d148 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1035,6 +1035,16 @@
   max => '3',
 },
 
+{ name => 'fast_clock_source', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_TIME',
+  short_desc => 'Controls the use of fast clock source on architectures that support it.',
+  long_desc => 'This enables the use of the faster RDTSC instruction on x86 systems (if available) to support timing measurements during EXPLAIN and other instrumentation.',
+  variable => 'fast_clock_source',
+  boot_val => 'FAST_CLOCK_SOURCE_AUTO',
+  options => 'fast_clock_source_options',
+  check_hook => 'check_fast_clock_source',
+  assign_hook => 'assign_fast_clock_source',
+},
+
 { name => 'file_copy_method', type => 'enum', context => 'PGC_USERSET', group => 'RESOURCES_DISK',
   short_desc => 'Selects the file copy method.',
   variable => 'file_copy_method',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 13c569d8790..f92bbef943d 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -90,6 +90,7 @@
 #include "tcop/tcopprot.h"
 #include "tsearch/ts_cache.h"
 #include "utils/builtins.h"
+#include "portability/instr_time.h"
 #include "utils/bytea.h"
 #include "utils/float.h"
 #include "utils/guc_hooks.h"
@@ -371,6 +372,15 @@ static const struct config_enum_entry huge_pages_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry fast_clock_source_options[] = {
+	{"auto", FAST_CLOCK_SOURCE_AUTO, false},
+	{"off", FAST_CLOCK_SOURCE_OFF, false},
+#if !defined(WIN32) && defined(__x86_64__)
+	{"rdtsc", FAST_CLOCK_SOURCE_RDTSC, false},
+#endif
+	{NULL, 0, false}
+};
+
 static const struct config_enum_entry huge_pages_status_options[] = {
 	{"off", HUGE_PAGES_OFF, false},
 	{"on", HUGE_PAGES_ON, false},
diff --git a/src/common/Makefile b/src/common/Makefile
index 2c720caa509..1a2fbbe887f 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 00000000000..2e6240b4408
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,283 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2025, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#if defined(HAVE__GET_CPUID) || (defined(HAVE__CPUIDEX) && !defined(_MSC_VER))
+#include <cpuid.h>
+#endif
+
+#if defined(HAVE__CPUID) || (defined(HAVE__CPUIDEX) && defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#include "portability/instr_time.h"
+
+#if !defined(WIN32) && defined(__x86_64__)
+static void set_ticks_per_ns(void);
+static void pg_initialize_rdtsc(void);
+#endif
+
+#ifdef FRONTEND
+
+void
+pg_initialize_fast_clock_source()
+{
+#if !defined(WIN32) && defined(__x86_64__)
+	pg_initialize_rdtsc();
+	use_tsc = has_rdtsc && has_rdtscp;
+	set_ticks_per_ns();
+#endif
+}
+
+#else
+
+#include "utils/guc_hooks.h"
+
+int			fast_clock_source = FAST_CLOCK_SOURCE_AUTO;
+
+bool
+check_fast_clock_source(int *newval, void **extra, GucSource source)
+{
+#if !defined(WIN32) && defined(__x86_64__)
+	if (*newval == FAST_CLOCK_SOURCE_AUTO || *newval == FAST_CLOCK_SOURCE_RDTSC)
+		pg_initialize_rdtsc();
+
+	if (*newval == FAST_CLOCK_SOURCE_RDTSC && (!has_rdtsc || !has_rdtscp))
+	{
+		GUC_check_errdetail("TSC is not supported as fast clock source");
+		return false;
+	}
+#endif
+
+	return true;
+}
+
+void
+assign_fast_clock_source(int newval, void *extra)
+{
+#if !defined(WIN32) && defined(__x86_64__)
+	use_tsc = has_rdtsc && has_rdtscp && (newval == FAST_CLOCK_SOURCE_RDTSC || (newval == FAST_CLOCK_SOURCE_AUTO && pg_fast_clock_source_default()));
+
+	set_ticks_per_ns();
+#endif
+}
+
+#endif							/* FRONTEND */
+
+/*
+ * Decides whether to use TSC time source if the user did not specify it
+ * one way or the other, and it is available (checked separately).
+ *
+ * Currently only enabled by default on Linux, since Linux already does a
+ * significant amount of work to determine whether TSC is a viable clock
+ * source.
+ */
+bool
+pg_fast_clock_source_default()
+{
+#if defined(__x86_64__) && defined(__linux__)
+	FILE	   *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+	char		buf[128];
+
+	if (!fp)
+		return false;
+
+	if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0)
+	{
+		fclose(fp);
+		return true;
+	}
+
+	fclose(fp);
+#endif
+
+	return false;
+}
+
+#ifndef WIN32
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up
+ * with nanoseconds using integer math. See comment in pg_initialize_rdtsc()
+ * for more details.
+ *
+ * By default assume we are using clock_gettime() as a fallback which uses
+ * nanoseconds as ticks. Hence, we set the multiplier to the precision scalar
+ * so that the division in INSTR_TIME_GET_NANOSEC() won't change the nanoseconds.
+ *
+ * When using the RDTSC instruction directly this is filled in during initialization
+ * based on the relevant CPUID fields.
+ */
+int64		ticks_per_ns_scaled = TICKS_TO_NS_PRECISION;
+int64		max_ticks_no_overflow = PG_INT64_MAX / TICKS_TO_NS_PRECISION;
+
+#if defined(__x86_64__)
+
+bool		has_rdtsc = false;
+bool		has_rdtscp = false;
+bool		use_tsc = false;
+
+static uint32 tsc_freq = 0;
+
+static int64
+ticks_per_ns_for_system()
+{
+	return TICKS_TO_NS_PRECISION;
+}
+
+/*
+ * For TSC, the ticks to nanoseconds conversion requires floating point math
+ * because:
+ *
+ * sec = ticks / frequency_hz
+ * ns  = ticks / frequency_hz * 1,000,000,000
+ * ns  = ticks * (1,000,000,000 / frequency_hz)
+ * ns  = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
+ *
+ * Here, 'ns' is usually a floating number. For example for a 2.5 GHz CPU
+ * the scaling factor becomes 1,000,000 / 2,500,000 = 1.2.
+ *
+ * To be able to use integer math we work around the lack of precision. We
+ * first scale the integer up and after the multiplication by the number
+ * of ticks in INSTR_TIME_GET_NANOSEC() we divide again by the same value.
+ * We picked the scaler such that it provides enough precision and is a
+ * power-of-two which allows for shifting instead of doing an integer
+ * division.
+ */
+static int64
+ticks_per_ns_for_tsc()
+{
+	return INT64CONST(1000000) * TICKS_TO_NS_PRECISION / tsc_freq;
+}
+
+static void
+set_ticks_per_ns()
+{
+	if (use_tsc)
+		ticks_per_ns_scaled = ticks_per_ns_for_tsc();
+	else
+		ticks_per_ns_scaled = ticks_per_ns_for_system();
+	max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
+}
+
+#define CPUID_HYPERVISOR_VMWARE(words) (words[1] == 0x61774d56 && words[2] == 0x4d566572 && words[3] == 0x65726177) /* VMwareVMware */
+#define CPUID_HYPERVISOR_KVM(words) (words[1] == 0x4b4d564b && words[2] == 0x564b4d56 && words[3] == 0x0000004d)	/* KVMKVMKVM */
+
+static bool
+get_tsc_frequency_khz()
+{
+	uint32		r[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(0x15, &r[0] /* denominator */ , &r[1] /* numerator */ , &r[2] /* hz */ , &r[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x15);
+#else
+#error cpuid instruction not available
+#endif
+
+	if (r[2] > 0)
+	{
+		if (r[0] == 0 || r[1] == 0)
+			return false;
+
+		tsc_freq = r[2] / 1000 * r[1] / r[0];
+		return true;
+	}
+
+	/* Some CPUs only report frequency in 16H */
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(0x16, &r[0] /* base_mhz */ , &r[1], &r[2], &r[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x16);
+#else
+#error cpuid instruction not available
+#endif
+
+	if (r[0] > 0)
+	{
+		tsc_freq = r[0] * 1000;
+		return true;
+	}
+
+	/*
+	 * Check if we have a KVM or VMware Hypervisor passing down TSC frequency
+	 * to us in a guest VM
+	 *
+	 * Note that accessing the 0x40000000 leaf for Hypervisor info requires
+	 * use of __cpuidex to set ECX to 0. The similar __get_cpuid_count
+	 * function does not work as expected since it contains a check for
+	 * __get_cpuid_max, which has been observed to be lower than the special
+	 * Hypervisor leaf.
+	 */
+#if defined(HAVE__CPUIDEX)
+	__cpuidex((int32 *) r, 0x40000000, 0);
+	if (r[0] >= 0x40000010 && (CPUID_HYPERVISOR_VMWARE(r) || CPUID_HYPERVISOR_KVM(r)))
+	{
+		__cpuidex((int32 *) r, 0x40000010, 0);
+		if (r[0] > 0)
+		{
+			tsc_freq = r[0];
+			return true;
+		}
+	}
+#endif
+
+	return false;
+}
+
+static bool
+is_rdtscp_available()
+{
+	uint32		r[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	if (!__get_cpuid(0x80000001, &r[0], &r[1], &r[2], &r[3]))
+		return false;
+#elif defined(HAVE__CPUID)
+	__cpuid(r, 0x80000001);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (r[3] & (1 << 27)) != 0;
+}
+
+/*
+ * Decide whether we use the RDTSC instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ */
+static void
+pg_initialize_rdtsc(void)
+{
+	/*
+	 * Compute baseline CPU peformance, determines speed at which RDTSC
+	 * advances.
+	 */
+	if (!get_tsc_frequency_khz())
+		return;
+
+	has_rdtsc = true;
+	has_rdtscp = is_rdtscp_available();
+}
+#endif							/* defined(__x86_64__) */
+
+#endif							/* WIN32 */
diff --git a/src/common/meson.build b/src/common/meson.build
index b757618a9c9..042edb7473a 100644
--- a/src/common/meson.build
+++ b/src/common/meson.build
@@ -13,6 +13,7 @@ common_sources = files(
   'file_perm.c',
   'file_utils.c',
   'hashfn.c',
+  'instr_time.c',
   'ip.c',
   'jsonapi.c',
   'keywords.c',
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 490593d1825..5b7eeaa84c9 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,9 +4,10 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime(), and on Windows we use
- * QueryPerformanceCounter().  These macros also give some breathing room to
- * use other high-precision-timing APIs.
+ * interval timing. On x86 we use the RDTSC/RDTSCP instruction directly in
+ * certain cases, or alternatively clock_gettime() on Unix-like systems and
+ * QueryPerformanceCounter() on Windows. These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -17,10 +18,11 @@
  *
  * INSTR_TIME_SET_ZERO(t)			set t to zero (memset is acceptable too)
  *
- * INSTR_TIME_SET_CURRENT(t)		set t to current time
+ * INSTR_TIME_SET_CURRENT_FAST(t)	set t to current time without waiting
+ * 									for instructions in out-of-order window
  *
- * INSTR_TIME_SET_CURRENT_LAZY(t)	set t to current time if t is zero,
- *									evaluates to whether t changed
+ * INSTR_TIME_SET_CURRENT(t)		set t to current time while waiting for
+ * 									instructions in OOO to retire
  *
  * INSTR_TIME_ADD(x, y)				x += y
  *
@@ -78,11 +80,47 @@ typedef struct instr_time
 #define NS_PER_MS	INT64CONST(1000000)
 #define NS_PER_US	INT64CONST(1000)
 
+/*
+ * Initialize fast clock source, and enable use if available.
+ *
+ * This needs to be called by client programs before using INSTR_* macros, in
+ * order to benefit from fast clock sources when available. If not called
+ * INSTR_* macros will utilize the regular source instead.
+ *
+ * Not used for backends, where instead the fast_clock_source GUC controls, and
+ * the GUC check/assign hooks do the necessary initialization.
+ */
+#ifdef FRONTEND
+
+extern void pg_initialize_fast_clock_source(void);
+
+#else
+
+typedef enum
+{
+	FAST_CLOCK_SOURCE_AUTO,
+	FAST_CLOCK_SOURCE_OFF,
+	FAST_CLOCK_SOURCE_RDTSC
+}			FastClockSourceType;
+
+extern int	fast_clock_source;
+
+#endif							/* FRONTEND */
+
+extern bool pg_fast_clock_source_default(void);
 
 #ifndef WIN32
 
+/*
+ * Make sure this is a power-of-two, so that the compiler can turn the
+ * multiplications and divisions into shifts.
+ */
+#define TICKS_TO_NS_PRECISION (1<<14)
 
-/* Use clock_gettime() */
+extern int64 ticks_per_ns_scaled;
+extern int64 max_ticks_no_overflow;
+
+/* Use clock_gettime() or fast clock source */
 
 #include <time.h>
 
@@ -97,6 +135,9 @@ typedef struct instr_time
  * than CLOCK_MONOTONIC.  In particular, as of macOS 10.12, Apple provides
  * CLOCK_MONOTONIC_RAW which is both faster to read and higher resolution than
  * their version of CLOCK_MONOTONIC.
+ *
+ * Note this does not get used in case the fast clock source logic is used,
+ * which directly calls architecture specific timing instructions (e.g. RDTSC).
  */
 #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC_RAW
@@ -106,9 +147,19 @@ typedef struct instr_time
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
-/* helper for INSTR_TIME_SET_CURRENT */
+#if defined(__x86_64__)
+#include <x86intrin.h>
+
+/* Indicates if TSC instructions (RDTSC and RDTSCP) are available. */
+extern bool has_rdtsc;
+extern bool has_rdtscp;
+
+/* Whether to actually use RDTSC/RDTSCP based on availability and GUC settings. */
+extern bool use_tsc;
+#endif
+
 static inline instr_time
-pg_clock_gettime_ns(void)
+pg_clock_gettime(void)
 {
 	instr_time	now;
 	struct timespec tmp;
@@ -119,11 +170,83 @@ pg_clock_gettime_ns(void)
 	return now;
 }
 
+static inline instr_time
+pg_get_ticks_fast(void)
+{
+#if defined(__x86_64__)
+	if (likely(use_tsc))
+	{
+		instr_time	now;
+
+		now.ticks = __rdtsc();
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline instr_time
+pg_get_ticks(void)
+{
+#if defined(__x86_64__)
+	if (likely(use_tsc))
+	{
+		instr_time	now;
+		uint32		unused;
+
+		now.ticks = __rdtscp(&unused);
+		return now;
+	}
+#endif
+
+	return pg_clock_gettime();
+}
+
+static inline int64_t
+pg_ticks_to_ns(instr_time t)
+{
+	/*
+	 * Would multiplication overflow? If so perform computation in two parts.
+	 * Check overflow without actually overflowing via: a * b > max <=> a >
+	 * max / b
+	 */
+	int64		ns = 0;
+
+	if (unlikely(t.ticks > max_ticks_no_overflow))
+	{
+		/*
+		 * Compute how often the maximum number of ticks fits completely into
+		 * the number of elapsed ticks and convert that number into
+		 * nanoseconds. Then multiply by the count to arrive at the final
+		 * value. In a 2nd step we adjust the number of elapsed ticks and
+		 * convert the remaining ticks.
+		 */
+		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+
+		ns = max_ns * count;
+
+		/*
+		 * Subtract the ticks that we now already accounted for, so that they
+		 * don't get counted twice.
+		 */
+		t.ticks -= count * max_ticks_no_overflow;
+		Assert(t.ticks >= 0);
+	}
+
+	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	return ns;
+}
+
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_get_ticks_fast())
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	((t) = pg_clock_gettime_ns())
+	((t) = pg_get_ticks())
 
 #define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) (t).ticks)
+	pg_ticks_to_ns(t)
 
 
 #else							/* WIN32 */
@@ -131,7 +254,7 @@ pg_clock_gettime_ns(void)
 
 /* Use QueryPerformanceCounter() */
 
-/* helper for INSTR_TIME_SET_CURRENT */
+/* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */
 static inline instr_time
 pg_query_performance_counter(void)
 {
@@ -153,6 +276,9 @@ GetTimerFrequency(void)
 	return (double) f.QuadPart;
 }
 
+#define INSTR_TIME_SET_CURRENT_FAST(t) \
+	((t) = pg_query_performance_counter())
+
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
@@ -168,13 +294,8 @@ GetTimerFrequency(void)
 
 #define INSTR_TIME_IS_ZERO(t)	((t).ticks == 0)
 
-
 #define INSTR_TIME_SET_ZERO(t)	((t).ticks = 0)
 
-#define INSTR_TIME_SET_CURRENT_LAZY(t) \
-	(INSTR_TIME_IS_ZERO(t) ? INSTR_TIME_SET_CURRENT(t), true : false)
-
-
 #define INSTR_TIME_ADD(x,y) \
 	((x).ticks += (y).ticks)
 
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index b6ecb0e769f..4488d8579fc 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -62,6 +62,8 @@ extern void assign_default_text_search_config(const char *newval, void *extra);
 extern bool check_default_with_oids(bool *newval, void **extra,
 									GucSource source);
 extern const char *show_effective_wal_level(void);
+extern void assign_fast_clock_source(int newval, void *extra);
+extern bool check_fast_clock_source(int *newval, void **extra, GucSource source);
 extern bool check_huge_page_size(int *newval, void **extra, GucSource source);
 extern void assign_io_method(int newval, void *extra);
 extern bool check_io_max_concurrency(int *newval, void **extra, GucSource source);
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 71a80161961..63440b8e36c 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -60,6 +60,7 @@ enum config_group
 	CONN_AUTH_TCP,
 	CONN_AUTH_AUTH,
 	CONN_AUTH_SSL,
+	RESOURCES_TIME,
 	RESOURCES_MEM,
 	RESOURCES_DISK,
 	RESOURCES_KERNEL,
-- 
2.47.1

From fe13fb66422d2e4e47b06e1c95d0fd4155d88a1c Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sun, 27 Jul 2025 08:48:48 -0700
Subject: [PATCH v5 3/3] pg_test_timing: Also test fast timing and report time
 source

In passing also reduce the per-loop overhead caused by repeated divisions
in INSTR_TIME_GET_NANOSEC when the ticks variable has become very large,
instead diff first and then turn it into nanosecs.
---
 src/bin/pg_test_timing/pg_test_timing.c | 95 ++++++++++++++++++++-----
 src/include/portability/instr_time.h    | 31 +++++---
 2 files changed, 97 insertions(+), 29 deletions(-)

diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index a5621251afc..7846ad07d97 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -30,7 +30,7 @@ static long long int largest_diff_count;
 
 
 static void handle_args(int argc, char *argv[]);
-static uint64 test_timing(unsigned int duration);
+static uint64 test_timing(unsigned int duration, bool fast_timing);
 static void output(uint64 loop_count);
 
 int
@@ -43,10 +43,30 @@ main(int argc, char *argv[])
 
 	handle_args(argc, argv);
 
-	loop_count = test_timing(test_duration);
-
+	/*
+	 * First, test default (non-fast) timing code. A clock source for that is
+	 * always available. Hence, we can unconditionally output the result.
+	 */
+	loop_count = test_timing(test_duration, false);
 	output(loop_count);
 
+	/*
+	 * Second, test the fast timing code. This clock source is not always
+	 * available. In that case the loop count will be 0 and we don't print.
+	 */
+	printf("\n");
+	loop_count = test_timing(test_duration, true);
+	if (loop_count > 0)
+	{
+		output(loop_count);
+		printf("\n");
+
+		if (pg_fast_clock_source_default())
+			printf(_("Fast time source will be used by default, unless fast_clock_source is set to 'off'.\n"));
+		else
+			printf(_("Fast time source will not be used by default, unless fast_clock_source is set to 'rdtsc'.\n"));
+	}
+
 	return 0;
 }
 
@@ -78,7 +98,7 @@ handle_args(int argc, char *argv[])
 		}
 	}
 
-	while ((option = getopt_long(argc, argv, "d:c:",
+	while ((option = getopt_long(argc, argv, "d:c:f:",
 								 long_options, &optindex)) != -1)
 	{
 		switch (option)
@@ -143,23 +163,55 @@ handle_args(int argc, char *argv[])
 		exit(1);
 	}
 
-	printf(ngettext("Testing timing overhead for %u second.\n",
-					"Testing timing overhead for %u seconds.\n",
+	printf(ngettext("Testing timing overhead for %u second.\n\n",
+					"Testing timing overhead for %u seconds.\n\n",
 					test_duration),
 		   test_duration);
 }
 
 static uint64
-test_timing(unsigned int duration)
+test_timing(unsigned int duration, bool fast_timing)
 {
 	uint64		total_time;
 	int64		time_elapsed = 0;
 	uint64		loop_count = 0;
-	uint64		prev,
-				cur;
 	instr_time	start_time,
 				end_time,
-				temp;
+				prev,
+				cur;
+	char	   *time_source = NULL;
+	bool		fast_timing_available = false;
+
+	if (fast_timing)
+		pg_initialize_fast_clock_source();
+
+#if !defined(WIN32) && defined(__x86_64__)
+	if (fast_timing && has_rdtsc)
+	{
+		time_source = "RDTSC";
+		fast_timing_available = true;
+	}
+	else if (has_rdtscp)
+
+		/*
+		 * Note that we never reach RDTSCP currently, because it only gets
+		 * made available by the fast clock source initialization. In
+		 * practice, we assume its performance to be similar to the OS clock
+		 * source.
+		 */
+		time_source = "RDTSCP";
+	else
+		time_source = PG_INSTR_CLOCK_NAME;
+#else
+	time_source = PG_INSTR_CLOCK_NAME;
+#endif
+	if (fast_timing && !fast_timing_available)
+		return 0;
+
+	if (fast_timing)
+		printf(_("Fast time source: %s\n"), time_source);
+	else
+		printf(_("Time source: %s\n"), time_source);
 
 	/*
 	 * Pre-zero the statistics data structures.  They're already zero by
@@ -173,8 +225,11 @@ test_timing(unsigned int duration)
 
 	total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
 
-	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_NANOSEC(start_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(start_time);
+	else
+		INSTR_TIME_SET_CURRENT(start_time);
+	cur = start_time;
 
 	while (time_elapsed < total_time)
 	{
@@ -182,9 +237,11 @@ test_timing(unsigned int duration)
 					bits;
 
 		prev = cur;
-		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_NANOSEC(temp);
-		diff = cur - prev;
+		if (fast_timing)
+			INSTR_TIME_SET_CURRENT_FAST(cur);
+		else
+			INSTR_TIME_SET_CURRENT(cur);
+		diff = INSTR_TIME_DIFF_NANOSEC(cur, prev);
 
 		/* Did time go backwards? */
 		if (unlikely(diff < 0))
@@ -217,11 +274,13 @@ test_timing(unsigned int duration)
 			largest_diff_count++;
 
 		loop_count++;
-		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
+		time_elapsed = INSTR_TIME_DIFF_NANOSEC(cur, start_time);
 	}
 
-	INSTR_TIME_SET_CURRENT(end_time);
+	if (fast_timing)
+		INSTR_TIME_SET_CURRENT_FAST(end_time);
+	else
+		INSTR_TIME_SET_CURRENT(end_time);
 
 	INSTR_TIME_SUBTRACT(end_time, start_time);
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 5b7eeaa84c9..b885b04a1bf 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -141,10 +141,13 @@ extern int64 max_ticks_no_overflow;
  */
 #if defined(__darwin__) && defined(CLOCK_MONOTONIC_RAW)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC_RAW
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC_RAW)"
 #elif defined(CLOCK_MONOTONIC)
 #define PG_INSTR_CLOCK	CLOCK_MONOTONIC
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_MONOTONIC)"
 #else
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
+#define PG_INSTR_CLOCK_NAME	"clock_gettime (CLOCK_REALTIME)"
 #endif
 
 #if defined(__x86_64__)
@@ -204,7 +207,7 @@ pg_get_ticks(void)
 }
 
 static inline int64_t
-pg_ticks_to_ns(instr_time t)
+pg_ticks_to_ns(int64 ticks)
 {
 	/*
 	 * Would multiplication overflow? If so perform computation in two parts.
@@ -213,7 +216,7 @@ pg_ticks_to_ns(instr_time t)
 	 */
 	int64		ns = 0;
 
-	if (unlikely(t.ticks > max_ticks_no_overflow))
+	if (unlikely(ticks > max_ticks_no_overflow))
 	{
 		/*
 		 * Compute how often the maximum number of ticks fits completely into
@@ -222,7 +225,7 @@ pg_ticks_to_ns(instr_time t)
 		 * value. In a 2nd step we adjust the number of elapsed ticks and
 		 * convert the remaining ticks.
 		 */
-		int64		count = t.ticks / max_ticks_no_overflow;
+		int64		count = ticks / max_ticks_no_overflow;
 		int64		max_ns = max_ticks_no_overflow * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 
 		ns = max_ns * count;
@@ -231,11 +234,11 @@ pg_ticks_to_ns(instr_time t)
 		 * Subtract the ticks that we now already accounted for, so that they
 		 * don't get counted twice.
 		 */
-		t.ticks -= count * max_ticks_no_overflow;
-		Assert(t.ticks >= 0);
+		ticks -= count * max_ticks_no_overflow;
+		Assert(ticks >= 0);
 	}
 
-	ns += t.ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
+	ns += ticks * ticks_per_ns_scaled / TICKS_TO_NS_PRECISION;
 	return ns;
 }
 
@@ -245,14 +248,14 @@ pg_ticks_to_ns(instr_time t)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_get_ticks())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	pg_ticks_to_ns(t)
-
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	(pg_ticks_to_ns(ticks))
 
 #else							/* WIN32 */
 
 
 /* Use QueryPerformanceCounter() */
+#define PG_INSTR_CLOCK_NAME	"QueryPerformanceCounter"
 
 /* helper for INSTR_TIME_SET_CURRENT / INSTR_TIME_SET_CURRENT_FAST */
 static inline instr_time
@@ -282,8 +285,8 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SET_CURRENT(t) \
 	((t) = pg_query_performance_counter())
 
-#define INSTR_TIME_GET_NANOSEC(t) \
-	((int64) ((t).ticks * ((double) NS_PER_S / GetTimerFrequency())))
+#define INSTR_TIME_TICKS_TO_NANOSEC(ticks) \
+	((int64) ((ticks) * ((double) NS_PER_S / GetTimerFrequency())))
 
 #endif							/* WIN32 */
 
@@ -302,12 +305,18 @@ GetTimerFrequency(void)
 #define INSTR_TIME_SUBTRACT(x,y) \
 	((x).ticks -= (y).ticks)
 
+#define INSTR_TIME_DIFF_NANOSEC(x,y) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((x).ticks - (y).ticks))
+
 #define INSTR_TIME_ACCUM_DIFF(x,y,z) \
 	((x).ticks += (y).ticks - (z).ticks)
 
 #define INSTR_TIME_LT(x,y) \
 	((x).ticks > (y).ticks)
 
+#define INSTR_TIME_GET_NANOSEC(t) \
+	(INSTR_TIME_TICKS_TO_NANOSEC((t).ticks))
+
 #define INSTR_TIME_GET_DOUBLE(t) \
 	((double) INSTR_TIME_GET_NANOSEC(t) / NS_PER_S)
 
-- 
2.47.1

Re: Reduce timing overhead of EXPLAIN ANALYZE using rdtsc?

Reply via email to