From 8bd31560afe60bed38c3f63305d924d3b629206b Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Thu, 30 Jun 2022 17:57:37 -0700
Subject: [PATCH v2 2/2] WIP: Use cpu reference cycles, via rdtsc, to measure
 time for instrumentation.

For now this is only enabled on Linux/x86 when the system clocksource is
marked tsc as well, as determined at runtime. This way we can rely on the
Linux kernel to make a decision whether tsc is invariant and usable on the
current CPU architecture. In all other cases we continue to use the
clock_gettime() implementation like before.

Note that this intentionally uses rdtsc, not rdtscp, as rdtscp waits for
currently running CPU instructions to have finished, and that adds up to
noticable latency for little benefit in the typical InstrStartNode() /
InstrStopNode() use case.
---
 src/backend/utils/init/postinit.c       |   3 +
 src/bin/pg_test_timing/pg_test_timing.c |   1 +
 src/bin/pgbench/pgbench.c               |   3 +
 src/bin/psql/startup.c                  |   4 +
 src/common/Makefile                     |   1 +
 src/common/instr_time.c                 | 107 ++++++++++++++++++++++++
 src/include/portability/instr_time.h    |  53 +++++++++---
 src/tools/msvc/Mkvcbuild.pm             |   2 +-
 8 files changed, 163 insertions(+), 11 deletions(-)
 create mode 100644 src/common/instr_time.c

diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 6b9082604f..6ec0ad3271 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -735,6 +735,9 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	/* Initialize portal manager */
 	EnablePortalManager();
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* Initialize status reporting */
 	pgstat_beinit();
 
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index c29d6f8762..0d667ff5a7 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -132,6 +132,7 @@ test_timing(unsigned int duration)
 
 	total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
 
+	INSTR_TIME_INITIALIZE();
 	INSTR_TIME_SET_CURRENT(start_time);
 	cur = INSTR_TIME_GET_MICROSEC(start_time);
 
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index fbb74bdc4c..043e2e433d 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -7093,6 +7093,9 @@ main(int argc, char **argv)
 		initRandomState(&state[i].cs_func_rs);
 	}
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	/* opening connection... */
 	con = doConnect();
 	if (con == NULL)
diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c
index 7c2f555f15..da930043c0 100644
--- a/src/bin/psql/startup.c
+++ b/src/bin/psql/startup.c
@@ -24,6 +24,7 @@
 #include "help.h"
 #include "input.h"
 #include "mainloop.h"
+#include "portability/instr_time.h"
 #include "settings.h"
 
 /*
@@ -322,6 +323,9 @@ main(int argc, char *argv[])
 
 	PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL);
 
+	/* initialize high-precision interval timing */
+	INSTR_TIME_INITIALIZE();
+
 	SyncVariables();
 
 	if (options.list_dbs)
diff --git a/src/common/Makefile b/src/common/Makefile
index e9af7346c9..437a018590 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -59,6 +59,7 @@ OBJS_COMMON = \
 	file_perm.o \
 	file_utils.o \
 	hashfn.o \
+	instr_time.o \
 	ip.o \
 	jsonapi.o \
 	keywords.o \
diff --git a/src/common/instr_time.c b/src/common/instr_time.c
new file mode 100644
index 0000000000..a63ef19f08
--- /dev/null
+++ b/src/common/instr_time.c
@@ -0,0 +1,107 @@
+/*-------------------------------------------------------------------------
+ *
+ * instr_time.c
+ *	   Non-inline parts of the portable high-precision interval timing
+ *	 implementation
+ *
+ * Portions Copyright (c) 2022, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/port/instr_time.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "portability/instr_time.h"
+
+#ifdef HAVE_CLOCK_GETTIME
+
+/*
+ * Stores what the number of cycles needs to be multiplied with to end up with
+ * seconds. This indirection exists to support the rtdsc instruction.
+ *
+ * As a default, assume we are using clock_gettime() as a fallback and treat it
+ * as 1 "cycle" per nanosecond (aka 1 GHz).
+ *
+ * When using the rtdsc instruction directly this is filled in during
+ * initialization based on the relevant cpuid fields.
+ */
+double cycles_to_sec = 1.0 / NS_PER_S;
+
+/*
+ * Determines whether rdtsc is used (Linux/x86 only, when OS uses tsc clocksource)
+ */
+bool use_rdtsc = false;
+
+#if defined(__x86_64__) && defined(__linux__)
+/*
+ * Decide whether we use the rdtsc instruction at runtime, for Linux/x86,
+ * instead of incurring the overhead of a full clock_gettime() call.
+ *
+ * This can't be reliably determined at compile time, since the
+ * availability of an "invariant" TSC (that is not affected by CPU
+ * frequency changes) is dependent on the CPU architecture. Additionally,
+ * there are cases where TSC availability is impacted by virtualization,
+ * where a simple cpuid feature check would not be enough.
+ *
+ * Since Linux already does a significant amount of work to determine
+ * whether TSC is a viable clock source, decide based on that.
+ */
+void pg_clock_gettime_initialize_rdtsc(void)
+{
+	FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
+	char buf[128];
+
+	if (fp)
+	{
+		fgets(buf, sizeof(buf), fp);
+		if (strcmp(buf, "tsc\n") == 0)
+		{
+			use_rdtsc = true;
+		}
+		fclose(fp);
+	}
+
+	/*
+	 * Compute baseline cpu peformance, determines speed at which rdtsc advances
+	 */
+	if (use_rdtsc)
+	{
+		uint32 cpuinfo[4] = {0};
+
+		/*
+		 * FIXME: We should probably not unnecessarily use floating point math
+		 * here. And it's likely that the numbers are small enough that we are running
+		 * into floating point inaccuracies already. Probably worthwhile to be a good
+		 * bit smarter.
+		 */
+
+		__get_cpuid(0x16, cpuinfo, cpuinfo + 1, cpuinfo + 2, cpuinfo + 3);
+
+		if (cpuinfo[0] != 0) {
+			cycles_to_sec = 1 / ((double) cpuinfo[0] * 1000 * 1000);
+		} else {
+			FILE       *fp = fopen("/proc/cpuinfo", "r");
+			char            buf[128];
+			float cpu_mhz;
+
+			if (fp)
+			{
+				while (fgets(buf, sizeof(buf), fp))
+				{
+					if (sscanf(buf, "cpu MHz                : %f", &cpu_mhz) == 1)
+					{
+						cycles_to_sec = 1 / ((double) cpu_mhz * 1000 * 1000);
+						break;
+					}
+				}
+			}
+			fclose(fp);
+		}
+	}
+}
+#endif							/* defined(__x86_64__) && defined(__linux__) */
+
+#endif						    /* HAVE_CLOCK_GETTIME */
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index ca260032d1..a23b88d9e6 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -4,10 +4,12 @@
  *	  portable high-precision interval timing
  *
  * This file provides an abstraction layer to hide portability issues in
- * interval timing.  On Unix we use clock_gettime() if available, else
- * gettimeofday().  On Windows, gettimeofday() gives a low-precision result
- * so we must use QueryPerformanceCounter() instead.  These macros also give
- * some breathing room to use other high-precision-timing APIs.
+ * interval timing.  On Linux/x86 we use the rdtsc instruction when a TSC
+ * clocksource is also used on the host OS.  Otherwise, and on other
+ * Unix-like systems we use clock_gettime() if available, else gettimeofday().
+ * On Windows, gettimeofday() gives a low-precision result so we must use
+ * QueryPerformanceCounter() instead.  These macros also give some breathing
+ * room to use other high-precision-timing APIs.
  *
  * The basic data type is instr_time, which all callers should treat as an
  * opaque typedef.  instr_time can store either an absolute time (of
@@ -59,10 +61,15 @@
 
 #ifdef HAVE_CLOCK_GETTIME
 
-/* Use clock_gettime() */
+/* Uses rdtsc on Linux/x86 if available, otherwise clock_gettime() */
 
 #include <time.h>
 
+#if defined(__x86_64__) && defined(__linux__)
+#include <x86intrin.h>
+#include <cpuid.h>
+#endif
+
 /*
  * The best clockid to use according to the POSIX spec is CLOCK_MONOTONIC,
  * since that will give reliable interval timing even in the face of changes
@@ -83,7 +90,9 @@
 #define PG_INSTR_CLOCK	CLOCK_REALTIME
 #endif
 
+/* time in cpu reference cycles (when using rdtsc), otherwise nanoseconds */
 typedef int64 instr_time;
+
 #define NS_PER_S INT64CONST(1000000000)
 #define US_PER_S INT64CONST(1000000)
 #define MS_PER_S INT64CONST(1000)
@@ -95,17 +104,37 @@ typedef int64 instr_time;
 
 #define INSTR_TIME_SET_ZERO(t)	((t) = 0)
 
-static inline instr_time pg_clock_gettime_ns(void)
+extern double cycles_to_sec;
+
+bool use_rdtsc;
+
+#if defined(__x86_64__) && defined(__linux__)
+extern void pg_clock_gettime_initialize_rdtsc(void);
+#endif
+
+static inline instr_time pg_clock_gettime_ref_cycles(void)
 {
 	struct timespec tmp;
 
+#if defined(__x86_64__) && defined(__linux__)
+	if (use_rdtsc)
+		return __rdtsc();
+#endif
+
 	clock_gettime(PG_INSTR_CLOCK, &tmp);
 
 	return tmp.tv_sec * NS_PER_S + tmp.tv_nsec;
 }
 
+#if defined(__x86_64__) && defined(__linux__)
+#define INSTR_TIME_INITIALIZE() \
+	pg_clock_gettime_initialize_rdtsc()
+#else
+#define INSTR_TIME_INITIALIZE()
+#endif
+
 #define INSTR_TIME_SET_CURRENT(t) \
-	(t) = pg_clock_gettime_ns()
+	(t) = pg_clock_gettime_ref_cycles()
 
 #define INSTR_TIME_ADD(x,y) \
 	do { \
@@ -123,13 +152,13 @@ static inline instr_time pg_clock_gettime_ns(void)
 	} while (0)
 
 #define INSTR_TIME_GET_DOUBLE(t) \
-	((double) (t) / NS_PER_S)
+	((double) (t) * cycles_to_sec)
 
 #define INSTR_TIME_GET_MILLISEC(t) \
-	((double) (t) / NS_PER_MS)
+	((double) (t) * (cycles_to_sec * MS_PER_S))
 
 #define INSTR_TIME_GET_MICROSEC(t) \
-	((double) (t) / NS_PER_US)
+	((uint64) (t) * (cycles_to_sec * US_PER_S))
 
 #else							/* !HAVE_CLOCK_GETTIME */
 
@@ -143,6 +172,8 @@ typedef struct timeval instr_time;
 
 #define INSTR_TIME_SET_ZERO(t)	((t).tv_sec = 0, (t).tv_usec = 0)
 
+#define INSTR_TIME_INITIALIZE()
+
 #define INSTR_TIME_SET_CURRENT(t)	gettimeofday(&(t), NULL)
 
 #define INSTR_TIME_ADD(x,y) \
@@ -207,6 +238,8 @@ typedef LARGE_INTEGER instr_time;
 
 #define INSTR_TIME_SET_ZERO(t)	((t).QuadPart = 0)
 
+#define INSTR_TIME_INITIALIZE()
+
 #define INSTR_TIME_SET_CURRENT(t)	QueryPerformanceCounter(&(t))
 
 #define INSTR_TIME_ADD(x,y) \
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index e4feda10fd..a881fcc64e 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -126,7 +126,7 @@ sub mkvcbuild
 	our @pgcommonallfiles = qw(
 	  archive.c base64.c checksum_helper.c compression.c
 	  config_info.c controldata_utils.c d2s.c encnames.c exec.c
-	  f2s.c file_perm.c file_utils.c hashfn.c ip.c jsonapi.c
+	  f2s.c file_perm.c file_utils.c hashfn.c ip.c instr_time.o jsonapi.c
 	  keywords.c kwlookup.c link-canary.c md5_common.c
 	  pg_get_line.c pg_lzcompress.c pg_prng.c pgfnames.c psprintf.c relpath.c
 	  rmtree.c saslprep.c scram-common.c string.c stringinfo.c unicode_norm.c
-- 
2.34.0

