(resending to list and other CC:s )

Hi Tom

This is my current patch which also adds running % and optionally uses
faster way to count leading zeros, though I did not  see a change from
that.

It also bucketizes first 128 ns to get better overview of exact behaviour.

We may want to put reporting this behind a flag

---
Hannu

On Wed, Jun 19, 2024 at 6:36 PM Tom Lane <t...@sss.pgh.pa.us> wrote:
>
> Peter Eisentraut <pe...@eisentraut.org> writes:
> > AFAICT, pg_test_timing doesn't use gettimeofday(), so this doesn't
> > really address the original question.
>
> It's not exactly hard to make it do so (see attached).
>
> I tried this on several different machines, and my conclusion is that
> gettimeofday() reports full microsecond precision on any platform
> anybody is likely to be running PG on today.  Even my one surviving
> pet dinosaur, NetBSD 10 on PowerPC Mac (mamba), shows results like
> this:
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 901.41 ns
> Histogram of timing durations:
>   < us   % of total      count
>      1     10.46074     348148
>      2     89.51495    2979181
>      4      0.00574        191
>      8      0.00430        143
>     16      0.00691        230
>     32      0.00376        125
>     64      0.00012          4
>    128      0.00303        101
>    256      0.00027          9
>    512      0.00009          3
>   1024      0.00009          3
>
> I also modified pg_test_timing to measure nanoseconds not
> microseconds (second patch attached), and got this:
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 805.50 ns
> Histogram of timing durations:
>   < ns   % of total      count
>      1     19.84234     739008
>      2      0.00000          0
>      4      0.00000          0
>      8      0.00000          0
>     16      0.00000          0
>     32      0.00000          0
>     64      0.00000          0
>    128      0.00000          0
>    256      0.00000          0
>    512      0.00000          0
>   1024     80.14013    2984739
>   2048      0.00078         29
>   4096      0.00658        245
>   8192      0.00290        108
>  16384      0.00252         94
>  32768      0.00250         93
>  65536      0.00016          6
> 131072      0.00185         69
> 262144      0.00008          3
> 524288      0.00008          3
> 1048576      0.00008          3
>
> confirming that when the result changes it generally does so by 1usec.
>
> Applying just the second patch, I find that clock_gettime on this
> old hardware seems to be limited to 1us resolution, but on my more
> modern machines (mac M1, x86_64) it can tick at 40ns or less.
> Even a raspberry pi 4 shows
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 69.12 ns
> Histogram of timing durations:
>   < ns   % of total      count
>      1      0.00000          0
>      2      0.00000          0
>      4      0.00000          0
>      8      0.00000          0
>     16      0.00000          0
>     32      0.00000          0
>     64     37.59583   16317040
>    128     62.38568   27076131
>    256      0.01674       7265
>    512      0.00002          8
>   1024      0.00000          0
>   2048      0.00000          0
>   4096      0.00153        662
>   8192      0.00019         83
>  16384      0.00001          3
>  32768      0.00001          5
>
> suggesting that the clock_gettime resolution is better than 64 ns.
>
> So I concur with Hannu that it's time to adjust pg_test_timing to
> resolve nanoseconds not microseconds.  I gather he's created a
> patch that does more than mine below, so I'll wait for that.
>
>                         regards, tom lane
>
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index c29d6f8762..20b2785f50 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -19,9 +19,12 @@ static void handle_args(int argc, char *argv[]);
 static uint64 test_timing(unsigned int duration);
 static void output(uint64 loop_count);
 
-/* record duration in powers of 2 microseconds */
+/* record duration in powers of 2 nanoseconds */
 long long int histogram[32];
 
+/* record duration of first 128 ns directly */
+long long int direct_histogram[128];
+
 int
 main(int argc, char *argv[])
 {
@@ -130,10 +133,10 @@ test_timing(unsigned int duration)
 				end_time,
 				temp;
 
-	total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
+	total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
 
 	INSTR_TIME_SET_CURRENT(start_time);
-	cur = INSTR_TIME_GET_MICROSEC(start_time);
+	cur = INSTR_TIME_GET_NANOSEC(start_time);
 
 	while (time_elapsed < total_time)
 	{
@@ -142,7 +145,7 @@ test_timing(unsigned int duration)
 
 		prev = cur;
 		INSTR_TIME_SET_CURRENT(temp);
-		cur = INSTR_TIME_GET_MICROSEC(temp);
+		cur = INSTR_TIME_GET_NANOSEC(temp);
 		diff = cur - prev;
 
 		/* Did time go backwards? */
@@ -153,19 +156,25 @@ test_timing(unsigned int duration)
 			exit(1);
 		}
 
+		if(likely(diff < 128))
+			direct_histogram[diff]++;
+
+#if defined(__has_builtin) && __has_builtin(__builtin_clz)
+		bits = 32 - __builtin_clz(diff);
+#else
 		/* What is the highest bit in the time diff? */
 		while (diff)
 		{
 			diff >>= 1;
 			bits++;
 		}
-
+#endif
 		/* Update appropriate duration bucket */
 		histogram[bits]++;
 
 		loop_count++;
 		INSTR_TIME_SUBTRACT(temp, start_time);
-		time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
+		time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
 	}
 
 	INSTR_TIME_SET_CURRENT(end_time);
@@ -183,26 +192,46 @@ output(uint64 loop_count)
 {
 	int64		max_bit = 31,
 				i;
-	char	   *header1 = _("< us");
+	char	   *header1 = _("<= ns");
 	char	   *header2 = /* xgettext:no-c-format */ _("% of total");
+	char	   *header2a = /* xgettext:no-c-format */ _("running %");
 	char	   *header3 = _("count");
 	int			len1 = strlen(header1);
 	int			len2 = strlen(header2);
+	int			len2a = strlen(header2a);
 	int			len3 = strlen(header3);
+	float       rprct;
 
 	/* find highest bit value */
 	while (max_bit > 0 && histogram[max_bit] == 0)
 		max_bit--;
 
 	printf(_("Histogram of timing durations:\n"));
-	printf("%*s   %*s %*s\n",
-		   Max(6, len1), header1,
+	printf("%*s   %*s %*s %*s\n",
+		   Max(8, len1), header1,
 		   Max(10, len2), header2,
+		   Max(10, len2a), header2a,
 		   Max(10, len3), header3);
 
-	for (i = 0; i <= max_bit; i++)
-		printf("%*ld    %*.5f %*lld\n",
-			   Max(6, len1), 1l << i,
-			   Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
-			   Max(10, len3), histogram[i]);
+	for (i = 0, rprct=0; i <= max_bit; i++){
+		rprct += (double) histogram[i] * 100 / loop_count;
+		printf("%*ld    %*.4f  %*.4f %*lld\n",
+			Max(8, len1), (1l << i) - 1,
+			Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
+			Max(10, len2a) - 1, rprct,
+			Max(10, len3), histogram[i]);
+	}
+
+    printf("First 128 nanoseconds:\n");
+	for (i = 0, rprct=0; i < 128; i++){
+		rprct += (double) direct_histogram[i] * 100 / loop_count;
+	    if (direct_histogram[i])
+			printf("%*ld    %*.4f  %*.4f %*lld\n",
+			   Max(8, len1), i,
+			   Max(10, len2) - 1, (double) direct_histogram[i] * 100 / loop_count,
+			   Max(10, len2a) - 1, rprct,
+			   Max(10, len3), direct_histogram[i]);
+	}
+
 }
+

Reply via email to