(resending to list and other CC:s )
Hi Tom
This is my current patch which also adds running % and optionally uses
faster way to count leading zeros, though I did not see a change from
that.
It also bucketizes first 128 ns to get better overview of exact behaviour.
We may want to put reporting this behind a flag
---
Hannu
On Wed, Jun 19, 2024 at 6:36 PM Tom Lane <[email protected]> wrote:
>
> Peter Eisentraut <[email protected]> writes:
> > AFAICT, pg_test_timing doesn't use gettimeofday(), so this doesn't
> > really address the original question.
>
> It's not exactly hard to make it do so (see attached).
>
> I tried this on several different machines, and my conclusion is that
> gettimeofday() reports full microsecond precision on any platform
> anybody is likely to be running PG on today. Even my one surviving
> pet dinosaur, NetBSD 10 on PowerPC Mac (mamba), shows results like
> this:
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 901.41 ns
> Histogram of timing durations:
> < us % of total count
> 1 10.46074 348148
> 2 89.51495 2979181
> 4 0.00574 191
> 8 0.00430 143
> 16 0.00691 230
> 32 0.00376 125
> 64 0.00012 4
> 128 0.00303 101
> 256 0.00027 9
> 512 0.00009 3
> 1024 0.00009 3
>
> I also modified pg_test_timing to measure nanoseconds not
> microseconds (second patch attached), and got this:
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 805.50 ns
> Histogram of timing durations:
> < ns % of total count
> 1 19.84234 739008
> 2 0.00000 0
> 4 0.00000 0
> 8 0.00000 0
> 16 0.00000 0
> 32 0.00000 0
> 64 0.00000 0
> 128 0.00000 0
> 256 0.00000 0
> 512 0.00000 0
> 1024 80.14013 2984739
> 2048 0.00078 29
> 4096 0.00658 245
> 8192 0.00290 108
> 16384 0.00252 94
> 32768 0.00250 93
> 65536 0.00016 6
> 131072 0.00185 69
> 262144 0.00008 3
> 524288 0.00008 3
> 1048576 0.00008 3
>
> confirming that when the result changes it generally does so by 1usec.
>
> Applying just the second patch, I find that clock_gettime on this
> old hardware seems to be limited to 1us resolution, but on my more
> modern machines (mac M1, x86_64) it can tick at 40ns or less.
> Even a raspberry pi 4 shows
>
> $ ./pg_test_timing
> Testing timing overhead for 3 seconds.
> Per loop time including overhead: 69.12 ns
> Histogram of timing durations:
> < ns % of total count
> 1 0.00000 0
> 2 0.00000 0
> 4 0.00000 0
> 8 0.00000 0
> 16 0.00000 0
> 32 0.00000 0
> 64 37.59583 16317040
> 128 62.38568 27076131
> 256 0.01674 7265
> 512 0.00002 8
> 1024 0.00000 0
> 2048 0.00000 0
> 4096 0.00153 662
> 8192 0.00019 83
> 16384 0.00001 3
> 32768 0.00001 5
>
> suggesting that the clock_gettime resolution is better than 64 ns.
>
> So I concur with Hannu that it's time to adjust pg_test_timing to
> resolve nanoseconds not microseconds. I gather he's created a
> patch that does more than mine below, so I'll wait for that.
>
> regards, tom lane
>
diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c
index c29d6f8762..20b2785f50 100644
--- a/src/bin/pg_test_timing/pg_test_timing.c
+++ b/src/bin/pg_test_timing/pg_test_timing.c
@@ -19,9 +19,12 @@ static void handle_args(int argc, char *argv[]);
static uint64 test_timing(unsigned int duration);
static void output(uint64 loop_count);
-/* record duration in powers of 2 microseconds */
+/* record duration in powers of 2 nanoseconds */
long long int histogram[32];
+/* record duration of first 128 ns directly */
+long long int direct_histogram[128];
+
int
main(int argc, char *argv[])
{
@@ -130,10 +133,10 @@ test_timing(unsigned int duration)
end_time,
temp;
- total_time = duration > 0 ? duration * INT64CONST(1000000) : 0;
+ total_time = duration > 0 ? duration * INT64CONST(1000000000) : 0;
INSTR_TIME_SET_CURRENT(start_time);
- cur = INSTR_TIME_GET_MICROSEC(start_time);
+ cur = INSTR_TIME_GET_NANOSEC(start_time);
while (time_elapsed < total_time)
{
@@ -142,7 +145,7 @@ test_timing(unsigned int duration)
prev = cur;
INSTR_TIME_SET_CURRENT(temp);
- cur = INSTR_TIME_GET_MICROSEC(temp);
+ cur = INSTR_TIME_GET_NANOSEC(temp);
diff = cur - prev;
/* Did time go backwards? */
@@ -153,19 +156,25 @@ test_timing(unsigned int duration)
exit(1);
}
+ if(likely(diff < 128))
+ direct_histogram[diff]++;
+
+#if defined(__has_builtin) && __has_builtin(__builtin_clz)
+ bits = 32 - __builtin_clz(diff);
+#else
/* What is the highest bit in the time diff? */
while (diff)
{
diff >>= 1;
bits++;
}
-
+#endif
/* Update appropriate duration bucket */
histogram[bits]++;
loop_count++;
INSTR_TIME_SUBTRACT(temp, start_time);
- time_elapsed = INSTR_TIME_GET_MICROSEC(temp);
+ time_elapsed = INSTR_TIME_GET_NANOSEC(temp);
}
INSTR_TIME_SET_CURRENT(end_time);
@@ -183,26 +192,46 @@ output(uint64 loop_count)
{
int64 max_bit = 31,
i;
- char *header1 = _("< us");
+ char *header1 = _("<= ns");
char *header2 = /* xgettext:no-c-format */ _("% of total");
+ char *header2a = /* xgettext:no-c-format */ _("running %");
char *header3 = _("count");
int len1 = strlen(header1);
int len2 = strlen(header2);
+ int len2a = strlen(header2a);
int len3 = strlen(header3);
+ float rprct;
/* find highest bit value */
while (max_bit > 0 && histogram[max_bit] == 0)
max_bit--;
printf(_("Histogram of timing durations:\n"));
- printf("%*s %*s %*s\n",
- Max(6, len1), header1,
+ printf("%*s %*s %*s %*s\n",
+ Max(8, len1), header1,
Max(10, len2), header2,
+ Max(10, len2a), header2a,
Max(10, len3), header3);
- for (i = 0; i <= max_bit; i++)
- printf("%*ld %*.5f %*lld\n",
- Max(6, len1), 1l << i,
- Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
- Max(10, len3), histogram[i]);
+ for (i = 0, rprct=0; i <= max_bit; i++){
+ rprct += (double) histogram[i] * 100 / loop_count;
+ printf("%*ld %*.4f %*.4f %*lld\n",
+ Max(8, len1), (1l << i) - 1,
+ Max(10, len2) - 1, (double) histogram[i] * 100 / loop_count,
+ Max(10, len2a) - 1, rprct,
+ Max(10, len3), histogram[i]);
+ }
+
+ printf("First 128 nanoseconds:\n");
+ for (i = 0, rprct=0; i < 128; i++){
+ rprct += (double) direct_histogram[i] * 100 / loop_count;
+ if (direct_histogram[i])
+ printf("%*ld %*.4f %*.4f %*lld\n",
+ Max(8, len1), i,
+ Max(10, len2) - 1, (double) direct_histogram[i] * 100 / loop_count,
+ Max(10, len2a) - 1, rprct,
+ Max(10, len3), direct_histogram[i]);
+ }
+
}
+