Hi Thomas - RE: On 15/03/2018, Thomas Gleixner <t...@linutronix.de> wrote: > Jason, > > On Thu, 15 Mar 2018, jason.vas.d...@gmail.com wrote: > >> Resent to address reviewer comments. > > I was being patient so far and tried to guide you through the patch > submission process, but unfortunately this turns out to be just waste of my > time. > > You have not addressed any of the comments I made here: > > [1] > https://lkml.kernel.org/r/alpine.deb.2.21.1803141511340.2...@nanos.tec.linutronix.de > [2] > https://lkml.kernel.org/r/alpine.deb.2.21.1803141527300.2...@nanos.tec.linutronix.de >
I'm really sorry about that - I did not see those mails , and have searched for them in my inbox - are you sure they were sent to 'linux-kernel@vger.kernel.org' ? That is the only list I am subscribed to . I clicked on the links , but the 'To:' field is just 'linux-kernel' . If I had seen those messages before I re-submitted, those issues would have been fixed. checkpatch.pl did not report them - I ran it with all patches and it reported no errors . And I did send the test results in a previous mail - $ gcc -m64 -o timer timer.c ( must be compiled in 64-bit mode). This is using the new rdtscp() function : $ ./timer -r 100 ... Total time: 0.000002806S - Average Latency: 0.000000028S N zero deltas: 0 N inconsistent deltas: 0 Average of 100 average latencies of 100 samples : 0.000000027S This is using the rdtsc_ordered() function: $ ./timer -m -r 100 Total time: 0.000005269S - Average Latency: 0.000000052S N zero deltas: 0 N inconsistent deltas: 0 Average of 100 average latencies of 100 samples : 0.000000047S timer.c is a very short program that just reads N_SAMPLES (a compile-time option) timespecs using either CLOCK_MONOTONIC_RAW (no -m) or CLOCK_MONOTONIC first parameter to clock_gettime(), then computes the deltas as long long, then averages them , counting any zero deltas, or deltas where the previous timespec is somehow greater than the current timespec, which are reported as inconsitencies (note 'inconistent deltas:0' and 'zero deltas: 0' in output). So my initial claim that rdtscp() can be twice as fast as rdtsc_ordered() was not far-fetched - this is what I am seeing . I think this is because of the explicit barrier() call in rdtsc_ordered() . This must be slower than than the internal processor pipeline "cancellation point" (barrier) used by the rdtscp instruction itself. This is the only reason for the rdtscp call - plus all modern Intel & AMD CPUs support it, and it DOES solve the ordering problem, whereby instructions in one pipeline of a task can get different rdtsc() results than instructions in another pipeline. I will document the results better in the ChangeLog , fix all issues you identified, and resend . I did not mean to ignore your comments - those mails are nowhere in my Inbox - please , confirm the actual email address they are getting sent to. Thanks & Regards, Jason
/* * Program to measure high-res timer latency. * */ #include <stdint.h> #include <stdbool.h> #include <sys/types.h> #include <unistd.h> #include <time.h> #include <errno.h> #include <alloca.h> #include <string.h> #include <stdio.h> #include <stdlib.h> #ifndef N_SAMPLES #define N_SAMPLES 100 #endif #define _STR(_S_) #_S_ #define STR(_S_) _STR(_S_) #define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec)))) int main(int argc, char *const* argv, char *const* envp) { struct timespec sample[N_SAMPLES+1]; unsigned int cnt=N_SAMPLES, s=0 , avg_n=0; unsigned long long deltas [ N_SAMPLES ] , t1, t2, sum=0, zd=0, ic=0, d , t_start, avg_ns, *avgs=0; clockid_t clk = CLOCK_MONOTONIC_RAW; bool do_dump = false; int argn=1, repeat=1; for(; argn < argc; argn+=1) if( argv[argn] != NULL ) if( *(argv[argn]) == '-') switch( *(argv[argn]+1) ) { case 'm': case 'M': clk = CLOCK_MONOTONIC; break; case 'd': case 'D': do_dump = true; break; case 'r': case 'R': if( (argn < argc) && (argv[argn+1] != NULL)) repeat = atoi(argv[argn+=1]); break; case '?': case 'h': case 'u': case 'U': case 'H': fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW)\n\t-d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t" "-r <repeat count>\n]\t" "Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n" ); return 0; } if( repeat > 1 ) { avgs=alloca(sizeof(unsigned long long) * (N_SAMPLES + 1)); if( ((unsigned long) avgs) & 7 ) avgs = ((unsigned long long*)(((unsigned char*)avgs)+(8-((unsigned long) avgs) & 7))); } do { cnt=N_SAMPLES; s=0; do { if( 0 != clock_gettime(clk, &sample[s++]) ) { fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno)); return 1; } }while( --cnt ); clock_gettime(clk, &sample[s]); for(s=1; s < (N_SAMPLES+1); s+=1) { t1 = TS2NS(sample[s-1]); t2 = TS2NS(sample[s]); if ( (t1 > t2) ||(sample[s-1].tv_sec > sample[s].tv_sec) ||((sample[s-1].tv_sec == sample[s].tv_sec) &&(sample[s-1].tv_nsec > sample[s].tv_nsec) ) ) { fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2 , sample[s-1].tv_sec, sample[s-1].tv_nsec , sample[s].tv_sec, sample[s].tv_nsec ); ic+=1; continue; } d = t2 - t1; if ( d == 0 ) { zd += 1; } deltas[s-1] = d; if(do_dump) fprintf(stderr, "%lu %lu %llu\n", sample[s].tv_sec, sample[s].tv_nsec, d ); } for(s = 0, sum=0; s < N_SAMPLES; s+=1) sum += deltas[s]; fprintf(stderr,"sum: %llu\n",sum); avg_ns = sum / N_SAMPLES; t_start = TS2NS(sample[0]); t1=(t2 - t_start); printf("Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS N zero deltas: %u N inconsistent deltas: %u\n", t1 / 1000000000, t1 % 1000000000, avg_ns / 1000000000, avg_ns % 1000000000 , zd, ic ); if (avgs != ((void*)0UL)) avgs[avg_n++] = avg_ns; } while (--repeat); if (avgs != ((void*)0UL)) { for( s=0, sum=0; s < avg_n; s+=1) sum += avgs[s]; printf("Average of %u average latencies of " STR(N_SAMPLES) " samples : %1.1llu.%9.9lluS\n" , avg_n, (sum / N_SAMPLES) / 1000000000, (sum / N_SAMPLES) % 1000000000 ); } return 0; }