Hi Thomas -
RE:
On 15/03/2018, Thomas Gleixner <t...@linutronix.de> wrote:
> Jason,
>
> On Thu, 15 Mar 2018, jason.vas.d...@gmail.com wrote:
>
>>   Resent to address reviewer comments.
>
> I was being patient so far and tried to guide you through the patch
> submission process, but unfortunately this turns out to be just waste of my
> time.
>
> You have not addressed any of the comments I made here:
>
> [1]
> https://lkml.kernel.org/r/alpine.deb.2.21.1803141511340.2...@nanos.tec.linutronix.de
> [2]
> https://lkml.kernel.org/r/alpine.deb.2.21.1803141527300.2...@nanos.tec.linutronix.de
>

I'm really sorry about that - I did not see those mails ,
and have searched for them in my inbox -
are you sure they were sent to 'linux-kernel@vger.kernel.org' ?
That is the only list I am subscribed to .
I clicked on the links , but the 'To:' field is just
'linux-kernel' .

If I had seen those messages before I re-submitted,
those issues would have been fixed.

checkpatch.pl did not report them -
I ran it with all patches and it reported
no errors .

And I did send the test results in a previous mail -

$ gcc -m64 -o timer timer.c

( must be compiled in 64-bit mode).

This is using the new rdtscp() function :
$ ./timer -r 100
...
Total time: 0.000002806S - Average Latency: 0.000000028S N zero
deltas: 0 N inconsistent deltas: 0
Average of 100 average latencies of 100 samples : 0.000000027S

This is using the rdtsc_ordered() function:

$ ./timer -m -r 100
Total time: 0.000005269S - Average Latency: 0.000000052S N zero
deltas: 0 N inconsistent deltas: 0
Average of 100 average latencies of 100 samples : 0.000000047S

timer.c is a very short program that just reads N_SAMPLES (a
compile-time option)
timespecs using either CLOCK_MONOTONIC_RAW (no -m) or CLOCK_MONOTONIC
first parameter to clock_gettime(),  then
computes the deltas as long long, then averages them , counting any
zero deltas, or deltas where the previous timespec is somehow
greater than the current timespec, which are reported as
inconsitencies (note 'inconistent deltas:0' and 'zero deltas: 0' in output).

So my initial claim that rdtscp() can be twice as fast as rdtsc_ordered()
was not far-fetched - this is what I am seeing .

I think this is because of the explicit barrier() call in rdtsc_ordered() .
This must be slower than than the internal processor pipeline
"cancellation point" (barrier) used by the rdtscp instruction itself.
This is the only reason for the rdtscp call  -  plus all modern Intel
& AMD CPUs support it, and it DOES solve the ordering problem,
whereby instructions in one pipeline of a task can get different
rdtsc() results than instructions in another pipeline.

I will document the results better in the ChangeLog , fix all issues
you identified, and resend .

I did not mean to ignore your comments -
those mails are nowhere in my Inbox -
please ,  confirm the actual email address
they are getting sent to.

Thanks & Regards,
Jason
/* 
 * Program to measure high-res timer latency.
 *
 */
#include <stdint.h>
#include <stdbool.h>
#include <sys/types.h>
#include <unistd.h>
#include <time.h>
#include <errno.h>
#include <alloca.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

#define TS2NS(_TS_) ((((unsigned long long)(_TS_).tv_sec)*1000000000ULL) + (((unsigned long long)((_TS_).tv_nsec)))) 

int main(int argc, char *const* argv, char *const* envp)
{ struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 , avg_n=0;
  unsigned long long
    deltas [ N_SAMPLES ]
    , t1, t2, sum=0, zd=0, ic=0, d
    , t_start, avg_ns, *avgs=0;
  clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1, repeat=1;
  for(; argn < argc; argn+=1)
    if( argv[argn] != NULL )
      if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	    clk = CLOCK_MONOTONIC;
	    break;
	  case 'd':
	  case 'D':
	    do_dump = true;
	    break;
     	  case 'r':
      	  case 'R':
	    if( (argn < argc) && (argv[argn+1] != NULL))
	      repeat = atoi(argv[argn+=1]);
	    break;
	  case '?':
	  case 'h':
	  case 'u':
	  case 'U':
	  case 'H':
	    fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW)\n\t-d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t"
	            "-r <repeat count>\n]\t" 
	            "Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n"
	           );
	    return 0;
	}
  if( repeat > 1 )
  { avgs=alloca(sizeof(unsigned long long) * (N_SAMPLES + 1));
    if( ((unsigned long) avgs) & 7 )
      avgs = ((unsigned long long*)(((unsigned char*)avgs)+(8-((unsigned long) avgs) & 7)));
  }
  do {
    cnt=N_SAMPLES;
    s=0;
  do
  { if( 0 != clock_gettime(clk, &sample[s++]) )
    { fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
      return 1;
    }
  }while( --cnt );
  clock_gettime(clk, &sample[s]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
    t2 = TS2NS(sample[s]);
    if ( (t1 > t2)
       ||(sample[s-1].tv_sec > sample[s].tv_sec)
       ||((sample[s-1].tv_sec == sample[s].tv_sec)
        &&(sample[s-1].tv_nsec > sample[s].tv_nsec)
         )
       )
    { fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2
            , sample[s-1].tv_sec, sample[s-1].tv_nsec
            , sample[s].tv_sec,   sample[s].tv_nsec
      );
      ic+=1;
      continue;
    }
    d = t2 - t1;
    if ( d == 0 )
    {  zd += 1;
    }
    deltas[s-1] = d;
    if(do_dump)
      fprintf(stderr, "%lu %lu %llu\n",
              sample[s].tv_sec, sample[s].tv_nsec, d
             );
  }
  for(s = 0, sum=0; s < N_SAMPLES; s+=1)
    sum += deltas[s];
  fprintf(stderr,"sum: %llu\n",sum);
  avg_ns = sum / N_SAMPLES;
  t_start = TS2NS(sample[0]);
  t1=(t2 - t_start);
  printf("Total time: %1.1llu.%9.9lluS - Average Latency: %1.1llu.%9.9lluS N zero deltas: %u N inconsistent deltas: %u\n",
          t1 / 1000000000,       t1 % 1000000000,
          avg_ns / 1000000000,   avg_ns % 1000000000 ,
          zd, ic
        );
  if (avgs != ((void*)0UL))
    avgs[avg_n++] = avg_ns;
  } while (--repeat);
  if (avgs != ((void*)0UL))
  { for( s=0, sum=0; s < avg_n; s+=1)
      sum += avgs[s];
    printf("Average of %u average latencies of " STR(N_SAMPLES) " samples : %1.1llu.%9.9lluS\n"
          , avg_n, (sum / N_SAMPLES) / 1000000000, (sum / N_SAMPLES) % 1000000000
          );
  }
  return 0;
}

Reply via email to