Re: [PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread Jason Vas Dias
fixed typo in timer_latency.c affecting only -r  printout
:

$ gcc -DN_SAMPLES=1000 -o timer timer_latency.c
CLOCK_MONOTONIC ( using rdtscp_ordered() ) :

$ ./timer -m -r 10
sum: 67615
Total time: 0.67615S - Average Latency: 0.00067S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51858
Total time: 0.51858S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51742
Total time: 0.51742S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51944
Total time: 0.51944S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51838
Total time: 0.51838S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52397
Total time: 0.52397S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52428
Total time: 0.52428S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52135
Total time: 0.52135S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52145
Total time: 0.52145S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 53116
Total time: 0.53116S - Average Latency: 0.00053S N zero
deltas: 0 N inconsistent deltas: 0
Average of 10 average latencies of 1000 samples : 0.00053S


CLOCK_MONOTONIC_RAW ( using rdtscp() ) :

$ ./timer  -r 10
sum: 25755
Total time: 0.25755S - Average Latency: 0.00025S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21614
Total time: 0.21614S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21616
Total time: 0.21616S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21610
Total time: 0.21610S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21619
Total time: 0.21619S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21617
Total time: 0.21617S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21610
Total time: 0.21610S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16940
Total time: 0.16940S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16939
Total time: 0.16939S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16943
Total time: 0.16943S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
Average of 10 average latencies of 1000 samples : 0.00019S
/*
 * Program to measure high-res timer latency.
 *
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

#define TS2NS(_TS_) unsigned long long)(_TS_).tv_sec)*10ULL) + (((unsigned long long)((_TS_).tv_nsec 

int main(int argc, char *const* argv, char *const* envp)
{ struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 , avg_n=0;
  unsigned long long
deltas [ N_SAMPLES ]
, t1, t2, sum=0, zd=0, ic=0, d
, t_start, avg_ns, *avgs=0;
  clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1, repeat=1;
  for(; argn < argc; argn+=1)
if( argv[argn] != NULL )
  if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	clk = CLOCK_MONOTONIC;
	break;
	  case 'd':
	  case 'D':
	do_dump = true;
	break;
 	  case 'r':
  	  case 'R':
	if( (argn < argc) && (argv[argn+1] != NULL))
	  repeat = atoi(argv[argn+=1]);
	break;
	  case '?':
	  case 'h':
	  case 'u':
	  case 'U':
	  case 'H':
	fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW)\n\t-d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t"
	"-r \n]\t"
	"Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n"
	   );
	return 0;
	}
  if( repeat > 1 )
  { avgs=alloca(sizeof(unsigned long long) * (N_SAMPLES + 1));
if( ((unsigned long) avgs) & 7 )
  avgs = ((unsigned long long*)(((unsigned char*)avgs)+(8-((unsigned long) avgs) & 7)));
  }
  do {
cnt=N_SAMPLES;
s=0;
  do
  { if( 0 != clock_gettime(clk, [s++]) )
{ fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
  return 1;
}
  }while( --cnt );
  clock_gettime(clk, [s]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
t2 = TS2NS(sample[s]);
if ( (t1 > t2)
   ||(sample[s-1].tv_sec > sample[s].tv_sec)
   ||((sample[s-1].tv_sec == sample[s].tv_sec)
&&(sample[s-1].tv_nsec > sample[s].tv_nsec)
 )
   )
{ fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2
, sample[s-1].tv_sec, sample[s-1].tv_nsec
, sample[s].tv_sec,   sample[s].tv_nsec
  );
  ic+=1;
  

Re: [PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread Jason Vas Dias
fixed typo in timer_latency.c affecting only -r  printout
:

$ gcc -DN_SAMPLES=1000 -o timer timer_latency.c
CLOCK_MONOTONIC ( using rdtscp_ordered() ) :

$ ./timer -m -r 10
sum: 67615
Total time: 0.67615S - Average Latency: 0.00067S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51858
Total time: 0.51858S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51742
Total time: 0.51742S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51944
Total time: 0.51944S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 51838
Total time: 0.51838S - Average Latency: 0.00051S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52397
Total time: 0.52397S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52428
Total time: 0.52428S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52135
Total time: 0.52135S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 52145
Total time: 0.52145S - Average Latency: 0.00052S N zero
deltas: 0 N inconsistent deltas: 0
sum: 53116
Total time: 0.53116S - Average Latency: 0.00053S N zero
deltas: 0 N inconsistent deltas: 0
Average of 10 average latencies of 1000 samples : 0.00053S


CLOCK_MONOTONIC_RAW ( using rdtscp() ) :

$ ./timer  -r 10
sum: 25755
Total time: 0.25755S - Average Latency: 0.00025S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21614
Total time: 0.21614S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21616
Total time: 0.21616S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21610
Total time: 0.21610S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21619
Total time: 0.21619S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21617
Total time: 0.21617S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 21610
Total time: 0.21610S - Average Latency: 0.00021S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16940
Total time: 0.16940S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16939
Total time: 0.16939S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
sum: 16943
Total time: 0.16943S - Average Latency: 0.00016S N zero
deltas: 0 N inconsistent deltas: 0
Average of 10 average latencies of 1000 samples : 0.00019S
/*
 * Program to measure high-res timer latency.
 *
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

#define TS2NS(_TS_) unsigned long long)(_TS_).tv_sec)*10ULL) + (((unsigned long long)((_TS_).tv_nsec 

int main(int argc, char *const* argv, char *const* envp)
{ struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 , avg_n=0;
  unsigned long long
deltas [ N_SAMPLES ]
, t1, t2, sum=0, zd=0, ic=0, d
, t_start, avg_ns, *avgs=0;
  clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1, repeat=1;
  for(; argn < argc; argn+=1)
if( argv[argn] != NULL )
  if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	clk = CLOCK_MONOTONIC;
	break;
	  case 'd':
	  case 'D':
	do_dump = true;
	break;
 	  case 'r':
  	  case 'R':
	if( (argn < argc) && (argv[argn+1] != NULL))
	  repeat = atoi(argv[argn+=1]);
	break;
	  case '?':
	  case 'h':
	  case 'u':
	  case 'U':
	  case 'H':
	fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW)\n\t-d : dump timespec contents. N_SAMPLES: " STR(N_SAMPLES) "\n\t"
	"-r \n]\t"
	"Calculates average timer latency (minimum time that can be measured) over N_SAMPLES.\n"
	   );
	return 0;
	}
  if( repeat > 1 )
  { avgs=alloca(sizeof(unsigned long long) * (N_SAMPLES + 1));
if( ((unsigned long) avgs) & 7 )
  avgs = ((unsigned long long*)(((unsigned char*)avgs)+(8-((unsigned long) avgs) & 7)));
  }
  do {
cnt=N_SAMPLES;
s=0;
  do
  { if( 0 != clock_gettime(clk, [s++]) )
{ fprintf(stderr,"oops, clock_gettime() failed: %d: '%s'.\n", errno, strerror(errno));
  return 1;
}
  }while( --cnt );
  clock_gettime(clk, [s]);
  for(s=1; s < (N_SAMPLES+1); s+=1)
  { t1 = TS2NS(sample[s-1]);
t2 = TS2NS(sample[s]);
if ( (t1 > t2)
   ||(sample[s-1].tv_sec > sample[s].tv_sec)
   ||((sample[s-1].tv_sec == sample[s].tv_sec)
&&(sample[s-1].tv_nsec > sample[s].tv_nsec)
 )
   )
{ fprintf(stderr,"Inconsistency: %llu %llu %lu.%lu %lu.%lu\n", t1 , t2
, sample[s-1].tv_sec, sample[s-1].tv_nsec
, sample[s].tv_sec,   sample[s].tv_nsec
  );
  ic+=1;
  

re: [PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread Jason Vas Dias
Hi -

I submitted a new stripped-down to bare essentials version of
the patch, (see LKML emails with $subject)  which passes all
checkpatch.pl tests and addresses all concerns raised by reviewers,
which uses only rdtsc_ordered(), and which only only updates in
  vsyscall_gtod_data the new fields:
u32 raw_mult,  raw_shift ; ...
gtod_long_t  monotonic_time_raw_sec   /* == tk->raw_sec */ ,
  monotonic_time_raw_nsec /* == tk->tkr_raw.nsec */;
(this is NOT the formatting used in vgtod.h - sorry about previous
 formatting issues .
) .

I don't see how one could present the raw timespec in user-space
properly without tk->tkr_raw.xtime_nsec and and tk->raw_sec ;
monotonic has gtod->monotonic_time_sec and gtod->monotonic_time_snsec,
and I am only trying to follow exactly the existing algorithm in
timekeeping.c's
getrawmonotonic64() .

When I submitted the initial version of this stripped down patch,
I got an email back from robot reporting a compilation
error saying :

>
>   arch/x86/entry/vdso/vclock_gettime.o: In function `__vdso_clock_gettime':
>   vclock_gettime.c:(.text+0xf7): undefined reference to 
> >`__x86_indirect_thunk_rax'
>   /usr/bin/ld: arch/x86/entry/vdso/vclock_gettime.o: relocation R_X86_64_PC32 
> >against undefined symbol `__x86_indirect_thunk_rax' can not be used when 
> making >a shared object; recompile with -fPIC
>   /usr/bin/ld: final link failed: Bad value
>>> collect2: error: ld returned 1 exit status
>--
>>> arch/x86/entry/vdso/vdso32.so.dbg: undefined symbols found
>--
>>> objcopy: 'arch/x86/entry/vdso/vdso64.so.dbg': No such file
>---


I had fixed this problem with the patch to the RHEL kernel attached to
bug #198161 (attachment #274751:
https://bugzilla.kernel.org/attachment.cgi?id=274751) ,
 by simply reducing the number of clauses in __vdso_clock_gettime's
switch(clock) from 6 to 5 , but at the cost of an extra test of clock
& second switch(clock).

I reported this as GCC bug :
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84908
because I don't think GCC should fail to do anything
for a switch with 6 clauses and not for one with 5, but
the response I got from H.J. Liu was:

H.J. Lu wrote @ 2018-03-16 22:13:27 UTC:
>
> vDSO isn't compiled with $(KBUILD_CFLAGS).  Why does your kernel do it?
>
> Please try my kernel patch at comment 4..
>

So that patch to the arch/x86/vdso/Makefile only prevents it enabling the
RETPOLINE_CFLAGS for building  the vDSO .

I defer to H.J.'s expertise on GCC + binutils & advisability of enabling
RETPOLINE_CFLAGS in the VDSO - GCC definitely behaves strangely
for the vDSO when RETPOLINE _CFLAGS  are enabled.

Please provide something like the patch in a future version of Linux ,
and I suggest not compiling the vDSO with RETPOLINE_CFLAGS
as does H.J. .

The inconsistency_check program in tools/testing/selftests/timers produces
no errors for long runs and the timer_latency.c program (attached) also
produces no errors and latencies of @ 20ns for CLOCK_MONOTONIC_RAW
and latencies of @ 40ns for CLOCK_MONOTONIC - this is however
with the additional rdtscp patches , and under 4.15.9, for use on my system ;
the 4.16-rc5 version submitted still uses barrier() + rdtsc  , and
that has  a latency
of @ 30ns for CLOCK_MONOTONIC_RAW and @40ns for CLOCK_MONOTONIC ; but
both are much, much better that
200-1000ns for CLOCK_MONOTONIC_RAW that the unpatched
kernels have (all times refer to 'Average Latency' output produced
by 'timer_latency.c').

I do apologize for whitespace errors, unread emails and resends and confusion
of previous emails - I now understand the process and standards much better
and will attempt to adhere to them more closely in future.

Thanks & Best Regards,
Jason Vas Dias
/* 
 * Program to measure high-res timer latency.
 *
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

#define TS2NS(_TS_) unsigned long long)(_TS_).tv_sec)*10ULL) + (((unsigned long long)((_TS_).tv_nsec 

int main(int argc, char *const* argv, char *const* envp)
{ struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 , avg_n=0;
  unsigned long long
deltas [ N_SAMPLES ]
, t1, t2, sum=0, zd=0, ic=0, d
, t_start, avg_ns, *avgs=0;
  clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1, repeat=1;
  for(; argn < argc; argn+=1)
if( argv[argn] != NULL )
  if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	clk = CLOCK_MONOTONIC;
	break;
	  case 'd':
	  case 'D':
	do_dump = true;
	break;
 	  case 'r':
  	  case 'R':
	if( (argn < argc) && (argv[argn+1] != NULL))
	  repeat = atoi(argv[argn+=1]);
	break;
	  case '?':
	  case 'h':
	  case 'u':
	  case 'U':
	  case 'H':
	fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not 

re: [PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread Jason Vas Dias
Hi -

I submitted a new stripped-down to bare essentials version of
the patch, (see LKML emails with $subject)  which passes all
checkpatch.pl tests and addresses all concerns raised by reviewers,
which uses only rdtsc_ordered(), and which only only updates in
  vsyscall_gtod_data the new fields:
u32 raw_mult,  raw_shift ; ...
gtod_long_t  monotonic_time_raw_sec   /* == tk->raw_sec */ ,
  monotonic_time_raw_nsec /* == tk->tkr_raw.nsec */;
(this is NOT the formatting used in vgtod.h - sorry about previous
 formatting issues .
) .

I don't see how one could present the raw timespec in user-space
properly without tk->tkr_raw.xtime_nsec and and tk->raw_sec ;
monotonic has gtod->monotonic_time_sec and gtod->monotonic_time_snsec,
and I am only trying to follow exactly the existing algorithm in
timekeeping.c's
getrawmonotonic64() .

When I submitted the initial version of this stripped down patch,
I got an email back from robot reporting a compilation
error saying :

>
>   arch/x86/entry/vdso/vclock_gettime.o: In function `__vdso_clock_gettime':
>   vclock_gettime.c:(.text+0xf7): undefined reference to 
> >`__x86_indirect_thunk_rax'
>   /usr/bin/ld: arch/x86/entry/vdso/vclock_gettime.o: relocation R_X86_64_PC32 
> >against undefined symbol `__x86_indirect_thunk_rax' can not be used when 
> making >a shared object; recompile with -fPIC
>   /usr/bin/ld: final link failed: Bad value
>>> collect2: error: ld returned 1 exit status
>--
>>> arch/x86/entry/vdso/vdso32.so.dbg: undefined symbols found
>--
>>> objcopy: 'arch/x86/entry/vdso/vdso64.so.dbg': No such file
>---


I had fixed this problem with the patch to the RHEL kernel attached to
bug #198161 (attachment #274751:
https://bugzilla.kernel.org/attachment.cgi?id=274751) ,
 by simply reducing the number of clauses in __vdso_clock_gettime's
switch(clock) from 6 to 5 , but at the cost of an extra test of clock
& second switch(clock).

I reported this as GCC bug :
  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84908
because I don't think GCC should fail to do anything
for a switch with 6 clauses and not for one with 5, but
the response I got from H.J. Liu was:

H.J. Lu wrote @ 2018-03-16 22:13:27 UTC:
>
> vDSO isn't compiled with $(KBUILD_CFLAGS).  Why does your kernel do it?
>
> Please try my kernel patch at comment 4..
>

So that patch to the arch/x86/vdso/Makefile only prevents it enabling the
RETPOLINE_CFLAGS for building  the vDSO .

I defer to H.J.'s expertise on GCC + binutils & advisability of enabling
RETPOLINE_CFLAGS in the VDSO - GCC definitely behaves strangely
for the vDSO when RETPOLINE _CFLAGS  are enabled.

Please provide something like the patch in a future version of Linux ,
and I suggest not compiling the vDSO with RETPOLINE_CFLAGS
as does H.J. .

The inconsistency_check program in tools/testing/selftests/timers produces
no errors for long runs and the timer_latency.c program (attached) also
produces no errors and latencies of @ 20ns for CLOCK_MONOTONIC_RAW
and latencies of @ 40ns for CLOCK_MONOTONIC - this is however
with the additional rdtscp patches , and under 4.15.9, for use on my system ;
the 4.16-rc5 version submitted still uses barrier() + rdtsc  , and
that has  a latency
of @ 30ns for CLOCK_MONOTONIC_RAW and @40ns for CLOCK_MONOTONIC ; but
both are much, much better that
200-1000ns for CLOCK_MONOTONIC_RAW that the unpatched
kernels have (all times refer to 'Average Latency' output produced
by 'timer_latency.c').

I do apologize for whitespace errors, unread emails and resends and confusion
of previous emails - I now understand the process and standards much better
and will attempt to adhere to them more closely in future.

Thanks & Best Regards,
Jason Vas Dias
/* 
 * Program to measure high-res timer latency.
 *
 */
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

#ifndef N_SAMPLES
#define N_SAMPLES 100
#endif
#define _STR(_S_) #_S_
#define STR(_S_) _STR(_S_)

#define TS2NS(_TS_) unsigned long long)(_TS_).tv_sec)*10ULL) + (((unsigned long long)((_TS_).tv_nsec 

int main(int argc, char *const* argv, char *const* envp)
{ struct timespec sample[N_SAMPLES+1];
  unsigned int cnt=N_SAMPLES, s=0 , avg_n=0;
  unsigned long long
deltas [ N_SAMPLES ]
, t1, t2, sum=0, zd=0, ic=0, d
, t_start, avg_ns, *avgs=0;
  clockid_t clk = CLOCK_MONOTONIC_RAW;
  bool do_dump = false;
  int argn=1, repeat=1;
  for(; argn < argc; argn+=1)
if( argv[argn] != NULL )
  if( *(argv[argn]) == '-')
	switch( *(argv[argn]+1) )
	{ case 'm':
	  case 'M':
	clk = CLOCK_MONOTONIC;
	break;
	  case 'd':
	  case 'D':
	do_dump = true;
	break;
 	  case 'r':
  	  case 'R':
	if( (argn < argc) && (argv[argn+1] != NULL))
	  repeat = atoi(argv[argn+=1]);
	break;
	  case '?':
	  case 'h':
	  case 'u':
	  case 'U':
	  case 'H':
	fprintf(stderr,"Usage: timer_latency [\n\t-m : use CLOCK_MONOTONIC clock (not CLOCK_MONOTONIC_RAW)\n\t-d : 

[PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread jason . vas . dias


  Resent to address reviewer comments, and allow builds with compilers
  that support -DRETPOLINE to succeed.

  Currently, the VDSO does not handle
 clock_gettime( CLOCK_MONOTONIC_RAW,  )
  on Intel / AMD - it calls
 vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values, user-space  code needs to know elapsed time from the
  perspective of the CPU no matter how "hot" / fast or "cold" / slow it
  might be running wrt NTP / PTP "real" time; when code needs this,
  the latencies associated with a syscall are often unacceptably high.

  I reported this as Bug #198161 :
'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .

  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 20ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:

   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vsyscall/vsyscall_gtod.c
   arch/x86/entry/vdso/Makefile   

  Patches for kernels 3.10.0-21 and 4.9.65-rt23 (ARM) are attached to bug 
#198161,
  as is the test program, timer_latency.c, to demonstrate the problem.

  Before the patch a latency of 200-1000ns was measured for
clock_gettime(CLOCK_MONOTONIC_RAW,)
  calls - after the patch, the same call on the same machine
  has a latency of @ 20ns.


Thanks & Best Regards,
Jason Vas Dias


[PATCH v4.16-rc5 (2)] x86/vdso: VDSO should handle clock_gettime(CLOCK_MONOTONIC_RAW) without syscall

2018-03-17 Thread jason . vas . dias


  Resent to address reviewer comments, and allow builds with compilers
  that support -DRETPOLINE to succeed.

  Currently, the VDSO does not handle
 clock_gettime( CLOCK_MONOTONIC_RAW,  )
  on Intel / AMD - it calls
 vdso_fallback_gettime()
  for this clock, which issues a syscall, having an unacceptably high
  latency (minimum measurable time or time between measurements)
  of 300-700ns on 2 2.8-3.9ghz Haswell x86_64 Family'_'Model : 06_3C
  machines under various versions of Linux.

  Sometimes, particularly when correlating elapsed time to performance
  counter values, user-space  code needs to know elapsed time from the
  perspective of the CPU no matter how "hot" / fast or "cold" / slow it
  might be running wrt NTP / PTP "real" time; when code needs this,
  the latencies associated with a syscall are often unacceptably high.

  I reported this as Bug #198161 :
'https://bugzilla.kernel.org/show_bug.cgi?id=198961'
  and in previous posts with subjects matching 'CLOCK_MONOTONIC_RAW' .

  This patch handles CLOCK_MONOTONIC_RAW clock_gettime() in the VDSO ,
  by exporting the raw clock calibration, last cycles, last xtime_nsec,
  and last raw_sec value in the vsyscall_gtod_data during vsyscall_update() .

  Now the new do_monotonic_raw() function in the vDSO has a latency of @ 20ns
  on average, and the test program:
   tools/testing/selftest/timers/inconsistency-check.c
  succeeds with arguments: '-c 4 -t 120' or any arbitrary -t value.

  The patch is against Linus' latest 4.16-rc5 tree,
  current HEAD of :
git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
  .

  This patch affects only files:

   arch/x86/include/asm/vgtod.h
   arch/x86/entry/vdso/vclock_gettime.c
   arch/x86/entry/vsyscall/vsyscall_gtod.c
   arch/x86/entry/vdso/Makefile   

  Patches for kernels 3.10.0-21 and 4.9.65-rt23 (ARM) are attached to bug 
#198161,
  as is the test program, timer_latency.c, to demonstrate the problem.

  Before the patch a latency of 200-1000ns was measured for
clock_gettime(CLOCK_MONOTONIC_RAW,)
  calls - after the patch, the same call on the same machine
  has a latency of @ 20ns.


Thanks & Best Regards,
Jason Vas Dias