Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-22 Thread dean gaudet
On Thu, 18 Jan 2007, Andi Kleen wrote:

> > let me know what you think... thanks.
> 
> It's ok, although I would like to have the file in a separate directory.

cool -- do you have a directory in mind?

and would you like this change as two separate patches or one combined 
patch?

thanks
-dean
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-22 Thread dean gaudet
On Thu, 18 Jan 2007, Andi Kleen wrote:

  let me know what you think... thanks.
 
 It's ok, although I would like to have the file in a separate directory.

cool -- do you have a directory in mind?

and would you like this change as two separate patches or one combined 
patch?

thanks
-dean
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-18 Thread Andi Kleen
> let me know what you think... thanks.

It's ok, although I would like to have the file in a separate directory.

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-18 Thread Andi Kleen
 let me know what you think... thanks.

It's ok, although I would like to have the file in a separate directory.

-Andi
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-14 Thread dean gaudet
On Sat, 13 Jan 2007, dean gaudet wrote:

> ok here is the latest rev of this patch (against 2.6.20-rc4).
> 
> timings in cycles:
> 
> baseline   patchedbaseline   patched
> no cache   no cachecache  cache
> k8 pre-revF2116  1417
> k8 revF3117  1417
> core2  3816  1214
> p4 4941  2424
> 
> the degredation in cached timings appears to be due to the 16 byte stack
> frame set up for the sidt instruction.  apparently due to -mno-red-zone...
> would you accept a patch which re-enables the red-zone for vsyscalls?

here is a first stab at a patch (applied on top of my vgetcpu sidt patch) 
which enables red-zone for vsyscall.  it fixes the cache degredation 
problem above by getting rid of the stack frame setup in vgetcpu (and 
improves the no cache cases as well but i haven't run it everywhere yet).

to do this i split the user-mode-only portion of vsyscall.c into 
vsyscall_user.c.  this required a couple externs in vsyscall.c and two 
extra ".globl" in the asm in vsyscall_user.c.

i'm not sure if we need the CFLAGS_vsyscall.o still or not.

let me know what you think... thanks.

-dean

Index: linux/arch/x86_64/kernel/Makefile
===
--- linux.orig/arch/x86_64/kernel/Makefile  2006-11-29 13:57:37.0 
-0800
+++ linux/arch/x86_64/kernel/Makefile   2007-01-13 23:34:22.0 -0800
@@ -6,7 +6,7 @@
 EXTRA_AFLAGS   := -traditional
 obj-y  := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
-   x8664_ksyms.o i387.o syscall.o vsyscall.o \
+   x8664_ksyms.o i387.o syscall.o vsyscall.o vsyscall_user.o \
setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
pci-dma.o pci-nommu.o alternative.o
 
@@ -45,6 +45,7 @@
 obj-y  += intel_cacheinfo.o
 
 CFLAGS_vsyscall.o  := $(PROFILING) -g0
+CFLAGS_vsyscall_user.o := $(PROFILING) -g0 -mred-zone
 
 therm_throt-y   += ../../i386/kernel/cpu/mcheck/therm_throt.o
 bootflag-y += ../../i386/kernel/bootflag.o
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-13 22:21:01.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-13 23:41:08.0 -0800
@@ -40,161 +40,12 @@
 #include 
 #include 
 #include 
-
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define __syscall_clobber "r11","rcx","memory"
-
-int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-
-/* is this necessary? */
-#ifndef CONFIG_NODES_SHIFT
-#define CONFIG_NODES_SHIFT 0
-#endif
-
 #include 
 
-static __always_inline void timeval_normalize(struct timeval * tv)
-{
-   time_t __sec;
-
-   __sec = tv->tv_usec / 100;
-   if (__sec) {
-   tv->tv_usec %= 100;
-   tv->tv_sec += __sec;
-   }
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-   long sequence, t;
-   unsigned long sec, usec;
-
-   do {
-   sequence = read_seqbegin(&__xtime_lock);
-   
-   sec = __xtime.tv_sec;
-   usec = __xtime.tv_nsec / 1000;
-
-   if (__vxtime.mode != VXTIME_HPET) {
-   t = get_cycles_sync();
-   if (t < __vxtime.last_tsc)
-   t = __vxtime.last_tsc;
-   usec += ((t - __vxtime.last_tsc) *
-__vxtime.tsc_quot) >> 32;
-   /* See comment in x86_64 do_gettimeofday. */
-   } else {
-   usec += ((readl((void __iomem *)
-  fix_to_virt(VSYSCALL_HPET) + 0xf0) -
- __vxtime.last) * __vxtime.quot) >> 32;
-   }
-   } while (read_seqretry(&__xtime_lock, sequence));
-
-   tv->tv_sec = sec + usec / 100;
-   tv->tv_usec = usec % 100;
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be 
write-once. */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
-   *tz = __sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone 
*tz)
-{
-   int ret;
-   asm volatile("vsysc2: syscall"
-   : "=a" (ret)
-   : "0" (__NR_gettimeofday),"D" (tv),"S" (tz) : __syscall_clobber 
);
-   return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
-   long secs;
-   asm volatile("vsysc1: syscall"
-   : "=a" (secs)
-   : "0" (__NR_time),"D" (t) : 

Re: [patch] faster vgetcpu using sidt (take 2)

2007-01-14 Thread dean gaudet
On Sat, 13 Jan 2007, dean gaudet wrote:

 ok here is the latest rev of this patch (against 2.6.20-rc4).
 
 timings in cycles:
 
 baseline   patchedbaseline   patched
 no cache   no cachecache  cache
 k8 pre-revF2116  1417
 k8 revF3117  1417
 core2  3816  1214
 p4 4941  2424
 
 the degredation in cached timings appears to be due to the 16 byte stack
 frame set up for the sidt instruction.  apparently due to -mno-red-zone...
 would you accept a patch which re-enables the red-zone for vsyscalls?

here is a first stab at a patch (applied on top of my vgetcpu sidt patch) 
which enables red-zone for vsyscall.  it fixes the cache degredation 
problem above by getting rid of the stack frame setup in vgetcpu (and 
improves the no cache cases as well but i haven't run it everywhere yet).

to do this i split the user-mode-only portion of vsyscall.c into 
vsyscall_user.c.  this required a couple externs in vsyscall.c and two 
extra .globl in the asm in vsyscall_user.c.

i'm not sure if we need the CFLAGS_vsyscall.o still or not.

let me know what you think... thanks.

-dean

Index: linux/arch/x86_64/kernel/Makefile
===
--- linux.orig/arch/x86_64/kernel/Makefile  2006-11-29 13:57:37.0 
-0800
+++ linux/arch/x86_64/kernel/Makefile   2007-01-13 23:34:22.0 -0800
@@ -6,7 +6,7 @@
 EXTRA_AFLAGS   := -traditional
 obj-y  := process.o signal.o entry.o traps.o irq.o \
ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
-   x8664_ksyms.o i387.o syscall.o vsyscall.o \
+   x8664_ksyms.o i387.o syscall.o vsyscall.o vsyscall_user.o \
setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \
pci-dma.o pci-nommu.o alternative.o
 
@@ -45,6 +45,7 @@
 obj-y  += intel_cacheinfo.o
 
 CFLAGS_vsyscall.o  := $(PROFILING) -g0
+CFLAGS_vsyscall_user.o := $(PROFILING) -g0 -mred-zone
 
 therm_throt-y   += ../../i386/kernel/cpu/mcheck/therm_throt.o
 bootflag-y += ../../i386/kernel/bootflag.o
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-13 22:21:01.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-13 23:41:08.0 -0800
@@ -40,161 +40,12 @@
 #include asm/segment.h
 #include asm/desc.h
 #include asm/topology.h
-
-#define __vsyscall(nr) __attribute__ ((unused,__section__(.vsyscall_ #nr)))
-#define __syscall_clobber r11,rcx,memory
-
-int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-
-/* is this necessary? */
-#ifndef CONFIG_NODES_SHIFT
-#define CONFIG_NODES_SHIFT 0
-#endif
-
 #include asm/unistd.h
 
-static __always_inline void timeval_normalize(struct timeval * tv)
-{
-   time_t __sec;
-
-   __sec = tv-tv_usec / 100;
-   if (__sec) {
-   tv-tv_usec %= 100;
-   tv-tv_sec += __sec;
-   }
-}
-
-static __always_inline void do_vgettimeofday(struct timeval * tv)
-{
-   long sequence, t;
-   unsigned long sec, usec;
-
-   do {
-   sequence = read_seqbegin(__xtime_lock);
-   
-   sec = __xtime.tv_sec;
-   usec = __xtime.tv_nsec / 1000;
-
-   if (__vxtime.mode != VXTIME_HPET) {
-   t = get_cycles_sync();
-   if (t  __vxtime.last_tsc)
-   t = __vxtime.last_tsc;
-   usec += ((t - __vxtime.last_tsc) *
-__vxtime.tsc_quot)  32;
-   /* See comment in x86_64 do_gettimeofday. */
-   } else {
-   usec += ((readl((void __iomem *)
-  fix_to_virt(VSYSCALL_HPET) + 0xf0) -
- __vxtime.last) * __vxtime.quot)  32;
-   }
-   } while (read_seqretry(__xtime_lock, sequence));
-
-   tv-tv_sec = sec + usec / 100;
-   tv-tv_usec = usec % 100;
-}
-
-/* RED-PEN may want to readd seq locking, but then the variable should be 
write-once. */
-static __always_inline void do_get_tz(struct timezone * tz)
-{
-   *tz = __sys_tz;
-}
-
-static __always_inline int gettimeofday(struct timeval *tv, struct timezone 
*tz)
-{
-   int ret;
-   asm volatile(vsysc2: syscall
-   : =a (ret)
-   : 0 (__NR_gettimeofday),D (tv),S (tz) : __syscall_clobber 
);
-   return ret;
-}
-
-static __always_inline long time_syscall(long *t)
-{
-   long secs;
-   asm volatile(vsysc1: syscall
-   : =a (secs)
-   : 0 (__NR_time),D (t) : 

[patch] faster vgetcpu using sidt (take 2)

2007-01-13 Thread dean gaudet
ok here is the latest rev of this patch (against 2.6.20-rc4).

timings in cycles:

baseline   patchedbaseline   patched
no cache   no cachecache  cache
k8 pre-revF2116  1417
k8 revF3117  1417
core2  3816  1214
p4 4941  2424

the degredation in cached timings appears to be due to the 16 byte stack
frame set up for the sidt instruction.  apparently due to -mno-red-zone...
would you accept a patch which re-enables the red-zone for vsyscalls?

here is the slightly updated description:

below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are
faster than lsl on all x86_64 processors i've tested.  lsl requires
microcoded permission testing whereas s*dt are free of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value >= 0xfff is essentially "infinite" because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUS

-dean

Signed-off-by: dean gaudet <[EMAIL PROTECTED]>

Index: linux/arch/x86_64/kernel/time.c
===
--- linux.orig/arch/x86_64/kernel/time.c2007-01-13 22:20:46.0 
-0800
+++ linux/arch/x86_64/kernel/time.c 2007-01-13 22:21:01.0 -0800
@@ -957,11 +957,6 @@
if (unsynchronized_tsc())
notsc = 1;
 
-   if (cpu_has(_cpu_data, X86_FEATURE_RDTSCP))
-   vgetcpu_mode = VGETCPU_RDTSCP;
-   else
-   vgetcpu_mode = VGETCPU_LSL;
-
if (vxtime.hpet_address && notsc) {
timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-13 22:20:46.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-13 22:21:01.0 -0800
@@ -46,7 +46,11 @@
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include 
 
@@ -147,11 +151,11 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
-  relatively costly rdtscp/cpuid otherwise.
+  relatively costly lsl/sidt otherwise.
   This works because the scheduler usually keeps the process
   on the same CPU and this syscall doesn't guarantee its
   results anyways.
@@ -160,21 +164,30 @@
   If you don't like it pass NULL. */
if (tcache && tcache->blob[0] == (j = __jiffies)) {
p = tcache->blob[1];
-   } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-   /* Load per CPU data from RDTSCP */
-   rdtscp(dummy, dummy, p);
-   } else {
+   }
+   else {
+#ifdef VGETCPU_USE_SIDT
+struct {
+char pad[6];   /* avoid unaligned stores */
+u16 size;
+u64 address;
+} idt;
+
+asm("sidt %0" : "=m" (idt.size));
+p = idt.size - 0x1000;
+#else
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
-   }
-   if (tcache) {
-   tcache->blob[0] = j;
-   tcache->blob[1] = p;
+#endif
+   if (tcache) {
+   tcache->blob[0] = j;
+   tcache->blob[1] = p;
+   

[patch] faster vgetcpu using sidt (take 2)

2007-01-13 Thread dean gaudet
ok here is the latest rev of this patch (against 2.6.20-rc4).

timings in cycles:

baseline   patchedbaseline   patched
no cache   no cachecache  cache
k8 pre-revF2116  1417
k8 revF3117  1417
core2  3816  1214
p4 4941  2424

the degredation in cached timings appears to be due to the 16 byte stack
frame set up for the sidt instruction.  apparently due to -mno-red-zone...
would you accept a patch which re-enables the red-zone for vsyscalls?

here is the slightly updated description:

below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are
faster than lsl on all x86_64 processors i've tested.  lsl requires
microcoded permission testing whereas s*dt are free of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value = 0xfff is essentially infinite because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUSCONFIG_NODES_SHIFT) will fit in the idt limit field and 
selects the lsl method otherwise.  i've further added a test for the 20 
bit limit of the lsl method and #error in the event it doesn't fit (we 
could fall all the way back to cpuid method if someone has a box with that 
many cpus*nodes, but i'll let someone else handle that case ;).

given this is a compile-time choice, and rdtscp is always slower than 
sidt, i've dropped the vgetcpu_mode variable.

timing tools and test case can be found at 
http://arctic.org/~dean/vgetcpu/

-dean

Signed-off-by: dean gaudet [EMAIL PROTECTED]

Index: linux/arch/x86_64/kernel/time.c
===
--- linux.orig/arch/x86_64/kernel/time.c2007-01-13 22:20:46.0 
-0800
+++ linux/arch/x86_64/kernel/time.c 2007-01-13 22:21:01.0 -0800
@@ -957,11 +957,6 @@
if (unsynchronized_tsc())
notsc = 1;
 
-   if (cpu_has(boot_cpu_data, X86_FEATURE_RDTSCP))
-   vgetcpu_mode = VGETCPU_RDTSCP;
-   else
-   vgetcpu_mode = VGETCPU_LSL;
-
if (vxtime.hpet_address  notsc) {
timetype = hpet_use_timer ? HPET : PIT/HPET;
if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-13 22:20:46.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-13 22:21:01.0 -0800
@@ -46,7 +46,11 @@
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include asm/unistd.h
 
@@ -147,11 +151,11 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
-  relatively costly rdtscp/cpuid otherwise.
+  relatively costly lsl/sidt otherwise.
   This works because the scheduler usually keeps the process
   on the same CPU and this syscall doesn't guarantee its
   results anyways.
@@ -160,21 +164,30 @@
   If you don't like it pass NULL. */
if (tcache  tcache-blob[0] == (j = __jiffies)) {
p = tcache-blob[1];
-   } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-   /* Load per CPU data from RDTSCP */
-   rdtscp(dummy, dummy, p);
-   } else {
+   }
+   else {
+#ifdef VGETCPU_USE_SIDT
+struct {
+char pad[6];   /* avoid unaligned stores */
+u16 size;
+

Re: [patch] faster vgetcpu using sidt

2007-01-09 Thread Andi Kleen
> 64-bit processes can't actually use their 
> LDT can they?  

The kernel supports LDT for 64bit processes, it just is not commonly
used. 

-Andi
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt

2007-01-09 Thread Andi Kleen
 64-bit processes can't actually use their 
 LDT can they?  

The kernel supports LDT for 64bit processes, it just is not commonly
used. 

-Andi
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt

2007-01-08 Thread dean gaudet
On Sat, 6 Jan 2007, dean gaudet wrote:

> below is a patch which improves vgetcpu latency on all x86_64 
> implementations i've tested.
> 
> Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
> userland-accessible and we could use their limit fields to tuck away a few 
> bits of per-cpu information.
...

i got a hold of a p4 (model 4) and ran the timings there:

baselinepatched
no cachecache
k8 pre-revF21 14  16
k8 revF31 14  17
core2  38 12  17
p4 49 24  37

not as good as i hoped... i'll have to put the cache back in just for the 
p4... so i'll respin my patch with the cache back in place.

another thought occured to me -- 64-bit processes can't actually use their 
LDT can they?  in that case i could probably use sldt (faster than sidt) 
for 64-bit procs and fallback to sidt for 32-bit emulation (which doesn't 
exist for this vsyscall yet anyhow).

let me know if you have any other feedback.

thanks
-dean
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] faster vgetcpu using sidt

2007-01-08 Thread dean gaudet
On Sat, 6 Jan 2007, dean gaudet wrote:

 below is a patch which improves vgetcpu latency on all x86_64 
 implementations i've tested.
 
 Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
 userland-accessible and we could use their limit fields to tuck away a few 
 bits of per-cpu information.
...

i got a hold of a p4 (model 4) and ran the timings there:

baselinepatched
no cachecache
k8 pre-revF21 14  16
k8 revF31 14  17
core2  38 12  17
p4 49 24  37

not as good as i hoped... i'll have to put the cache back in just for the 
p4... so i'll respin my patch with the cache back in place.

another thought occured to me -- 64-bit processes can't actually use their 
LDT can they?  in that case i could probably use sldt (faster than sidt) 
for 64-bit procs and fallback to sidt for 32-bit emulation (which doesn't 
exist for this vsyscall yet anyhow).

let me know if you have any other feedback.

thanks
-dean
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] faster vgetcpu using sidt

2007-01-06 Thread dean gaudet
below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are 
faster than lsl on all x86_64 processors i've tested.  on p4 processers 
lsl tends to be 150 cycles whereas the s*dt instructions are 15 cycles or 
less.  lsl requires microcoded permission testing whereas s*dt are free 
of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value >= 0xfff is essentially "infinite" because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUS

-dean

Signed-off-by: dean gaudet <[EMAIL PROTECTED]>

Index: linux/arch/x86_64/kernel/time.c
===
--- linux.orig/arch/x86_64/kernel/time.c2007-01-06 13:31:10.0 
-0800
+++ linux/arch/x86_64/kernel/time.c 2007-01-06 16:04:01.0 -0800
@@ -957,11 +957,6 @@
if (unsynchronized_tsc())
notsc = 1;
 
-   if (cpu_has(_cpu_data, X86_FEATURE_RDTSCP))
-   vgetcpu_mode = VGETCPU_RDTSCP;
-   else
-   vgetcpu_mode = VGETCPU_LSL;
-
if (vxtime.hpet_address && notsc) {
timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-06 13:31:10.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-06 17:29:36.0 -0800
@@ -40,13 +40,18 @@
 #include 
 #include 
 #include 
+#include 
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
 #define __syscall_clobber "r11","rcx","memory"
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include 
 
@@ -147,11 +152,21 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
+#ifdef VGETCPU_USE_SIDT
+   struct {
+   char pad[6];/* avoid unaligned stores */
+   u16 size;
+   u64 address;
+   } idt;
+
+   asm("sidt %0" : "=m" (idt.size));
+   p = idt.size - 0x1000;
+#else
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
-  relatively costly rdtscp/cpuid otherwise.
+  relatively costly lsl otherwise.
   This works because the scheduler usually keeps the process
   on the same CPU and this syscall doesn't guarantee its
   results anyways.
@@ -160,21 +175,20 @@
   If you don't like it pass NULL. */
if (tcache && tcache->blob[0] == (j = __jiffies)) {
p = tcache->blob[1];
-   } else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
-   /* Load per CPU data from RDTSCP */
-   rdtscp(dummy, dummy, p);
-   } else {
+   }
+   else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+   if (tcache) {
+   tcache->blob[0] = j;
+   tcache->blob[1] = p;
+   }
}
-   if (tcache) {
-   tcache->blob[0] = j;
-   tcache->blob[1] = p;
-   }
+#endif
if (cpu)
-   *cpu = p & 0xfff;
+   *cpu = p >> CONFIG_NODES_SHIFT;
if (node)
-   *node = p >> 12;
+   *node = p & ((1<> 4) << 48;
+  in user space in vgetcpu. */
+   {
+   unsigned long *d;
+   d = (unsigned long *)(cpu_gdt(cpu) + GDT_ENTRY_PER_CPU);
+   *d = 0x0f400ULL;
+   *d |= cpu_node_encoding & 0x;
+

[patch] faster vgetcpu using sidt

2007-01-06 Thread dean gaudet
below is a patch which improves vgetcpu latency on all x86_64 
implementations i've tested.

Nathan Laredo pointed out the sgdt/sidt/sldt instructions are 
userland-accessible and we could use their limit fields to tuck away a few 
bits of per-cpu information.

vgetcpu generally uses lsl at present, but all of sgdt/sidt/sldt are 
faster than lsl on all x86_64 processors i've tested.  on p4 processers 
lsl tends to be 150 cycles whereas the s*dt instructions are 15 cycles or 
less.  lsl requires microcoded permission testing whereas s*dt are free 
of any such hassle.

sldt is the least expensive of the three instructions however it's a 
hassle to use because processes may want to adjust their ldt.  sidt/sgdt 
have essentially the same performance across all the major architectures 
-- however sidt has the advantage that its limit field is 16-bits, yet any 
value = 0xfff is essentially infinite because there are only 256 (16 
byte) descriptors.  so sidt is probably the best choice of the three.

in benchmarking i've discovered the rdtscp implementation of vgetcpu is 
slower than even the lsl-based implementation on opteron revF.  so i've 
dropped the rdtscp implementation in this patch.  however i've left the 
rdtscp_aux register initialized because i'm sure it's the right choice for 
various proposed vgettimeofday / per-cpu tsc state improvements which need 
the atomic nature of the rdtscp instruction and i hope it'll be used in 
those situations.

at compile time this patch detects if 0x1000 + 
(CONFIG_NR_CPUSCONFIG_NODES_SHIFT) will fit in the idt limit field and 
selects the lsl method otherwise.  i've further added a test for the 20 
bit limit of the lsl method and #error in the event it doesn't fit (we 
could fall all the way back to cpuid method if someone has a box with that 
many cpus*nodes, but i'll let someone else handle that case ;).

given this is a compile-time choice, and rdtscp is always slower than 
sidt, i've dropped the vgetcpu_mode variable.

i've also dropped the cache support in the sidt case -- depending on the 
compiler and cpu i found it to be 1 cycle slower than the uncached case, 
and it just doesn't seem worth the potential extra L1 traffic (besides if 
you add in the implied __thread overhead it's definitely a loss).

here are the before/after results:

baselinepatched
no cachecache
k8 pre-revF21 14  16
k8 revF31 14  17
core2  38 12  17

sorry i don't have a handy EMT p4 on which i can install a 2.6.20-rc3 
kernel...  but based on userland-only comparisons of the sidt/lsl 
instructions i'll be amazed if this isn't a huge win on p4.

timing tools and test case can be found at 
http://arctic.org/~dean/vgetcpu/

-dean

Signed-off-by: dean gaudet [EMAIL PROTECTED]

Index: linux/arch/x86_64/kernel/time.c
===
--- linux.orig/arch/x86_64/kernel/time.c2007-01-06 13:31:10.0 
-0800
+++ linux/arch/x86_64/kernel/time.c 2007-01-06 16:04:01.0 -0800
@@ -957,11 +957,6 @@
if (unsynchronized_tsc())
notsc = 1;
 
-   if (cpu_has(boot_cpu_data, X86_FEATURE_RDTSCP))
-   vgetcpu_mode = VGETCPU_RDTSCP;
-   else
-   vgetcpu_mode = VGETCPU_LSL;
-
if (vxtime.hpet_address  notsc) {
timetype = hpet_use_timer ? HPET : PIT/HPET;
if (hpet_use_timer)
Index: linux/arch/x86_64/kernel/vsyscall.c
===
--- linux.orig/arch/x86_64/kernel/vsyscall.c2007-01-06 13:31:10.0 
-0800
+++ linux/arch/x86_64/kernel/vsyscall.c 2007-01-06 17:29:36.0 -0800
@@ -40,13 +40,18 @@
 #include asm/segment.h
 #include asm/desc.h
 #include asm/topology.h
+#include asm/desc.h
 
 #define __vsyscall(nr) __attribute__ ((unused,__section__(.vsyscall_ #nr)))
 #define __syscall_clobber r11,rcx,memory
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
 seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
-int __vgetcpu_mode __section_vgetcpu_mode;
+
+/* is this necessary? */
+#ifndef CONFIG_NODES_SHIFT
+#define CONFIG_NODES_SHIFT 0
+#endif
 
 #include asm/unistd.h
 
@@ -147,11 +152,21 @@
 long __vsyscall(2)
 vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
 {
-   unsigned int dummy, p;
+   unsigned int p;
+#ifdef VGETCPU_USE_SIDT
+   struct {
+   char pad[6];/* avoid unaligned stores */
+   u16 size;
+   u64 address;
+   } idt;
+
+   asm(sidt %0 : =m (idt.size));
+   p = idt.size - 0x1000;
+#else
unsigned long j = 0;
 
/* Fast cache - only recompute value once per jiffies and avoid
-  relatively costly rdtscp/cpuid otherwise.
+  relatively costly lsl otherwise.
   This works because the scheduler usually keeps the