Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-12-01 Thread Konrad Rzeszutek Wilk
On Tue, Nov 25, 2014 at 07:33:58PM -0500, Waiman Long wrote:
 On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote:
 On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:
 
 My concern is that spin_unlock() can be called in many places, including
 loadable kernel modules. Can the paravirt_patch_ident_32() function able to
 patch all of them in reasonable time? How about a kernel module loaded later
 at run time?
 It has too. When the modules are loaded the .paravirt symbols are exposed
 and the module loader patches that.
 
 And during bootup time (before modules are loaded) it also patches everything
 - when it only runs on one CPU.
 
 
 I have been changing the patching code to patch the unlock call sites and it
 seems to be working now. However, when I manually inserted a kernel module
 using insmod and run the code in the newly inserted module, I got memory
 access violation as follows:
 
 BUG: unable to handle kernel NULL pointer dereference at   (null)
 IP: [  (null)]   (null)
 PGD 18d62f3067 PUD 18d476f067 PMD 0
 Oops: 0010 [#1] SMP
 Modules linked in: locktest(OE) ebtable_nat ebtables xt_CHECKSUM
 iptable_mangle bridge autofs4 8021q garp stp llc ipt_REJECT
 nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT
 nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter
 ip6_tables ipv6 vhost_net macvtap macvlan vhost tun uinput ppdev parport_pc
 parport sg microcode pcspkr virtio_balloon snd_hda_codec_generic
 virtio_console snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep
 snd_seq snd_seq_device snd_pcm snd_timer snd soundcore virtio_net i2c_piix4
 i2c_core ext4(E) jbd2(E) mbcache(E) floppy(E) virtio_blk(E) sr_mod(E)
 cdrom(E) virtio_pci(E) virtio_ring(E) virtio(E) pata_acpi(E) ata_generic(E)
 ata_piix(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last
 unloaded: speedstep_lib]
 CPU: 1 PID: 3907 Comm: run-locktest Tainted: GW  OE  3.17.0-pvqlock
 #3
 Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
 task: 8818cc5baf90 ti: 8818b7094000 task.ti: 8818b7094000
 RIP: 0010:[]  [  (null)]   (null)
 RSP: 0018:8818b7097db0  EFLAGS: 00010246
 RAX:  RBX: 004c4b40 RCX: 
 RDX: 0001 RSI:  RDI: 8818d3f052c0
 RBP: 8818b7097dd8 R08: 80522014 R09: 
 R10: 1000 R11: 0001 R12: 0001
 R13:  R14: 0001 R15: 8818b7097ea0
 FS:  7fb828ece700() GS:88193ec2() knlGS:
 CS:  0010 DS:  ES:  CR0: 8005003b
 CR2:  CR3: 0018cc7e9000 CR4: 06e0
 Stack:
  a06ff395 8818d465e000 8164bec0 0001
  0050 8818b7097e18 a06ff785 8818b7097e38
  0246 54755e3a 39f8ba72 8818c174f000
 Call Trace:
  [a06ff395] ? test_spinlock+0x65/0x90 [locktest]
  [a06ff785] etime_show+0xd5/0x120 [locktest]
  [812a2dc6] kobj_attr_show+0x16/0x20
  [8121a7fa] sysfs_kf_seq_show+0xca/0x1b0
  [81218a13] kernfs_seq_show+0x23/0x30
  [811c82db] seq_read+0xbb/0x400
  [812197e5] kernfs_fop_read+0x35/0x40
  [811a4223] vfs_read+0xa3/0x110
  [811a47e6] SyS_read+0x56/0xd0
  [810f3e16] ? __audit_syscall_exit+0x216/0x2c0
  [815b3ca9] system_call_fastpath+0x16/0x1b
 Code:  Bad RIP value.
  RSP 8818b7097db0
 CR2: 
 ---[ end trace 69d0e259c9ec632f ]---
 
 It seems like call site patching isn't properly done or the kernel module
 that I built was missing some critical information necessary for the proper

Did the readelf give you the paravirt note section?
 linking. Anyway, I will include the unlock call patching code as a separate
 patch as it seems there may be problem under certain circumstance.

one way to troubleshoot those is to enable the paravirt patching code to
actually print where it is patching the code. That way when you load the
module you can confirm it has done its job.

Then you can verify that the address  where the code is called:

a06ff395

is indeed patched. You might as well also do a hexdump in the module loading
to confim that the patching had been done correctly.
 
 BTW, the kernel panic problem that your team reported had been fixed. The
 fix will be in the next version of the patch.
 
 -Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-11-25 Thread Waiman Long

On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:


My concern is that spin_unlock() can be called in many places, including
loadable kernel modules. Can the paravirt_patch_ident_32() function able to
patch all of them in reasonable time? How about a kernel module loaded later
at run time?

It has too. When the modules are loaded the .paravirt symbols are exposed
and the module loader patches that.

And during bootup time (before modules are loaded) it also patches everything
- when it only runs on one CPU.



I have been changing the patching code to patch the unlock call sites 
and it seems to be working now. However, when I manually inserted a 
kernel module using insmod and run the code in the newly inserted 
module, I got memory access violation as follows:


BUG: unable to handle kernel NULL pointer dereference at   (null)
IP: [  (null)]   (null)
PGD 18d62f3067 PUD 18d476f067 PMD 0
Oops: 0010 [#1] SMP
Modules linked in: locktest(OE) ebtable_nat ebtables xt_CHECKSUM 
iptable_mangle bridge autofs4 8021q garp stp llc ipt_REJECT 
nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT 
nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter 
ip6_tables ipv6 vhost_net macvtap macvlan vhost tun uinput ppdev 
parport_pc parport sg microcode pcspkr virtio_balloon 
snd_hda_codec_generic virtio_console snd_hda_intel snd_hda_controller 
snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd 
soundcore virtio_net i2c_piix4 i2c_core ext4(E) jbd2(E) mbcache(E) 
floppy(E) virtio_blk(E) sr_mod(E) cdrom(E) virtio_pci(E) virtio_ring(E) 
virtio(E) pata_acpi(E) ata_generic(E) ata_piix(E) dm_mirror(E) 
dm_region_hash(E) dm_log(E) dm_mod(E) [last unloaded: speedstep_lib]
CPU: 1 PID: 3907 Comm: run-locktest Tainted: GW  OE  
3.17.0-pvqlock #3

Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011
task: 8818cc5baf90 ti: 8818b7094000 task.ti: 8818b7094000
RIP: 0010:[]  [  (null)]   (null)
RSP: 0018:8818b7097db0  EFLAGS: 00010246
RAX:  RBX: 004c4b40 RCX: 
RDX: 0001 RSI:  RDI: 8818d3f052c0
RBP: 8818b7097dd8 R08: 80522014 R09: 
R10: 1000 R11: 0001 R12: 0001
R13:  R14: 0001 R15: 8818b7097ea0
FS:  7fb828ece700() GS:88193ec2() knlGS:
CS:  0010 DS:  ES:  CR0: 8005003b
CR2:  CR3: 0018cc7e9000 CR4: 06e0
Stack:
 a06ff395 8818d465e000 8164bec0 0001
 0050 8818b7097e18 a06ff785 8818b7097e38
 0246 54755e3a 39f8ba72 8818c174f000
Call Trace:
 [a06ff395] ? test_spinlock+0x65/0x90 [locktest]
 [a06ff785] etime_show+0xd5/0x120 [locktest]
 [812a2dc6] kobj_attr_show+0x16/0x20
 [8121a7fa] sysfs_kf_seq_show+0xca/0x1b0
 [81218a13] kernfs_seq_show+0x23/0x30
 [811c82db] seq_read+0xbb/0x400
 [812197e5] kernfs_fop_read+0x35/0x40
 [811a4223] vfs_read+0xa3/0x110
 [811a47e6] SyS_read+0x56/0xd0
 [810f3e16] ? __audit_syscall_exit+0x216/0x2c0
 [815b3ca9] system_call_fastpath+0x16/0x1b
Code:  Bad RIP value.
 RSP 8818b7097db0
CR2: 
---[ end trace 69d0e259c9ec632f ]---

It seems like call site patching isn't properly done or the kernel 
module that I built was missing some critical information necessary for 
the proper linking. Anyway, I will include the unlock call patching code 
as a separate patch as it seems there may be problem under certain 
circumstance.


BTW, the kernel panic problem that your team reported had been fixed. 
The fix will be in the next version of the patch.


-Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-29 Thread Waiman Long

On 10/27/2014 05:22 PM, Waiman Long wrote:

On 10/27/2014 02:04 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

Since enabling paravirt spinlock will disable unlock function 
inlining,

a jump label can be added to the unlock function without adding patch
sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running 
on a

real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there 
might be

some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.
My concern is that spin_unlock() can be called in many places, 
including
loadable kernel modules. Can the paravirt_patch_ident_32() function 
able to
patch all of them in reasonable time? How about a kernel module 
loaded later

at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
-  apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.


Thanks for letting me know about that. I have this concern because 
your patch didn't change the current configuration of disabling unlock 
inlining when paravirt_spinlock is enabled. With that, I think it is 
worthwhile to reduce the performance delta between the PV and non-PV 
kernel on bare metal.


I am sorry that the unlock call sites patching code doesn't work in a 
virtual guest. Your pvqspinlock patch did an unconditional patching even 
in a virtual guest. I added check for the paravirt_spinlocks_enabled, 
but it turned out that some spin_unlock() seemed to be called before 
paravirt_spinlocks_enabled is set. As a result, some call sites were 
still patched resulting in missed wake up's and system hang.


At this point, I am going to leave out that change from my patch set 
until we can figure out a better way of doing that.


-Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-29 Thread Waiman Long

On 10/29/2014 03:05 PM, Waiman Long wrote:

On 10/27/2014 05:22 PM, Waiman Long wrote:

On 10/27/2014 02:04 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

Since enabling paravirt spinlock will disable unlock function 
inlining,
a jump label can be added to the unlock function without adding 
patch

sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running 
on a

real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there 
might be

some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.
My concern is that spin_unlock() can be called in many places, 
including
loadable kernel modules. Can the paravirt_patch_ident_32() function 
able to
patch all of them in reasonable time? How about a kernel module 
loaded later

at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
-  apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.


Thanks for letting me know about that. I have this concern because 
your patch didn't change the current configuration of disabling 
unlock inlining when paravirt_spinlock is enabled. With that, I think 
it is worthwhile to reduce the performance delta between the PV and 
non-PV kernel on bare metal.


I am sorry that the unlock call sites patching code doesn't work in a 
virtual guest. Your pvqspinlock patch did an unconditional patching 
even in a virtual guest. I added check for the 
paravirt_spinlocks_enabled, but it turned out that some spin_unlock() 
seemed to be called before paravirt_spinlocks_enabled is set. As a 
result, some call sites were still patched resulting in missed wake 
up's and system hang.


At this point, I am going to leave out that change from my patch set 
until we can figure out a better way of doing that.




Below was a partial kernel log with the unlock call site patch code in a 
KVM guest:


[0.438006] native_patch: patch out pv_queue_unlock!
[0.438565] native_patch: patch out pv_queue_unlock!
[0.439006] native_patch: patch out pv_queue_unlock!
[0.439638] native_patch: patch out pv_queue_unlock!
[0.440052] native_patch: patch out pv_queue_unlock!
[0.441006] native_patch: patch out pv_queue_unlock!
[0.441566] native_patch: patch out pv_queue_unlock!
[0.442035] ftrace: allocating 24168 entries in 95 pages
[0.451208] Switched APIC routing to physical flat.
[0.453202] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[0.454002] smpboot: CPU0: Intel QEMU Virtual CPU version 1.5.3 (fam: 
06, model: 06, stepping: 03)
[0.456000] Performance Events: Broken PMU hardware detected, using 
software events only.

[0.456003] Failed to access perfctr msr (MSR c1 is 0)
[0.457151] KVM setup paravirtual spinlock
[0.460039] NMI watchdog: disabled (cpu0): hardware events not enabled

It could be seen that some unlock call sites were patched before the KVM 
setup code set the paravirt_spinlocks_enabled flag.


-Longman

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Mike Galbraith
On Sat, 2014-10-25 at 00:04 +0200, Peter Zijlstra wrote: 
 On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote:
  The additional register pressure may just cause a few more register moves
  which should be negligible in the overall performance . The additional
  icache pressure, however, may have some impact on performance. I was trying
  to balance the performance of the pv and non-pv versions so that we won't
  penalize the pv code too much for a bit more performance in the non-pv code.
  Doing it your way will add a lot of function call and register
  saving/restoring to the pv code.
 
 If people care about performance they should not be using virt crap :-)

I tried some benching recently.. where did they hide the fastpaths? :)

-Mike

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Waiman Long

On 10/24/2014 06:04 PM, Peter Zijlstra wrote:

On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote:

The additional register pressure may just cause a few more register moves
which should be negligible in the overall performance . The additional
icache pressure, however, may have some impact on performance. I was trying
to balance the performance of the pv and non-pv versions so that we won't
penalize the pv code too much for a bit more performance in the non-pv code.
Doing it your way will add a lot of function call and register
saving/restoring to the pv code.

If people care about performance they should not be using virt crap :-)

I only really care about bare metal.


Yes, I am aware of that. However, the whole point of doing PV spinlock 
is to improve performance in a virtual guest.


Anyway, I had done some measurements. In my test system, the 
queue_spin_lock_slowpath() function has a text size of about 400 bytes 
without PV, but 1120 bytes with PV. I made some changes to create 
separate versions of PV and non-PV slowpath functions as shown by the 
diff below. The text size is now about 430 bytes for the non-PV version 
and 925 bytes for the PV version. The overall object size increases by a 
bit more than 200 bytes, but the icache footprint should be reduced no 
matter which version is used.


-Longman



diff --git a/arch/x86/include/asm/pvqspinlock.h 
b/arch/x86/include/asm/pvqspinlo

index d424252..241bf30 100644
--- a/arch/x86/include/asm/pvqspinlock.h
+++ b/arch/x86/include/asm/pvqspinlock.h
@@ -79,9 +79,6 @@ static inline void pv_init_node(struct mcs_spinlock *node)

BUILD_BUG_ON(sizeof(struct pv_qnode)  5*sizeof(struct 
mcs_spinlock));


-   if (!pv_enabled())
-   return;
-
pn-cpustate = PV_CPU_ACTIVE;
pn-mayhalt  = false;
pn-mycpu= smp_processor_id();
@@ -132,9 +129,6 @@ static inline bool pv_link_and_wait_node(u32 old, 
struct mcs

struct pv_qnode *ppn, *pn = (struct pv_qnode *)node;
unsigned int count;

-   if (!pv_enabled())
-   return false;
-
if (!(old  _Q_TAIL_MASK)) {
node-locked = true;/* At queue head now */
goto ret;
@@ -236,9 +230,6 @@ pv_wait_head(struct qspinlock *lock, struct 
mcs_spinlock *no

 {
struct pv_qnode *pn = (struct pv_qnode *)node;

-   if (!pv_enabled())
-   return smp_load_acquire(lock-val.counter);
-
for (;;) {
unsigned int count;
s8 oldstate;
@@ -328,8 +319,6 @@ static inline void pv_wait_check(struct qspinlock *lock,
struct pv_qnode *pnxt = (struct pv_qnode *)next;
struct pv_qnode *pcur = (struct pv_qnode *)node;

-   if (!pv_enabled())
-   return;
/*
 * Clear the locked and head values of lock holder
 */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 1662dbd..05aea57 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -16,6 +16,7 @@
  * Authors: Waiman Long waiman.l...@hp.com
  *  Peter Zijlstra pzijl...@redhat.com
  */
+#ifndef _GEN_PV_LOCK_SLOWPATH
 #include linux/smp.h
 #include linux/bug.h
 #include linux/cpumask.h
@@ -271,19 +272,37 @@ void queue_spin_unlock_slowpath(struct qspinlock 
*lock)

 }
 EXPORT_SYMBOL(queue_spin_unlock_slowpath);

-#else
+static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+#else  /* CONFIG_PARAVIRT_SPINLOCKS */
+
+static inline void pv_queue_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)

+   { }

-static inline void pv_init_node(struct mcs_spinlock *node) { }
-static inline void pv_wait_check(struct qspinlock *lock,
-struct mcs_spinlock *node,
-struct mcs_spinlock *next) { }
-static inline bool pv_link_and_wait_node(u32 old, struct mcs_spinlock 
*node)

+#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+
+/*
+ * Dummy PV functions for bare-metal slowpath code
+ */
+static inline void nopv_init_node(struct mcs_spinlock *node)   { }
+static inline void nopv_wait_check(struct qspinlock *lock,
+  struct mcs_spinlock *node,
+  struct mcs_spinlock *next)   { }
+static inline bool nopv_link_and_wait_node(u32 old, struct mcs_spinlock 
*node)

   { return false; }
-static inline int  pv_wait_head(struct qspinlock *lock,
+static inline int  nopv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node)
   { return smp_load_acquire(lock-val.counter); }
+static inline bool return_true(void)   { return true;  }
+static inline bool return_false(void)  { return false; }

-#endif /* CONFIG_PARAVIRT_SPINLOCKS */
+#define pv_init_node   nopv_init_node
+#define pv_wait_check  nopv_wait_check
+#define pv_link_and_wait_node  nopv_link_and_wait_node

Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Peter Zijlstra
On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote:
 On 10/24/2014 06:04 PM, Peter Zijlstra wrote:
 On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote:
 The additional register pressure may just cause a few more register moves
 which should be negligible in the overall performance . The additional
 icache pressure, however, may have some impact on performance. I was trying
 to balance the performance of the pv and non-pv versions so that we won't
 penalize the pv code too much for a bit more performance in the non-pv code.
 Doing it your way will add a lot of function call and register
 saving/restoring to the pv code.
 If people care about performance they should not be using virt crap :-)
 
 I only really care about bare metal.
 
 Yes, I am aware of that. However, the whole point of doing PV spinlock is to
 improve performance in a virtual guest.

Anything that avoids the lock holder preemption nonsense is a _massive_
win for them, a few function calls should not even register on that
scale.

 +#ifdef _GEN_PV_LOCK_SLOWPATH
 +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 +#else
  void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 +#endif

If you have two functions you might as well use the PV stuff to patch in
the right function call at the usage sites and avoid:

 +   if (pv_enabled()) {
 +   pv_queue_spin_lock_slowpath(lock, val);
 +   return;
 +   }

this alltogether.

 this_cpu_dec(mcs_nodes[0].count);
  }
  EXPORT_SYMBOL(queue_spin_lock_slowpath);
 +
 +#if !defined(_GEN_PV_LOCK_SLOWPATH)  defined(CONFIG_PARAVIRT_SPINLOCKS)
 +/*
 + * Generate the PV version of the queue_spin_lock_slowpath function
 + */
 +#undef pv_init_node
 +#undef pv_wait_check
 +#undef pv_link_and_wait_node
 +#undef pv_wait_head
 +#undef EXPORT_SYMBOL
 +#undef in_pv_code
 +
 +#define _GEN_PV_LOCK_SLOWPATH
 +#define EXPORT_SYMBOL(x)
 +#define in_pv_code return_true
 +#define pv_enabled return_false
 +
 +#include qspinlock.c
 +
 +#endif

That's properly disgusting :-) But a lot better than actually
duplicating everything I suppose.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Waiman Long

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:


Since enabling paravirt spinlock will disable unlock function inlining,
a jump label can be added to the unlock function without adding patch
sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running on a
real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there might be
some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.


My concern is that spin_unlock() can be called in many places, including 
loadable kernel modules. Can the paravirt_patch_ident_32() function able 
to patch all of them in reasonable time? How about a kernel module 
loaded later at run time?


So I think we may still need to disable unlock function inlining even if 
we used your way kernel site patching.


Regards,
Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Konrad Rzeszutek Wilk
On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:
 On 10/24/2014 04:54 AM, Peter Zijlstra wrote:
 On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:
 
 Since enabling paravirt spinlock will disable unlock function inlining,
 a jump label can be added to the unlock function without adding patch
 sites all over the kernel.
 But you don't have to. My patches allowed for the inline to remain,
 again reducing the overhead of enabling PV spinlocks while running on a
 real machine.
 
 Look at:
 
http://lkml.kernel.org/r/20140615130154.213923...@chello.nl
 
 In particular this hunk:
 
 Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
 ===
 --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
 +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
 @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
   DEF_NATIVE(, mov32, mov %edi, %eax);
   DEF_NATIVE(, mov64, mov %rdi, %rax);
 
 +#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
 +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
 +#endif
 +
   unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
   {
  return paravirt_patch_insns(insnbuf, len,
 @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
  PATCH_SITE(pv_cpu_ops, clts);
  PATCH_SITE(pv_mmu_ops, flush_tlb_single);
  PATCH_SITE(pv_cpu_ops, wbinvd);
 +#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
 +   PATCH_SITE(pv_lock_ops, queue_unlock);
 +#endif
 
  patch_site:
  ret = paravirt_patch_insns(ibuf, len, start, end);
 
 
 That makes sure to overwrite the callee-saved call to the
 pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).
 
 
 Therefore you can retain the inlined unlock with hardly (there might be
 some NOP padding) any overhead at all. On PV it reverts to a callee
 saved function call.
 
 My concern is that spin_unlock() can be called in many places, including
 loadable kernel modules. Can the paravirt_patch_ident_32() function able to
 patch all of them in reasonable time? How about a kernel module loaded later
 at run time?

It has too. When the modules are loaded the .paravirt symbols are exposed
and the module loader patches that.

And during bootup time (before modules are loaded) it also patches everything
- when it only runs on one CPU.
 
 So I think we may still need to disable unlock function inlining even if we
 used your way kernel site patching.

No need. Inline should (And is) working just fine.
 
 Regards,
 Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Peter Zijlstra
On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:
 On 10/24/2014 04:54 AM, Peter Zijlstra wrote:
 On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:
 
 Since enabling paravirt spinlock will disable unlock function inlining,
 a jump label can be added to the unlock function without adding patch
 sites all over the kernel.
 But you don't have to. My patches allowed for the inline to remain,
 again reducing the overhead of enabling PV spinlocks while running on a
 real machine.
 
 Look at:
 
http://lkml.kernel.org/r/20140615130154.213923...@chello.nl
 
 In particular this hunk:
 
 Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
 ===
 --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
 +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
 @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
   DEF_NATIVE(, mov32, mov %edi, %eax);
   DEF_NATIVE(, mov64, mov %rdi, %rax);
 
 +#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
 +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
 +#endif
 +
   unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
   {
  return paravirt_patch_insns(insnbuf, len,
 @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
  PATCH_SITE(pv_cpu_ops, clts);
  PATCH_SITE(pv_mmu_ops, flush_tlb_single);
  PATCH_SITE(pv_cpu_ops, wbinvd);
 +#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
 +   PATCH_SITE(pv_lock_ops, queue_unlock);
 +#endif
 
  patch_site:
  ret = paravirt_patch_insns(ibuf, len, start, end);
 
 
 That makes sure to overwrite the callee-saved call to the
 pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).
 
 
 Therefore you can retain the inlined unlock with hardly (there might be
 some NOP padding) any overhead at all. On PV it reverts to a callee
 saved function call.
 
 My concern is that spin_unlock() can be called in many places, including
 loadable kernel modules. Can the paravirt_patch_ident_32() function able to
 patch all of them in reasonable time? How about a kernel module loaded later
 at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
- apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Waiman Long

On 10/27/2014 01:27 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote:

On 10/24/2014 06:04 PM, Peter Zijlstra wrote:

On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote:

The additional register pressure may just cause a few more register moves
which should be negligible in the overall performance . The additional
icache pressure, however, may have some impact on performance. I was trying
to balance the performance of the pv and non-pv versions so that we won't
penalize the pv code too much for a bit more performance in the non-pv code.
Doing it your way will add a lot of function call and register
saving/restoring to the pv code.

If people care about performance they should not be using virt crap :-)

I only really care about bare metal.

Yes, I am aware of that. However, the whole point of doing PV spinlock is to
improve performance in a virtual guest.

Anything that avoids the lock holder preemption nonsense is a _massive_
win for them, a few function calls should not even register on that
scale.


I would say all the PV stuffs are mostly useful for a over-committed 
guest where a single CPU is shared in more than one guest. When the 
guests are not overcommitted, the PV code seldom get triggered. In those 
cases, the overhead of the extra function call and register 
saving/restoring will show up.



+#ifdef _GEN_PV_LOCK_SLOWPATH
+static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+#else
  void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+#endif

If you have two functions you might as well use the PV stuff to patch in
the right function call at the usage sites and avoid:


+   if (pv_enabled()) {
+   pv_queue_spin_lock_slowpath(lock, val);
+   return;
+   }

this alltogether.


Good point! I will do some investigation on how to do this kind of 
function address patching and eliminate the extra function call overhead.



 this_cpu_dec(mcs_nodes[0].count);
  }
  EXPORT_SYMBOL(queue_spin_lock_slowpath);
+
+#if !defined(_GEN_PV_LOCK_SLOWPATH)  defined(CONFIG_PARAVIRT_SPINLOCKS)
+/*
+ * Generate the PV version of the queue_spin_lock_slowpath function
+ */
+#undef pv_init_node
+#undef pv_wait_check
+#undef pv_link_and_wait_node
+#undef pv_wait_head
+#undef EXPORT_SYMBOL
+#undef in_pv_code
+
+#define _GEN_PV_LOCK_SLOWPATH
+#define EXPORT_SYMBOL(x)
+#define in_pv_code return_true
+#define pv_enabled return_false
+
+#include qspinlock.c
+
+#endif

That's properly disgusting :-) But a lot better than actually
duplicating everything I suppose.


I know you don't like this kind of preprocessor trick, but this is the 
easiest way that I can think of to generate two separate functions from 
the same source code.


-Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Waiman Long

On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:


Since enabling paravirt spinlock will disable unlock function inlining,
a jump label can be added to the unlock function without adding patch
sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running on a
real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   defined(CONFIG_QUEUE_SPINLOCK)
+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there might be
some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.

My concern is that spin_unlock() can be called in many places, including
loadable kernel modules. Can the paravirt_patch_ident_32() function able to
patch all of them in reasonable time? How about a kernel module loaded later
at run time?

It has too. When the modules are loaded the .paravirt symbols are exposed
and the module loader patches that.

And during bootup time (before modules are loaded) it also patches everything
- when it only runs on one CPU.

So I think we may still need to disable unlock function inlining even if we
used your way kernel site patching.

No need. Inline should (And is) working just fine.

Regards,
Longman


Thanks for letting me know about the paravirt patching capability 
available in the kernel. In this case, I would say we should use Peter's 
way of doing unlock without disabling unlock function inlining. That 
will further reduce the performance difference of kernels with and 
without PV.


Cheer,
Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-27 Thread Waiman Long

On 10/27/2014 02:04 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:


Since enabling paravirt spinlock will disable unlock function inlining,
a jump label can be added to the unlock function without adding patch
sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running on a
real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   defined(CONFIG_QUEUE_SPINLOCK)
+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there might be
some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.

My concern is that spin_unlock() can be called in many places, including
loadable kernel modules. Can the paravirt_patch_ident_32() function able to
patch all of them in reasonable time? How about a kernel module loaded later
at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
-  apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.


Thanks for letting me know about that. I have this concern because your 
patch didn't change the current configuration of disabling unlock 
inlining when paravirt_spinlock is enabled. With that, I think it is 
worthwhile to reduce the performance delta between the PV and non-PV 
kernel on bare metal.


-Longman
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-24 Thread Peter Zijlstra
On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

 Since enabling paravirt spinlock will disable unlock function inlining,
 a jump label can be added to the unlock function without adding patch
 sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running on a
real machine.

Look at: 

  http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
 DEF_NATIVE(, mov32, mov %edi, %eax);
 DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+ 
 unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
 {
return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)  defined(CONFIG_QUEUE_SPINLOCK)
+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

patch_site:
ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there might be
some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-24 Thread Peter Zijlstra
On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:
 +static inline void pv_init_node(struct mcs_spinlock *node)
 +{
 + struct pv_qnode *pn = (struct pv_qnode *)node;
 +
 + BUILD_BUG_ON(sizeof(struct pv_qnode)  5*sizeof(struct mcs_spinlock));
 +
 + if (!pv_enabled())
 + return;
 +
 + pn-cpustate = PV_CPU_ACTIVE;
 + pn-mayhalt  = false;
 + pn-mycpu= smp_processor_id();
 + pn-head = PV_INVALID_HEAD;
 +}


 @@ -333,6 +393,7 @@ queue:
   node += idx;
   node-locked = 0;
   node-next = NULL;
 + pv_init_node(node);
  
   /*
* We touched a (possibly) cold cacheline in the per-cpu queue node;


So even if !pv_enabled() the compiler will still have to emit the code
for that inline, which will generate additional register pressure,
icache pressure and lovely stuff like that.

The patch I had used pv-ops for these things that would turn into NOPs
in the regular case and callee-saved function calls for the PV case.

That still does not entirely eliminate cost, but does reduce it
significant. Please consider using that.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-24 Thread Waiman Long

On 10/24/2014 04:47 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

+static inline void pv_init_node(struct mcs_spinlock *node)
+{
+   struct pv_qnode *pn = (struct pv_qnode *)node;
+
+   BUILD_BUG_ON(sizeof(struct pv_qnode)  5*sizeof(struct mcs_spinlock));
+
+   if (!pv_enabled())
+   return;
+
+   pn-cpustate = PV_CPU_ACTIVE;
+   pn-mayhalt  = false;
+   pn-mycpu= smp_processor_id();
+   pn-head = PV_INVALID_HEAD;
+}



@@ -333,6 +393,7 @@ queue:
node += idx;
node-locked = 0;
node-next = NULL;
+   pv_init_node(node);

/*
 * We touched a (possibly) cold cacheline in the per-cpu queue node;


So even if !pv_enabled() the compiler will still have to emit the code
for that inline, which will generate additional register pressure,
icache pressure and lovely stuff like that.

The patch I had used pv-ops for these things that would turn into NOPs
in the regular case and callee-saved function calls for the PV case.

That still does not entirely eliminate cost, but does reduce it
significant. Please consider using that.


The additional register pressure may just cause a few more register 
moves which should be negligible in the overall performance . The 
additional icache pressure, however, may have some impact on 
performance. I was trying to balance the performance of the pv and 
non-pv versions so that we won't penalize the pv code too much for a bit 
more performance in the non-pv code. Doing it your way will add a lot of 
function call and register saving/restoring to the pv code.


Another alternative that I can think of is to generate 2 versions of the 
slowpath code - one pv and one non-pv out of the same source code. The 
non-pv code will call into the pv code once if pv is enabled. In this 
way, it won't increase the icache and register pressure of the non-pv 
code. However, this may make the source code a bit harder to read.


Please let me know your thought on this alternate approach.

-Longman


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-24 Thread Peter Zijlstra
On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote:
 The additional register pressure may just cause a few more register moves
 which should be negligible in the overall performance . The additional
 icache pressure, however, may have some impact on performance. I was trying
 to balance the performance of the pv and non-pv versions so that we won't
 penalize the pv code too much for a bit more performance in the non-pv code.
 Doing it your way will add a lot of function call and register
 saving/restoring to the pv code.

If people care about performance they should not be using virt crap :-)

I only really care about bare metal.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-16 Thread Waiman Long
This patch adds para-virtualization support to the queue spinlock
code base with minimal impact to the native case. There are some
minor code changes in the generic qspinlock.c file which should be
usable in other architectures. The other code changes are specific
to x86 processors and so are all put under the arch/x86 directory.

On the lock side, there are a couple of jump labels and 2 paravirt
callee saved calls that defaults to NOPs and some registered move
instructions. So the performance impact should be minimal.

Since enabling paravirt spinlock will disable unlock function inlining,
a jump label can be added to the unlock function without adding patch
sites all over the kernel.

The actual paravirt code comes in 5 parts;

 - init_node; this initializes the extra data members required for PV
   state. PV state data is kept 1 cacheline ahead of the regular data.

 - link_and_wait_node; this replaces the regular MCS queuing code. CPU
   halting can happen if the wait is too long.

 - wait_head; this waits until the lock is avialable and the CPU will
   be halted if the wait is too long.

 - wait_check; this is called after acquiring the lock to see if the
   next queue head CPU is halted. If this is the case, the lock bit is
   changed to indicate the queue head will have to be kicked on unlock.

 - queue_unlock;  this routine has a jump label to check if paravirt
   is enabled. If yes, it has to do an atomic cmpxchg to clear the lock
   bit or call the slowpath function to kick the queue head cpu.

Tracking the head is done in two parts, firstly the pv_wait_head will
store its cpu number in whichever node is pointed to by the tail part
of the lock word. Secondly, pv_link_and_wait_node() will propagate the
existing head from the old to the new tail node.

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 arch/x86/include/asm/paravirt.h   |   20 ++
 arch/x86/include/asm/paravirt_types.h |   20 ++
 arch/x86/include/asm/pvqspinlock.h|  403 +
 arch/x86/include/asm/qspinlock.h  |   44 -
 arch/x86/kernel/paravirt-spinlocks.c  |6 +
 kernel/locking/qspinlock.c|   72 ++-
 6 files changed, 558 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/include/asm/pvqspinlock.h

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cd6e161..3b041db 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,25 @@ static inline void __set_fixmap(unsigned /* enum 
fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP)  defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+
+static __always_inline void pv_kick_cpu(int cpu)
+{
+   PVOP_VCALLEE1(pv_lock_ops.kick_cpu, cpu);
+}
+
+static __always_inline void
+pv_lockwait(u8 *lockbyte)
+{
+   PVOP_VCALLEE1(pv_lock_ops.lockwait, lockbyte);
+}
+
+static __always_inline void pv_lockstat(enum pv_lock_stats type)
+{
+   PVOP_VCALLEE1(pv_lock_ops.lockstat, type);
+}
+
+#else
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
__ticket_t ticket)
 {
@@ -723,6 +742,7 @@ static __always_inline void __ticket_unlock_kick(struct 
arch_spinlock *lock,
 {
PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
+#endif
 
 #endif
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 7549b8b..49e4b76 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -326,6 +326,9 @@ struct pv_mmu_ops {
   phys_addr_t phys, pgprot_t flags);
 };
 
+struct mcs_spinlock;
+struct qspinlock;
+
 struct arch_spinlock;
 #ifdef CONFIG_SMP
 #include asm/spinlock_types.h
@@ -333,9 +336,26 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+enum pv_lock_stats {
+   PV_HALT_QHEAD,  /* Queue head halting   */
+   PV_HALT_QNODE,  /* Other queue node halting */
+   PV_HALT_ABORT,  /* Halting aborted  */
+   PV_WAKE_KICKED, /* Wakeup by kicking*/
+   PV_WAKE_SPURIOUS,   /* Spurious wakeup  */
+   PV_KICK_NOHALT  /* Kick but CPU not halted  */
+};
+#endif
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUE_SPINLOCK
+   struct paravirt_callee_save kick_cpu;
+   struct paravirt_callee_save lockstat;
+   struct paravirt_callee_save lockwait;
+#else
struct paravirt_callee_save lock_spinning;
void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pvqspinlock.h 
b/arch/x86/include/asm/pvqspinlock.h
new file mode 100644
index 000..d424252
--- /dev/null
+++ b/arch/x86/include/asm/pvqspinlock.h
@@ -0,0 +1,403 @@
+#ifndef _ASM_X86_PVQSPINLOCK_H
+#define _ASM_X86_PVQSPINLOCK_H
+
+/*
+ *