Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Tue, Nov 25, 2014 at 07:33:58PM -0500, Waiman Long wrote: On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? It has too. When the modules are loaded the .paravirt symbols are exposed and the module loader patches that. And during bootup time (before modules are loaded) it also patches everything - when it only runs on one CPU. I have been changing the patching code to patch the unlock call sites and it seems to be working now. However, when I manually inserted a kernel module using insmod and run the code in the newly inserted module, I got memory access violation as follows: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [ (null)] (null) PGD 18d62f3067 PUD 18d476f067 PMD 0 Oops: 0010 [#1] SMP Modules linked in: locktest(OE) ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge autofs4 8021q garp stp llc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 vhost_net macvtap macvlan vhost tun uinput ppdev parport_pc parport sg microcode pcspkr virtio_balloon snd_hda_codec_generic virtio_console snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore virtio_net i2c_piix4 i2c_core ext4(E) jbd2(E) mbcache(E) floppy(E) virtio_blk(E) sr_mod(E) cdrom(E) virtio_pci(E) virtio_ring(E) virtio(E) pata_acpi(E) ata_generic(E) ata_piix(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last unloaded: speedstep_lib] CPU: 1 PID: 3907 Comm: run-locktest Tainted: GW OE 3.17.0-pvqlock #3 Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011 task: 8818cc5baf90 ti: 8818b7094000 task.ti: 8818b7094000 RIP: 0010:[] [ (null)] (null) RSP: 0018:8818b7097db0 EFLAGS: 00010246 RAX: RBX: 004c4b40 RCX: RDX: 0001 RSI: RDI: 8818d3f052c0 RBP: 8818b7097dd8 R08: 80522014 R09: R10: 1000 R11: 0001 R12: 0001 R13: R14: 0001 R15: 8818b7097ea0 FS: 7fb828ece700() GS:88193ec2() knlGS: CS: 0010 DS: ES: CR0: 8005003b CR2: CR3: 0018cc7e9000 CR4: 06e0 Stack: a06ff395 8818d465e000 8164bec0 0001 0050 8818b7097e18 a06ff785 8818b7097e38 0246 54755e3a 39f8ba72 8818c174f000 Call Trace: [a06ff395] ? test_spinlock+0x65/0x90 [locktest] [a06ff785] etime_show+0xd5/0x120 [locktest] [812a2dc6] kobj_attr_show+0x16/0x20 [8121a7fa] sysfs_kf_seq_show+0xca/0x1b0 [81218a13] kernfs_seq_show+0x23/0x30 [811c82db] seq_read+0xbb/0x400 [812197e5] kernfs_fop_read+0x35/0x40 [811a4223] vfs_read+0xa3/0x110 [811a47e6] SyS_read+0x56/0xd0 [810f3e16] ? __audit_syscall_exit+0x216/0x2c0 [815b3ca9] system_call_fastpath+0x16/0x1b Code: Bad RIP value. RSP 8818b7097db0 CR2: ---[ end trace 69d0e259c9ec632f ]--- It seems like call site patching isn't properly done or the kernel module that I built was missing some critical information necessary for the proper Did the readelf give you the paravirt note section? linking. Anyway, I will include the unlock call patching code as a separate patch as it seems there may be problem under certain circumstance. one way to troubleshoot those is to enable the paravirt patching code to actually print where it is patching the code. That way when you load the module you can confirm it has done its job. Then you can verify that the address where the code is called: a06ff395 is indeed patched. You might as well also do a hexdump in the module loading to confim that the patching had been done correctly. BTW, the kernel panic problem that your team reported had been fixed. The fix will be in the next version of the patch. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? It has too. When the modules are loaded the .paravirt symbols are exposed and the module loader patches that. And during bootup time (before modules are loaded) it also patches everything - when it only runs on one CPU. I have been changing the patching code to patch the unlock call sites and it seems to be working now. However, when I manually inserted a kernel module using insmod and run the code in the newly inserted module, I got memory access violation as follows: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [ (null)] (null) PGD 18d62f3067 PUD 18d476f067 PMD 0 Oops: 0010 [#1] SMP Modules linked in: locktest(OE) ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge autofs4 8021q garp stp llc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables ipv6 vhost_net macvtap macvlan vhost tun uinput ppdev parport_pc parport sg microcode pcspkr virtio_balloon snd_hda_codec_generic virtio_console snd_hda_intel snd_hda_controller snd_hda_codec snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer snd soundcore virtio_net i2c_piix4 i2c_core ext4(E) jbd2(E) mbcache(E) floppy(E) virtio_blk(E) sr_mod(E) cdrom(E) virtio_pci(E) virtio_ring(E) virtio(E) pata_acpi(E) ata_generic(E) ata_piix(E) dm_mirror(E) dm_region_hash(E) dm_log(E) dm_mod(E) [last unloaded: speedstep_lib] CPU: 1 PID: 3907 Comm: run-locktest Tainted: GW OE 3.17.0-pvqlock #3 Hardware name: Red Hat KVM, BIOS Bochs 01/01/2011 task: 8818cc5baf90 ti: 8818b7094000 task.ti: 8818b7094000 RIP: 0010:[] [ (null)] (null) RSP: 0018:8818b7097db0 EFLAGS: 00010246 RAX: RBX: 004c4b40 RCX: RDX: 0001 RSI: RDI: 8818d3f052c0 RBP: 8818b7097dd8 R08: 80522014 R09: R10: 1000 R11: 0001 R12: 0001 R13: R14: 0001 R15: 8818b7097ea0 FS: 7fb828ece700() GS:88193ec2() knlGS: CS: 0010 DS: ES: CR0: 8005003b CR2: CR3: 0018cc7e9000 CR4: 06e0 Stack: a06ff395 8818d465e000 8164bec0 0001 0050 8818b7097e18 a06ff785 8818b7097e38 0246 54755e3a 39f8ba72 8818c174f000 Call Trace: [a06ff395] ? test_spinlock+0x65/0x90 [locktest] [a06ff785] etime_show+0xd5/0x120 [locktest] [812a2dc6] kobj_attr_show+0x16/0x20 [8121a7fa] sysfs_kf_seq_show+0xca/0x1b0 [81218a13] kernfs_seq_show+0x23/0x30 [811c82db] seq_read+0xbb/0x400 [812197e5] kernfs_fop_read+0x35/0x40 [811a4223] vfs_read+0xa3/0x110 [811a47e6] SyS_read+0x56/0xd0 [810f3e16] ? __audit_syscall_exit+0x216/0x2c0 [815b3ca9] system_call_fastpath+0x16/0x1b Code: Bad RIP value. RSP 8818b7097db0 CR2: ---[ end trace 69d0e259c9ec632f ]--- It seems like call site patching isn't properly done or the kernel module that I built was missing some critical information necessary for the proper linking. Anyway, I will include the unlock call patching code as a separate patch as it seems there may be problem under certain circumstance. BTW, the kernel panic problem that your team reported had been fixed. The fix will be in the next version of the patch. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 05:22 PM, Waiman Long wrote: On 10/27/2014 02:04 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. Thanks for letting me know about that. I have this concern because your patch didn't change the current configuration of disabling unlock inlining when paravirt_spinlock is enabled. With that, I think it is worthwhile to reduce the performance delta between the PV and non-PV kernel on bare metal. I am sorry that the unlock call sites patching code doesn't work in a virtual guest. Your pvqspinlock patch did an unconditional patching even in a virtual guest. I added check for the paravirt_spinlocks_enabled, but it turned out that some spin_unlock() seemed to be called before paravirt_spinlocks_enabled is set. As a result, some call sites were still patched resulting in missed wake up's and system hang. At this point, I am going to leave out that change from my patch set until we can figure out a better way of doing that. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/29/2014 03:05 PM, Waiman Long wrote: On 10/27/2014 05:22 PM, Waiman Long wrote: On 10/27/2014 02:04 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. Thanks for letting me know about that. I have this concern because your patch didn't change the current configuration of disabling unlock inlining when paravirt_spinlock is enabled. With that, I think it is worthwhile to reduce the performance delta between the PV and non-PV kernel on bare metal. I am sorry that the unlock call sites patching code doesn't work in a virtual guest. Your pvqspinlock patch did an unconditional patching even in a virtual guest. I added check for the paravirt_spinlocks_enabled, but it turned out that some spin_unlock() seemed to be called before paravirt_spinlocks_enabled is set. As a result, some call sites were still patched resulting in missed wake up's and system hang. At this point, I am going to leave out that change from my patch set until we can figure out a better way of doing that. Below was a partial kernel log with the unlock call site patch code in a KVM guest: [0.438006] native_patch: patch out pv_queue_unlock! [0.438565] native_patch: patch out pv_queue_unlock! [0.439006] native_patch: patch out pv_queue_unlock! [0.439638] native_patch: patch out pv_queue_unlock! [0.440052] native_patch: patch out pv_queue_unlock! [0.441006] native_patch: patch out pv_queue_unlock! [0.441566] native_patch: patch out pv_queue_unlock! [0.442035] ftrace: allocating 24168 entries in 95 pages [0.451208] Switched APIC routing to physical flat. [0.453202] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 [0.454002] smpboot: CPU0: Intel QEMU Virtual CPU version 1.5.3 (fam: 06, model: 06, stepping: 03) [0.456000] Performance Events: Broken PMU hardware detected, using software events only. [0.456003] Failed to access perfctr msr (MSR c1 is 0) [0.457151] KVM setup paravirtual spinlock [0.460039] NMI watchdog: disabled (cpu0): hardware events not enabled It could be seen that some unlock call sites were patched before the KVM setup code set the paravirt_spinlocks_enabled flag. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Sat, 2014-10-25 at 00:04 +0200, Peter Zijlstra wrote: On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. If people care about performance they should not be using virt crap :-) I tried some benching recently.. where did they hide the fastpaths? :) -Mike ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/24/2014 06:04 PM, Peter Zijlstra wrote: On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. If people care about performance they should not be using virt crap :-) I only really care about bare metal. Yes, I am aware of that. However, the whole point of doing PV spinlock is to improve performance in a virtual guest. Anyway, I had done some measurements. In my test system, the queue_spin_lock_slowpath() function has a text size of about 400 bytes without PV, but 1120 bytes with PV. I made some changes to create separate versions of PV and non-PV slowpath functions as shown by the diff below. The text size is now about 430 bytes for the non-PV version and 925 bytes for the PV version. The overall object size increases by a bit more than 200 bytes, but the icache footprint should be reduced no matter which version is used. -Longman diff --git a/arch/x86/include/asm/pvqspinlock.h b/arch/x86/include/asm/pvqspinlo index d424252..241bf30 100644 --- a/arch/x86/include/asm/pvqspinlock.h +++ b/arch/x86/include/asm/pvqspinlock.h @@ -79,9 +79,6 @@ static inline void pv_init_node(struct mcs_spinlock *node) BUILD_BUG_ON(sizeof(struct pv_qnode) 5*sizeof(struct mcs_spinlock)); - if (!pv_enabled()) - return; - pn-cpustate = PV_CPU_ACTIVE; pn-mayhalt = false; pn-mycpu= smp_processor_id(); @@ -132,9 +129,6 @@ static inline bool pv_link_and_wait_node(u32 old, struct mcs struct pv_qnode *ppn, *pn = (struct pv_qnode *)node; unsigned int count; - if (!pv_enabled()) - return false; - if (!(old _Q_TAIL_MASK)) { node-locked = true;/* At queue head now */ goto ret; @@ -236,9 +230,6 @@ pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *no { struct pv_qnode *pn = (struct pv_qnode *)node; - if (!pv_enabled()) - return smp_load_acquire(lock-val.counter); - for (;;) { unsigned int count; s8 oldstate; @@ -328,8 +319,6 @@ static inline void pv_wait_check(struct qspinlock *lock, struct pv_qnode *pnxt = (struct pv_qnode *)next; struct pv_qnode *pcur = (struct pv_qnode *)node; - if (!pv_enabled()) - return; /* * Clear the locked and head values of lock holder */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1662dbd..05aea57 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -16,6 +16,7 @@ * Authors: Waiman Long waiman.l...@hp.com * Peter Zijlstra pzijl...@redhat.com */ +#ifndef _GEN_PV_LOCK_SLOWPATH #include linux/smp.h #include linux/bug.h #include linux/cpumask.h @@ -271,19 +272,37 @@ void queue_spin_unlock_slowpath(struct qspinlock *lock) } EXPORT_SYMBOL(queue_spin_unlock_slowpath); -#else +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +#else /* CONFIG_PARAVIRT_SPINLOCKS */ + +static inline void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) + { } -static inline void pv_init_node(struct mcs_spinlock *node) { } -static inline void pv_wait_check(struct qspinlock *lock, -struct mcs_spinlock *node, -struct mcs_spinlock *next) { } -static inline bool pv_link_and_wait_node(u32 old, struct mcs_spinlock *node) +#endif /* CONFIG_PARAVIRT_SPINLOCKS */ + +/* + * Dummy PV functions for bare-metal slowpath code + */ +static inline void nopv_init_node(struct mcs_spinlock *node) { } +static inline void nopv_wait_check(struct qspinlock *lock, + struct mcs_spinlock *node, + struct mcs_spinlock *next) { } +static inline bool nopv_link_and_wait_node(u32 old, struct mcs_spinlock *node) { return false; } -static inline int pv_wait_head(struct qspinlock *lock, +static inline int nopv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) { return smp_load_acquire(lock-val.counter); } +static inline bool return_true(void) { return true; } +static inline bool return_false(void) { return false; } -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ +#define pv_init_node nopv_init_node +#define pv_wait_check nopv_wait_check +#define pv_link_and_wait_node nopv_link_and_wait_node
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote: On 10/24/2014 06:04 PM, Peter Zijlstra wrote: On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. If people care about performance they should not be using virt crap :-) I only really care about bare metal. Yes, I am aware of that. However, the whole point of doing PV spinlock is to improve performance in a virtual guest. Anything that avoids the lock holder preemption nonsense is a _massive_ win for them, a few function calls should not even register on that scale. +#ifdef _GEN_PV_LOCK_SLOWPATH +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#else void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#endif If you have two functions you might as well use the PV stuff to patch in the right function call at the usage sites and avoid: + if (pv_enabled()) { + pv_queue_spin_lock_slowpath(lock, val); + return; + } this alltogether. this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queue_spin_lock_slowpath); + +#if !defined(_GEN_PV_LOCK_SLOWPATH) defined(CONFIG_PARAVIRT_SPINLOCKS) +/* + * Generate the PV version of the queue_spin_lock_slowpath function + */ +#undef pv_init_node +#undef pv_wait_check +#undef pv_link_and_wait_node +#undef pv_wait_head +#undef EXPORT_SYMBOL +#undef in_pv_code + +#define _GEN_PV_LOCK_SLOWPATH +#define EXPORT_SYMBOL(x) +#define in_pv_code return_true +#define pv_enabled return_false + +#include qspinlock.c + +#endif That's properly disgusting :-) But a lot better than actually duplicating everything I suppose. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? So I think we may still need to disable unlock function inlining even if we used your way kernel site patching. Regards, Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? It has too. When the modules are loaded the .paravirt symbols are exposed and the module loader patches that. And during bootup time (before modules are loaded) it also patches everything - when it only runs on one CPU. So I think we may still need to disable unlock function inlining even if we used your way kernel site patching. No need. Inline should (And is) working just fine. Regards, Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 01:27 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:15:53PM -0400, Waiman Long wrote: On 10/24/2014 06:04 PM, Peter Zijlstra wrote: On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. If people care about performance they should not be using virt crap :-) I only really care about bare metal. Yes, I am aware of that. However, the whole point of doing PV spinlock is to improve performance in a virtual guest. Anything that avoids the lock holder preemption nonsense is a _massive_ win for them, a few function calls should not even register on that scale. I would say all the PV stuffs are mostly useful for a over-committed guest where a single CPU is shared in more than one guest. When the guests are not overcommitted, the PV code seldom get triggered. In those cases, the overhead of the extra function call and register saving/restoring will show up. +#ifdef _GEN_PV_LOCK_SLOWPATH +static void pv_queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#else void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) +#endif If you have two functions you might as well use the PV stuff to patch in the right function call at the usage sites and avoid: + if (pv_enabled()) { + pv_queue_spin_lock_slowpath(lock, val); + return; + } this alltogether. Good point! I will do some investigation on how to do this kind of function address patching and eliminate the extra function call overhead. this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queue_spin_lock_slowpath); + +#if !defined(_GEN_PV_LOCK_SLOWPATH) defined(CONFIG_PARAVIRT_SPINLOCKS) +/* + * Generate the PV version of the queue_spin_lock_slowpath function + */ +#undef pv_init_node +#undef pv_wait_check +#undef pv_link_and_wait_node +#undef pv_wait_head +#undef EXPORT_SYMBOL +#undef in_pv_code + +#define _GEN_PV_LOCK_SLOWPATH +#define EXPORT_SYMBOL(x) +#define in_pv_code return_true +#define pv_enabled return_false + +#include qspinlock.c + +#endif That's properly disgusting :-) But a lot better than actually duplicating everything I suppose. I know you don't like this kind of preprocessor trick, but this is the easiest way that I can think of to generate two separate functions from the same source code. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 02:02 PM, Konrad Rzeszutek Wilk wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? It has too. When the modules are loaded the .paravirt symbols are exposed and the module loader patches that. And during bootup time (before modules are loaded) it also patches everything - when it only runs on one CPU. So I think we may still need to disable unlock function inlining even if we used your way kernel site patching. No need. Inline should (And is) working just fine. Regards, Longman Thanks for letting me know about the paravirt patching capability available in the kernel. In this case, I would say we should use Peter's way of doing unlock without disabling unlock function inlining. That will further reduce the performance difference of kernels with and without PV. Cheer, Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 02:04 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. Thanks for letting me know about that. I have this concern because your patch didn't change the current configuration of disabling unlock inlining when paravirt_spinlock is enabled. With that, I think it is worthwhile to reduce the performance delta between the PV and non-PV kernel on bare metal. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: +static inline void pv_init_node(struct mcs_spinlock *node) +{ + struct pv_qnode *pn = (struct pv_qnode *)node; + + BUILD_BUG_ON(sizeof(struct pv_qnode) 5*sizeof(struct mcs_spinlock)); + + if (!pv_enabled()) + return; + + pn-cpustate = PV_CPU_ACTIVE; + pn-mayhalt = false; + pn-mycpu= smp_processor_id(); + pn-head = PV_INVALID_HEAD; +} @@ -333,6 +393,7 @@ queue: node += idx; node-locked = 0; node-next = NULL; + pv_init_node(node); /* * We touched a (possibly) cold cacheline in the per-cpu queue node; So even if !pv_enabled() the compiler will still have to emit the code for that inline, which will generate additional register pressure, icache pressure and lovely stuff like that. The patch I had used pv-ops for these things that would turn into NOPs in the regular case and callee-saved function calls for the PV case. That still does not entirely eliminate cost, but does reduce it significant. Please consider using that. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/24/2014 04:47 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: +static inline void pv_init_node(struct mcs_spinlock *node) +{ + struct pv_qnode *pn = (struct pv_qnode *)node; + + BUILD_BUG_ON(sizeof(struct pv_qnode) 5*sizeof(struct mcs_spinlock)); + + if (!pv_enabled()) + return; + + pn-cpustate = PV_CPU_ACTIVE; + pn-mayhalt = false; + pn-mycpu= smp_processor_id(); + pn-head = PV_INVALID_HEAD; +} @@ -333,6 +393,7 @@ queue: node += idx; node-locked = 0; node-next = NULL; + pv_init_node(node); /* * We touched a (possibly) cold cacheline in the per-cpu queue node; So even if !pv_enabled() the compiler will still have to emit the code for that inline, which will generate additional register pressure, icache pressure and lovely stuff like that. The patch I had used pv-ops for these things that would turn into NOPs in the regular case and callee-saved function calls for the PV case. That still does not entirely eliminate cost, but does reduce it significant. Please consider using that. The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. Another alternative that I can think of is to generate 2 versions of the slowpath code - one pv and one non-pv out of the same source code. The non-pv code will call into the pv code once if pv is enabled. In this way, it won't increase the icache and register pressure of the non-pv code. However, this may make the source code a bit harder to read. Please let me know your thought on this alternate approach. -Longman ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On Fri, Oct 24, 2014 at 04:53:27PM -0400, Waiman Long wrote: The additional register pressure may just cause a few more register moves which should be negligible in the overall performance . The additional icache pressure, however, may have some impact on performance. I was trying to balance the performance of the pv and non-pv versions so that we won't penalize the pv code too much for a bit more performance in the non-pv code. Doing it your way will add a lot of function call and register saving/restoring to the pv code. If people care about performance they should not be using virt crap :-) I only really care about bare metal. ___ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization
[PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
This patch adds para-virtualization support to the queue spinlock code base with minimal impact to the native case. There are some minor code changes in the generic qspinlock.c file which should be usable in other architectures. The other code changes are specific to x86 processors and so are all put under the arch/x86 directory. On the lock side, there are a couple of jump labels and 2 paravirt callee saved calls that defaults to NOPs and some registered move instructions. So the performance impact should be minimal. Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. The actual paravirt code comes in 5 parts; - init_node; this initializes the extra data members required for PV state. PV state data is kept 1 cacheline ahead of the regular data. - link_and_wait_node; this replaces the regular MCS queuing code. CPU halting can happen if the wait is too long. - wait_head; this waits until the lock is avialable and the CPU will be halted if the wait is too long. - wait_check; this is called after acquiring the lock to see if the next queue head CPU is halted. If this is the case, the lock bit is changed to indicate the queue head will have to be kicked on unlock. - queue_unlock; this routine has a jump label to check if paravirt is enabled. If yes, it has to do an atomic cmpxchg to clear the lock bit or call the slowpath function to kick the queue head cpu. Tracking the head is done in two parts, firstly the pv_wait_head will store its cpu number in whichever node is pointed to by the tail part of the lock word. Secondly, pv_link_and_wait_node() will propagate the existing head from the old to the new tail node. Signed-off-by: Waiman Long waiman.l...@hp.com --- arch/x86/include/asm/paravirt.h | 20 ++ arch/x86/include/asm/paravirt_types.h | 20 ++ arch/x86/include/asm/pvqspinlock.h| 403 + arch/x86/include/asm/qspinlock.h | 44 - arch/x86/kernel/paravirt-spinlocks.c |6 + kernel/locking/qspinlock.c| 72 ++- 6 files changed, 558 insertions(+), 7 deletions(-) create mode 100644 arch/x86/include/asm/pvqspinlock.h diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e161..3b041db 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,25 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUE_SPINLOCK + +static __always_inline void pv_kick_cpu(int cpu) +{ + PVOP_VCALLEE1(pv_lock_ops.kick_cpu, cpu); +} + +static __always_inline void +pv_lockwait(u8 *lockbyte) +{ + PVOP_VCALLEE1(pv_lock_ops.lockwait, lockbyte); +} + +static __always_inline void pv_lockstat(enum pv_lock_stats type) +{ + PVOP_VCALLEE1(pv_lock_ops.lockstat, type); +} + +#else static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -723,6 +742,7 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, { PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } +#endif #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b..49e4b76 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -326,6 +326,9 @@ struct pv_mmu_ops { phys_addr_t phys, pgprot_t flags); }; +struct mcs_spinlock; +struct qspinlock; + struct arch_spinlock; #ifdef CONFIG_SMP #include asm/spinlock_types.h @@ -333,9 +336,26 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +#ifdef CONFIG_QUEUE_SPINLOCK +enum pv_lock_stats { + PV_HALT_QHEAD, /* Queue head halting */ + PV_HALT_QNODE, /* Other queue node halting */ + PV_HALT_ABORT, /* Halting aborted */ + PV_WAKE_KICKED, /* Wakeup by kicking*/ + PV_WAKE_SPURIOUS, /* Spurious wakeup */ + PV_KICK_NOHALT /* Kick but CPU not halted */ +}; +#endif + struct pv_lock_ops { +#ifdef CONFIG_QUEUE_SPINLOCK + struct paravirt_callee_save kick_cpu; + struct paravirt_callee_save lockstat; + struct paravirt_callee_save lockwait; +#else struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pvqspinlock.h b/arch/x86/include/asm/pvqspinlock.h new file mode 100644 index 000..d424252 --- /dev/null +++ b/arch/x86/include/asm/pvqspinlock.h @@ -0,0 +1,403 @@ +#ifndef _ASM_X86_PVQSPINLOCK_H +#define _ASM_X86_PVQSPINLOCK_H + +/* + *