[RFC PATCH 4/4] vhost_net: byteswap virtio_net header
Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- drivers/vhost/net.c | 39 ++- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8dae2f724a35..f2d5f585dae9 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -18,6 +18,7 @@ #include linux/file.h #include linux/slab.h #include linux/vmalloc.h +#include linux/swab.h #include linux/net.h #include linux/if_packet.h @@ -329,6 +330,14 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) rcu_read_unlock_bh(); } +static void virtio_net_hdr_swap(struct virtio_net_hdr *hdr) +{ + hdr-hdr_len = swab16(hdr-hdr_len); + hdr-gso_size= swab16(hdr-gso_size); + hdr-csum_start = swab16(hdr-csum_start); + hdr-csum_offset = swab16(hdr-csum_offset); +} + /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) @@ -346,7 +355,7 @@ static void handle_tx(struct vhost_net *net) .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; - int err; + int err, has_vnet_hdr; size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); @@ -359,6 +368,7 @@ static void handle_tx(struct vhost_net *net) vhost_disable_notify(net-dev, vq); + has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR); hdr_size = nvq-vhost_hlen; zcopy = nvq-ubufs; @@ -406,6 +416,8 @@ static void handle_tx(struct vhost_net *net) break; } + if (!has_vnet_hdr vq-byteswap) + virtio_net_hdr_swap((void *) vq-iov[0].iov_base); zcopy_used = zcopy len = VHOST_GOODCOPY_LEN (nvq-upend_idx + 1) % UIO_MAXIOV != nvq-done_idx @@ -570,7 +582,7 @@ static void handle_rx(struct vhost_net *net) .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; - int err, mergeable; + int err, mergeable, has_vnet_hdr; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; @@ -588,6 +600,7 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq-log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR); while ((sock_len = peek_head_len(sock-sk))) { sock_len += sock_hlen; @@ -638,6 +651,9 @@ static void handle_rx(struct vhost_net *net) vhost_discard_vq_desc(vq, headcount); continue; } + + if (!has_vnet_hdr vq-byteswap) + virtio_net_hdr_swap((void *) vq-iov[0].iov_base); if (unlikely(vhost_hlen) memcpy_toiovecend(nvq-hdr, (unsigned char *)hdr, 0, vhost_hlen)) { @@ -646,13 +662,18 @@ static void handle_rx(struct vhost_net *net) break; } /* TODO: Should check and handle checksum. */ - if (likely(mergeable) - memcpy_toiovecend(nvq-hdr, (unsigned char *)headcount, - offsetof(typeof(hdr), num_buffers), - sizeof hdr.num_buffers)) { - vq_err(vq, Failed num_buffers write); - vhost_discard_vq_desc(vq, headcount); - break; + if (likely(mergeable)) { + __u16 tmp = headcount; + + if (vq-byteswap) + tmp = swab16(headcount); + if (memcpy_toiovecend(nvq-hdr, (unsigned char *)tmp, + offsetof(typeof(hdr), num_buffers), + sizeof(hdr.num_buffers))) { + vq_err(vq, Failed num_buffers write); + vhost_discard_vq_desc(vq, headcount); + break; + } } vhost_add_used_and_signal_n(net-dev, vq, vq-heads, headcount); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 1/4] vhost: add VHOST_VRING_F_BYTESWAP flag
The VHOST_VRING_F_BYTESWAP flag will be used by the host to byteswap the vring data when the guest and the host have a different endian order. Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- drivers/vhost/vhost.c |5 - drivers/vhost/vhost.h |1 + include/uapi/linux/vhost.h |3 +++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c90f4374442a..72c21b790ba3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -199,6 +199,7 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq-call = NULL; vq-log_ctx = NULL; vq-memory = NULL; + vq-byteswap = 0; } static int vhost_worker(void *data) @@ -701,7 +702,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) r = -EFAULT; break; } - if (a.flags ~(0x1 VHOST_VRING_F_LOG)) { + if (a.flags ~(0x1 VHOST_VRING_F_LOG | + 0x1 VHOST_VRING_F_BYTESWAP)) { r = -EOPNOTSUPP; break; } @@ -747,6 +749,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp) vq-avail = (void __user *)(unsigned long)a.avail_user_addr; vq-log_addr = a.log_guest_addr; vq-used = (void __user *)(unsigned long)a.used_user_addr; + vq-byteswap = !!(a.flags (0x1 VHOST_VRING_F_BYTESWAP)); break; case VHOST_SET_VRING_KICK: if (copy_from_user(f, argp, sizeof f)) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 3eda654b8f5a..ab25b7d0720d 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -110,6 +110,7 @@ struct vhost_virtqueue { /* Log write descriptors */ void __user *log_base; struct vhost_log *log; + bool byteswap; }; struct vhost_dev { diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index bb6a5b4cb3c5..6a8c2b325c44 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -34,6 +34,9 @@ struct vhost_vring_addr { /* Flag values: */ /* Whether log address is valid. If set enables logging. */ #define VHOST_VRING_F_LOG 0 + /* Whether vring memory accesses should be byte-swapped. +* required when the guest has a different endianness */ +#define VHOST_VRING_F_BYTESWAP 1 /* Start of array of descriptors (virtually contiguous) */ __u64 desc_user_addr; -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 3/4] vhost: byteswap virtqueue attributes
The virtqueue structure shares a few attributes with the guest OS which need to be byteswapped when the endian order of the host is different. This patch uses the vq-byteswap attribute to decide whether to byteswap or not data being accessed in the guest memory. Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- drivers/vhost/vhost.c | 52 + 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index afcb3368370c..9a538f4d2ff1 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1097,7 +1097,7 @@ static void vring_desc_swap(struct vring_desc *desc) static int vhost_update_used_flags(struct vhost_virtqueue *vq) { void __user *used; - if (__put_user(vq-used_flags, vq-used-flags) 0) + if (__vq_put_user(vq, vq-used_flags, vq-used-flags) 0) return -EFAULT; if (unlikely(vq-log_used)) { /* Make sure the flag is seen before log. */ @@ -1115,7 +1115,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) { - if (__put_user(vq-avail_idx, vhost_avail_event(vq))) + if (__vq_put_user(vq, vq-avail_idx, vhost_avail_event(vq))) return -EFAULT; if (unlikely(vq-log_used)) { void __user *used; @@ -1142,7 +1142,7 @@ int vhost_init_used(struct vhost_virtqueue *vq) if (r) return r; vq-signalled_used_valid = false; - return get_user(vq-last_used_idx, vq-used-idx); + return vq_get_user(vq, vq-last_used_idx, vq-used-idx); } EXPORT_SYMBOL_GPL(vhost_init_used); @@ -1254,6 +1254,10 @@ static int get_indirect(struct vhost_virtqueue *vq, i, (size_t)indirect-addr + i * sizeof desc); return -EINVAL; } + + if (vq-byteswap) + vring_desc_swap(desc); + if (unlikely(desc.flags VRING_DESC_F_INDIRECT)) { vq_err(vq, Nested indirect descriptor: idx %d, %zx\n, i, (size_t)indirect-addr + i * sizeof desc); @@ -1309,7 +1313,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Check it isn't doing very strange things with descriptor numbers. */ last_avail_idx = vq-last_avail_idx; - if (unlikely(__get_user(vq-avail_idx, vq-avail-idx))) { + if (unlikely(__vq_get_user(vq, vq-avail_idx, vq-avail-idx))) { vq_err(vq, Failed to access avail idx at %p\n, vq-avail-idx); return -EFAULT; @@ -1330,7 +1334,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(__get_user(head, + if (unlikely(__vq_get_user(vq, head, vq-avail-ring[last_avail_idx % vq-num]))) { vq_err(vq, Failed to read head: idx %d address %p\n, last_avail_idx, @@ -1370,6 +1374,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, i, vq-desc + i); return -EFAULT; } + + if (vq-byteswap) + vring_desc_swap(desc); + if (desc.flags VRING_DESC_F_INDIRECT) { ret = get_indirect(vq, iov, iov_size, out_num, in_num, @@ -1437,6 +1445,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) } EXPORT_SYMBOL_GPL(vhost_add_used); +static int __copyhead_to_user(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + struct vring_used_elem __user *used, + unsigned count) +{ + int i; + + for (i = 0; i count; i++) { + if (__vq_put_user(vq, heads[i].id, used[i].id)) { + vq_err(vq, Failed to write used id); + return -EFAULT; + } + if (__vq_put_user(vq, heads[i].len, used[i].len)) { + vq_err(vq, Failed to write used len); + return -EFAULT; + } + } + return 0; +} + static int __vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) @@ -1448,15 +1476,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, start = vq-last_used_idx % vq-num; used = vq-used-ring + start; if (count == 1) { - if (__put_user(heads[0].id, used-id)) { + if (__vq_put_user(vq, heads[0].id, used-id)) { vq_err(vq, Failed to write used id);
[RFC PATCH 0/4] vhost_net: support for cross endian guests
This patchset adds a VHOST_VRING_F_BYTESWAP flag to inform the host to byteswap data of the vring when the guest and the host have a different endian order. The flag is stored at initialization in an attribute of the virtio queues. It is then used to byteswap, or not, the vring indexes and descriptors shared with the guest OS. The last patch adds the byteswapping of the virtio_net header as it is done in qemu. The patches apply on linux-3.18-rc2 and the tests were done on PowerPC using the following hosts : fedora21/ppc64, utopic/ppc64le with various guests : trusty/ppc64le, utopic/ppc64le, debian/ppc64le, rhel6.5/ppc64, fedora21/ppc64, debian/ppc64 Regressions tests for x86_64 were done a debian host using rhel6.6, fedora20 and debian guests. Cédric Le Goater (4): vhost: add VHOST_VRING_F_BYTESWAP flag vhost: add byteswap routines vhost: byteswap virtqueue attributes vhost_net: byteswap virtio_net header drivers/vhost/net.c| 39 +--- drivers/vhost/vhost.c | 150 drivers/vhost/vhost.h |1 + include/uapi/linux/vhost.h |3 + 4 files changed, 171 insertions(+), 22 deletions(-) -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 2/4] vhost: add byteswap routines
This patch adds a few helper routines around get_user and put_user to ease byteswapping. Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- I am not sure these routines belong to this file. There is room for improvement to remove the ugly switch (sizeof(*(ptr))). Please comment ! drivers/vhost/vhost.c | 93 + 1 file changed, 93 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 72c21b790ba3..afcb3368370c 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/kthread.h #include linux/cgroup.h #include linux/module.h +#include linux/swab.h #include vhost.h @@ -1001,6 +1002,98 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, } EXPORT_SYMBOL_GPL(vhost_log_write); +#define vq_get_user(vq, x, ptr)\ +({ \ + int ret = get_user(x, ptr); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + x = swab16(x); \ + break; \ + case 4: \ + x = swab32(x); \ + break; \ + case 8: \ + x = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + ret;\ +}) + +#define vq_put_user(vq, x, ptr)\ +({ \ + __typeof__(*(ptr)) y = (x); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + y = swab16(x); \ + break; \ + case 4: \ + y = swab32(x); \ + break; \ + case 8: \ + y = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + put_user(y, ptr); \ +}) + +#define __vq_get_user(vq, x, ptr) \ +({ \ + int ret = __get_user(x, ptr); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + x = swab16(x); \ + break; \ + case 4: \ + x = swab32(x); \ + break; \ + case 8: \ + x = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + ret;\ +}) + +#define __vq_put_user(vq, x, ptr) \ +({ \ + __typeof__(*(ptr)) y = (x); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2:
Re: [PATCH 3.10] vhost-net: backport extend device allocation to 3.10
On Tue, Oct 28, 2014 at 07:38:30PM +, Eddie Chapman wrote: On 12/10/14 10:30, Michael S. Tsirkin wrote: On Thu, Oct 09, 2014 at 08:41:23AM +0400, Dmitry Petuhov wrote: From: Michael S. Tsirkin m...@redhat.com upstream commit 23cc5a991c7a9fb7e6d6550e65cee4f4173111c5 Michael Mueller provided a patch to reduce the size of vhost-net structure as some allocations could fail under memory pressure/fragmentation. We are still left with high order allocations though. This patch is handling the problem at the core level, allowing vhost structures to use vmalloc() if kmalloc() failed. As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT to kzalloc() flags to do this fallback only when really needed. People are still looking at cleaner ways to handle the problem at the API level, probably passing in multiple iovecs. This hack seems consistent with approaches taken since then by drivers/vhost/scsi.c and net/core/dev.c Based on patch by Romain Francoise. Cc: Michael Mueller m...@linux.vnet.ibm.com Signed-off-by: Romain Francoise rom...@orebokech.com Acked-by: Michael S. Tsirkin m...@redhat.com [mityapetuhov: backport to v3.10: vhost_net_free() in one more place] Signed-off-by: Dmitry Petuhov mityapetu...@gmail.com Sounds reasonable. Acked-by: Michael S. Tsirkin m...@redhat.com --- diff -uprN a/drivers/vhost/net.c b/drivers/vhost/net.c --- a/drivers/vhost/net.c 2014-10-09 06:45:08.336283258 +0400 +++ b/drivers/vhost/net.c 2014-10-09 06:51:21.796266607 +0400 @@ -18,6 +18,7 @@ #include linux/rcupdate.h #include linux/file.h #include linux/slab.h +#include linux/vmalloc.h #include linux/net.h #include linux/if_packet.h @@ -707,18 +708,30 @@ static void handle_rx_net(struct vhost_w handle_rx(net); } +static void vhost_net_free(void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + static int vhost_net_open(struct inode *inode, struct file *f) { - struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; int r, i; - if (!n) - return -ENOMEM; + n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); + if (!n) { + n = vmalloc(sizeof *n); + if (!n) + return -ENOMEM; + } vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); if (!vqs) { - kfree(n); + vhost_net_free(n); return -ENOMEM; } @@ -737,7 +750,7 @@ static int vhost_net_open(struct inode * } r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX); if (r 0) { - kfree(n); + vhost_net_free(n); kfree(vqs); return r; } @@ -840,7 +853,7 @@ static int vhost_net_release(struct inod * since jobs can re-queue themselves. */ vhost_net_flush(n); kfree(n-dev.vqs); - kfree(n); + vhost_net_free(n); return 0; } Hi Michael, Dmitry, I needed to apply this to a server using vanilla 3.14, so I reformatted Dmitry's 3.10 patch to make it apply cleanly to 3.14.23-rc1. It built fine without warnings, and the kernel has been running now all day with the patch applied without any problems, no errors in dmesg, module loaded fine, and qemu VMs running without any network problems, with vhost-net enabled. Since I don't really know what I'm doing (I just fixed up white space to make it apply, and removed the extra vhost_net_free() added by Dmitry for the 3.10 version as there was no such bit of code in 3.14) I just wanted to paste what I came up with below just to ask for your eyeballs. If you think I haven't missed anything obvious that is needed extra in 3.14, then I'll send it as a properly formatted patch for 3.14 according to submission rules, since it appears to be working (unless Dmitry prefers to submit it). thanks, Eddie Looks reasonable. Acked-by: Michael S. Tsirkin m...@redhat.com --- a/drivers/vhost/net.c 2014-03-31 04:40:15.0 +0100 +++ b/drivers/vhost/net.c 2014-10-28 19:05:21.242141453 + @@ -17,6 +17,7 @@ #include linux/workqueue.h #include linux/file.h #include linux/slab.h +#include linux/vmalloc.h #include linux/net.h #include linux/if_packet.h @@ -699,18 +700,31 @@ handle_rx(net); } +static void vhost_net_free(void *addr) +{ + if (is_vmalloc_addr(addr)) + vfree(addr); + else + kfree(addr); +} + static int vhost_net_open(struct inode *inode, struct file *f) { - struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); + struct vhost_net *n; struct vhost_dev *dev; struct vhost_virtqueue **vqs; int i; - if (!n) - return -ENOMEM; + n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 29/10/14 05:19, Andy Lutomirski wrote: Here's a draft CommonHV spec. It's also on github: https://github.com/amluto/CommonHV So far, this provides a two-way RNG interface, a way to detect it, and a way to detect other hypervisor leaves. The latter is because, after both the enormous public thread and some private discussions, it seems that detection of existing CPUID paravirt leaves is annoying and inefficient. If we're going to define some cross-vendor CPUID leaves, it seems like it would be useful to offer a way to quickly enumerate other leaves. I've been told the AMD intends to update their manual to match Intel's so that hypervisors can use the entire 0x4F?? CPUID range. I have intentionally not fixed an MSR value for the RNG because the range of allowed MSRs is very small in both the Intel and AMD manuals. If any given hypervisor wants to ignore that small range and advertise a higher-numbered MSR, it is welcome to, but I don't want to codify something that doesn't comply with the manuals. Here's the draft. Comments? To the people who work on various hypervisors: Would you implement this? Do you like it? Is there anything, major or minor, that you'd like to see changed? Do you think that this is a good idea at all? As a first pass, it looks like a plausible idea. I do however have come comments. I've tried to get good coverage of various hypervisors. There are Hyper-V, VMWare, KVM, and Xen people on the cc list. Thanks, Andy CommonHV, a common hypervisor interface === This is CommonHV draft 1. The CommonHV specification is Copyright (c) 2014 Andrew Lutomirski. Licensing will be determined soon. The license is expected to be extremely liberal. I am currently leaning towards CC-BY-SA for the specification and an explicit license permitting anyone to implement the specification with no restrictions whatsoever. I have not patented, nor do I intend to patent, anything required to implement this specification. I am not aware of any current or future intellectual property rights that would prevent a royalty-free implementation of this specification. I would like to find a stable, neutral steward of this specification going forward. Help with this would be much appreciated. Scope - CommonHV is a simple interface for communication between hypervisors and their guests. CommonHV is intended to be very simple and to avoid interfering with existing paravirtual interfaces. To that end, its scope is limited. CommonHV does only two types of things: * It provides a way to enumerate other paravirtual interfaces. * It provides a small, extensible set of paravirtual features that do not modify or replace standard system functionality. For example, CommonHV does not and will not define anything related to interrupt handling or virtual CPU management. For now, CommonHV is only applicable to the x86 platform. Discovery - A CommonHV hypervisor MUST set the hypervisor bit (bit 31 in CPUID.1H.0H.ECX) and provide the CPUID leaf 4F00H, containing: * CPUID.4F00H.0H.EAX = max_commonhv_leaf * CPUID.4F00H.0H.EBX = 0x6D6D6F43 * CPUID.4F00H.0H.ECX = 0x56486E6F * CPUID.4F00H.0H.EDX = 0x66746e49 EBX, ECX, and EDX form the string CommonHVIntf in little-endian ASCII. While testing various nested combinations, XenServer has found that modern Windows Server versions must have the hypervisor bit hidden from them for them to be happy running HyperV, despite the fact that they will make use of the Viridian virtual extensions also provided. As a result, while it is certainly advisable for the hypervisor bit to be set, CommonHV should be available to be found by paravirtualised drivers inside an OS which can't cope with the hypervisor bit set. max_commonhv_leaf MUST be a number between 0x4F00 and 0x4FFF. It indicates the largest leaf defined in this specification that is provided. Any leaves described in this specification with EAX values that exceed max_commonhv_leaf MUST be handled by guests as though they contain all zeros. CPUID leaf 4F01H: hypervisor interface enumeration -- If max_commonhv_leaf = 0x4F01, CommonHV provides a list of tuples (location, signature). Each tuple indicates the presence of another paravirtual interface identified by the signature at the indicated CPUID location. It is expected that CPUID.location.0H will have (EBX, ECX, EDX) == signature, although whether this is required is left to the specification associated with the given signature. If the list contains N tuples, then, for each 0 = i N: * CPUID.4F01H.i.EBX, CPUID.4F01H.i.ECX, and CPUID.4F01H.i.EDX are the signature. * CPUID.4F01H.i.EAX is the location. CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros. To the extent that the
Re: [GIT PULL 0/9] KVM: s390: Fixes and cleanups for kvm/next (3.19)
On 10/28/2014 08:39 PM, Christian Borntraeger wrote: Paolo, the first bunch on s390 change for next: The following changes since commit cac7f2429872d3733dc3f9915857b1691da2eb2f: Linux 3.18-rc2 (2014-10-26 16:48:41 -0700) are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git tags/kvm-s390-next-20141028 for you to fetch changes up to a6cc3108567e0adc06c4a8031186f84ad1e1e194: KVM: s390: sigp: split handling of SIGP STOP (AND STORE STATUS) (2014-10-28 13:09:14 +0100) KVM: s390: Fixes and cleanups 1. A small fix regarding program check handling (cc stable as it overwrites the wrong guest memory) 2. Improve the ipte interlock scalability for older hardware 3. current-mm to mm cleanup (currently a no-op) 4. several SIGP rework patches (more to come) David Hildenbrand (6): KVM: s390: sigp: dispatch orders with one target in a separate function KVM: s390: sigp: move target cpu checks into dispatcher KVM: s390: sigp: separate preparation handlers KVM: s390: sigp: instruction counters for all sigp orders KVM: s390: sigp: inject emergency calls in a separate function KVM: s390: sigp: split handling of SIGP STOP (AND STORE STATUS) Jason J. Herne (1): KVM: s390: Cleanup usage of current-mm in set_guest_storage_key Thomas Huth (2): KVM: s390: Make the simple ipte mutex specific to a VM instead of global KVM: s390: Fix size of monitor-class number field arch/s390/include/asm/kvm_host.h | 9 ++ arch/s390/include/asm/sigp.h | 1 + arch/s390/kvm/gaccess.c | 20 ++- arch/s390/kvm/interrupt.c| 2 +- arch/s390/kvm/kvm-s390.c | 8 ++ arch/s390/kvm/sigp.c | 269 --- arch/s390/mm/pgtable.c | 2 +- 7 files changed, 168 insertions(+), 143 deletions(-) Pulled, thanks. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Benchmarking for vhost polling patch
Hi Michael, Following the polling patch thread: http://marc.info/?l=kvmm=140853271510179w=2, I changed poll_stop_idle to be counted in micro seconds, and carried out experiments using varying sizes of this value. The setup for netperf consisted of 1 vm and 1 vhost , each running on their own dedicated core. Here are the numbers for netperf (micro benchmark): polling|Send |Throughput|Utilization |S. Demand |vhost|exits|throughput|throughput mode |Msg | |Send Recv |Send Recv |util |/sec | /cpu | /cpu |Size | |local remote|local remote| | | |% change |bytes|10^6bits/s| %%|us/KB us/KB | % | | | - NoPolling 64 1054.11 99.97 3.01 7.78 3.74 38.80 92K7.60 Polling=1 64 1036.67 99.97 2.93 7.90 3.70 53.00 92K6.78 -10.78 Polling=5 64 1079.27 99.97 3.07 7.59 3.73 83.00 90K5.90 -22.35 Polling=7 64 1444.90 99.97 3.98 5.67 3.61 95.00 19.5K 7.41 -2.44 Polling=10 64 1521.70 99.97 4.21 5.38 3.63 98.00 8.5K 7.69 1.19 Polling=25 64 1534.24 99.97 4.18 5.34 3.57 99.00 8.5K 7.71 1.51 Polling=50 64 1534.24 99.97 4.18 5.34 3.57 99.00 8.5K 7.71 1.51 NoPolling 128 1577.39 99.97 4.09 5.19 3.40 54.00 113K 10.24 Polling=1 128 1596.08 99.97 4.22 5.13 3.47 71.00 120K 9.34 -8.88 Polling=5 128 2238.49 99.97 5.45 3.66 3.19 92.00 24K11.66 13.82 Polling=7 128 2330.97 99.97 5.59 3.51 3.14 95.00 19.5K 11.96 16.70 Polling=10 128 2375.78 99.97 5.69 3.45 3.14 98.00 10K12.00 17.14 Polling=25 128 2655.01 99.97 2.45 3.09 1.21 99.00 8.5K 13.34 30.25 Polling=50 128 2655.01 99.97 2.45 3.09 1.21 99.00 8.5K 13.34 30.25 NoPolling 25 2558.10 99.97 2.33 3.20 1.20 67.00 120K 15.32 Polling=1 25 2508.93 99.97 3.13 3.27 1.67 75.00 125K 14.34 -6.41 Polling=5 25 3740.34 99.97 2.70 2.19 0.95 94.00 17K19.28 25.86 Polling=7 25 3692.69 99.97 2.80 2.22 0.99 97.00 15.5K 18.75 22.37 Polling=10 25 4036.60 99.97 2.69 2.03 0.87 99.00 8.5K 20.29 32.42 Polling=25 25 3998.89 99.97 2.64 2.05 0.87 99.00 8.5K 20.10 31.18 Polling=50 25 3998.89 99.97 2.64 2.05 0.87 99.00 8.5K 20.10 31.18 NoPolling 512 4531.50 99.90 2.75 1.81 0.79 78.00 55K25.47 Polling=1 512 4684.19 99.95 2.69 1.75 0.75 83.00 35K25.60 0.52 Polling=5 512 4932.65 99.75 2.75 1.68 0.74 91.00 12K25.86 1.52 Polling=7 512 5226.14 99.86 2.80 1.57 0.70 95.00 7.5K 26.82 5.30 Polling=10 512 5464.90 99.60 2.90 1.49 0.70 96.00 8.2K 27.94 9.69 Polling=25 512 5550.44 99.58 2.84 1.47 0.67 99.00 7.5K 27.95 9.73 Polling=50 512 5550.44 99.58 2.84 1.47 0.67 99.00 7.5K 27.95 9.73 As you can see from the last column, polling improves performance in most cases. I ran memcached (macro benchmark), where (as in the previous benchmark) the vm and vhost each get their own dedicated core. I configured memslap with C=128, T=8, as this configuration was required to produce enough load to saturate the vm. I tried several other configurations, but this one produced the maximal throughput(for the baseline). The numbers for memcached (macro benchmark): polling time TPS Netvhost vm exits TPS/cpu TPS/cpu mode rate util util /sec % change % Disabled15.9s 125819 91.5 4599 87K873.74 polling=1 15.8s 126820 92.3 6099 87K797.61 -8.71 polling=5 12.82 155799 113.4 7999 25.5K 875.280.18 polling=10 11.7s 160639 116.9 8399 16.3K 882.631.02 pollling=15 12.4s 160897 117.2 8799 15K865.04 -1.00 polling=100 11.7s 170971 124.4 9999 30 863.49 -1.17 For memcached TPS/cpu does not show a significant difference in any of the cases. However, TPS numbers did improve in up to 35%, which can be useful for under-utilized systems which have cpu time to spare for extra throughput. If it makes sense to you, I will continue with the other changes requested for the patch. Thank you, Razya -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 11:37 AM, Andrew Cooper wrote: While testing various nested combinations, XenServer has found that modern Windows Server versions must have the hypervisor bit hidden from them for them to be happy running HyperV, despite the fact that they will make use of the Viridian virtual extensions also provided. Right. As a result, while it is certainly advisable for the hypervisor bit to be set, CommonHV should be available to be found by paravirtualised drivers inside an OS which can't cope with the hypervisor bit set. Microsoft should just stop putting arbitrary limitations on their software; or pay the price which, in this case, is not being able to use the features from the common specification. I guess what they'd do is reinvent the RNG as a Viridian extension (if they need it). You can certainly do CPUID(0x4F00) even if HYPERVISOR=0. What you get back is undefined, but in all likelihood it won't be the CommonHVIntf string. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 29/10/14 13:45, Paolo Bonzini wrote: On 10/29/2014 11:37 AM, Andrew Cooper wrote: While testing various nested combinations, XenServer has found that modern Windows Server versions must have the hypervisor bit hidden from them for them to be happy running HyperV, despite the fact that they will make use of the Viridian virtual extensions also provided. Right. As a result, while it is certainly advisable for the hypervisor bit to be set, CommonHV should be available to be found by paravirtualised drivers inside an OS which can't cope with the hypervisor bit set. Microsoft should just stop putting arbitrary limitations on their software; or pay the price which, in this case, is not being able to use the features from the common specification. I guess what they'd do is reinvent the RNG as a Viridian extension (if they need it). You can certainly do CPUID(0x4F00) even if HYPERVISOR=0. What you get back is undefined, but in all likelihood it won't be the CommonHVIntf string. Microsoft already has a specification to obtain a random number via an ACPI device. The VM Generation ID. http://www.microsoft.com/en-us/download/details.aspx?id=30707 David -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 02:57 PM, David Vrabel wrote: Microsoft already has a specification to obtain a random number via an ACPI device. The VM Generation ID. http://www.microsoft.com/en-us/download/details.aspx?id=30707 That's a very different thing. The VM Generation ID always returns the same number if you run the VM from the same configuration file. It tells the VM when it might need a reseed, but provides neither a best effort random number like this spec does, nor an entropy source like virtio-rng does. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
Andy Lutomirski writes ([Xen-devel] [RFC] Hypervisor RNG and enumeration): Here's a draft CommonHV spec. It's also on github: https://github.com/amluto/CommonHV This a worthwhile direction to investigate, and an interesting proposal. From a Xen point of view I have some concerns, though. I think in Xen we would want to implement the bulk of the provision of random numbers to guests outside the hypervisor. That is, the hypervisor itself should not have random number pool code, associated policy, and so on. We would like to avoid adding too much code to the hypervisor. That functionality should live in the lower toolstack layers. For the benefit of people who want to do radical disaggregation (for security reasons), it should be capable of being provided by a different domain to dom0. I think a fully general interface based purely on MSRs makes that difficult in a number of ways: * Currently I don't think there is any way in Xen to cause MSR accesses to be passed to toolstack support programs. * In some configurations, Xen PV domains do not have a suitable longrunning service process to handle requests of this kind. * MSR reads of this kind might be expected to be fast but if they involve trapping to a service domain they might be very slow. * This interface is very x86-specific. It seems to me that the real need for this facility is to provide a good seed for the guest's own cryptographic PRNG. If we restrict ourselves to that use case, we can sidestep the difficulties. In particular, the parts of this proposal that are most difficult are: * The facility for the guest to provide random numbers back to the host. I think this can be dealt with easily by hypervisor-specific mechanisms, if it is desirable. * The implication that a hypervisor ought to be providing a unbounded stream of random numbers, rather than a fixed amount of seed. I think the most obvious approach would be to provide the VM, at startup, with a page containing a fixed amount of random number seed, along with some metatdata. Some platform-specific way of discovering the location of the page would have to be defined. (That might an MSR but more likely it would be Device Tree or ACPI.) After the guest has read the page, it would be free to treat it as normal memory. The metadata need do little more than specify the length and the amount of provided entropy. There should be some room for expansion. The specification should say that the provided seed MUST be cryptographically secure, MUST have as much entropy as stated and that that amount of entropy MUST be at least (say) 64 bits and SHOULD be at least (say) 256 bits. Ian. -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 03:37 AM, Andrew Cooper wrote: CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros. To the extent that the hypervisor prefers a given interface, it should specify that interface earlier in the list. For example, KVM might place its KVMKVMKVM signature first in the list to indicate that it should be used by guests in preference to other supported interfaces. Other hypervisors would likely use a different order. The exact semantics of the ordering of the list is beyond the scope of this specification. How do you evaluate N? It would make more sense for CPUID.4F01[ECX=0] to return N in one register, and perhaps prefered interface index in another. The signatures can then be obtained from CPUID.4F01[ECX={1 to N}]. That way, a consumer can be confident that they have found all the signatures, without relying on an unbounded loop and checking for zeroes Yes. Specifically, it should return it in EAX. That is the preferred interface and we are trying to push for that going forward. -hpa -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 03:37 AM, Andrew Cooper wrote: CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros. To the extent that the hypervisor prefers a given interface, it should specify that interface earlier in the list. For example, KVM might place its KVMKVMKVM signature first in the list to indicate that it should be used by guests in preference to other supported interfaces. Other hypervisors would likely use a different order. The exact semantics of the ordering of the list is beyond the scope of this specification. How do you evaluate N? It would make more sense for CPUID.4F01[ECX=0] to return N in one register, and perhaps prefered interface index in another. The signatures can then be obtained from CPUID.4F01[ECX={1 to N}]. That way, a consumer can be confident that they have found all the signatures, without relying on an unbounded loop and checking for zeroes Yes. Specifically, it should return it in EAX. That is the preferred interface and we are trying to push for that going forward. -hpa -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 03:37 AM, Andrew Cooper wrote: CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros. To the extent that the hypervisor prefers a given interface, it should specify that interface earlier in the list. For example, KVM might place its KVMKVMKVM signature first in the list to indicate that it should be used by guests in preference to other supported interfaces. Other hypervisors would likely use a different order. The exact semantics of the ordering of the list is beyond the scope of this specification. How do you evaluate N? It would make more sense for CPUID.4F01[ECX=0] to return N in one register, and perhaps prefered interface index in another. The signatures can then be obtained from CPUID.4F01[ECX={1 to N}]. That way, a consumer can be confident that they have found all the signatures, without relying on an unbounded loop and checking for zeroes Yes. Specifically, it should return it in EAX. That is the preferred interface and we are trying to push for that going forward. -hpa -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On Oct 29, 2014 8:17 AM, Ian Jackson ian.jack...@eu.citrix.com wrote: Andy Lutomirski writes ([Xen-devel] [RFC] Hypervisor RNG and enumeration): Here's a draft CommonHV spec. It's also on github: https://github.com/amluto/CommonHV This a worthwhile direction to investigate, and an interesting proposal. From a Xen point of view I have some concerns, though. I think in Xen we would want to implement the bulk of the provision of random numbers to guests outside the hypervisor. That is, the hypervisor itself should not have random number pool code, associated policy, and so on. We would like to avoid adding too much code to the hypervisor. That functionality should live in the lower toolstack layers. For the benefit of people who want to do radical disaggregation (for security reasons), it should be capable of being provided by a different domain to dom0. I think a fully general interface based purely on MSRs makes that difficult in a number of ways: * Currently I don't think there is any way in Xen to cause MSR accesses to be passed to toolstack support programs. * In some configurations, Xen PV domains do not have a suitable longrunning service process to handle requests of this kind. * MSR reads of this kind might be expected to be fast but if they involve trapping to a service domain they might be very slow. I have no objection to specifying that these reads may be quite slow. Guests should only use them at boot and if they have some reason to distrust their RNG pool. The latter can legitimately happen after various types of suspend or after migration (detected by VM Generation ID, for example). * This interface is very x86-specific. Agreed. Part of the motivation is to allow guests to use this mechanism very early in boot for stack canaries, ASLR, etc. I don't know a good way to do that without something very platform specific. It seems to me that the real need for this facility is to provide a good seed for the guest's own cryptographic PRNG. If we restrict ourselves to that use case, we can sidestep the difficulties. In particular, the parts of this proposal that are most difficult are: * The facility for the guest to provide random numbers back to the host. I think this can be dealt with easily by hypervisor-specific mechanisms, if it is desirable. Xen can implement this is a no-op. If we use feature bits, wrmsr support could be separately enumerated. * The implication that a hypervisor ought to be providing a unbounded stream of random numbers, rather than a fixed amount of seed. I don't expect hypervisors to estimate the entropy available through this mechanism. Given that, the length of the stream is largely irrelevant, except that an unbounded stream allowed reseeding after boot. I think the most obvious approach would be to provide the VM, at startup, with a page containing a fixed amount of random number seed, along with some metatdata. Some platform-specific way of discovering the location of the page would have to be defined. (That might an MSR but more likely it would be Device Tree or ACPI.) After the guest has read the page, it would be free to treat it as normal memory. The metadata need do little more than specify the length and the amount of provided entropy. There should be some room for expansion. ACPI is not useful early in boot. DT might be, but that could be a separate spec. The specification should say that the provided seed MUST be cryptographically secure, MUST have as much entropy as stated and that that amount of entropy MUST be at least (say) 64 bits and SHOULD be at least (say) 256 bits. I don't think this is practical. It will require hypervisors to throttle guest startup to ensure that they have that much entropy. I'm not fundamentally opposed to allowing hypervisors to provide more than 64 bits of data per invocation, which would help when the trap is very slow, but I don't know of a suitably simple way to do that. --Andy -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On Wed, Oct 29, 2014 at 9:07 AM, H. Peter Anvin h...@zytor.com wrote: On 10/29/2014 03:37 AM, Andrew Cooper wrote: CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros. To the extent that the hypervisor prefers a given interface, it should specify that interface earlier in the list. For example, KVM might place its KVMKVMKVM signature first in the list to indicate that it should be used by guests in preference to other supported interfaces. Other hypervisors would likely use a different order. The exact semantics of the ordering of the list is beyond the scope of this specification. How do you evaluate N? It would make more sense for CPUID.4F01[ECX=0] to return N in one register, and perhaps prefered interface index in another. The signatures can then be obtained from CPUID.4F01[ECX={1 to N}]. That way, a consumer can be confident that they have found all the signatures, without relying on an unbounded loop and checking for zeroes Yes. Specifically, it should return it in EAX. That is the preferred interface and we are trying to push for that going forward. I'm okay with that. I'm inclined to leave EBX, ECX, and EDX reserved for now, though. Barring an actual use case in which the order of the list isn't sufficient to determine preference, I don't see the need to give another preference indication. I updated the copy of github to make this change and to use an explicit feature bit for the RNG. --Andy -hpa -- Andy Lutomirski AMA Capital Management, LLC -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4] mm: gup: add get_user_pages_locked and get_user_pages_unlocked
On Thu, Oct 09, 2014 at 12:47:23PM +0200, Peter Zijlstra wrote: On Wed, Oct 01, 2014 at 10:56:35AM +0200, Andrea Arcangeli wrote: +static inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + int write, int force, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, + bool notify_drop) You might want to consider __always_inline to make sure it does indeed get inlined and constant propagation works for @locked and @notify_drop. Ok, that's included in the last patchset submit. Thanks, Andrea -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [Xen-devel] [RFC] Hypervisor RNG and enumeration
I have no objection to specifying that these reads may be quite slow. Guests should only use them at boot and if they have some reason to distrust their RNG pool. The latter can legitimately happen after various types of suspend or after migration (detected by VM Generation ID, for example). Just as a point of clarification, the VM Generation ID changes (at least in the Hyper-V implementation) only when the VM may have observed a different future, as when a VM backup is restored, a checkpoint is applied, etc. It does not change during migration, when the VM is suspended or when it is rebooted. I've heard anecdotes from application vendors saying that there is some other hypervisor that actually does change the ID at these moments and they wanted us to us to fix that, until I explained that I only control Hyper-V. -- Jake Oshins N�r��yb�X��ǧv�^�){.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On Wed, Oct 29, 2014 at 9:29 AM, Jake Oshins ja...@microsoft.com wrote: I have no objection to specifying that these reads may be quite slow. Guests should only use them at boot and if they have some reason to distrust their RNG pool. The latter can legitimately happen after various types of suspend or after migration (detected by VM Generation ID, for example). Just as a point of clarification, the VM Generation ID changes (at least in the Hyper-V implementation) only when the VM may have observed a different future, as when a VM backup is restored, a checkpoint is applied, etc. It does not change during migration, when the VM is suspended or when it is rebooted. I've heard anecdotes from application vendors saying that there is some other hypervisor that actually does change the ID at these moments and they wanted us to us to fix that, until I explained that I only control Hyper-V. Fair enough. If the VM may indeed have observed a different future, then I would argue that reseeding the RNG is very important -- more so than after a normal migration. If the VM trusts that its other history hasn't been compromised, then merely mixing in a unique value would get most of the benefit. --Andy -- Jake Oshins -- Andy Lutomirski AMA Capital Management, LLC -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH v3 4/6] target-i386: Remove unsupported bits from all CPU models
Am 03.10.2014 um 21:39 schrieb Eduardo Habkost: The following CPU features were never supported by neither TCG or KVM, so they are useless on the CPU model definitions, today: * CPUID_DTS (DS) * CPUID_HT * CPUID_TM * CPUID_PBE * CPUID_EXT_DTES64 * CPUID_EXT_DSCPL * CPUID_EXT_EST * CPUID_EXT_TM2 * CPUID_EXT_XTPR * CPUID_EXT_PDCM * CPUID_SVM_LBRV As using enforce mode is the only way to ensure guest ABI doesn't change when moving to a different host, we should make enforce mode the default or at least encourage management software to always use it. In turn, to make enforce usable, we need CPU models that work without always requiring some features to be explicitly disabled. This patch removes the above features from all CPU model definitions. We won't need any machine-type compat code for those changes, because it is impossible to have existing VMs with those features enabled. Signed-off-by: Eduardo Habkost ehabk...@redhat.com Cc: Aurelien Jarno aurel...@aurel32.net --- Changes v1 - v2: * Trivial typo fix in comment --- target-i386/cpu.c | 33 - 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 4119fca..1e9fff9 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -681,10 +681,11 @@ static X86CPUDefinition builtin_x86_defs[] = { .family = 16, .model = 2, .stepping = 3, +/* Missing: CPUID_HT */ .features[FEAT_1_EDX] = PPRO_FEATURES | CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | -CPUID_PSE36 | CPUID_VME | CPUID_HT, +CPUID_PSE36 | CPUID_VME, .features[FEAT_1_ECX] = CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_CX16 | CPUID_EXT_POPCNT, [snip] I'm okay with retaining these as comments. Anyone any objections? Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/4] mm: gup: add get_user_pages_locked and get_user_pages_unlocked
On Thu, Oct 09, 2014 at 12:50:37PM +0200, Peter Zijlstra wrote: On Wed, Oct 01, 2014 at 10:56:35AM +0200, Andrea Arcangeli wrote: +static inline long __get_user_pages_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + int write, int force, + struct page **pages, + struct vm_area_struct **vmas, + int *locked, + bool notify_drop) +{ + if (notify_drop lock_dropped *locked) { + /* +* We must let the caller know we temporarily dropped the lock +* and so the critical section protected by it was lost. +*/ + up_read(mm-mmap_sem); + *locked = 0; + } + return pages_done; +} +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, locked, true); +} +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, +unsigned long start, unsigned long nr_pages, +int write, int force, struct page **pages) +{ + long ret; + int locked = 1; + down_read(mm-mmap_sem); + ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, NULL, locked, false); + if (locked) + up_read(mm-mmap_sem); + return ret; +} long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { + return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, + pages, vmas, NULL, false); } I'm wondering about that notify_drop parameter, what's the added benefit? If you look at these 3 callers we can do away with it, since in the second called where we have locked but !notify_drop we seem to do The second (and third) caller pass notify_drop=false, so the notify_drop parameter is always a noop for them. They certainly could get away without it. the exact same thing afterwards anyway. It makes a difference only to the first caller, if it wasn't for the first caller notify_drop could be dropped. The first caller does this: return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, pages, NULL, locked, true, FOLL_TOUCH); ^ notify_drop = true Without notify_drop=true the first caller could make its own respective caller think the lock has never been dropped, just because it is locked by the time get_user_pages_locked returned. But the caller must be made aware that the lock has been dropped during the call and in turn any vma it got before inside the mmap_sem critical section is now stale. That's all notify_drop achieves. Thanks, Andrea -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH v3 5/6] target-i386: Don't enable nested VMX by default
Am 03.10.2014 um 21:39 schrieb Eduardo Habkost: TCG doesn't support VMX, and nested VMX is not enabled by default on the KVM kernel module. So, there's no reason to have VMX enabled by default on the core2duo and coreduo CPU models, today. Even the newer Intel CPU model definitions don't have it enabled. In this case, we need machine-type compat code, as people may be running the older machine-types on hosts that had VMX nesting enabled. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- hw/i386/pc_piix.c | 2 ++ hw/i386/pc_q35.c | 2 ++ target-i386/cpu.c | 8 3 files changed, 8 insertions(+), 4 deletions(-) [...] diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 1e9fff9..c336003 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -720,10 +720,10 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | CPUID_PSE36 | CPUID_VME | CPUID_ACPI | CPUID_SS, /* Missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_EST, - * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM */ + * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_VMX */ .features[FEAT_1_ECX] = CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 | -CPUID_EXT_VMX | CPUID_EXT_CX16, +CPUID_EXT_CX16, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, .features[FEAT_8000_0001_ECX] = [snip] Here I'm less certain what the best approach is. As you point out, there's an inconsistency that I agree should be fixed. I wonder however whether an approach similar to 3/6 for KVM only would be better? I.e., have VMX as a sometimes-KVM-supported feature be listed in the model and filter it out for accel=kvm so that -cpu enforce works, but let accel=tcg fail with features not implemented. Regards, Andreas -- SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/17] RFC: userfault v2
Hi Zhanghailiang, On Mon, Oct 27, 2014 at 05:32:51PM +0800, zhanghailiang wrote: Hi Andrea, Thanks for your hard work on userfault;) This is really a useful API. I want to confirm a question: Can we support distinguishing between writing and reading memory for userfault? That is, we can decide whether writing a page, reading a page or both trigger userfault. I think this will help supporting vhost-scsi,ivshmem for migration, we can trace dirty page in userspace. Actually, i'm trying to relize live memory snapshot based on pre-copy and userfault, but reading memory from migration thread will also trigger userfault. It will be easy to implement live memory snapshot, if we support configuring userfault for writing memory only. Mail is going to be long enough already so I'll just assume tracking dirty memory in userland (instead of doing it in kernel) is worthy feature to have here. After some chat during the KVMForum I've been already thinking it could be beneficial for some usage to give userland the information about the fault being read or write, combined with the ability of mapping pages wrprotected to mcopy_atomic (that would work without false positives only with MADV_DONTFORK also set, but it's already set in qemu). That will require vma-vm_flags VM_USERFAULT to be checked also in the wrprotect faults, not just in the not present faults, but it's not a massive change. Returning the read/write information is also a not massive change. This will then payoff mostly if there's also a way to remove the memory atomically (kind of remap_anon_pages). Would that be enough? I mean are you still ok if non present read fault traps too (you'd be notified it's a read) and you get notification for both wrprotect and non present faults? The question then is how you mark the memory readonly to let the wrprotect faults trap if the memory already existed and you didn't map it yourself in the guest with mcopy_atomic with a readonly flag. My current plan would be: - keep MADV_USERFAULT|NOUSERFAULT just to set VM_USERFAULT for the fast path check in the not-present and wrprotect page fault - if VM_USERFAULT is set, find if there's a userfaultfd registered into that vma too if yes engage userfaultfd protocol otherwise raise SIGBUS (single threaded apps should be fine with SIGBUS and it'll avoid them to spawn a thread in order to talk the userfaultfd protocol) - if userfaultfd protocol is engaged, return read|write fault + fault address to read(ufd) syscalls - leave the userfault resolution mechanism independent of the userfaultfd protocol so we keep the two problems separated and we don't mix them in the same API which makes it even harder to finalize it. add mcopy_atomic (with a flag to map the page readonly too) The alternative would be to hide mcopy_atomic (and even remap_anon_pages in order to remove the memory atomically for the externalization into the cloud) as userfaultfd commands to write into the fd. But then there would be no much point to keep MADV_USERFAULT around if I do so and I could just remove it too or it doesn't look clean having to open the userfaultfd just to issue an hidden mcopy_atomic. So it becomes a decision if the basic SIGBUS mode for single threaded apps should be supported or not. As long as we support SIGBUS too and we don't force to use userfaultfd as the only mechanism to be notified about userfaults, having a separate mcopy_atomic syscall sounds cleaner. Perhaps mcopy_atomic could be used in other cases that may arise later that may not be connected with the userfault. Questions to double check the above plan is ok: 1) should I drop the SIGBUS behavior and MADV_USERFAULT? 2) should I hide mcopy_atomic as a write into the userfaultfd? NOTE: even if I hide mcopy_atomic as a userfaultfd command to write into the fd, the buffer pointer passed to write() syscall would still _not_ be pointing to the data like a regular write, but it would be a pointer to a command structure that points to the source and destination data of the hidden mcopy_atomic, the only advantage is that perhaps I could wakeup the blocked page faults without requiring an additional syscall. The standalone mcopy_atomic would still require a write into the userfaultfd as it happens now after remap_anon_pages returns, in order to wakeup the stopped page faults. 3) should I add a registration command to trap only write faults? The protocol can always be extended later anyway in a backwards compatible way but it's better if we get it fully featured from the start. For completeness, some answers for other questions I've seen floating around but that weren't posted on the list yet (you can skip reading the below part if not interested): - open(/dev/userfault) instead of sys_userfaultfd(), I don't see the benefit: userfaultfd is just like eventfd in
Re: [Qemu-devel] [PATCH 00/17] RFC: userfault v2
On 29 October 2014 17:46, Andrea Arcangeli aarca...@redhat.com wrote: After some chat during the KVMForum I've been already thinking it could be beneficial for some usage to give userland the information about the fault being read or write ...I wonder if that would let us replace the current nasty mess we use in linux-user to detect read vs write faults (which uses a bunch of architecture-specific hacks including in some cases look at the insn that triggered this SEGV and decode it to see if it was a load or a store; see the various cpu_signal_handler() implementations in user-exec.c). -- PMM -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration
On 10/29/2014 05:29 PM, Jake Oshins wrote: Just as a point of clarification, the VM Generation ID changes (at least in the Hyper-V implementation) only when the VM may have observed a different future, as when a VM backup is restored, a checkpoint is applied, etc. It does not change during migration, when the VM is suspended or when it is rebooted. I've heard anecdotes from application vendors saying that there is some other hypervisor that actually does change the ID at these moments and they wanted us to us to fix that, until I explained that I only control Hyper-V. This is indeed the only reasonable way you can read the vmgenid spec. Paolo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Fix Penguin Penalty 17th October2014 ( mail-archive.com )
Dear Sir Did your website get hit by Google Penguin update on October 17th 2014? What basically is Google Penguin Update? It is actually a code name for Google algorithm which aims at decreasing your websites search engine rankings that violate Googles guidelines by using black hat SEO techniques to rank your webpage by giving number of spammy links to the page. We are one of those few SEO companies that can help you avoid penalties from Google Updates like Penguin and Panda. Our clients have survived all the previous and present updates with ease. They have never been hit because we use 100% white hat SEO techniques to rank Webpages. Simple thing that we do to keep websites away from any Penguin or Panda penalties is follow Google guidelines and we give Google users the best answers to their queries. If you are looking to increase the quality of your websites and to get more targeted traffic or save your websites from these Google penalties email us back with your interest. We will be glad to serve you and help you grow your business. Regards Vince G SEO Manager ( TOB ) B7 Green Avenue, Amritsar 143001 Punjab NO CLICK in the subject to STOP EMAILS -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/27/2014 05:22 PM, Waiman Long wrote: On 10/27/2014 02:04 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. Thanks for letting me know about that. I have this concern because your patch didn't change the current configuration of disabling unlock inlining when paravirt_spinlock is enabled. With that, I think it is worthwhile to reduce the performance delta between the PV and non-PV kernel on bare metal. I am sorry that the unlock call sites patching code doesn't work in a virtual guest. Your pvqspinlock patch did an unconditional patching even in a virtual guest. I added check for the paravirt_spinlocks_enabled, but it turned out that some spin_unlock() seemed to be called before paravirt_spinlocks_enabled is set. As a result, some call sites were still patched resulting in missed wake up's and system hang. At this point, I am going to leave out that change from my patch set until we can figure out a better way of doing that. -Longman -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [Qemu-devel] [PATCH v3 5/6] target-i386: Don't enable nested VMX by default
On Wed, Oct 29, 2014 at 06:40:48PM +0100, Andreas Färber wrote: Am 03.10.2014 um 21:39 schrieb Eduardo Habkost: TCG doesn't support VMX, and nested VMX is not enabled by default on the KVM kernel module. So, there's no reason to have VMX enabled by default on the core2duo and coreduo CPU models, today. Even the newer Intel CPU model definitions don't have it enabled. In this case, we need machine-type compat code, as people may be running the older machine-types on hosts that had VMX nesting enabled. Signed-off-by: Eduardo Habkost ehabk...@redhat.com --- hw/i386/pc_piix.c | 2 ++ hw/i386/pc_q35.c | 2 ++ target-i386/cpu.c | 8 3 files changed, 8 insertions(+), 4 deletions(-) [...] diff --git a/target-i386/cpu.c b/target-i386/cpu.c index 1e9fff9..c336003 100644 --- a/target-i386/cpu.c +++ b/target-i386/cpu.c @@ -720,10 +720,10 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | CPUID_PSE36 | CPUID_VME | CPUID_ACPI | CPUID_SS, /* Missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_EST, - * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM */ + * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_VMX */ .features[FEAT_1_ECX] = CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 | -CPUID_EXT_VMX | CPUID_EXT_CX16, +CPUID_EXT_CX16, .features[FEAT_8000_0001_EDX] = CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX, .features[FEAT_8000_0001_ECX] = [snip] Here I'm less certain what the best approach is. As you point out, there's an inconsistency that I agree should be fixed. I wonder however whether an approach similar to 3/6 for KVM only would be better? I.e., have VMX as a sometimes-KVM-supported feature be listed in the model and filter it out for accel=kvm so that -cpu enforce works, but let accel=tcg fail with features not implemented. I don't mind either way, this is up for TCG maintainers. But I don't understand what would be the benefit in keeping it enabled by default for TCG. Fortunately the answer for KVM and TCG are completely independent from each other, now. We just need to follow this depending on the defaults we choose: | Default on | | TCG | KVM | Implementation +--+-+- A | NO | NO | Unset it on CPU model table (builtin_x86_defs) B | NO | YES | Unset on table, set on kvm_default_features C | YES | NO | Set on table, set on kvm_default_unset_features B | YES | YES | Set it on CPU model table +--+-+- This patch implements row A. If you tell me you really want VMX to be enabled by default on TCG mode, then we can implement row C like we already did for CPUID_ACPI, CPUID_EXT_MONITOR, and CPUID_EXT3_SVM. But, are you really sure you do? Considering accel=tcg behavior only (ignore KVM by now): why is this different from all the features listed on patch 1/6? VMX was never supported by TCG, so (like all the features from 1/6) it is not useful to have it enabled by default when accel=tcg. -- Eduardo -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v13 00/11] qspinlock: a 4-byte queue spinlock with PV support
v12-v13: - Change patch 9 to generate separate versions of the queue_spin_lock_slowpath functions for bare metal and PV guest. This reduces the performance impact of the PV code on bare metal systems. v11-v12: - Based on PeterZ's version of the qspinlock patch (https://lkml.org/lkml/2014/6/15/63). - Incorporated many of the review comments from Konrad Wilk and Paolo Bonzini. - The pvqspinlock code is largely from my previous version with PeterZ's way of going from queue tail to head and his idea of using callee saved calls to KVM and XEN codes. v10-v11: - Use a simple test-and-set unfair lock to simplify the code, but performance may suffer a bit for large guest with many CPUs. - Take out Raghavendra KT's test results as the unfair lock changes may render some of his results invalid. - Add PV support without increasing the size of the core queue node structure. - Other minor changes to address some of the feedback comments. v9-v10: - Make some minor changes to qspinlock.c to accommodate review feedback. - Change author to PeterZ for 2 of the patches. - Include Raghavendra KT's test results in patch 18. v8-v9: - Integrate PeterZ's version of the queue spinlock patch with some modification: http://lkml.kernel.org/r/20140310154236.038181...@infradead.org - Break the more complex patches into smaller ones to ease review effort. - Fix a racing condition in the PV qspinlock code. v7-v8: - Remove one unneeded atomic operation from the slowpath, thus improving performance. - Simplify some of the codes and add more comments. - Test for X86_FEATURE_HYPERVISOR CPU feature bit to enable/disable unfair lock. - Reduce unfair lock slowpath lock stealing frequency depending on its distance from the queue head. - Add performance data for IvyBridge-EX CPU. v6-v7: - Remove an atomic operation from the 2-task contending code - Shorten the names of some macros - Make the queue waiter to attempt to steal lock when unfair lock is enabled. - Remove lock holder kick from the PV code and fix a race condition - Run the unfair lock PV code on overcommitted KVM guests to collect performance data. v5-v6: - Change the optimized 2-task contending code to make it fairer at the expense of a bit of performance. - Add a patch to support unfair queue spinlock for Xen. - Modify the PV qspinlock code to follow what was done in the PV ticketlock. - Add performance data for the unfair lock as well as the PV support code. v4-v5: - Move the optimized 2-task contending code to the generic file to enable more architectures to use it without code duplication. - Address some of the style-related comments by PeterZ. - Allow the use of unfair queue spinlock in a real para-virtualized execution environment. - Add para-virtualization support to the qspinlock code by ensuring that the lock holder and queue head stay alive as much as possible. v3-v4: - Remove debugging code and fix a configuration error - Simplify the qspinlock structure and streamline the code to make it perform a bit better - Add an x86 version of asm/qspinlock.h for holding x86 specific optimization. - Add an optimized x86 code path for 2 contending tasks to improve low contention performance. v2-v3: - Simplify the code by using numerous mode only without an unfair option. - Use the latest smp_load_acquire()/smp_store_release() barriers. - Move the queue spinlock code to kernel/locking. - Make the use of queue spinlock the default for x86-64 without user configuration. - Additional performance tuning. v1-v2: - Add some more comments to document what the code does. - Add a numerous CPU mode to support = 16K CPUs - Add a configuration option to allow lock stealing which can further improve performance in many cases. - Enable wakeup of queue head CPU at unlock time for non-numerous CPU mode. This patch set has 3 different sections: 1) Patches 1-6: Introduces a queue-based spinlock implementation that can replace the default ticket spinlock without increasing the size of the spinlock data structure. As a result, critical kernel data structures that embed spinlock won't increase in size and break data alignments. 2) Patch 7: Enables the use of unfair lock in a virtual guest. This can resolve some of the locking related performance issues due to the fact that the next CPU to get the lock may have been scheduled out for a period of time. 3) Patches 8-11: Enable qspinlock para-virtualization support by halting the waiting CPUs after spinning for a certain amount of time. The unlock code will detect the a sleeping waiter and wake it up. This is essentially the same logic as the PV ticketlock code. The queue spinlock has slightly better performance than the ticket spinlock in uncontended case. Its performance can be much better with moderate to heavy contention. This patch has the
[PATCH v13 10/11] pvqspinlock, x86: Enable PV qspinlock for KVM
This patch adds the necessary KVM specific code to allow KVM to support the CPU halting and kicking operations needed by the queue spinlock PV code. Two KVM guests of 20 CPU cores (2 nodes) were created for performance testing in one of the following three configurations: 1) Only 1 VM is active 2) Both VMs are active and they share the same 20 physical CPUs (200% overcommit) The tests run included the disk workload of the AIM7 benchmark on both ext4 and xfs RAM disks at 3000 users on a 3.17 based kernel. The ebizzy -m test and futextest was was also run and its performance data were recorded. With two VMs running, the idle=poll kernel option was added to simulate a busy guest. If PV qspinlock is not enabled, unfairlock will be used automically in a guest. AIM7 XFS Disk Test (no overcommit) kernel JPMReal Time Sys TimeUsr Time - ---- PV ticketlock 25423737.08 98.95 5.44 PV qspinlock 25495757.06 98.63 5.40 unfairlock26162796.91 97.05 5.42 AIM7 XFS Disk Test (200% overcommit) kernel JPMReal Time Sys TimeUsr Time - ---- PV ticketlock 64446827.93 415.22 6.33 PV qspinlock 64562427.88 419.84 0.39 unfairlock69551825.88 377.40 4.09 AIM7 EXT4 Disk Test (no overcommit) kernel JPMReal Time Sys TimeUsr Time - ---- PV ticketlock 19955659.02 103.67 5.76 PV qspinlock 20111738.95 102.15 5.40 unfairlock20665908.71 98.13 5.46 AIM7 EXT4 Disk Test (200% overcommit) kernel JPMReal Time Sys TimeUsr Time - ---- PV ticketlock 47834137.63 495.81 30.78 PV qspinlock 47405837.97 475.74 30.95 unfairlock56022432.13 398.43 26.27 For the AIM7 disk workload, both PV ticketlock and qspinlock have about the same performance. The unfairlock performs slightly better than the PV lock. EBIZZY-m Test (no overcommit) kernelRec/s Real Time Sys TimeUsr Time - - - PV ticketlock 3255 10.00 60.65 3.62 PV qspinlock 3318 10.00 54.27 3.60 unfairlock2833 10.00 26.66 3.09 EBIZZY-m Test (200% overcommit) kernelRec/s Real Time Sys TimeUsr Time - - - PV ticketlock 841 10.00 71.03 2.37 PV qspinlock 834 10.00 68.27 2.39 unfairlock 865 10.00 27.08 1.51 futextest (no overcommit) kernel kops/s --- PV ticketlock11523 PV qspinlock 12328 unfairlock9478 futextest (200% overcommit) kernel kops/s --- PV ticketlock 7276 PV qspinlock 7095 unfairlock5614 The ebizzy and futextest have much higher spinlock contention than the AIM7 disk workload. In this case, the unfairlock performs worse than both the PV ticketlock and qspinlock. The performance of the 2 PV locks are comparable. Signed-off-by: Waiman Long waiman.l...@hp.com --- arch/x86/kernel/kvm.c | 138 - kernel/Kconfig.locks |2 +- 2 files changed, 138 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eaa064c..1183645 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -568,7 +568,7 @@ arch_initcall(activate_jump_labels); #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ -static void kvm_kick_cpu(int cpu) +void kvm_kick_cpu(int cpu) { int apicid; unsigned long flags = 0; @@ -576,7 +576,9 @@ static void kvm_kick_cpu(int cpu) apicid = per_cpu(x86_cpu_to_apicid, cpu); kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } +PV_CALLEE_SAVE_REGS_THUNK(kvm_kick_cpu); +#ifndef CONFIG_QUEUE_SPINLOCK enum kvm_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -804,6 +806,132 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } } +#else /* !CONFIG_QUEUE_SPINLOCK */ + +#ifdef CONFIG_KVM_DEBUG_FS +static struct dentry *d_spin_debug; +static struct dentry *d_kvm_debug; +static u32 kick_nohlt_stats; /* Kick but
[PATCH v13 11/11] pvqspinlock, x86: Enable PV qspinlock for XEN
This patch adds the necessary XEN specific code to allow XEN to support the CPU halting and kicking operations needed by the queue spinlock PV code. Signed-off-by: Waiman Long waiman.l...@hp.com --- arch/x86/xen/spinlock.c | 149 +-- kernel/Kconfig.locks|2 +- 2 files changed, 145 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d332ae0..401d7a6 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,6 +17,12 @@ #include xen-ops.h #include debugfs.h +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static bool xen_pvspin = true; + +#ifndef CONFIG_QUEUE_SPINLOCK + enum xen_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -100,12 +106,9 @@ struct xen_lock_waiting { __ticket_t want; }; -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; -static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -213,6 +216,118 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } +#else /* CONFIG_QUEUE_SPINLOCK */ + +#ifdef CONFIG_XEN_DEBUG_FS +static u32 kick_nohlt_stats; /* Kick but not halt count */ +static u32 halt_qhead_stats; /* Queue head halting count */ +static u32 halt_qnode_stats; /* Queue node halting count */ +static u32 halt_abort_stats; /* Halting abort count */ +static u32 wake_kick_stats;/* Wakeup by kicking count */ +static u32 wake_spur_stats;/* Spurious wakeup count*/ +static u64 time_blocked; /* Total blocking time */ + +static inline void xen_halt_stats(enum pv_lock_stats type) +{ + if (type == PV_HALT_QHEAD) + add_smp(halt_qhead_stats, 1); + else if (type == PV_HALT_QNODE) + add_smp(halt_qnode_stats, 1); + else /* type == PV_HALT_ABORT */ + add_smp(halt_abort_stats, 1); +} + +void xen_lock_stats(enum pv_lock_stats type) +{ + if (type == PV_WAKE_KICKED) + add_smp(wake_kick_stats, 1); + else if (type == PV_WAKE_SPURIOUS) + add_smp(wake_spur_stats, 1); + else /* type == PV_KICK_NOHALT */ + add_smp(kick_nohlt_stats, 1); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_lock_stats); + +static inline u64 spin_time_start(void) +{ + return sched_clock(); +} + +static inline void spin_time_accum_blocked(u64 start) +{ + u64 delta; + + delta = sched_clock() - start; + add_smp(time_blocked, delta); +} +#else /* CONFIG_XEN_DEBUG_FS */ +static inline void xen_halt_stats(enum pv_lock_stats type) +{ +} + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_blocked(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + +void xen_kick_cpu(int cpu) +{ + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_kick_cpu); + +/* + * Halt the current CPU release it back to the host + */ +void xen_halt_cpu(u8 *lockbyte) +{ + int irq = __this_cpu_read(lock_kicker_irq); + unsigned long flags; + u64 start; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + /* +* Make sure an interrupt handler can't upset things in a +* partially setup state. +*/ + local_irq_save(flags); + start = spin_time_start(); + + xen_halt_stats(lockbyte ? PV_HALT_QHEAD : PV_HALT_QNODE); + /* clear pending */ + xen_clear_irq_pending(irq); + + /* Allow interrupts while blocked */ + local_irq_restore(flags); + /* +* Don't halt if the lock is now available +*/ + if (lockbyte !ACCESS_ONCE(*lockbyte)) { + xen_halt_stats(PV_HALT_ABORT); + return; + } + /* +* If an interrupt happens here, it will leave the wakeup irq +* pending, which will cause xen_poll_irq() to return +* immediately. +*/ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); + spin_time_accum_blocked(start); +} +PV_CALLEE_SAVE_REGS_THUNK(xen_halt_cpu); + +#endif /* CONFIG_QUEUE_SPINLOCK */ + static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -258,7 +373,6 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } - /* * Our init of PV spinlocks is split in two init functions due to us * using paravirt patching and jump labels patching and having to do @@ -275,8 +389,17 @@ void __init xen_init_spinlocks(void)
[PATCH v13 09/11] pvqspinlock, x86: Add para-virtualization support
This patch adds para-virtualization support to the queue spinlock code base with minimal impact to the native case. There are some minor code changes in the generic qspinlock.c file which should be usable in other architectures. The other code changes are specific to x86 processors and so are all put under the arch/x86 directory. On the lock side, the slowpath code is split into 2 separate functions generated from the same code - one for bare metal and one for PV guest. The switching is done in the _raw_spin_lock* functions. This makes sure that the performance impact to the bare metal case is minimal, just a few NOPs in the _raw_spin_lock* functions. In the PV slowpath code, there are 2 paravirt callee saved calls that minimize register pressure. On the unlock side, however, the disabling of unlock function inlining does have some slight impact on bare metal performance. The actual paravirt code comes in 5 parts; - init_node; this initializes the extra data members required for PV state. PV state data is kept 1 cacheline ahead of the regular data. - link_and_wait_node; this replaces the regular MCS queuing code. CPU halting can happen if the wait is too long. - wait_head; this waits until the lock is avialable and the CPU will be halted if the wait is too long. - wait_check; this is called after acquiring the lock to see if the next queue head CPU is halted. If this is the case, the lock bit is changed to indicate the queue head will have to be kicked on unlock. - queue_unlock; this routine has a jump label to check if paravirt is enabled. If yes, it has to do an atomic cmpxchg to clear the lock bit or call the slowpath function to kick the queue head cpu. Tracking the head is done in two parts, firstly the pv_wait_head will store its cpu number in whichever node is pointed to by the tail part of the lock word. Secondly, pv_link_and_wait_node() will propagate the existing head from the old to the new tail node. Signed-off-by: Waiman Long waiman.l...@hp.com --- arch/x86/include/asm/paravirt.h | 19 ++ arch/x86/include/asm/paravirt_types.h | 20 ++ arch/x86/include/asm/pvqspinlock.h| 411 + arch/x86/include/asm/qspinlock.h | 71 ++- arch/x86/kernel/paravirt-spinlocks.c |6 + include/asm-generic/qspinlock.h |2 + kernel/locking/qspinlock.c| 69 +- 7 files changed, 591 insertions(+), 7 deletions(-) create mode 100644 arch/x86/include/asm/pvqspinlock.h diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cd6e161..7e296e6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,24 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUE_SPINLOCK + +static __always_inline void pv_kick_cpu(int cpu) +{ + PVOP_VCALLEE1(pv_lock_ops.kick_cpu, cpu); +} + +static __always_inline void pv_lockwait(u8 *lockbyte) +{ + PVOP_VCALLEE1(pv_lock_ops.lockwait, lockbyte); +} + +static __always_inline void pv_lockstat(enum pv_lock_stats type) +{ + PVOP_VCALLEE1(pv_lock_ops.lockstat, type); +} + +#else static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -723,6 +741,7 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, { PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } +#endif #endif diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7549b8b..49e4b76 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -326,6 +326,9 @@ struct pv_mmu_ops { phys_addr_t phys, pgprot_t flags); }; +struct mcs_spinlock; +struct qspinlock; + struct arch_spinlock; #ifdef CONFIG_SMP #include asm/spinlock_types.h @@ -333,9 +336,26 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +#ifdef CONFIG_QUEUE_SPINLOCK +enum pv_lock_stats { + PV_HALT_QHEAD, /* Queue head halting */ + PV_HALT_QNODE, /* Other queue node halting */ + PV_HALT_ABORT, /* Halting aborted */ + PV_WAKE_KICKED, /* Wakeup by kicking*/ + PV_WAKE_SPURIOUS, /* Spurious wakeup */ + PV_KICK_NOHALT /* Kick but CPU not halted */ +}; +#endif + struct pv_lock_ops { +#ifdef CONFIG_QUEUE_SPINLOCK + struct paravirt_callee_save kick_cpu; + struct paravirt_callee_save lockstat; + struct paravirt_callee_save lockwait; +#else struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif }; /* This contains all the paravirt structures: we get a convenient diff --git
[PATCH v13 08/11] qspinlock, x86: Rename paravirt_ticketlocks_enabled
This patch renames the paravirt_ticketlocks_enabled static key to a more generic paravirt_spinlocks_enabled name. Signed-off-by: Waiman Long waiman.l...@hp.com Signed-off-by: Peter Zijlstra pet...@infradead.org --- arch/x86/include/asm/spinlock.h |4 ++-- arch/x86/kernel/kvm.c|2 +- arch/x86/kernel/paravirt-spinlocks.c |4 ++-- arch/x86/xen/spinlock.c |2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 5899483..928751e 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -39,7 +39,7 @@ /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 15) -extern struct static_key paravirt_ticketlocks_enabled; +extern struct static_key paravirt_spinlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); #ifdef CONFIG_QUEUE_SPINLOCK @@ -150,7 +150,7 @@ static inline void __ticket_unlock_slowpath(arch_spinlock_t *lock, static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) { if (TICKET_SLOWPATH_FLAG - static_key_false(paravirt_ticketlocks_enabled)) { + static_key_false(paravirt_spinlocks_enabled)) { arch_spinlock_t prev; prev = *lock; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f6945be..eaa064c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -827,7 +827,7 @@ static __init int kvm_spinlock_init_jump(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return 0; - static_key_slow_inc(paravirt_ticketlocks_enabled); + static_key_slow_inc(paravirt_spinlocks_enabled); printk(KERN_INFO KVM setup paravirtual spinlock\n); return 0; diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index bbb6c73..e434f24 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -16,5 +16,5 @@ struct pv_lock_ops pv_lock_ops = { }; EXPORT_SYMBOL(pv_lock_ops); -struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE; -EXPORT_SYMBOL(paravirt_ticketlocks_enabled); +struct static_key paravirt_spinlocks_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(paravirt_spinlocks_enabled); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 23b45eb..d332ae0 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -293,7 +293,7 @@ static __init int xen_init_spinlocks_jump(void) if (!xen_domain()) return 0; - static_key_slow_inc(paravirt_ticketlocks_enabled); + static_key_slow_inc(paravirt_spinlocks_enabled); return 0; } early_initcall(xen_init_spinlocks_jump); -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v13 04/11] qspinlock: Extract out code snippets for the next patch
This is a preparatory patch that extracts out the following 2 code snippets to prepare for the next performance optimization patch. 1) the logic for the exchange of new and previous tail code words into a new xchg_tail() function. 2) the logic for clearing the pending bit and setting the locked bit into a new clear_pending_set_locked() function. This patch also simplifies the trylock operation before queuing by calling queue_spin_trylock() directly. Signed-off-by: Waiman Long waiman.l...@hp.com Signed-off-by: Peter Zijlstra pet...@infradead.org --- include/asm-generic/qspinlock_types.h |2 + kernel/locking/qspinlock.c| 91 +--- 2 files changed, 62 insertions(+), 31 deletions(-) diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 4196694..88d647c 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -58,6 +58,8 @@ typedef struct qspinlock { #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + #define _Q_LOCKED_VAL (1U _Q_LOCKED_OFFSET) #define _Q_PENDING_VAL (1U _Q_PENDING_OFFSET) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 226b11d..48bd2ad 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -95,6 +95,54 @@ static inline struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) /** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queue spinlock structure + * @val : Current value of the queue spinlock 32-bit word + * + * *,1,0 - *,0,1 + */ +static __always_inline void +clear_pending_set_locked(struct qspinlock *lock, u32 val) +{ + u32 new, old; + + for (;;) { + new = (val ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + + old = atomic_cmpxchg(lock-val, val, new); + if (old == val) + break; + + val = old; + } +} + +/** + * xchg_tail - Put in the new queue tail code word retrieve previous one + * @lock : Pointer to queue spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* - n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + u32 old, new, val = atomic_read(lock-val); + + for (;;) { + new = (val _Q_LOCKED_PENDING_MASK) | tail; + old = atomic_cmpxchg(lock-val, val, new); + if (old == val) + break; + + val = old; + } + return old; +} + +/** * queue_spin_lock_slowpath - acquire the queue spinlock * @lock: Pointer to queue spinlock structure * @val: Current value of the queue spinlock 32-bit word @@ -176,15 +224,7 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) * * *,1,0 - *,0,1 */ - for (;;) { - new = (val ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; - - old = atomic_cmpxchg(lock-val, val, new); - if (old == val) - break; - - val = old; - } + clear_pending_set_locked(lock, val); return; /* @@ -201,37 +241,26 @@ queue: node-next = NULL; /* -* We have already touched the queueing cacheline; don't bother with -* pending stuff. -* -* trylock || xchg(lock, node) -* -* 0,0,0 - 0,0,1 ; no tail, not locked - no tail, locked. -* p,y,x - n,y,x ; tail was p - tail is n; preserving locked. +* We touched a (possibly) cold cacheline in the per-cpu queue node; +* attempt the trylock once more in the hope someone let go while we +* weren't watching. */ - for (;;) { - new = _Q_LOCKED_VAL; - if (val) - new = tail | (val _Q_LOCKED_PENDING_MASK); - - old = atomic_cmpxchg(lock-val, val, new); - if (old == val) - break; - - val = old; - } + if (queue_spin_trylock(lock)) + goto release; /* -* we won the trylock; forget about queueing. +* We have already touched the queueing cacheline; don't bother with +* pending stuff. +* +* p,*,* - n,*,* */ - if (new == _Q_LOCKED_VAL) - goto release; + old = xchg_tail(lock, tail); /* * if there was a previous node; link it and wait until reaching the * head of the waitqueue. */ - if (old ~_Q_LOCKED_PENDING_MASK) { + if (old _Q_TAIL_MASK) { prev =
[PATCH v13 07/11] qspinlock: Revert to test-and-set on hypervisors
From: Peter Zijlstra pet...@infradead.org When we detect a hypervisor (!paravirt, see qspinlock paravirt support patches), revert to a simple test-and-set lock to avoid the horrors of queue preemption. Signed-off-by: Peter Zijlstra pet...@infradead.org Signed-off-by: Waiman Long waiman.l...@hp.com --- arch/x86/include/asm/qspinlock.h | 14 ++ include/asm-generic/qspinlock.h |7 +++ kernel/locking/qspinlock.c |3 +++ 3 files changed, 24 insertions(+), 0 deletions(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index a6a8762..05a77fe 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -1,6 +1,7 @@ #ifndef _ASM_X86_QSPINLOCK_H #define _ASM_X86_QSPINLOCK_H +#include asm/cpufeature.h #include asm-generic/qspinlock_types.h #ifndef CONFIG_X86_PPRO_FENCE @@ -20,6 +21,19 @@ static inline void queue_spin_unlock(struct qspinlock *lock) #endif /* !CONFIG_X86_PPRO_FENCE */ +#define virt_queue_spin_lock virt_queue_spin_lock + +static inline bool virt_queue_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + while (atomic_cmpxchg(lock-val, 0, _Q_LOCKED_VAL) != 0) + cpu_relax(); + + return true; +} + #include asm-generic/qspinlock.h #endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index e8a7ae8..a53a7bb 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -98,6 +98,13 @@ static __always_inline void queue_spin_unlock(struct qspinlock *lock) } #endif +#ifndef virt_queue_spin_lock +static __always_inline bool virt_queue_spin_lock(struct qspinlock *lock) +{ + return false; +} +#endif + /* * Initializier */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index fb0e988..1c1926a 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -257,6 +257,9 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) BUILD_BUG_ON(CONFIG_NR_CPUS = (1U _Q_TAIL_CPU_BITS)); + if (virt_queue_spin_lock(lock)) + return; + /* * wait for in-progress pending-locked hand-overs * -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v13 05/11] qspinlock: Optimize for smaller NR_CPUS
From: Peter Zijlstra pet...@infradead.org When we allow for a max NR_CPUS 2^14 we can optimize the pending wait-acquire and the xchg_tail() operations. By growing the pending bit to a byte, we reduce the tail to 16bit. This means we can use xchg16 for the tail part and do away with all the repeated compxchg() operations. This in turn allows us to unconditionally acquire; the locked state as observed by the wait loops cannot change. And because both locked and pending are now a full byte we can use simple stores for the state transition, obviating one atomic operation entirely. This optimization is needed to make the qspinlock achieve performance parity with ticket spinlock at light load. All this is horribly broken on Alpha pre EV56 (and any other arch that cannot do single-copy atomic byte stores). Signed-off-by: Peter Zijlstra pet...@infradead.org Signed-off-by: Waiman Long waiman.l...@hp.com --- include/asm-generic/qspinlock_types.h | 13 ++ kernel/locking/qspinlock.c| 71 - 2 files changed, 83 insertions(+), 1 deletions(-) diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 88d647c..01b46df 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -35,6 +35,14 @@ typedef struct qspinlock { /* * Bitfields in the atomic value: * + * When NR_CPUS 16K + * 0- 7: locked byte + * 8: pending + * 9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + * + * When NR_CPUS = 16K * 0- 7: locked byte * 8: pending * 9-10: tail index @@ -47,7 +55,11 @@ typedef struct qspinlock { #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) #define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#if CONFIG_NR_CPUS (1U 14) +#define _Q_PENDING_BITS8 +#else #define _Q_PENDING_BITS1 +#endif #define _Q_PENDING_MASK_Q_SET_MASK(PENDING) #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) @@ -58,6 +70,7 @@ typedef struct qspinlock { #define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET #define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) #define _Q_LOCKED_VAL (1U _Q_LOCKED_OFFSET) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 48bd2ad..7c127b4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -22,6 +22,7 @@ #include linux/percpu.h #include linux/hardirq.h #include linux/mutex.h +#include asm/byteorder.h #include asm/qspinlock.h /* @@ -54,6 +55,10 @@ * node; whereby avoiding the need to carry a node from lock to unlock, and * preserving existing lock API. This also makes the unlock code simpler and * faster. + * + * N.B. The current implementation only supports architectures that allow + * atomic operations on smaller 8-bit and 16-bit data types. + * */ #include mcs_spinlock.h @@ -94,6 +99,64 @@ static inline struct mcs_spinlock *decode_tail(u32 tail) #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) +/* + * By using the whole 2nd least significant byte for the pending bit, we + * can allow better optimization of the lock acquisition for the pending + * bit holder. + */ +#if _Q_PENDING_BITS == 8 + +struct __qspinlock { + union { + atomic_t val; + struct { +#ifdef __LITTLE_ENDIAN + u16 locked_pending; + u16 tail; +#else + u16 tail; + u16 locked_pending; +#endif + }; + }; +}; + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queue spinlock structure + * @val : Current value of the queue spinlock 32-bit word + * + * *,1,0 - *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void +clear_pending_set_locked(struct qspinlock *lock, u32 val) +{ + struct __qspinlock *l = (void *)lock; + + ACCESS_ONCE(l-locked_pending) = _Q_LOCKED_VAL; +} + +/* + * xchg_tail - Put in the new queue tail code word retrieve previous one + * @lock : Pointer to queue spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* - n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) +{ + struct __qspinlock *l = (void *)lock; + + return (u32)xchg(l-tail, tail _Q_TAIL_OFFSET) _Q_TAIL_OFFSET; +} + +#else /* _Q_PENDING_BITS == 8 */ + /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queue spinlock structure @@ -141,6 +204,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) }
[PATCH v13 06/11] qspinlock: Use a simple write to grab the lock
Currently, atomic_cmpxchg() is used to get the lock. However, this is not really necessary if there is more than one task in the queue and the queue head don't need to reset the tail code. For that case, a simple write to set the lock bit is enough as the queue head will be the only one eligible to get the lock as long as it checks that both the lock and pending bits are not set. The current pending bit waiting code will ensure that the bit will not be set as soon as the tail code in the lock is set. With that change, the are some slight improvement in the performance of the queue spinlock in the 5M loop micro-benchmark run on a 4-socket Westere-EX machine as shown in the tables below. [Standalone/Embedded - same node] # of tasksBefore patchAfter patch %Change ----- -- --- 3 2324/2321 2248/2265-3%/-2% 4 2890/2896 2819/2831-2%/-2% 5 3611/3595 3522/3512-2%/-2% 6 4281/4276 4173/4160-3%/-3% 7 5018/5001 4875/4861-3%/-3% 8 5759/5750 5563/5568-3%/-3% [Standalone/Embedded - different nodes] # of tasksBefore patchAfter patch %Change ----- -- --- 312242/12237 12087/12093 -1%/-1% 410688/10696 10507/10521 -2%/-2% It was also found that this change produced a much bigger performance improvement in the newer IvyBridge-EX chip and was essentially to close the performance gap between the ticket spinlock and queue spinlock. The disk workload of the AIM7 benchmark was run on a 4-socket Westmere-EX machine with both ext4 and xfs RAM disks at 3000 users on a 3.14 based kernel. The results of the test runs were: AIM7 XFS Disk Test kernel JPMReal Time Sys TimeUsr Time - ---- ticketlock56782333.17 96.61 5.81 qspinlock 57507993.13 94.83 5.97 AIM7 EXT4 Disk Test kernel JPMReal Time Sys TimeUsr Time - ---- ticketlock1114551 16.15 509.72 7.11 qspinlock 21844668.24 232.99 6.01 The ext4 filesystem run had a much higher spinlock contention than the xfs filesystem run. The ebizzy -m test was also run with the following results: kernel records/s Real Time Sys TimeUsr Time -- - ticketlock 2075 10.00 216.35 3.49 qspinlock 3023 10.00 198.20 4.80 Signed-off-by: Waiman Long waiman.l...@hp.com Signed-off-by: Peter Zijlstra pet...@infradead.org --- kernel/locking/qspinlock.c | 59 1 files changed, 43 insertions(+), 16 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 7c127b4..fb0e988 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -103,24 +103,33 @@ static inline struct mcs_spinlock *decode_tail(u32 tail) * By using the whole 2nd least significant byte for the pending bit, we * can allow better optimization of the lock acquisition for the pending * bit holder. + * + * This internal structure is also used by the set_locked function which + * is not restricted to _Q_PENDING_BITS == 8. */ -#if _Q_PENDING_BITS == 8 - struct __qspinlock { union { atomic_t val; - struct { #ifdef __LITTLE_ENDIAN + u8 locked; + struct { u16 locked_pending; u16 tail; + }; #else + struct { u16 tail; u16 locked_pending; -#endif }; + struct { + u8 reserved[3]; + u8 locked; + }; +#endif }; }; +#if _Q_PENDING_BITS == 8 /** * clear_pending_set_locked - take ownership and clear the pending bit. * @lock: Pointer to queue spinlock structure @@ -207,6 +216,19 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) #endif /* _Q_PENDING_BITS == 8 */ /** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queue spinlock structure + * + * *,*,0 - *,0,1 + */ +static __always_inline void set_locked(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + ACCESS_ONCE(l-locked) = _Q_LOCKED_VAL; +} + +/** * queue_spin_lock_slowpath - acquire the queue spinlock * @lock: Pointer to queue spinlock structure * @val: Current value of the queue
[PATCH v13 01/11] qspinlock: A simple generic 4-byte queue spinlock
This patch introduces a new generic queue spinlock implementation that can serve as an alternative to the default ticket spinlock. Compared with the ticket spinlock, this queue spinlock should be almost as fair as the ticket spinlock. It has about the same speed in single-thread and it can be much faster in high contention situations especially when the spinlock is embedded within the data structure to be protected. Only in light to moderate contention where the average queue depth is around 1-3 will this queue spinlock be potentially a bit slower due to the higher slowpath overhead. This queue spinlock is especially suit to NUMA machines with a large number of cores as the chance of spinlock contention is much higher in those machines. The cost of contention is also higher because of slower inter-node memory traffic. Due to the fact that spinlocks are acquired with preemption disabled, the process will not be migrated to another CPU while it is trying to get a spinlock. Ignoring interrupt handling, a CPU can only be contending in one spinlock at any one time. Counting soft IRQ, hard IRQ and NMI, a CPU can only have a maximum of 4 concurrent lock waiting activities. By allocating a set of per-cpu queue nodes and used them to form a waiting queue, we can encode the queue node address into a much smaller 24-bit size (including CPU number and queue node index) leaving one byte for the lock. Please note that the queue node is only needed when waiting for the lock. Once the lock is acquired, the queue node can be released to be used later. Signed-off-by: Waiman Long waiman.l...@hp.com Signed-off-by: Peter Zijlstra pet...@infradead.org --- include/asm-generic/qspinlock.h | 118 +++ include/asm-generic/qspinlock_types.h | 58 + kernel/Kconfig.locks |7 + kernel/locking/Makefile |1 + kernel/locking/mcs_spinlock.h |1 + kernel/locking/qspinlock.c| 207 + 6 files changed, 392 insertions(+), 0 deletions(-) create mode 100644 include/asm-generic/qspinlock.h create mode 100644 include/asm-generic/qspinlock_types.h create mode 100644 kernel/locking/qspinlock.c diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h new file mode 100644 index 000..e8a7ae8 --- /dev/null +++ b/include/asm-generic/qspinlock.h @@ -0,0 +1,118 @@ +/* + * Queue spinlock + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. + * + * Authors: Waiman Long waiman.l...@hp.com + */ +#ifndef __ASM_GENERIC_QSPINLOCK_H +#define __ASM_GENERIC_QSPINLOCK_H + +#include asm-generic/qspinlock_types.h + +/** + * queue_spin_is_locked - is the spinlock locked? + * @lock: Pointer to queue spinlock structure + * Return: 1 if it is locked, 0 otherwise + */ +static __always_inline int queue_spin_is_locked(struct qspinlock *lock) +{ + return atomic_read(lock-val); +} + +/** + * queue_spin_value_unlocked - is the spinlock structure unlocked? + * @lock: queue spinlock structure + * Return: 1 if it is unlocked, 0 otherwise + * + * N.B. Whenever there are tasks waiting for the lock, it is considered + * locked wrt the lockref code to avoid lock stealing by the lockref + * code and change things underneath the lock. This also allows some + * optimizations to be applied without conflict with lockref. + */ +static __always_inline int queue_spin_value_unlocked(struct qspinlock lock) +{ + return !atomic_read(lock.val); +} + +/** + * queue_spin_is_contended - check if the lock is contended + * @lock : Pointer to queue spinlock structure + * Return: 1 if lock contended, 0 otherwise + */ +static __always_inline int queue_spin_is_contended(struct qspinlock *lock) +{ + return atomic_read(lock-val) ~_Q_LOCKED_MASK; +} +/** + * queue_spin_trylock - try to acquire the queue spinlock + * @lock : Pointer to queue spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int queue_spin_trylock(struct qspinlock *lock) +{ + if (!atomic_read(lock-val) + (atomic_cmpxchg(lock-val, 0, _Q_LOCKED_VAL) == 0)) + return 1; + return 0; +} + +extern void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +/** + * queue_spin_lock - acquire a queue spinlock + * @lock: Pointer to queue spinlock structure + */ +static __always_inline void queue_spin_lock(struct qspinlock *lock) +{ + u32 val; + +
[PATCH v13 03/11] qspinlock: Add pending bit
From: Peter Zijlstra pet...@infradead.org Because the qspinlock needs to touch a second cacheline (the per-cpu mcs_nodes[]); add a pending bit and allow a single in-word spinner before we punt to the second cacheline. It is possible so observe the pending bit without the locked bit when the last owner has just released but the pending owner has not yet taken ownership. In this case we would normally queue -- because the pending bit is already taken. However, in this case the pending bit is guaranteed to be released 'soon', therefore wait for it and avoid queueing. Signed-off-by: Peter Zijlstra pet...@infradead.org Signed-off-by: Waiman Long waiman.l...@hp.com --- include/asm-generic/qspinlock_types.h | 12 +++- kernel/locking/qspinlock.c| 119 +++-- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h index 67a2110..4196694 100644 --- a/include/asm-generic/qspinlock_types.h +++ b/include/asm-generic/qspinlock_types.h @@ -36,8 +36,9 @@ typedef struct qspinlock { * Bitfields in the atomic value: * * 0- 7: locked byte - * 8- 9: tail index - * 10-31: tail cpu (+1) + * 8: pending + * 9-10: tail index + * 11-31: tail cpu (+1) */ #define_Q_SET_MASK(type) (((1U _Q_ ## type ## _BITS) - 1)\ _Q_ ## type ## _OFFSET) @@ -45,7 +46,11 @@ typedef struct qspinlock { #define _Q_LOCKED_BITS 8 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) -#define _Q_TAIL_IDX_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS1 +#define _Q_PENDING_MASK_Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) #define _Q_TAIL_IDX_BITS 2 #define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) @@ -54,5 +59,6 @@ typedef struct qspinlock { #define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) #define _Q_LOCKED_VAL (1U _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL (1U _Q_PENDING_OFFSET) #endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index c114076..226b11d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -92,24 +92,28 @@ static inline struct mcs_spinlock *decode_tail(u32 tail) return per_cpu_ptr(mcs_nodes[idx], cpu); } +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + /** * queue_spin_lock_slowpath - acquire the queue spinlock * @lock: Pointer to queue spinlock structure * @val: Current value of the queue spinlock 32-bit word * - * (queue tail, lock value) - * - * fast :slow : unlock - *: : - * uncontended (0,0) --:-- (0,1) :-- (*,0) - *: | ^./ : - *: v \ | : - * uncontended:(n,x) --+-- (n,0) | : - * queue: | ^--' | : - *: v | : - * contended :(*,x) --+-- (*,0) - (*,1) ---' : - * queue: ^--' : + * (queue tail, pending bit, lock value) * + * fast :slow :unlock + * : : + * uncontended (0,0,0) -:-- (0,0,1) --:-- (*,*,0) + * : | ^.--. / : + * : v \ \| : + * pending :(0,1,1) +-- (0,1,0) \ | : + * : | ^--' | | : + * : v | | : + * uncontended :(n,x,y) +-- (n,0,0) --' | : + * queue : | ^--' | : + * : v | : + * contended :(*,x,y) +-- (*,0,0) --- (*,0,1) -' : + * queue : ^--' : */ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) { @@ -119,6 +123,75 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val) BUILD_BUG_ON(CONFIG_NR_CPUS = (1U _Q_TAIL_CPU_BITS)); + /* +* wait for in-progress pending-locked hand-overs +* +* 0,1,0 - 0,0,1 +*/ + if (val == _Q_PENDING_VAL) { + while ((val = atomic_read(lock-val)) == _Q_PENDING_VAL) +
[PATCH v13 02/11] qspinlock, x86: Enable x86-64 to use queue spinlock
This patch makes the necessary changes at the x86 architecture specific layer to enable the use of queue spinlock for x86-64. As x86-32 machines are typically not multi-socket. The benefit of queue spinlock may not be apparent. So queue spinlock is not enabled. Currently, there is some incompatibilities between the para-virtualized spinlock code (which hard-codes the use of ticket spinlock) and the queue spinlock. Therefore, the use of queue spinlock is disabled when the para-virtualized spinlock is enabled. The arch/x86/include/asm/qspinlock.h header file includes some x86 specific optimization which will make the queue spinlock code perform better than the generic implementation. Signed-off-by: Waiman Long waiman.l...@hp.com Signed-off-by: Peter Zijlstra pet...@infradead.org --- arch/x86/Kconfig |1 + arch/x86/include/asm/qspinlock.h | 25 + arch/x86/include/asm/spinlock.h |5 + arch/x86/include/asm/spinlock_types.h |4 4 files changed, 35 insertions(+), 0 deletions(-) create mode 100644 arch/x86/include/asm/qspinlock.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2327e8..9ee8324 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -122,6 +122,7 @@ config X86 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_QUEUE_SPINLOCK select ARCH_USE_QUEUE_RWLOCK select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000..a6a8762 --- /dev/null +++ b/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,25 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include asm-generic/qspinlock_types.h + +#ifndef CONFIG_X86_PPRO_FENCE + +#definequeue_spin_unlock queue_spin_unlock +/** + * queue_spin_unlock - release a queue spinlock + * @lock : Pointer to queue spinlock structure + * + * An effective smp_store_release() on the least-significant byte. + */ +static inline void queue_spin_unlock(struct qspinlock *lock) +{ + barrier(); + ACCESS_ONCE(*(u8 *)lock) = 0; +} + +#endif /* !CONFIG_X86_PPRO_FENCE */ + +#include asm-generic/qspinlock.h + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 9295016..5899483 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUE_SPINLOCK +#include asm/qspinlock.h +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -180,6 +184,7 @@ static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, { arch_spin_lock(lock); } +#endif /* CONFIG_QUEUE_SPINLOCK */ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) { diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5f9d757..5d654a1 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUE_SPINLOCK +#include asm-generic/qspinlock_types.h +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUE_SPINLOCK */ #include asm-generic/qrwlock_types.h -- 1.7.1 -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support
On 10/29/2014 03:05 PM, Waiman Long wrote: On 10/27/2014 05:22 PM, Waiman Long wrote: On 10/27/2014 02:04 PM, Peter Zijlstra wrote: On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote: On 10/24/2014 04:54 AM, Peter Zijlstra wrote: On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote: Since enabling paravirt spinlock will disable unlock function inlining, a jump label can be added to the unlock function without adding patch sites all over the kernel. But you don't have to. My patches allowed for the inline to remain, again reducing the overhead of enabling PV spinlocks while running on a real machine. Look at: http://lkml.kernel.org/r/20140615130154.213923...@chello.nl In particular this hunk: Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c === --- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c @@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs) DEF_NATIVE(, mov32, mov %edi, %eax); DEF_NATIVE(, mov64, mov %rdi, %rax); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) +DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi)); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) defined(CONFIG_QUEUE_SPINLOCK) + PATCH_SITE(pv_lock_ops, queue_unlock); +#endif patch_site: ret = paravirt_patch_insns(ibuf, len, start, end); That makes sure to overwrite the callee-saved call to the pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi). Therefore you can retain the inlined unlock with hardly (there might be some NOP padding) any overhead at all. On PV it reverts to a callee saved function call. My concern is that spin_unlock() can be called in many places, including loadable kernel modules. Can the paravirt_patch_ident_32() function able to patch all of them in reasonable time? How about a kernel module loaded later at run time? modules should be fine, see arch/x86/kernel/module.c:module_finalize() - apply_paravirt(). Also note that the 'default' text is an indirect call into the paravirt ops table which routes to the 'right' function, so even if the text patching would be 'late' calls would 'work' as expected, just slower. Thanks for letting me know about that. I have this concern because your patch didn't change the current configuration of disabling unlock inlining when paravirt_spinlock is enabled. With that, I think it is worthwhile to reduce the performance delta between the PV and non-PV kernel on bare metal. I am sorry that the unlock call sites patching code doesn't work in a virtual guest. Your pvqspinlock patch did an unconditional patching even in a virtual guest. I added check for the paravirt_spinlocks_enabled, but it turned out that some spin_unlock() seemed to be called before paravirt_spinlocks_enabled is set. As a result, some call sites were still patched resulting in missed wake up's and system hang. At this point, I am going to leave out that change from my patch set until we can figure out a better way of doing that. Below was a partial kernel log with the unlock call site patch code in a KVM guest: [0.438006] native_patch: patch out pv_queue_unlock! [0.438565] native_patch: patch out pv_queue_unlock! [0.439006] native_patch: patch out pv_queue_unlock! [0.439638] native_patch: patch out pv_queue_unlock! [0.440052] native_patch: patch out pv_queue_unlock! [0.441006] native_patch: patch out pv_queue_unlock! [0.441566] native_patch: patch out pv_queue_unlock! [0.442035] ftrace: allocating 24168 entries in 95 pages [0.451208] Switched APIC routing to physical flat. [0.453202] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 [0.454002] smpboot: CPU0: Intel QEMU Virtual CPU version 1.5.3 (fam: 06, model: 06, stepping: 03) [0.456000] Performance Events: Broken PMU hardware detected, using software events only. [0.456003] Failed to access perfctr msr (MSR c1 is 0) [0.457151] KVM setup paravirtual spinlock [0.460039] NMI watchdog: disabled (cpu0): hardware events not enabled It could be seen that some unlock call sites were patched before the KVM setup code set the paravirt_spinlocks_enabled flag. -Longman -- To unsubscribe from this list: send the line unsubscribe kvm in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 3/4] vhost: byteswap virtqueue attributes
The virtqueue structure shares a few attributes with the guest OS which need to be byteswapped when the endian order of the host is different. This patch uses the vq-byteswap attribute to decide whether to byteswap or not data being accessed in the guest memory. Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- drivers/vhost/vhost.c | 52 + 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index afcb3368370c..9a538f4d2ff1 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1097,7 +1097,7 @@ static void vring_desc_swap(struct vring_desc *desc) static int vhost_update_used_flags(struct vhost_virtqueue *vq) { void __user *used; - if (__put_user(vq-used_flags, vq-used-flags) 0) + if (__vq_put_user(vq, vq-used_flags, vq-used-flags) 0) return -EFAULT; if (unlikely(vq-log_used)) { /* Make sure the flag is seen before log. */ @@ -1115,7 +1115,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue *vq) static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 avail_event) { - if (__put_user(vq-avail_idx, vhost_avail_event(vq))) + if (__vq_put_user(vq, vq-avail_idx, vhost_avail_event(vq))) return -EFAULT; if (unlikely(vq-log_used)) { void __user *used; @@ -1142,7 +1142,7 @@ int vhost_init_used(struct vhost_virtqueue *vq) if (r) return r; vq-signalled_used_valid = false; - return get_user(vq-last_used_idx, vq-used-idx); + return vq_get_user(vq, vq-last_used_idx, vq-used-idx); } EXPORT_SYMBOL_GPL(vhost_init_used); @@ -1254,6 +1254,10 @@ static int get_indirect(struct vhost_virtqueue *vq, i, (size_t)indirect-addr + i * sizeof desc); return -EINVAL; } + + if (vq-byteswap) + vring_desc_swap(desc); + if (unlikely(desc.flags VRING_DESC_F_INDIRECT)) { vq_err(vq, Nested indirect descriptor: idx %d, %zx\n, i, (size_t)indirect-addr + i * sizeof desc); @@ -1309,7 +1313,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Check it isn't doing very strange things with descriptor numbers. */ last_avail_idx = vq-last_avail_idx; - if (unlikely(__get_user(vq-avail_idx, vq-avail-idx))) { + if (unlikely(__vq_get_user(vq, vq-avail_idx, vq-avail-idx))) { vq_err(vq, Failed to access avail idx at %p\n, vq-avail-idx); return -EFAULT; @@ -1330,7 +1334,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, /* Grab the next descriptor number they're advertising, and increment * the index we've seen. */ - if (unlikely(__get_user(head, + if (unlikely(__vq_get_user(vq, head, vq-avail-ring[last_avail_idx % vq-num]))) { vq_err(vq, Failed to read head: idx %d address %p\n, last_avail_idx, @@ -1370,6 +1374,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq, i, vq-desc + i); return -EFAULT; } + + if (vq-byteswap) + vring_desc_swap(desc); + if (desc.flags VRING_DESC_F_INDIRECT) { ret = get_indirect(vq, iov, iov_size, out_num, in_num, @@ -1437,6 +1445,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) } EXPORT_SYMBOL_GPL(vhost_add_used); +static int __copyhead_to_user(struct vhost_virtqueue *vq, + struct vring_used_elem *heads, + struct vring_used_elem __user *used, + unsigned count) +{ + int i; + + for (i = 0; i count; i++) { + if (__vq_put_user(vq, heads[i].id, used[i].id)) { + vq_err(vq, Failed to write used id); + return -EFAULT; + } + if (__vq_put_user(vq, heads[i].len, used[i].len)) { + vq_err(vq, Failed to write used len); + return -EFAULT; + } + } + return 0; +} + static int __vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, unsigned count) @@ -1448,15 +1476,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, start = vq-last_used_idx % vq-num; used = vq-used-ring + start; if (count == 1) { - if (__put_user(heads[0].id, used-id)) { + if (__vq_put_user(vq, heads[0].id, used-id)) { vq_err(vq, Failed to write used id);
[RFC PATCH 4/4] vhost_net: byteswap virtio_net header
Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- drivers/vhost/net.c | 39 ++- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 8dae2f724a35..f2d5f585dae9 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -18,6 +18,7 @@ #include linux/file.h #include linux/slab.h #include linux/vmalloc.h +#include linux/swab.h #include linux/net.h #include linux/if_packet.h @@ -329,6 +330,14 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success) rcu_read_unlock_bh(); } +static void virtio_net_hdr_swap(struct virtio_net_hdr *hdr) +{ + hdr-hdr_len = swab16(hdr-hdr_len); + hdr-gso_size= swab16(hdr-gso_size); + hdr-csum_start = swab16(hdr-csum_start); + hdr-csum_offset = swab16(hdr-csum_offset); +} + /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ static void handle_tx(struct vhost_net *net) @@ -346,7 +355,7 @@ static void handle_tx(struct vhost_net *net) .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; - int err; + int err, has_vnet_hdr; size_t hdr_size; struct socket *sock; struct vhost_net_ubuf_ref *uninitialized_var(ubufs); @@ -359,6 +368,7 @@ static void handle_tx(struct vhost_net *net) vhost_disable_notify(net-dev, vq); + has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR); hdr_size = nvq-vhost_hlen; zcopy = nvq-ubufs; @@ -406,6 +416,8 @@ static void handle_tx(struct vhost_net *net) break; } + if (!has_vnet_hdr vq-byteswap) + virtio_net_hdr_swap((void *) vq-iov[0].iov_base); zcopy_used = zcopy len = VHOST_GOODCOPY_LEN (nvq-upend_idx + 1) % UIO_MAXIOV != nvq-done_idx @@ -570,7 +582,7 @@ static void handle_rx(struct vhost_net *net) .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; - int err, mergeable; + int err, mergeable, has_vnet_hdr; s16 headcount; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; @@ -588,6 +600,7 @@ static void handle_rx(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ? vq-log : NULL; mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF); + has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR); while ((sock_len = peek_head_len(sock-sk))) { sock_len += sock_hlen; @@ -638,6 +651,9 @@ static void handle_rx(struct vhost_net *net) vhost_discard_vq_desc(vq, headcount); continue; } + + if (!has_vnet_hdr vq-byteswap) + virtio_net_hdr_swap((void *) vq-iov[0].iov_base); if (unlikely(vhost_hlen) memcpy_toiovecend(nvq-hdr, (unsigned char *)hdr, 0, vhost_hlen)) { @@ -646,13 +662,18 @@ static void handle_rx(struct vhost_net *net) break; } /* TODO: Should check and handle checksum. */ - if (likely(mergeable) - memcpy_toiovecend(nvq-hdr, (unsigned char *)headcount, - offsetof(typeof(hdr), num_buffers), - sizeof hdr.num_buffers)) { - vq_err(vq, Failed num_buffers write); - vhost_discard_vq_desc(vq, headcount); - break; + if (likely(mergeable)) { + __u16 tmp = headcount; + + if (vq-byteswap) + tmp = swab16(headcount); + if (memcpy_toiovecend(nvq-hdr, (unsigned char *)tmp, + offsetof(typeof(hdr), num_buffers), + sizeof(hdr.num_buffers))) { + vq_err(vq, Failed num_buffers write); + vhost_discard_vq_desc(vq, headcount); + break; + } } vhost_add_used_and_signal_n(net-dev, vq, vq-heads, headcount); -- 1.7.10.4 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH 2/4] vhost: add byteswap routines
This patch adds a few helper routines around get_user and put_user to ease byteswapping. Signed-off-by: Cédric Le Goater c...@fr.ibm.com --- I am not sure these routines belong to this file. There is room for improvement to remove the ugly switch (sizeof(*(ptr))). Please comment ! drivers/vhost/vhost.c | 93 + 1 file changed, 93 insertions(+) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 72c21b790ba3..afcb3368370c 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -25,6 +25,7 @@ #include linux/kthread.h #include linux/cgroup.h #include linux/module.h +#include linux/swab.h #include vhost.h @@ -1001,6 +1002,98 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, } EXPORT_SYMBOL_GPL(vhost_log_write); +#define vq_get_user(vq, x, ptr)\ +({ \ + int ret = get_user(x, ptr); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + x = swab16(x); \ + break; \ + case 4: \ + x = swab32(x); \ + break; \ + case 8: \ + x = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + ret;\ +}) + +#define vq_put_user(vq, x, ptr)\ +({ \ + __typeof__(*(ptr)) y = (x); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + y = swab16(x); \ + break; \ + case 4: \ + y = swab32(x); \ + break; \ + case 8: \ + y = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + put_user(y, ptr); \ +}) + +#define __vq_get_user(vq, x, ptr) \ +({ \ + int ret = __get_user(x, ptr); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: \ + x = swab16(x); \ + break; \ + case 4: \ + x = swab32(x); \ + break; \ + case 8: \ + x = swab64(x); \ + break; \ + default:\ + break; \ + } \ + } \ + ret;\ +}) + +#define __vq_put_user(vq, x, ptr) \ +({ \ + __typeof__(*(ptr)) y = (x); \ + if (vq-byteswap) { \ + switch (sizeof(*(ptr))) { \ + case 2: