[RFC PATCH 4/4] vhost_net: byteswap virtio_net header

2014-10-29 Thread Cédric Le Goater
Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---
 drivers/vhost/net.c |   39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f724a35..f2d5f585dae9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@
 #include linux/file.h
 #include linux/slab.h
 #include linux/vmalloc.h
+#include linux/swab.h
 
 #include linux/net.h
 #include linux/if_packet.h
@@ -329,6 +330,14 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
rcu_read_unlock_bh();
 }
 
+static void virtio_net_hdr_swap(struct virtio_net_hdr *hdr)
+{
+   hdr-hdr_len = swab16(hdr-hdr_len);
+   hdr-gso_size= swab16(hdr-gso_size);
+   hdr-csum_start  = swab16(hdr-csum_start);
+   hdr-csum_offset = swab16(hdr-csum_offset);
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -346,7 +355,7 @@ static void handle_tx(struct vhost_net *net)
.msg_flags = MSG_DONTWAIT,
};
size_t len, total_len = 0;
-   int err;
+   int err, has_vnet_hdr;
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -359,6 +368,7 @@ static void handle_tx(struct vhost_net *net)
 
vhost_disable_notify(net-dev, vq);
 
+   has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR);
hdr_size = nvq-vhost_hlen;
zcopy = nvq-ubufs;
 
@@ -406,6 +416,8 @@ static void handle_tx(struct vhost_net *net)
break;
}
 
+   if (!has_vnet_hdr  vq-byteswap)
+   virtio_net_hdr_swap((void *) vq-iov[0].iov_base);
zcopy_used = zcopy  len = VHOST_GOODCOPY_LEN
(nvq-upend_idx + 1) % UIO_MAXIOV !=
  nvq-done_idx
@@ -570,7 +582,7 @@ static void handle_rx(struct vhost_net *net)
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t total_len = 0;
-   int err, mergeable;
+   int err, mergeable, has_vnet_hdr;
s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
@@ -588,6 +600,7 @@ static void handle_rx(struct vhost_net *net)
vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
vq-log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+   has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR);
 
while ((sock_len = peek_head_len(sock-sk))) {
sock_len += sock_hlen;
@@ -638,6 +651,9 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq, headcount);
continue;
}
+
+   if (!has_vnet_hdr  vq-byteswap)
+   virtio_net_hdr_swap((void *) vq-iov[0].iov_base);
if (unlikely(vhost_hlen) 
memcpy_toiovecend(nvq-hdr, (unsigned char *)hdr, 0,
  vhost_hlen)) {
@@ -646,13 +662,18 @@ static void handle_rx(struct vhost_net *net)
break;
}
/* TODO: Should check and handle checksum. */
-   if (likely(mergeable) 
-   memcpy_toiovecend(nvq-hdr, (unsigned char *)headcount,
- offsetof(typeof(hdr), num_buffers),
- sizeof hdr.num_buffers)) {
-   vq_err(vq, Failed num_buffers write);
-   vhost_discard_vq_desc(vq, headcount);
-   break;
+   if (likely(mergeable)) {
+   __u16 tmp = headcount;
+
+   if (vq-byteswap)
+   tmp = swab16(headcount);
+   if (memcpy_toiovecend(nvq-hdr, (unsigned char *)tmp,
+   offsetof(typeof(hdr), num_buffers),
+ sizeof(hdr.num_buffers))) {
+   vq_err(vq, Failed num_buffers write);
+   vhost_discard_vq_desc(vq, headcount);
+   break;
+   }
}
vhost_add_used_and_signal_n(net-dev, vq, vq-heads,
headcount);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 1/4] vhost: add VHOST_VRING_F_BYTESWAP flag

2014-10-29 Thread Cédric Le Goater
The VHOST_VRING_F_BYTESWAP flag will be used by the host to byteswap
the vring data when the guest and the host have a different endian
order.

Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---
 drivers/vhost/vhost.c  |5 -
 drivers/vhost/vhost.h  |1 +
 include/uapi/linux/vhost.h |3 +++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c90f4374442a..72c21b790ba3 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -199,6 +199,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq-call = NULL;
vq-log_ctx = NULL;
vq-memory = NULL;
+   vq-byteswap = 0;
 }
 
 static int vhost_worker(void *data)
@@ -701,7 +702,8 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void 
__user *argp)
r = -EFAULT;
break;
}
-   if (a.flags  ~(0x1  VHOST_VRING_F_LOG)) {
+   if (a.flags  ~(0x1  VHOST_VRING_F_LOG |
+   0x1  VHOST_VRING_F_BYTESWAP)) {
r = -EOPNOTSUPP;
break;
}
@@ -747,6 +749,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void 
__user *argp)
vq-avail = (void __user *)(unsigned long)a.avail_user_addr;
vq-log_addr = a.log_guest_addr;
vq-used = (void __user *)(unsigned long)a.used_user_addr;
+   vq-byteswap = !!(a.flags  (0x1  VHOST_VRING_F_BYTESWAP));
break;
case VHOST_SET_VRING_KICK:
if (copy_from_user(f, argp, sizeof f)) {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 3eda654b8f5a..ab25b7d0720d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -110,6 +110,7 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
+   bool byteswap;
 };
 
 struct vhost_dev {
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index bb6a5b4cb3c5..6a8c2b325c44 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -34,6 +34,9 @@ struct vhost_vring_addr {
/* Flag values: */
/* Whether log address is valid. If set enables logging. */
 #define VHOST_VRING_F_LOG 0
+   /* Whether vring memory accesses should be byte-swapped.
+* required when the guest has a different endianness */
+#define VHOST_VRING_F_BYTESWAP 1
 
/* Start of array of descriptors (virtually contiguous) */
__u64 desc_user_addr;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 3/4] vhost: byteswap virtqueue attributes

2014-10-29 Thread Cédric Le Goater
The virtqueue structure shares a few attributes with the guest OS
which need to be byteswapped when the endian order of the host is
different.

This patch uses the vq-byteswap attribute to decide whether to
byteswap or not data being accessed in the guest memory.

Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---
 drivers/vhost/vhost.c |   52 +
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index afcb3368370c..9a538f4d2ff1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1097,7 +1097,7 @@ static void vring_desc_swap(struct vring_desc *desc)
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
void __user *used;
-   if (__put_user(vq-used_flags, vq-used-flags)  0)
+   if (__vq_put_user(vq, vq-used_flags, vq-used-flags)  0)
return -EFAULT;
if (unlikely(vq-log_used)) {
/* Make sure the flag is seen before log. */
@@ -1115,7 +1115,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
-   if (__put_user(vq-avail_idx, vhost_avail_event(vq)))
+   if (__vq_put_user(vq, vq-avail_idx, vhost_avail_event(vq)))
return -EFAULT;
if (unlikely(vq-log_used)) {
void __user *used;
@@ -1142,7 +1142,7 @@ int vhost_init_used(struct vhost_virtqueue *vq)
if (r)
return r;
vq-signalled_used_valid = false;
-   return get_user(vq-last_used_idx, vq-used-idx);
+   return vq_get_user(vq, vq-last_used_idx, vq-used-idx);
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
@@ -1254,6 +1254,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
   i, (size_t)indirect-addr + i * sizeof desc);
return -EINVAL;
}
+
+   if (vq-byteswap)
+   vring_desc_swap(desc);
+
if (unlikely(desc.flags  VRING_DESC_F_INDIRECT)) {
vq_err(vq, Nested indirect descriptor: idx %d, %zx\n,
   i, (size_t)indirect-addr + i * sizeof desc);
@@ -1309,7 +1313,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq-last_avail_idx;
-   if (unlikely(__get_user(vq-avail_idx, vq-avail-idx))) {
+   if (unlikely(__vq_get_user(vq, vq-avail_idx, vq-avail-idx))) {
vq_err(vq, Failed to access avail idx at %p\n,
   vq-avail-idx);
return -EFAULT;
@@ -1330,7 +1334,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(__get_user(head,
+   if (unlikely(__vq_get_user(vq, head,
vq-avail-ring[last_avail_idx % vq-num]))) {
vq_err(vq, Failed to read head: idx %d address %p\n,
   last_avail_idx,
@@ -1370,6 +1374,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
   i, vq-desc + i);
return -EFAULT;
}
+
+   if (vq-byteswap)
+   vring_desc_swap(desc);
+
if (desc.flags  VRING_DESC_F_INDIRECT) {
ret = get_indirect(vq, iov, iov_size,
   out_num, in_num,
@@ -1437,6 +1445,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned 
int head, int len)
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
+static int __copyhead_to_user(struct vhost_virtqueue *vq,
+   struct vring_used_elem *heads,
+   struct vring_used_elem __user *used,
+   unsigned count)
+{
+   int i;
+
+   for (i = 0; i  count; i++) {
+   if (__vq_put_user(vq, heads[i].id, used[i].id)) {
+   vq_err(vq, Failed to write used id);
+   return -EFAULT;
+   }
+   if (__vq_put_user(vq, heads[i].len, used[i].len)) {
+   vq_err(vq, Failed to write used len);
+   return -EFAULT;
+   }
+   }
+   return 0;
+}
+
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -1448,15 +1476,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue 
*vq,
start = vq-last_used_idx % vq-num;
used = vq-used-ring + start;
if (count == 1) {
-   if (__put_user(heads[0].id, used-id)) {
+   if (__vq_put_user(vq, heads[0].id, used-id)) {
vq_err(vq, Failed to write used id);

[RFC PATCH 0/4] vhost_net: support for cross endian guests

2014-10-29 Thread Cédric Le Goater
This patchset adds a VHOST_VRING_F_BYTESWAP flag to inform the host 
to byteswap data of the vring when the guest and the host have a 
different endian order. The flag is stored at initialization in an 
attribute of the virtio queues. It is then used to byteswap, or not, 
the vring indexes and descriptors shared with the guest OS.

The last patch adds the byteswapping of the virtio_net header as it 
is done in qemu.

The patches apply on linux-3.18-rc2 and the tests were done on PowerPC 
using the following hosts :
 
 fedora21/ppc64, utopic/ppc64le  

with various guests : 

 trusty/ppc64le, utopic/ppc64le, debian/ppc64le, 
 rhel6.5/ppc64, fedora21/ppc64, debian/ppc64

Regressions tests for x86_64 were done a debian host using rhel6.6, 
fedora20 and debian guests.


Cédric Le Goater (4):
  vhost: add VHOST_VRING_F_BYTESWAP flag
  vhost: add byteswap routines
  vhost: byteswap virtqueue attributes
  vhost_net: byteswap virtio_net header

 drivers/vhost/net.c|   39 +---
 drivers/vhost/vhost.c  |  150 
 drivers/vhost/vhost.h  |1 +
 include/uapi/linux/vhost.h |3 +
 4 files changed, 171 insertions(+), 22 deletions(-)

-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/4] vhost: add byteswap routines

2014-10-29 Thread Cédric Le Goater
This patch adds a few helper routines around get_user and put_user
to ease byteswapping.

Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---

I am not sure these routines belong to this file. There is room for 
improvement to remove the ugly  switch (sizeof(*(ptr))). Please 
comment !

 drivers/vhost/vhost.c |   93 +
 1 file changed, 93 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 72c21b790ba3..afcb3368370c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -25,6 +25,7 @@
 #include linux/kthread.h
 #include linux/cgroup.h
 #include linux/module.h
+#include linux/swab.h
 
 #include vhost.h
 
@@ -1001,6 +1002,98 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct 
vhost_log *log,
 }
 EXPORT_SYMBOL_GPL(vhost_log_write);
 
+#define vq_get_user(vq, x, ptr)\
+({ \
+   int ret = get_user(x, ptr); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   x = swab16(x);  \
+   break;  \
+   case 4: \
+   x = swab32(x);  \
+   break;  \
+   case 8: \
+   x = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   ret;\
+})
+
+#define vq_put_user(vq, x, ptr)\
+({ \
+   __typeof__(*(ptr)) y = (x); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   y = swab16(x);  \
+   break;  \
+   case 4: \
+   y = swab32(x);  \
+   break;  \
+   case 8: \
+   y = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   put_user(y, ptr);   \
+})
+
+#define __vq_get_user(vq, x, ptr)  \
+({ \
+   int ret = __get_user(x, ptr);   \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   x = swab16(x);  \
+   break;  \
+   case 4: \
+   x = swab32(x);  \
+   break;  \
+   case 8: \
+   x = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   ret;\
+})
+
+#define __vq_put_user(vq, x, ptr)  \
+({ \
+   __typeof__(*(ptr)) y = (x); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2:

Re: [PATCH 3.10] vhost-net: backport extend device allocation to 3.10

2014-10-29 Thread Michael S. Tsirkin
On Tue, Oct 28, 2014 at 07:38:30PM +, Eddie Chapman wrote:
 On 12/10/14 10:30, Michael S. Tsirkin wrote:
  On Thu, Oct 09, 2014 at 08:41:23AM +0400, Dmitry Petuhov wrote:
  From: Michael S. Tsirkin m...@redhat.com
 
  upstream commit 23cc5a991c7a9fb7e6d6550e65cee4f4173111c5
 
  Michael Mueller provided a patch to reduce the size of
  vhost-net structure as some allocations could fail under
  memory pressure/fragmentation. We are still left with
  high order allocations though.
 
  This patch is handling the problem at the core level, allowing
  vhost structures to use vmalloc() if kmalloc() failed.
 
  As vmalloc() adds overhead on a critical network path, add __GFP_REPEAT
  to kzalloc() flags to do this fallback only when really needed.
 
  People are still looking at cleaner ways to handle the problem
  at the API level, probably passing in multiple iovecs.
  This hack seems consistent with approaches
  taken since then by drivers/vhost/scsi.c and net/core/dev.c
 
  Based on patch by Romain Francoise.
 
  Cc: Michael Mueller m...@linux.vnet.ibm.com
  Signed-off-by: Romain Francoise rom...@orebokech.com
  Acked-by: Michael S. Tsirkin m...@redhat.com
  [mityapetuhov: backport to v3.10: vhost_net_free() in one more place]
  Signed-off-by: Dmitry Petuhov mityapetu...@gmail.com
  
  Sounds reasonable.
  
  Acked-by: Michael S. Tsirkin m...@redhat.com
  
  ---
  diff -uprN a/drivers/vhost/net.c b/drivers/vhost/net.c
  --- a/drivers/vhost/net.c  2014-10-09 06:45:08.336283258 +0400
  +++ b/drivers/vhost/net.c  2014-10-09 06:51:21.796266607 +0400
  @@ -18,6 +18,7 @@
   #include linux/rcupdate.h
   #include linux/file.h
   #include linux/slab.h
  +#include linux/vmalloc.h
   
   #include linux/net.h
   #include linux/if_packet.h
  @@ -707,18 +708,30 @@ static void handle_rx_net(struct vhost_w
 handle_rx(net);
   }
   
  +static void vhost_net_free(void *addr)
  +{
  +  if (is_vmalloc_addr(addr))
  +  vfree(addr);
  +  else
  +  kfree(addr);
  +}
  +
   static int vhost_net_open(struct inode *inode, struct file *f)
   {
  -  struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
  +  struct vhost_net *n;
 struct vhost_dev *dev;
 struct vhost_virtqueue **vqs;
 int r, i;
   
  -  if (!n)
  -  return -ENOMEM;
  +  n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
  +  if (!n) {
  +  n = vmalloc(sizeof *n);
  +  if (!n)
  +  return -ENOMEM;
  +  }
 vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
 if (!vqs) {
  -  kfree(n);
  +  vhost_net_free(n);
 return -ENOMEM;
 }
   
  @@ -737,7 +750,7 @@ static int vhost_net_open(struct inode *
 }
 r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
 if (r  0) {
  -  kfree(n);
  +  vhost_net_free(n);
 kfree(vqs);
 return r;
 }
  @@ -840,7 +853,7 @@ static int vhost_net_release(struct inod
  * since jobs can re-queue themselves. */
 vhost_net_flush(n);
 kfree(n-dev.vqs);
  -  kfree(n);
  +  vhost_net_free(n);
 return 0;
   }
   
 
 
 Hi Michael, Dmitry,
 
 I needed to apply this to a server using vanilla 3.14, so I reformatted
 Dmitry's 3.10 patch to make it apply cleanly to 3.14.23-rc1. It built
 fine without warnings, and the kernel has been running now all day with
 the patch applied without any problems, no errors in dmesg, module
 loaded fine, and qemu VMs running without any network problems, with
 vhost-net enabled.
 
 Since I don't really know what I'm doing (I just fixed up white space to
 make it apply, and removed the extra vhost_net_free() added by Dmitry
 for the 3.10 version as there was no such bit of code in 3.14) I just
 wanted to paste what I came up with below just to ask for your eyeballs.
 If you think I haven't missed anything obvious that is needed extra in
 3.14, then I'll send it as a properly formatted patch for 3.14 according
 to submission rules, since it appears to be working (unless Dmitry
 prefers to submit it).
 
 thanks,
 Eddie

Looks reasonable.

Acked-by: Michael S. Tsirkin m...@redhat.com

 --- a/drivers/vhost/net.c   2014-03-31 04:40:15.0 +0100
 +++ b/drivers/vhost/net.c   2014-10-28 19:05:21.242141453 +
 @@ -17,6 +17,7 @@
  #include linux/workqueue.h
  #include linux/file.h
  #include linux/slab.h
 +#include linux/vmalloc.h
 
  #include linux/net.h
  #include linux/if_packet.h
 @@ -699,18 +700,31 @@
 handle_rx(net);
  }
 
 +static void vhost_net_free(void *addr)
 +{
 + if (is_vmalloc_addr(addr))
 + vfree(addr);
 + else
 + kfree(addr);
 +}
 +
  static int vhost_net_open(struct inode *inode, struct file *f)
  {
 - struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
 + struct vhost_net *n;
   struct vhost_dev *dev;
   struct vhost_virtqueue **vqs;
   int i;
 
 - if (!n)
 - return -ENOMEM;
 + n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN 

Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Andrew Cooper
On 29/10/14 05:19, Andy Lutomirski wrote:
 Here's a draft CommonHV spec.  It's also on github:

 https://github.com/amluto/CommonHV

 So far, this provides a two-way RNG interface, a way to detect it, and
 a way to detect other hypervisor leaves.  The latter is because, after
 both the enormous public thread and some private discussions, it seems
 that detection of existing CPUID paravirt leaves is annoying and
 inefficient.  If we're going to define some cross-vendor CPUID leaves,
 it seems like it would be useful to offer a way to quickly enumerate
 other leaves.

 I've been told the AMD intends to update their manual to match Intel's
 so that hypervisors can use the entire 0x4F?? CPUID range.  I have
 intentionally not fixed an MSR value for the RNG because the range of
 allowed MSRs is very small in both the Intel and AMD manuals.  If any
 given hypervisor wants to ignore that small range and advertise a
 higher-numbered MSR, it is welcome to, but I don't want to codify
 something that doesn't comply with the manuals.

 Here's the draft.  Comments?  To the people who work on various
 hypervisors: Would you implement this?  Do you like it?  Is there
 anything, major or minor, that you'd like to see changed?  Do you
 think that this is a good idea at all?

As a first pass, it looks like a plausible idea.  I do however have come
comments.


 I've tried to get good coverage of various hypervisors.  There are
 Hyper-V, VMWare, KVM, and Xen people on the cc list.

 Thanks,
 Andy



 CommonHV, a common hypervisor interface
 ===

 This is CommonHV draft 1.

 The CommonHV specification is Copyright (c) 2014 Andrew Lutomirski.

 Licensing will be determined soon.  The license is expected to be extremely
 liberal.  I am currently leaning towards CC-BY-SA for the specification and
 an explicit license permitting anyone to implement the specification
 with no restrictions whatsoever.

 I have not patented, nor do I intend to patent, anything required to implement
 this specification.  I am not aware of any current or future intellectual
 property rights that would prevent a royalty-free implementation of
 this specification.

 I would like to find a stable, neutral steward of this specification
 going forward.  Help with this would be much appreciated.

 Scope
 -

 CommonHV is a simple interface for communication
 between hypervisors and their guests.

 CommonHV is intended to be very simple and to avoid interfering with
 existing paravirtual interfaces.  To that end, its scope is limited.
 CommonHV does only two types of things:

   * It provides a way to enumerate other paravirtual interfaces.
   * It provides a small, extensible set of paravirtual features that do not
 modify or replace standard system functionality.

 For example, CommonHV does not and will not define anything related to
 interrupt handling or virtual CPU management.

 For now, CommonHV is only applicable to the x86 platform.

 Discovery
 -

 A CommonHV hypervisor MUST set the hypervisor bit (bit 31 in CPUID.1H.0H.ECX)
 and provide the CPUID leaf 4F00H, containing:

   * CPUID.4F00H.0H.EAX = max_commonhv_leaf
   * CPUID.4F00H.0H.EBX = 0x6D6D6F43
   * CPUID.4F00H.0H.ECX = 0x56486E6F
   * CPUID.4F00H.0H.EDX = 0x66746e49

 EBX, ECX, and EDX form the string CommonHVIntf in little-endian ASCII.

While testing various nested combinations, XenServer has found that
modern Windows Server versions must have the hypervisor bit hidden from
them for them to be happy running HyperV, despite the fact that they
will make use of the Viridian virtual extensions also provided.

As a result, while it is certainly advisable for the hypervisor bit to
be set, CommonHV should be available to be found by paravirtualised
drivers inside an OS which can't cope with the hypervisor bit set.


 max_commonhv_leaf MUST be a number between 0x4F00 and 0x4FFF.  It
 indicates the largest leaf defined in this specification that is provided.
 Any leaves described in this specification with EAX values that exceed
 max_commonhv_leaf MUST be handled by guests as though they contain
 all zeros.

 CPUID leaf 4F01H: hypervisor interface enumeration
 --

 If max_commonhv_leaf = 0x4F01, CommonHV provides a list of tuples
 (location, signature).  Each tuple indicates the presence of another
 paravirtual interface identified by the signature at the indicated
 CPUID location.  It is expected that CPUID.location.0H will have
 (EBX, ECX, EDX) == signature, although whether this is required
 is left to the specification associated with the given signature.

 If the list contains N tuples, then, for each 0 = i  N:

   * CPUID.4F01H.i.EBX, CPUID.4F01H.i.ECX, and CPUID.4F01H.i.EDX
 are the signature.
   * CPUID.4F01H.i.EAX is the location.

 CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros.

 To the extent that the 

Re: [GIT PULL 0/9] KVM: s390: Fixes and cleanups for kvm/next (3.19)

2014-10-29 Thread Paolo Bonzini
On 10/28/2014 08:39 PM, Christian Borntraeger wrote:
 Paolo,
 
 the first bunch on s390 change for next:
 
 The following changes since commit cac7f2429872d3733dc3f9915857b1691da2eb2f:
 
   Linux 3.18-rc2 (2014-10-26 16:48:41 -0700)
 
 are available in the git repository at:
 
   git://git.kernel.org/pub/scm/linux/kernel/git/kvms390/linux.git  
 tags/kvm-s390-next-20141028
 
 for you to fetch changes up to a6cc3108567e0adc06c4a8031186f84ad1e1e194:
 
   KVM: s390: sigp: split handling of SIGP STOP (AND STORE STATUS) (2014-10-28 
 13:09:14 +0100)
 
 
 KVM: s390: Fixes and cleanups
 
 1. A small fix regarding program check handling (cc stable as it
overwrites the wrong guest memory)
 2. Improve the ipte interlock scalability for older hardware
 3. current-mm to mm cleanup (currently a no-op)
 4. several SIGP rework patches (more to come)
 
 
 David Hildenbrand (6):
   KVM: s390: sigp: dispatch orders with one target in a separate function
   KVM: s390: sigp: move target cpu checks into dispatcher
   KVM: s390: sigp: separate preparation handlers
   KVM: s390: sigp: instruction counters for all sigp orders
   KVM: s390: sigp: inject emergency calls in a separate function
   KVM: s390: sigp: split handling of SIGP STOP (AND STORE STATUS)
 
 Jason J. Herne (1):
   KVM: s390: Cleanup usage of current-mm in set_guest_storage_key
 
 Thomas Huth (2):
   KVM: s390: Make the simple ipte mutex specific to a VM instead of global
   KVM: s390: Fix size of monitor-class number field
 
  arch/s390/include/asm/kvm_host.h |   9 ++
  arch/s390/include/asm/sigp.h |   1 +
  arch/s390/kvm/gaccess.c  |  20 ++-
  arch/s390/kvm/interrupt.c|   2 +-
  arch/s390/kvm/kvm-s390.c |   8 ++
  arch/s390/kvm/sigp.c | 269 
 ---
  arch/s390/mm/pgtable.c   |   2 +-
  7 files changed, 168 insertions(+), 143 deletions(-)
 

Pulled, thanks.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Benchmarking for vhost polling patch

2014-10-29 Thread Razya Ladelsky
Hi Michael,

Following the polling patch thread: 
http://marc.info/?l=kvmm=140853271510179w=2, 
I changed poll_stop_idle to be counted in micro seconds, and carried out 
experiments using varying sizes of this value. The setup for netperf consisted 
of 
1 vm and 1 vhost , each running on their own dedicated core.
  
Here are the  numbers for netperf (micro benchmark):

polling|Send |Throughput|Utilization |S. Demand   
|vhost|exits|throughput|throughput
mode   |Msg  |  |Send  Recv  |Send  Recv  |util |/sec | /cpu |   
/cpu
   |Size |  |local remote|local remote| | |  |% 
change
   |bytes|10^6bits/s|  %%|us/KB us/KB |  %  | |  |
-
NoPolling  64   1054.11   99.97 3.01  7.78  3.74   38.80  92K7.60
Polling=1  64   1036.67   99.97 2.93  7.90  3.70   53.00  92K6.78 -10.78
Polling=5  64   1079.27   99.97 3.07  7.59  3.73   83.00  90K5.90 -22.35
Polling=7  64   1444.90   99.97 3.98  5.67  3.61   95.00  19.5K  7.41  -2.44
Polling=10 64   1521.70   99.97 4.21  5.38  3.63   98.00  8.5K   7.69   1.19
Polling=25 64   1534.24   99.97 4.18  5.34  3.57   99.00  8.5K   7.71   1.51
Polling=50 64   1534.24   99.97 4.18  5.34  3.57   99.00  8.5K   7.71   1.51
  
NoPolling  128  1577.39   99.97 4.09  5.19  3.40   54.00  113K   10.24 
Polling=1  128  1596.08   99.97 4.22  5.13  3.47   71.00  120K   9.34  -8.88
Polling=5  128  2238.49   99.97 5.45  3.66  3.19   92.00  24K11.66 13.82
Polling=7  128  2330.97   99.97 5.59  3.51  3.14   95.00  19.5K  11.96 16.70
Polling=10 128  2375.78   99.97 5.69  3.45  3.14   98.00  10K12.00 17.14
Polling=25 128  2655.01   99.97 2.45  3.09  1.21   99.00  8.5K   13.34 30.25
Polling=50 128  2655.01   99.97 2.45  3.09  1.21   99.00  8.5K   13.34 30.25
  
NoPolling  25   2558.10   99.97 2.33  3.20  1.20   67.00  120K   15.32 
Polling=1  25   2508.93   99.97 3.13  3.27  1.67   75.00  125K   14.34 -6.41
Polling=5  25   3740.34   99.97 2.70  2.19  0.95   94.00  17K19.28 25.86
Polling=7  25   3692.69   99.97 2.80  2.22  0.99   97.00  15.5K  18.75 22.37
Polling=10 25   4036.60   99.97 2.69  2.03  0.87   99.00  8.5K   20.29 32.42
Polling=25 25   3998.89   99.97 2.64  2.05  0.87   99.00  8.5K   20.10 31.18
Polling=50 25   3998.89   99.97 2.64  2.05  0.87   99.00  8.5K   20.10 31.18
  
NoPolling  512  4531.50   99.90 2.75  1.81  0.79   78.00  55K25.47 
Polling=1  512  4684.19   99.95 2.69  1.75  0.75   83.00  35K25.60  0.52
Polling=5  512  4932.65   99.75 2.75  1.68  0.74   91.00  12K25.86  1.52
Polling=7  512  5226.14   99.86 2.80  1.57  0.70   95.00  7.5K   26.82  5.30
Polling=10 512  5464.90   99.60 2.90  1.49  0.70   96.00  8.2K   27.94  9.69
Polling=25 512  5550.44   99.58 2.84  1.47  0.67   99.00  7.5K   27.95  9.73
Polling=50 512  5550.44   99.58 2.84  1.47  0.67   99.00  7.5K   27.95  9.73


As you can see from the last column, polling improves performance in most cases.

I ran memcached (macro benchmark), where (as in the previous benchmark) the vm 
and 
vhost each get their own dedicated core. I configured memslap with C=128, T=8, 
as 
this configuration was required to produce enough load to saturate the vm.
I tried several other configurations, but this one produced the maximal 
throughput(for the baseline). 
  
The numbers for memcached (macro benchmark):

polling time   TPS Netvhost vm   exits  TPS/cpu  TPS/cpu
mode   rate   util  util /sec % change
  %   
Disabled15.9s  125819  91.5   4599   87K873.74   
polling=1   15.8s  126820  92.3   6099   87K797.61   -8.71
polling=5   12.82  155799  113.4  7999   25.5K  875.280.18
polling=10  11.7s  160639  116.9  8399   16.3K  882.631.02
pollling=15 12.4s  160897  117.2  8799   15K865.04   -1.00
polling=100 11.7s  170971  124.4  9999   30 863.49   -1.17


For memcached TPS/cpu does not show a significant difference in any of the 
cases. 
However, TPS numbers did improve in up to 35%, which can be useful for 
under-utilized 
systems which have cpu time to spare for extra throughput. 

If it makes sense to you, I will continue with the other changes requested for 
the patch.

Thank you,
Razya




--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Paolo Bonzini


On 10/29/2014 11:37 AM, Andrew Cooper wrote:
 While testing various nested combinations, XenServer has found that
 modern Windows Server versions must have the hypervisor bit hidden from
 them for them to be happy running HyperV, despite the fact that they
 will make use of the Viridian virtual extensions also provided.

Right.

 As a result, while it is certainly advisable for the hypervisor bit to
 be set, CommonHV should be available to be found by paravirtualised
 drivers inside an OS which can't cope with the hypervisor bit set.

Microsoft should just stop putting arbitrary limitations on their
software; or pay the price which, in this case, is not being able to use
the features from the common specification.  I guess what they'd do is
reinvent the RNG as a Viridian extension (if they need it).

You can certainly do CPUID(0x4F00) even if HYPERVISOR=0.  What you
get back is undefined, but in all likelihood it won't be the
CommonHVIntf string.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread David Vrabel
On 29/10/14 13:45, Paolo Bonzini wrote:
 
 
 On 10/29/2014 11:37 AM, Andrew Cooper wrote:
 While testing various nested combinations, XenServer has found that
 modern Windows Server versions must have the hypervisor bit hidden from
 them for them to be happy running HyperV, despite the fact that they
 will make use of the Viridian virtual extensions also provided.
 
 Right.
 
 As a result, while it is certainly advisable for the hypervisor bit to
 be set, CommonHV should be available to be found by paravirtualised
 drivers inside an OS which can't cope with the hypervisor bit set.
 
 Microsoft should just stop putting arbitrary limitations on their
 software; or pay the price which, in this case, is not being able to use
 the features from the common specification.  I guess what they'd do is
 reinvent the RNG as a Viridian extension (if they need it).
 
 You can certainly do CPUID(0x4F00) even if HYPERVISOR=0.  What you
 get back is undefined, but in all likelihood it won't be the
 CommonHVIntf string.

Microsoft already has a specification to obtain a random number via an
ACPI device.  The VM Generation ID.

http://www.microsoft.com/en-us/download/details.aspx?id=30707

David
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Paolo Bonzini
On 10/29/2014 02:57 PM, David Vrabel wrote:
 Microsoft already has a specification to obtain a random number via an
 ACPI device.  The VM Generation ID.
 
 http://www.microsoft.com/en-us/download/details.aspx?id=30707

That's a very different thing.  The VM Generation ID always returns the
same number if you run the VM from the same configuration file.

It tells the VM when it might need a reseed, but provides neither a best
effort random number like this spec does, nor an entropy source like
virtio-rng does.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Ian Jackson
Andy Lutomirski writes ([Xen-devel] [RFC] Hypervisor RNG and enumeration):
 Here's a draft CommonHV spec.  It's also on github:
 https://github.com/amluto/CommonHV

This a worthwhile direction to investigate, and an interesting
proposal.  From a Xen point of view I have some concerns, though.

I think in Xen we would want to implement the bulk of the provision of
random numbers to guests outside the hypervisor.  That is, the
hypervisor itself should not have random number pool code, associated
policy, and so on.  We would like to avoid adding too much code to the
hypervisor.

That functionality should live in the lower toolstack layers.  For the
benefit of people who want to do radical disaggregation (for security
reasons), it should be capable of being provided by a different domain
to dom0.

I think a fully general interface based purely on MSRs makes that
difficult in a number of ways:

 * Currently I don't think there is any way in Xen to cause MSR
   accesses to be passed to toolstack support programs.

 * In some configurations, Xen PV domains do not have a suitable
   longrunning service process to handle requests of this kind.

 * MSR reads of this kind might be expected to be fast but if they
   involve trapping to a service domain they might be very slow.

 * This interface is very x86-specific.


It seems to me that the real need for this facility is to provide a
good seed for the guest's own cryptographic PRNG.  If we restrict
ourselves to that use case, we can sidestep the difficulties.

In particular, the parts of this proposal that are most difficult are:

 * The facility for the guest to provide random numbers back to the
   host.  I think this can be dealt with easily by hypervisor-specific
   mechanisms, if it is desirable.

 * The implication that a hypervisor ought to be providing a unbounded
   stream of random numbers, rather than a fixed amount of seed.


I think the most obvious approach would be to provide the VM, at
startup, with a page containing a fixed amount of random number seed,
along with some metatdata.

Some platform-specific way of discovering the location of the page
would have to be defined.  (That might an MSR but more likely it would
be Device Tree or ACPI.)

After the guest has read the page, it would be free to treat it as
normal memory.

The metadata need do little more than specify the length and the
amount of provided entropy.  There should be some room for expansion.

The specification should say that the provided seed MUST be
cryptographically secure, MUST have as much entropy as stated and that
that amount of entropy MUST be at least (say) 64 bits and SHOULD be at
least (say) 256 bits.

Ian.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread H. Peter Anvin
On 10/29/2014 03:37 AM, Andrew Cooper wrote:

 CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros.

 To the extent that the hypervisor prefers a given interface, it should
 specify that interface earlier in the list.  For example, KVM might place
 its KVMKVMKVM signature first in the list to indicate that it should be
 used by guests in preference to other supported interfaces.  Other 
 hypervisors
 would likely use a different order.

 The exact semantics of the ordering of the list is beyond the scope of
 this specification.
 
 How do you evaluate N?
 
 It would make more sense for CPUID.4F01[ECX=0] to return N in one
 register, and perhaps prefered interface index in another.  The
 signatures can then be obtained from CPUID.4F01[ECX={1 to N}].
 
 That way, a consumer can be confident that they have found all the
 signatures, without relying on an unbounded loop and checking for zeroes

Yes.  Specifically, it should return it in EAX.  That is the preferred
interface and we are trying to push for that going forward.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread H. Peter Anvin
On 10/29/2014 03:37 AM, Andrew Cooper wrote:

 CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros.

 To the extent that the hypervisor prefers a given interface, it should
 specify that interface earlier in the list.  For example, KVM might place
 its KVMKVMKVM signature first in the list to indicate that it should be
 used by guests in preference to other supported interfaces.  Other 
 hypervisors
 would likely use a different order.

 The exact semantics of the ordering of the list is beyond the scope of
 this specification.
 
 How do you evaluate N?
 
 It would make more sense for CPUID.4F01[ECX=0] to return N in one
 register, and perhaps prefered interface index in another.  The
 signatures can then be obtained from CPUID.4F01[ECX={1 to N}].
 
 That way, a consumer can be confident that they have found all the
 signatures, without relying on an unbounded loop and checking for zeroes

Yes.  Specifically, it should return it in EAX.  That is the preferred
interface and we are trying to push for that going forward.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread H. Peter Anvin
On 10/29/2014 03:37 AM, Andrew Cooper wrote:

 CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros.

 To the extent that the hypervisor prefers a given interface, it should
 specify that interface earlier in the list.  For example, KVM might place
 its KVMKVMKVM signature first in the list to indicate that it should be
 used by guests in preference to other supported interfaces.  Other 
 hypervisors
 would likely use a different order.

 The exact semantics of the ordering of the list is beyond the scope of
 this specification.
 
 How do you evaluate N?
 
 It would make more sense for CPUID.4F01[ECX=0] to return N in one
 register, and perhaps prefered interface index in another.  The
 signatures can then be obtained from CPUID.4F01[ECX={1 to N}].
 
 That way, a consumer can be confident that they have found all the
 signatures, without relying on an unbounded loop and checking for zeroes

Yes.  Specifically, it should return it in EAX.  That is the preferred
interface and we are trying to push for that going forward.

-hpa


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Andy Lutomirski
On Oct 29, 2014 8:17 AM, Ian Jackson ian.jack...@eu.citrix.com wrote:

 Andy Lutomirski writes ([Xen-devel] [RFC] Hypervisor RNG and enumeration):
  Here's a draft CommonHV spec.  It's also on github:
  https://github.com/amluto/CommonHV

 This a worthwhile direction to investigate, and an interesting
 proposal.  From a Xen point of view I have some concerns, though.

 I think in Xen we would want to implement the bulk of the provision of
 random numbers to guests outside the hypervisor.  That is, the
 hypervisor itself should not have random number pool code, associated
 policy, and so on.  We would like to avoid adding too much code to the
 hypervisor.

 That functionality should live in the lower toolstack layers.  For the
 benefit of people who want to do radical disaggregation (for security
 reasons), it should be capable of being provided by a different domain
 to dom0.

 I think a fully general interface based purely on MSRs makes that
 difficult in a number of ways:

  * Currently I don't think there is any way in Xen to cause MSR
accesses to be passed to toolstack support programs.

  * In some configurations, Xen PV domains do not have a suitable
longrunning service process to handle requests of this kind.

  * MSR reads of this kind might be expected to be fast but if they
involve trapping to a service domain they might be very slow.

I have no objection to specifying that these reads may be quite slow.
Guests should only use them at boot and if they have some reason to
distrust their RNG pool.

The latter can legitimately happen after various types of suspend or
after migration (detected by VM Generation ID, for example).


  * This interface is very x86-specific.


Agreed.

Part of the motivation is to allow guests to use this mechanism very
early in boot for stack canaries, ASLR, etc.  I don't know a good way
to do that without something very platform specific.


 It seems to me that the real need for this facility is to provide a
 good seed for the guest's own cryptographic PRNG.  If we restrict
 ourselves to that use case, we can sidestep the difficulties.

 In particular, the parts of this proposal that are most difficult are:

  * The facility for the guest to provide random numbers back to the
host.  I think this can be dealt with easily by hypervisor-specific
mechanisms, if it is desirable.

Xen can implement this is a no-op.  If we use feature bits, wrmsr
support could be separately enumerated.


  * The implication that a hypervisor ought to be providing a unbounded
stream of random numbers, rather than a fixed amount of seed.


I don't expect hypervisors to estimate the entropy available through
this mechanism.  Given that, the length of the stream is largely
irrelevant, except that an unbounded stream allowed reseeding after
boot.


 I think the most obvious approach would be to provide the VM, at
 startup, with a page containing a fixed amount of random number seed,
 along with some metatdata.

 Some platform-specific way of discovering the location of the page
 would have to be defined.  (That might an MSR but more likely it would
 be Device Tree or ACPI.)

 After the guest has read the page, it would be free to treat it as
 normal memory.

 The metadata need do little more than specify the length and the
 amount of provided entropy.  There should be some room for expansion.

ACPI is not useful early in boot.  DT might be, but that could be a
separate spec.


 The specification should say that the provided seed MUST be
 cryptographically secure, MUST have as much entropy as stated and that
 that amount of entropy MUST be at least (say) 64 bits and SHOULD be at
 least (say) 256 bits.

I don't think this is practical.  It will require hypervisors to
throttle guest startup to ensure that they have that much entropy.

I'm not fundamentally opposed to allowing hypervisors to provide more
than 64 bits of data per invocation, which would help when the trap is
very slow, but I don't know of a suitably simple way to do that.

--Andy
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Andy Lutomirski
On Wed, Oct 29, 2014 at 9:07 AM, H. Peter Anvin h...@zytor.com wrote:
 On 10/29/2014 03:37 AM, Andrew Cooper wrote:

 CPUID with EAX = 0x4F01 and ECX = N MUST return all zeros.

 To the extent that the hypervisor prefers a given interface, it should
 specify that interface earlier in the list.  For example, KVM might place
 its KVMKVMKVM signature first in the list to indicate that it should be
 used by guests in preference to other supported interfaces.  Other 
 hypervisors
 would likely use a different order.

 The exact semantics of the ordering of the list is beyond the scope of
 this specification.

 How do you evaluate N?

 It would make more sense for CPUID.4F01[ECX=0] to return N in one
 register, and perhaps prefered interface index in another.  The
 signatures can then be obtained from CPUID.4F01[ECX={1 to N}].

 That way, a consumer can be confident that they have found all the
 signatures, without relying on an unbounded loop and checking for zeroes

 Yes.  Specifically, it should return it in EAX.  That is the preferred
 interface and we are trying to push for that going forward.


I'm okay with that.

I'm inclined to leave EBX, ECX, and EDX reserved for now, though.
Barring an actual use case in which the order of the list isn't
sufficient to determine preference, I don't see the need to give
another preference indication.

I updated the copy of github to make this change and to use an
explicit feature bit for the RNG.

--Andy

 -hpa





-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] mm: gup: add get_user_pages_locked and get_user_pages_unlocked

2014-10-29 Thread Andrea Arcangeli
On Thu, Oct 09, 2014 at 12:47:23PM +0200, Peter Zijlstra wrote:
 On Wed, Oct 01, 2014 at 10:56:35AM +0200, Andrea Arcangeli wrote:
  +static inline long __get_user_pages_locked(struct task_struct *tsk,
  +  struct mm_struct *mm,
  +  unsigned long start,
  +  unsigned long nr_pages,
  +  int write, int force,
  +  struct page **pages,
  +  struct vm_area_struct **vmas,
  +  int *locked,
  +  bool notify_drop)
 
 You might want to consider __always_inline to make sure it does indeed
 get inlined and constant propagation works for @locked and @notify_drop.

Ok, that's included in the last patchset submit.

Thanks,
Andrea
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Jake Oshins

I have no objection to specifying that these reads may be quite slow.
Guests should only use them at boot and if they have some reason to
distrust their RNG pool.

The latter can legitimately happen after various types of suspend or
after migration (detected by VM Generation ID, for example).

Just as a point of clarification, the VM Generation ID changes (at least in the 
Hyper-V implementation) only when the VM may have observed a different future, 
as when a VM backup is restored, a checkpoint is applied, etc.  It does not 
change during migration, when the VM is suspended or when it is rebooted.  I've 
heard anecdotes from application vendors saying that there is some other 
hypervisor that actually does change the ID at these moments and they wanted us 
to us to fix that, until I explained that I only control Hyper-V.

-- Jake Oshins

N�r��yb�X��ǧv�^�)޺{.n�+h����ܨ}���Ơz�j:+v���zZ+��+zf���h���~i���z��w���?��)ߢf

Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Andy Lutomirski
On Wed, Oct 29, 2014 at 9:29 AM, Jake Oshins ja...@microsoft.com wrote:

I have no objection to specifying that these reads may be quite slow.
Guests should only use them at boot and if they have some reason to
distrust their RNG pool.

The latter can legitimately happen after various types of suspend or
after migration (detected by VM Generation ID, for example).

 Just as a point of clarification, the VM Generation ID changes (at least in 
 the Hyper-V implementation) only when the VM may have observed a different 
 future, as when a VM backup is restored, a checkpoint is applied, etc.  It 
 does not change during migration, when the VM is suspended or when it is 
 rebooted.  I've heard anecdotes from application vendors saying that there is 
 some other hypervisor that actually does change the ID at these moments and 
 they wanted us to us to fix that, until I explained that I only control 
 Hyper-V.


Fair enough.

If the VM may indeed have observed a different future, then I would
argue that reseeding the RNG is very important -- more so than after a
normal migration.

If the VM trusts that its other history hasn't been compromised, then
merely mixing in a unique value would get most of the benefit.

--Andy

 -- Jake Oshins




-- 
Andy Lutomirski
AMA Capital Management, LLC
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 4/6] target-i386: Remove unsupported bits from all CPU models

2014-10-29 Thread Andreas Färber
Am 03.10.2014 um 21:39 schrieb Eduardo Habkost:
 The following CPU features were never supported by neither TCG or KVM,
 so they are useless on the CPU model definitions, today:
 
  * CPUID_DTS (DS)
  * CPUID_HT
  * CPUID_TM
  * CPUID_PBE
  * CPUID_EXT_DTES64
  * CPUID_EXT_DSCPL
  * CPUID_EXT_EST
  * CPUID_EXT_TM2
  * CPUID_EXT_XTPR
  * CPUID_EXT_PDCM
  * CPUID_SVM_LBRV
 
 As using enforce mode is the only way to ensure guest ABI doesn't
 change when moving to a different host, we should make enforce mode
 the default or at least encourage management software to always use it.
 
 In turn, to make enforce usable, we need CPU models that work without
 always requiring some features to be explicitly disabled. This patch
 removes the above features from all CPU model definitions.
 
 We won't need any machine-type compat code for those changes, because it
 is impossible to have existing VMs with those features enabled.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 Cc: Aurelien Jarno aurel...@aurel32.net
 ---
 Changes v1 - v2:
 * Trivial typo fix in comment
 ---
  target-i386/cpu.c | 33 -
  1 file changed, 20 insertions(+), 13 deletions(-)
 
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 4119fca..1e9fff9 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -681,10 +681,11 @@ static X86CPUDefinition builtin_x86_defs[] = {
  .family = 16,
  .model = 2,
  .stepping = 3,
 +/* Missing: CPUID_HT */
  .features[FEAT_1_EDX] =
  PPRO_FEATURES |
  CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
 -CPUID_PSE36 | CPUID_VME | CPUID_HT,
 +CPUID_PSE36 | CPUID_VME,
  .features[FEAT_1_ECX] =
  CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_CX16 |
  CPUID_EXT_POPCNT,
[snip]

I'm okay with retaining these as comments. Anyone any objections?

Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] mm: gup: add get_user_pages_locked and get_user_pages_unlocked

2014-10-29 Thread Andrea Arcangeli
On Thu, Oct 09, 2014 at 12:50:37PM +0200, Peter Zijlstra wrote:
 On Wed, Oct 01, 2014 at 10:56:35AM +0200, Andrea Arcangeli wrote:
 
  +static inline long __get_user_pages_locked(struct task_struct *tsk,
  +  struct mm_struct *mm,
  +  unsigned long start,
  +  unsigned long nr_pages,
  +  int write, int force,
  +  struct page **pages,
  +  struct vm_area_struct **vmas,
  +  int *locked,
  +  bool notify_drop)
  +{
 
  +   if (notify_drop  lock_dropped  *locked) {
  +   /*
  +* We must let the caller know we temporarily dropped the lock
  +* and so the critical section protected by it was lost.
  +*/
  +   up_read(mm-mmap_sem);
  +   *locked = 0;
  +   }
  +   return pages_done;
  +}
 
  +long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
  +  unsigned long start, unsigned long nr_pages,
  +  int write, int force, struct page **pages,
  +  int *locked)
  +{
  +   return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
  +  pages, NULL, locked, true);
  +}
 
  +long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
  +unsigned long start, unsigned long nr_pages,
  +int write, int force, struct page **pages)
  +{
  +   long ret;
  +   int locked = 1;
  +   down_read(mm-mmap_sem);
  +   ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
  + pages, NULL, locked, false);
  +   if (locked)
  +   up_read(mm-mmap_sem);
  +   return ret;
  +}
 
   long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
  unsigned long start, unsigned long nr_pages, int write,
  int force, struct page **pages, struct vm_area_struct **vmas)
   {
  +   return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
  +  pages, vmas, NULL, false);
   }
 
 I'm wondering about that notify_drop parameter, what's the added
 benefit? If you look at these 3 callers we can do away with it, since in
 the second called where we have locked but !notify_drop we seem to do

The second (and third) caller pass notify_drop=false, so the
notify_drop parameter is always a noop for them. They certainly could
get away without it.

 the exact same thing afterwards anyway.

It makes a difference only to the first caller, if it wasn't for the
first caller notify_drop could be dropped. The first caller does this:

return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
   pages, NULL, locked, true, FOLL_TOUCH);
^ notify_drop = true

Without notify_drop=true the first caller could make its own
respective caller think the lock has never been dropped, just because
it is locked by the time get_user_pages_locked returned. But the
caller must be made aware that the lock has been dropped during the
call and in turn any vma it got before inside the mmap_sem critical
section is now stale. That's all notify_drop achieves.

Thanks,
Andrea
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 5/6] target-i386: Don't enable nested VMX by default

2014-10-29 Thread Andreas Färber
Am 03.10.2014 um 21:39 schrieb Eduardo Habkost:
 TCG doesn't support VMX, and nested VMX is not enabled by default on the
 KVM kernel module.
 
 So, there's no reason to have VMX enabled by default on the core2duo and
 coreduo CPU models, today. Even the newer Intel CPU model definitions
 don't have it enabled.
 
 In this case, we need machine-type compat code, as people may be running
 the older machine-types on hosts that had VMX nesting enabled.
 
 Signed-off-by: Eduardo Habkost ehabk...@redhat.com
 ---
  hw/i386/pc_piix.c | 2 ++
  hw/i386/pc_q35.c  | 2 ++
  target-i386/cpu.c | 8 
  3 files changed, 8 insertions(+), 4 deletions(-)
[...]
 diff --git a/target-i386/cpu.c b/target-i386/cpu.c
 index 1e9fff9..c336003 100644
 --- a/target-i386/cpu.c
 +++ b/target-i386/cpu.c
 @@ -720,10 +720,10 @@ static X86CPUDefinition builtin_x86_defs[] = {
  CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
  CPUID_PSE36 | CPUID_VME | CPUID_ACPI | CPUID_SS,
  /* Missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_EST,
 - * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM */
 + * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_VMX */
  .features[FEAT_1_ECX] =
  CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 |
 -CPUID_EXT_VMX | CPUID_EXT_CX16,
 +CPUID_EXT_CX16,
  .features[FEAT_8000_0001_EDX] =
  CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
  .features[FEAT_8000_0001_ECX] =
[snip]

Here I'm less certain what the best approach is. As you point out,
there's an inconsistency that I agree should be fixed. I wonder however
whether an approach similar to 3/6 for KVM only would be better? I.e.,
have VMX as a sometimes-KVM-supported feature be listed in the model and
filter it out for accel=kvm so that -cpu enforce works, but let
accel=tcg fail with features not implemented.

Regards,
Andreas

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/17] RFC: userfault v2

2014-10-29 Thread Andrea Arcangeli
Hi Zhanghailiang,

On Mon, Oct 27, 2014 at 05:32:51PM +0800, zhanghailiang wrote:
 Hi Andrea,
 
 Thanks for your hard work on userfault;)
 
 This is really a useful API.
 
 I want to confirm a question:
 Can we support distinguishing between writing and reading memory for 
 userfault?
 That is, we can decide whether writing a page, reading a page or both trigger 
 userfault.
 
 I think this will help supporting vhost-scsi,ivshmem for migration,
 we can trace dirty page in userspace.
 
 Actually, i'm trying to relize live memory snapshot based on pre-copy and 
 userfault,
 but reading memory from migration thread will also trigger userfault.
 It will be easy to implement live memory snapshot, if we support configuring
 userfault for writing memory only.

Mail is going to be long enough already so I'll just assume tracking
dirty memory in userland (instead of doing it in kernel) is worthy
feature to have here.

After some chat during the KVMForum I've been already thinking it
could be beneficial for some usage to give userland the information
about the fault being read or write, combined with the ability of
mapping pages wrprotected to mcopy_atomic (that would work without
false positives only with MADV_DONTFORK also set, but it's already set
in qemu). That will require vma-vm_flags  VM_USERFAULT to be
checked also in the wrprotect faults, not just in the not present
faults, but it's not a massive change. Returning the read/write
information is also a not massive change. This will then payoff mostly
if there's also a way to remove the memory atomically (kind of
remap_anon_pages).

Would that be enough? I mean are you still ok if non present read
fault traps too (you'd be notified it's a read) and you get
notification for both wrprotect and non present faults?

The question then is how you mark the memory readonly to let the
wrprotect faults trap if the memory already existed and you didn't map
it yourself in the guest with mcopy_atomic with a readonly flag.

My current plan would be:

- keep MADV_USERFAULT|NOUSERFAULT just to set VM_USERFAULT for the
  fast path check in the not-present and wrprotect page fault

- if VM_USERFAULT is set, find if there's a userfaultfd registered
  into that vma too

if yes engage userfaultfd protocol

otherwise raise SIGBUS (single threaded apps should be fine with
SIGBUS and it'll avoid them to spawn a thread in order to talk the
userfaultfd protocol)

- if userfaultfd protocol is engaged, return read|write fault + fault
  address to read(ufd) syscalls

- leave the userfault resolution mechanism independent of the
  userfaultfd protocol so we keep the two problems separated and we
  don't mix them in the same API which makes it even harder to
  finalize it.

add mcopy_atomic (with a flag to map the page readonly too)

The alternative would be to hide mcopy_atomic (and even
remap_anon_pages in order to remove the memory atomically for
the externalization into the cloud) as userfaultfd commands to
write into the fd. But then there would be no much point to keep
MADV_USERFAULT around if I do so and I could just remove it
too or it doesn't look clean having to open the userfaultfd just
to issue an hidden mcopy_atomic.

So it becomes a decision if the basic SIGBUS mode for single
threaded apps should be supported or not. As long as we support
SIGBUS too and we don't force to use userfaultfd as the only
mechanism to be notified about userfaults, having a separate
mcopy_atomic syscall sounds cleaner.
 
Perhaps mcopy_atomic could be used in other cases that may arise
later that may not be connected with the userfault.

Questions to double check the above plan is ok:

1) should I drop the SIGBUS behavior and MADV_USERFAULT?

2) should I hide mcopy_atomic as a write into the userfaultfd?

   NOTE: even if I hide mcopy_atomic as a userfaultfd command to write
   into the fd, the buffer pointer passed to write() syscall would
   still _not_ be pointing to the data like a regular write, but it
   would be a pointer to a command structure that points to the source
   and destination data of the hidden mcopy_atomic, the only
   advantage is that perhaps I could wakeup the blocked page faults
   without requiring an additional syscall.

   The standalone mcopy_atomic would still require a write into the
   userfaultfd as it happens now after remap_anon_pages returns, in
   order to wakeup the stopped page faults.

3) should I add a registration command to trap only write faults?

   The protocol can always be extended later anyway in a backwards
   compatible way but it's better if we get it fully featured from the
   start.

For completeness, some answers for other questions I've seen floating
around but that weren't posted on the list yet (you can skip reading
the below part if not interested):

- open(/dev/userfault) instead of sys_userfaultfd(), I don't see the
  benefit: userfaultfd is just like eventfd in 

Re: [Qemu-devel] [PATCH 00/17] RFC: userfault v2

2014-10-29 Thread Peter Maydell
On 29 October 2014 17:46, Andrea Arcangeli aarca...@redhat.com wrote:
 After some chat during the KVMForum I've been already thinking it
 could be beneficial for some usage to give userland the information
 about the fault being read or write

...I wonder if that would let us replace the current nasty
mess we use in linux-user to detect read vs write faults
(which uses a bunch of architecture-specific hacks including
in some cases look at the insn that triggered this SEGV and
decode it to see if it was a load or a store; see the
various cpu_signal_handler() implementations in user-exec.c).

-- PMM
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Xen-devel] [RFC] Hypervisor RNG and enumeration

2014-10-29 Thread Paolo Bonzini
On 10/29/2014 05:29 PM, Jake Oshins wrote:
 Just as a point of clarification, the VM Generation ID changes (at
 least in the Hyper-V implementation) only when the VM may have
 observed a different future, as when a VM backup is restored, a
 checkpoint is applied, etc.  It does not change during migration,
 when the VM is suspended or when it is rebooted.  I've heard
 anecdotes from application vendors saying that there is some other
 hypervisor that actually does change the ID at these moments and they
 wanted us to us to fix that, until I explained that I only control
 Hyper-V.

This is indeed the only reasonable way you can read the vmgenid spec.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fix Penguin Penalty 17th October2014 ( mail-archive.com )

2014-10-29 Thread centroids86916
Dear Sir

Did your website get hit by Google Penguin update on October 17th 2014? What 
basically is Google Penguin Update? It is actually a code name for Google 
algorithm which aims at decreasing your websites search engine rankings that 
violate Google’s guidelines by using black hat SEO techniques to rank your 
webpage by giving number of spammy links to the page.
 
We are one of those few SEO companies that can help you avoid penalties from 
Google Updates like Penguin and Panda. Our clients have survived all the 
previous and present updates with ease. They have never been hit because we use 
100% white hat SEO techniques to rank Webpages.  Simple thing that we do to 
keep websites away from any Penguin or Panda penalties is follow Google 
guidelines and we give Google users the best answers to their queries.

If you are looking to increase the quality of your websites and to get more 
targeted traffic or save your websites from these Google penalties email us 
back with your interest. 

We will be glad to serve you and help you grow your business.

Regards

Vince G

SEO Manager ( TOB )
B7 Green Avenue, Amritsar 143001 Punjab

NO CLICK in the subject to STOP EMAILS
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-29 Thread Waiman Long

On 10/27/2014 05:22 PM, Waiman Long wrote:

On 10/27/2014 02:04 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

Since enabling paravirt spinlock will disable unlock function 
inlining,

a jump label can be added to the unlock function without adding patch
sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running 
on a

real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there 
might be

some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.
My concern is that spin_unlock() can be called in many places, 
including
loadable kernel modules. Can the paravirt_patch_ident_32() function 
able to
patch all of them in reasonable time? How about a kernel module 
loaded later

at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
-  apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.


Thanks for letting me know about that. I have this concern because 
your patch didn't change the current configuration of disabling unlock 
inlining when paravirt_spinlock is enabled. With that, I think it is 
worthwhile to reduce the performance delta between the PV and non-PV 
kernel on bare metal.


I am sorry that the unlock call sites patching code doesn't work in a 
virtual guest. Your pvqspinlock patch did an unconditional patching even 
in a virtual guest. I added check for the paravirt_spinlocks_enabled, 
but it turned out that some spin_unlock() seemed to be called before 
paravirt_spinlocks_enabled is set. As a result, some call sites were 
still patched resulting in missed wake up's and system hang.


At this point, I am going to leave out that change from my patch set 
until we can figure out a better way of doing that.


-Longman
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 5/6] target-i386: Don't enable nested VMX by default

2014-10-29 Thread Eduardo Habkost
On Wed, Oct 29, 2014 at 06:40:48PM +0100, Andreas Färber wrote:
 Am 03.10.2014 um 21:39 schrieb Eduardo Habkost:
  TCG doesn't support VMX, and nested VMX is not enabled by default on the
  KVM kernel module.
  
  So, there's no reason to have VMX enabled by default on the core2duo and
  coreduo CPU models, today. Even the newer Intel CPU model definitions
  don't have it enabled.
  
  In this case, we need machine-type compat code, as people may be running
  the older machine-types on hosts that had VMX nesting enabled.
  
  Signed-off-by: Eduardo Habkost ehabk...@redhat.com
  ---
   hw/i386/pc_piix.c | 2 ++
   hw/i386/pc_q35.c  | 2 ++
   target-i386/cpu.c | 8 
   3 files changed, 8 insertions(+), 4 deletions(-)
 [...]
  diff --git a/target-i386/cpu.c b/target-i386/cpu.c
  index 1e9fff9..c336003 100644
  --- a/target-i386/cpu.c
  +++ b/target-i386/cpu.c
  @@ -720,10 +720,10 @@ static X86CPUDefinition builtin_x86_defs[] = {
   CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
   CPUID_PSE36 | CPUID_VME | CPUID_ACPI | CPUID_SS,
   /* Missing: CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_EST,
  - * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM */
  + * CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_VMX */
   .features[FEAT_1_ECX] =
   CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 |
  -CPUID_EXT_VMX | CPUID_EXT_CX16,
  +CPUID_EXT_CX16,
   .features[FEAT_8000_0001_EDX] =
   CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
   .features[FEAT_8000_0001_ECX] =
 [snip]
 
 Here I'm less certain what the best approach is. As you point out,
 there's an inconsistency that I agree should be fixed. I wonder however
 whether an approach similar to 3/6 for KVM only would be better? I.e.,
 have VMX as a sometimes-KVM-supported feature be listed in the model and
 filter it out for accel=kvm so that -cpu enforce works, but let
 accel=tcg fail with features not implemented.

I don't mind either way, this is up for TCG maintainers. But I don't
understand what would be the benefit in keeping it enabled by default
for TCG.

Fortunately the answer for KVM and TCG are completely independent from
each other, now. We just need to follow this depending on the defaults
we choose:

  | Default on |
  |  TCG | KVM | Implementation
  +--+-+-
A |  NO  | NO  | Unset it on CPU model table (builtin_x86_defs)
B |  NO  | YES | Unset on table, set on kvm_default_features
C |  YES | NO  | Set on table, set on kvm_default_unset_features
B |  YES | YES | Set it on CPU model table
  +--+-+-

This patch implements row A.

If you tell me you really want VMX to be enabled by default on TCG mode,
then we can implement row C like we already did for CPUID_ACPI,
CPUID_EXT_MONITOR, and CPUID_EXT3_SVM.

But, are you really sure you do?

Considering accel=tcg behavior only (ignore KVM by now): why is this
different from all the features listed on patch 1/6? VMX was never
supported by TCG, so (like all the features from 1/6) it is not useful
to have it enabled by default when accel=tcg.

-- 
Eduardo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v13 00/11] qspinlock: a 4-byte queue spinlock with PV support

2014-10-29 Thread Waiman Long
v12-v13:
 - Change patch 9 to generate separate versions of the
   queue_spin_lock_slowpath functions for bare metal and PV guest. This
   reduces the performance impact of the PV code on bare metal systems.

v11-v12:
 - Based on PeterZ's version of the qspinlock patch
   (https://lkml.org/lkml/2014/6/15/63).
 - Incorporated many of the review comments from Konrad Wilk and
   Paolo Bonzini.
 - The pvqspinlock code is largely from my previous version with
   PeterZ's way of going from queue tail to head and his idea of
   using callee saved calls to KVM and XEN codes.

v10-v11:
  - Use a simple test-and-set unfair lock to simplify the code,
but performance may suffer a bit for large guest with many CPUs.
  - Take out Raghavendra KT's test results as the unfair lock changes
may render some of his results invalid.
  - Add PV support without increasing the size of the core queue node
structure.
  - Other minor changes to address some of the feedback comments.

v9-v10:
  - Make some minor changes to qspinlock.c to accommodate review feedback.
  - Change author to PeterZ for 2 of the patches.
  - Include Raghavendra KT's test results in patch 18.

v8-v9:
  - Integrate PeterZ's version of the queue spinlock patch with some
modification:
http://lkml.kernel.org/r/20140310154236.038181...@infradead.org
  - Break the more complex patches into smaller ones to ease review effort.
  - Fix a racing condition in the PV qspinlock code.

v7-v8:
  - Remove one unneeded atomic operation from the slowpath, thus
improving performance.
  - Simplify some of the codes and add more comments.
  - Test for X86_FEATURE_HYPERVISOR CPU feature bit to enable/disable
unfair lock.
  - Reduce unfair lock slowpath lock stealing frequency depending
on its distance from the queue head.
  - Add performance data for IvyBridge-EX CPU.

v6-v7:
  - Remove an atomic operation from the 2-task contending code
  - Shorten the names of some macros
  - Make the queue waiter to attempt to steal lock when unfair lock is
enabled.
  - Remove lock holder kick from the PV code and fix a race condition
  - Run the unfair lock  PV code on overcommitted KVM guests to collect
performance data.

v5-v6:
 - Change the optimized 2-task contending code to make it fairer at the
   expense of a bit of performance.
 - Add a patch to support unfair queue spinlock for Xen.
 - Modify the PV qspinlock code to follow what was done in the PV
   ticketlock.
 - Add performance data for the unfair lock as well as the PV
   support code.

v4-v5:
 - Move the optimized 2-task contending code to the generic file to
   enable more architectures to use it without code duplication.
 - Address some of the style-related comments by PeterZ.
 - Allow the use of unfair queue spinlock in a real para-virtualized
   execution environment.
 - Add para-virtualization support to the qspinlock code by ensuring
   that the lock holder and queue head stay alive as much as possible.

v3-v4:
 - Remove debugging code and fix a configuration error
 - Simplify the qspinlock structure and streamline the code to make it
   perform a bit better
 - Add an x86 version of asm/qspinlock.h for holding x86 specific
   optimization.
 - Add an optimized x86 code path for 2 contending tasks to improve
   low contention performance.

v2-v3:
 - Simplify the code by using numerous mode only without an unfair option.
 - Use the latest smp_load_acquire()/smp_store_release() barriers.
 - Move the queue spinlock code to kernel/locking.
 - Make the use of queue spinlock the default for x86-64 without user
   configuration.
 - Additional performance tuning.

v1-v2:
 - Add some more comments to document what the code does.
 - Add a numerous CPU mode to support = 16K CPUs
 - Add a configuration option to allow lock stealing which can further
   improve performance in many cases.
 - Enable wakeup of queue head CPU at unlock time for non-numerous
   CPU mode.

This patch set has 3 different sections:
 1) Patches 1-6: Introduces a queue-based spinlock implementation that
can replace the default ticket spinlock without increasing the
size of the spinlock data structure. As a result, critical kernel
data structures that embed spinlock won't increase in size and
break data alignments.
 2) Patch 7: Enables the use of unfair lock in a virtual guest. This
can resolve some of the locking related performance issues due to
the fact that the next CPU to get the lock may have been scheduled
out for a period of time.
 3) Patches 8-11: Enable qspinlock para-virtualization support by
halting the waiting CPUs after spinning for a certain amount of
time. The unlock code will detect the a sleeping waiter and wake it
up. This is essentially the same logic as the PV ticketlock code.

The queue spinlock has slightly better performance than the ticket
spinlock in uncontended case. Its performance can be much better
with moderate to heavy contention.  This patch has the 

[PATCH v13 10/11] pvqspinlock, x86: Enable PV qspinlock for KVM

2014-10-29 Thread Waiman Long
This patch adds the necessary KVM specific code to allow KVM to
support the CPU halting and kicking operations needed by the queue
spinlock PV code.

Two KVM guests of 20 CPU cores (2 nodes) were created for performance
testing in one of the following three configurations:
 1) Only 1 VM is active
 2) Both VMs are active and they share the same 20 physical CPUs
(200% overcommit)

The tests run included the disk workload of the AIM7 benchmark on
both ext4 and xfs RAM disks at 3000 users on a 3.17 based kernel. The
ebizzy -m test and futextest was was also run and its performance
data were recorded.  With two VMs running, the idle=poll kernel
option was added to simulate a busy guest. If PV qspinlock is not
enabled, unfairlock will be used automically in a guest.

AIM7 XFS Disk Test (no overcommit)
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  PV ticketlock 25423737.08   98.95   5.44
  PV qspinlock  25495757.06   98.63   5.40
  unfairlock26162796.91   97.05   5.42

AIM7 XFS Disk Test (200% overcommit)
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  PV ticketlock 64446827.93  415.22   6.33
  PV qspinlock  64562427.88  419.84   0.39
  unfairlock69551825.88  377.40   4.09

AIM7 EXT4 Disk Test (no overcommit)
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  PV ticketlock 19955659.02  103.67   5.76
  PV qspinlock  20111738.95  102.15   5.40
  unfairlock20665908.71   98.13   5.46

AIM7 EXT4 Disk Test (200% overcommit)
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  PV ticketlock 47834137.63  495.81  30.78
  PV qspinlock  47405837.97  475.74  30.95
  unfairlock56022432.13  398.43  26.27

For the AIM7 disk workload, both PV ticketlock and qspinlock have
about the same performance. The unfairlock performs slightly better
than the PV lock.

EBIZZY-m Test (no overcommit)
  kernelRec/s   Real Time   Sys TimeUsr Time
  - -   -   
  PV ticketlock 3255  10.00   60.65   3.62
  PV qspinlock  3318  10.00   54.27   3.60
  unfairlock2833  10.00   26.66   3.09

EBIZZY-m Test (200% overcommit)
  kernelRec/s   Real Time   Sys TimeUsr Time
  - -   -   
  PV ticketlock  841  10.00   71.03   2.37
  PV qspinlock   834  10.00   68.27   2.39
  unfairlock 865  10.00   27.08   1.51

  futextest (no overcommit)
  kernel   kops/s
  ---
  PV ticketlock11523
  PV qspinlock 12328
  unfairlock9478

  futextest (200% overcommit)
  kernel   kops/s
  ---
  PV ticketlock 7276
  PV qspinlock  7095
  unfairlock5614

The ebizzy and futextest have much higher spinlock contention than
the AIM7 disk workload. In this case, the unfairlock performs worse
than both the PV ticketlock and qspinlock. The performance of the 2
PV locks are comparable.

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 arch/x86/kernel/kvm.c |  138 -
 kernel/Kconfig.locks  |2 +-
 2 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index eaa064c..1183645 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -568,7 +568,7 @@ arch_initcall(activate_jump_labels);
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 /* Kick a cpu by its apicid. Used to wake up a halted vcpu */
-static void kvm_kick_cpu(int cpu)
+void kvm_kick_cpu(int cpu)
 {
int apicid;
unsigned long flags = 0;
@@ -576,7 +576,9 @@ static void kvm_kick_cpu(int cpu)
apicid = per_cpu(x86_cpu_to_apicid, cpu);
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
+PV_CALLEE_SAVE_REGS_THUNK(kvm_kick_cpu);
 
+#ifndef CONFIG_QUEUE_SPINLOCK
 enum kvm_contention_stat {
TAKEN_SLOW,
TAKEN_SLOW_PICKUP,
@@ -804,6 +806,132 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, 
__ticket_t ticket)
}
}
 }
+#else /* !CONFIG_QUEUE_SPINLOCK */
+
+#ifdef CONFIG_KVM_DEBUG_FS
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+static u32 kick_nohlt_stats;   /* Kick but 

[PATCH v13 11/11] pvqspinlock, x86: Enable PV qspinlock for XEN

2014-10-29 Thread Waiman Long
This patch adds the necessary XEN specific code to allow XEN to
support the CPU halting and kicking operations needed by the queue
spinlock PV code.

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 arch/x86/xen/spinlock.c |  149 +--
 kernel/Kconfig.locks|2 +-
 2 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index d332ae0..401d7a6 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,6 +17,12 @@
 #include xen-ops.h
 #include debugfs.h
 
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifndef CONFIG_QUEUE_SPINLOCK
+
 enum xen_contention_stat {
TAKEN_SLOW,
TAKEN_SLOW_PICKUP,
@@ -100,12 +106,9 @@ struct xen_lock_waiting {
__ticket_t want;
 };
 
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
 static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
 static cpumask_t waiting_cpus;
 
-static bool xen_pvspin = true;
 __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
 {
int irq = __this_cpu_read(lock_kicker_irq);
@@ -213,6 +216,118 @@ static void xen_unlock_kick(struct arch_spinlock *lock, 
__ticket_t next)
}
 }
 
+#else /* CONFIG_QUEUE_SPINLOCK */
+
+#ifdef CONFIG_XEN_DEBUG_FS
+static u32 kick_nohlt_stats;   /* Kick but not halt count  */
+static u32 halt_qhead_stats;   /* Queue head halting count */
+static u32 halt_qnode_stats;   /* Queue node halting count */
+static u32 halt_abort_stats;   /* Halting abort count  */
+static u32 wake_kick_stats;/* Wakeup by kicking count  */
+static u32 wake_spur_stats;/* Spurious wakeup count*/
+static u64 time_blocked;   /* Total blocking time  */
+
+static inline void xen_halt_stats(enum pv_lock_stats type)
+{
+   if (type == PV_HALT_QHEAD)
+   add_smp(halt_qhead_stats, 1);
+   else if (type == PV_HALT_QNODE)
+   add_smp(halt_qnode_stats, 1);
+   else /* type == PV_HALT_ABORT */
+   add_smp(halt_abort_stats, 1);
+}
+
+void xen_lock_stats(enum pv_lock_stats type)
+{
+   if (type == PV_WAKE_KICKED)
+   add_smp(wake_kick_stats, 1);
+   else if (type == PV_WAKE_SPURIOUS)
+   add_smp(wake_spur_stats, 1);
+   else /* type == PV_KICK_NOHALT */
+   add_smp(kick_nohlt_stats, 1);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_lock_stats);
+
+static inline u64 spin_time_start(void)
+{
+   return sched_clock();
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+   u64 delta;
+
+   delta = sched_clock() - start;
+   add_smp(time_blocked, delta);
+}
+#else /* CONFIG_XEN_DEBUG_FS */
+static inline void xen_halt_stats(enum pv_lock_stats type)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+   return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_XEN_DEBUG_FS */
+
+void xen_kick_cpu(int cpu)
+{
+   xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_kick_cpu);
+
+/*
+ * Halt the current CPU  release it back to the host
+ */
+void xen_halt_cpu(u8 *lockbyte)
+{
+   int irq = __this_cpu_read(lock_kicker_irq);
+   unsigned long flags;
+   u64 start;
+
+   /* If kicker interrupts not initialized yet, just spin */
+   if (irq == -1)
+   return;
+
+   /*
+* Make sure an interrupt handler can't upset things in a
+* partially setup state.
+*/
+   local_irq_save(flags);
+   start = spin_time_start();
+
+   xen_halt_stats(lockbyte ? PV_HALT_QHEAD : PV_HALT_QNODE);
+   /* clear pending */
+   xen_clear_irq_pending(irq);
+
+   /* Allow interrupts while blocked */
+   local_irq_restore(flags);
+   /*
+* Don't halt if the lock is now available
+*/
+   if (lockbyte  !ACCESS_ONCE(*lockbyte)) {
+   xen_halt_stats(PV_HALT_ABORT);
+   return;
+   }
+   /*
+* If an interrupt happens here, it will leave the wakeup irq
+* pending, which will cause xen_poll_irq() to return
+* immediately.
+*/
+
+   /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+   xen_poll_irq(irq);
+   spin_time_accum_blocked(start);
+}
+PV_CALLEE_SAVE_REGS_THUNK(xen_halt_cpu);
+
+#endif /* CONFIG_QUEUE_SPINLOCK */
+
 static irqreturn_t dummy_handler(int irq, void *dev_id)
 {
BUG();
@@ -258,7 +373,6 @@ void xen_uninit_lock_cpu(int cpu)
per_cpu(irq_name, cpu) = NULL;
 }
 
-
 /*
  * Our init of PV spinlocks is split in two init functions due to us
  * using paravirt patching and jump labels patching and having to do
@@ -275,8 +389,17 @@ void __init xen_init_spinlocks(void)

[PATCH v13 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-29 Thread Waiman Long
This patch adds para-virtualization support to the queue spinlock
code base with minimal impact to the native case. There are some
minor code changes in the generic qspinlock.c file which should be
usable in other architectures. The other code changes are specific
to x86 processors and so are all put under the arch/x86 directory.

On the lock side, the slowpath code is split into 2 separate functions
generated from the same code - one for bare metal and one for PV guest.
The switching is done in the _raw_spin_lock* functions. This makes
sure that the performance impact to the bare metal case is minimal,
just a few NOPs in the _raw_spin_lock* functions. In the PV slowpath
code, there are 2 paravirt callee saved calls that minimize register
pressure.

On the unlock side, however, the disabling of unlock function inlining
does have some slight impact on bare metal performance.

The actual paravirt code comes in 5 parts;

 - init_node; this initializes the extra data members required for PV
   state. PV state data is kept 1 cacheline ahead of the regular data.

 - link_and_wait_node; this replaces the regular MCS queuing code. CPU
   halting can happen if the wait is too long.

 - wait_head; this waits until the lock is avialable and the CPU will
   be halted if the wait is too long.

 - wait_check; this is called after acquiring the lock to see if the
   next queue head CPU is halted. If this is the case, the lock bit is
   changed to indicate the queue head will have to be kicked on unlock.

 - queue_unlock;  this routine has a jump label to check if paravirt
   is enabled. If yes, it has to do an atomic cmpxchg to clear the lock
   bit or call the slowpath function to kick the queue head cpu.

Tracking the head is done in two parts, firstly the pv_wait_head will
store its cpu number in whichever node is pointed to by the tail part
of the lock word. Secondly, pv_link_and_wait_node() will propagate the
existing head from the old to the new tail node.

Signed-off-by: Waiman Long waiman.l...@hp.com
---
 arch/x86/include/asm/paravirt.h   |   19 ++
 arch/x86/include/asm/paravirt_types.h |   20 ++
 arch/x86/include/asm/pvqspinlock.h|  411 +
 arch/x86/include/asm/qspinlock.h  |   71 ++-
 arch/x86/kernel/paravirt-spinlocks.c  |6 +
 include/asm-generic/qspinlock.h   |2 +
 kernel/locking/qspinlock.c|   69 +-
 7 files changed, 591 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/include/asm/pvqspinlock.h

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index cd6e161..7e296e6 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,24 @@ static inline void __set_fixmap(unsigned /* enum 
fixed_addresses */ idx,
 
 #if defined(CONFIG_SMP)  defined(CONFIG_PARAVIRT_SPINLOCKS)
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+
+static __always_inline void pv_kick_cpu(int cpu)
+{
+   PVOP_VCALLEE1(pv_lock_ops.kick_cpu, cpu);
+}
+
+static __always_inline void pv_lockwait(u8 *lockbyte)
+{
+   PVOP_VCALLEE1(pv_lock_ops.lockwait, lockbyte);
+}
+
+static __always_inline void pv_lockstat(enum pv_lock_stats type)
+{
+   PVOP_VCALLEE1(pv_lock_ops.lockstat, type);
+}
+
+#else
 static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
__ticket_t ticket)
 {
@@ -723,6 +741,7 @@ static __always_inline void __ticket_unlock_kick(struct 
arch_spinlock *lock,
 {
PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
 }
+#endif
 
 #endif
 
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 7549b8b..49e4b76 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -326,6 +326,9 @@ struct pv_mmu_ops {
   phys_addr_t phys, pgprot_t flags);
 };
 
+struct mcs_spinlock;
+struct qspinlock;
+
 struct arch_spinlock;
 #ifdef CONFIG_SMP
 #include asm/spinlock_types.h
@@ -333,9 +336,26 @@ struct arch_spinlock;
 typedef u16 __ticket_t;
 #endif
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+enum pv_lock_stats {
+   PV_HALT_QHEAD,  /* Queue head halting   */
+   PV_HALT_QNODE,  /* Other queue node halting */
+   PV_HALT_ABORT,  /* Halting aborted  */
+   PV_WAKE_KICKED, /* Wakeup by kicking*/
+   PV_WAKE_SPURIOUS,   /* Spurious wakeup  */
+   PV_KICK_NOHALT  /* Kick but CPU not halted  */
+};
+#endif
+
 struct pv_lock_ops {
+#ifdef CONFIG_QUEUE_SPINLOCK
+   struct paravirt_callee_save kick_cpu;
+   struct paravirt_callee_save lockstat;
+   struct paravirt_callee_save lockwait;
+#else
struct paravirt_callee_save lock_spinning;
void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif
 };
 
 /* This contains all the paravirt structures: we get a convenient
diff --git 

[PATCH v13 08/11] qspinlock, x86: Rename paravirt_ticketlocks_enabled

2014-10-29 Thread Waiman Long
This patch renames the paravirt_ticketlocks_enabled static key to a
more generic paravirt_spinlocks_enabled name.

Signed-off-by: Waiman Long waiman.l...@hp.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 arch/x86/include/asm/spinlock.h  |4 ++--
 arch/x86/kernel/kvm.c|2 +-
 arch/x86/kernel/paravirt-spinlocks.c |4 ++--
 arch/x86/xen/spinlock.c  |2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 5899483..928751e 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -39,7 +39,7 @@
 /* How long a lock should spin before we consider blocking */
 #define SPIN_THRESHOLD (1  15)
 
-extern struct static_key paravirt_ticketlocks_enabled;
+extern struct static_key paravirt_spinlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
 #ifdef CONFIG_QUEUE_SPINLOCK
@@ -150,7 +150,7 @@ static inline void __ticket_unlock_slowpath(arch_spinlock_t 
*lock,
 static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
if (TICKET_SLOWPATH_FLAG 
-   static_key_false(paravirt_ticketlocks_enabled)) {
+   static_key_false(paravirt_spinlocks_enabled)) {
arch_spinlock_t prev;
 
prev = *lock;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f6945be..eaa064c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -827,7 +827,7 @@ static __init int kvm_spinlock_init_jump(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return 0;
 
-   static_key_slow_inc(paravirt_ticketlocks_enabled);
+   static_key_slow_inc(paravirt_spinlocks_enabled);
printk(KERN_INFO KVM setup paravirtual spinlock\n);
 
return 0;
diff --git a/arch/x86/kernel/paravirt-spinlocks.c 
b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c73..e434f24 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -16,5 +16,5 @@ struct pv_lock_ops pv_lock_ops = {
 };
 EXPORT_SYMBOL(pv_lock_ops);
 
-struct static_key paravirt_ticketlocks_enabled = STATIC_KEY_INIT_FALSE;
-EXPORT_SYMBOL(paravirt_ticketlocks_enabled);
+struct static_key paravirt_spinlocks_enabled = STATIC_KEY_INIT_FALSE;
+EXPORT_SYMBOL(paravirt_spinlocks_enabled);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23b45eb..d332ae0 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -293,7 +293,7 @@ static __init int xen_init_spinlocks_jump(void)
if (!xen_domain())
return 0;
 
-   static_key_slow_inc(paravirt_ticketlocks_enabled);
+   static_key_slow_inc(paravirt_spinlocks_enabled);
return 0;
 }
 early_initcall(xen_init_spinlocks_jump);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v13 04/11] qspinlock: Extract out code snippets for the next patch

2014-10-29 Thread Waiman Long
This is a preparatory patch that extracts out the following 2 code
snippets to prepare for the next performance optimization patch.

 1) the logic for the exchange of new and previous tail code words
into a new xchg_tail() function.
 2) the logic for clearing the pending bit and setting the locked bit
into a new clear_pending_set_locked() function.

This patch also simplifies the trylock operation before queuing by
calling queue_spin_trylock() directly.

Signed-off-by: Waiman Long waiman.l...@hp.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 include/asm-generic/qspinlock_types.h |2 +
 kernel/locking/qspinlock.c|   91 +---
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/include/asm-generic/qspinlock_types.h 
b/include/asm-generic/qspinlock_types.h
index 4196694..88d647c 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -58,6 +58,8 @@ typedef struct qspinlock {
 #define _Q_TAIL_CPU_BITS   (32 - _Q_TAIL_CPU_OFFSET)
 #define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
 
+#define _Q_TAIL_MASK   (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
 #define _Q_LOCKED_VAL  (1U  _Q_LOCKED_OFFSET)
 #define _Q_PENDING_VAL (1U  _Q_PENDING_OFFSET)
 
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 226b11d..48bd2ad 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -95,6 +95,54 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
 /**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queue spinlock structure
+ * @val : Current value of the queue spinlock 32-bit word
+ *
+ * *,1,0 - *,0,1
+ */
+static __always_inline void
+clear_pending_set_locked(struct qspinlock *lock, u32 val)
+{
+   u32 new, old;
+
+   for (;;) {
+   new = (val  ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+
+   old = atomic_cmpxchg(lock-val, val, new);
+   if (old == val)
+   break;
+
+   val = old;
+   }
+}
+
+/**
+ * xchg_tail - Put in the new queue tail code word  retrieve previous one
+ * @lock : Pointer to queue spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* - n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+   u32 old, new, val = atomic_read(lock-val);
+
+   for (;;) {
+   new = (val  _Q_LOCKED_PENDING_MASK) | tail;
+   old = atomic_cmpxchg(lock-val, val, new);
+   if (old == val)
+   break;
+
+   val = old;
+   }
+   return old;
+}
+
+/**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
  * @val: Current value of the queue spinlock 32-bit word
@@ -176,15 +224,7 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 *
 * *,1,0 - *,0,1
 */
-   for (;;) {
-   new = (val  ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
-
-   old = atomic_cmpxchg(lock-val, val, new);
-   if (old == val)
-   break;
-
-   val = old;
-   }
+   clear_pending_set_locked(lock, val);
return;
 
/*
@@ -201,37 +241,26 @@ queue:
node-next = NULL;
 
/*
-* We have already touched the queueing cacheline; don't bother with
-* pending stuff.
-*
-* trylock || xchg(lock, node)
-*
-* 0,0,0 - 0,0,1 ; no tail, not locked - no tail, locked.
-* p,y,x - n,y,x ; tail was p - tail is n; preserving locked.
+* We touched a (possibly) cold cacheline in the per-cpu queue node;
+* attempt the trylock once more in the hope someone let go while we
+* weren't watching.
 */
-   for (;;) {
-   new = _Q_LOCKED_VAL;
-   if (val)
-   new = tail | (val  _Q_LOCKED_PENDING_MASK);
-
-   old = atomic_cmpxchg(lock-val, val, new);
-   if (old == val)
-   break;
-
-   val = old;
-   }
+   if (queue_spin_trylock(lock))
+   goto release;
 
/*
-* we won the trylock; forget about queueing.
+* We have already touched the queueing cacheline; don't bother with
+* pending stuff.
+*
+* p,*,* - n,*,*
 */
-   if (new == _Q_LOCKED_VAL)
-   goto release;
+   old = xchg_tail(lock, tail);
 
/*
 * if there was a previous node; link it and wait until reaching the
 * head of the waitqueue.
 */
-   if (old  ~_Q_LOCKED_PENDING_MASK) {
+   if (old  _Q_TAIL_MASK) {
prev = 

[PATCH v13 07/11] qspinlock: Revert to test-and-set on hypervisors

2014-10-29 Thread Waiman Long
From: Peter Zijlstra pet...@infradead.org

When we detect a hypervisor (!paravirt, see qspinlock paravirt support
patches), revert to a simple test-and-set lock to avoid the horrors
of queue preemption.

Signed-off-by: Peter Zijlstra pet...@infradead.org
Signed-off-by: Waiman Long waiman.l...@hp.com
---
 arch/x86/include/asm/qspinlock.h |   14 ++
 include/asm-generic/qspinlock.h  |7 +++
 kernel/locking/qspinlock.c   |3 +++
 3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index a6a8762..05a77fe 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_QSPINLOCK_H
 #define _ASM_X86_QSPINLOCK_H
 
+#include asm/cpufeature.h
 #include asm-generic/qspinlock_types.h
 
 #ifndef CONFIG_X86_PPRO_FENCE
@@ -20,6 +21,19 @@ static inline void queue_spin_unlock(struct qspinlock *lock)
 
 #endif /* !CONFIG_X86_PPRO_FENCE */
 
+#define virt_queue_spin_lock virt_queue_spin_lock
+
+static inline bool virt_queue_spin_lock(struct qspinlock *lock)
+{
+   if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+   return false;
+
+   while (atomic_cmpxchg(lock-val, 0, _Q_LOCKED_VAL) != 0)
+   cpu_relax();
+
+   return true;
+}
+
 #include asm-generic/qspinlock.h
 
 #endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index e8a7ae8..a53a7bb 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -98,6 +98,13 @@ static __always_inline void queue_spin_unlock(struct 
qspinlock *lock)
 }
 #endif
 
+#ifndef virt_queue_spin_lock
+static __always_inline bool virt_queue_spin_lock(struct qspinlock *lock)
+{
+   return false;
+}
+#endif
+
 /*
  * Initializier
  */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fb0e988..1c1926a 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -257,6 +257,9 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 
BUILD_BUG_ON(CONFIG_NR_CPUS = (1U  _Q_TAIL_CPU_BITS));
 
+   if (virt_queue_spin_lock(lock))
+   return;
+
/*
 * wait for in-progress pending-locked hand-overs
 *
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v13 05/11] qspinlock: Optimize for smaller NR_CPUS

2014-10-29 Thread Waiman Long
From: Peter Zijlstra pet...@infradead.org

When we allow for a max NR_CPUS  2^14 we can optimize the pending
wait-acquire and the xchg_tail() operations.

By growing the pending bit to a byte, we reduce the tail to 16bit.
This means we can use xchg16 for the tail part and do away with all
the repeated compxchg() operations.

This in turn allows us to unconditionally acquire; the locked state
as observed by the wait loops cannot change. And because both locked
and pending are now a full byte we can use simple stores for the
state transition, obviating one atomic operation entirely.

This optimization is needed to make the qspinlock achieve performance
parity with ticket spinlock at light load.

All this is horribly broken on Alpha pre EV56 (and any other arch that
cannot do single-copy atomic byte stores).

Signed-off-by: Peter Zijlstra pet...@infradead.org
Signed-off-by: Waiman Long waiman.l...@hp.com
---
 include/asm-generic/qspinlock_types.h |   13 ++
 kernel/locking/qspinlock.c|   71 -
 2 files changed, 83 insertions(+), 1 deletions(-)

diff --git a/include/asm-generic/qspinlock_types.h 
b/include/asm-generic/qspinlock_types.h
index 88d647c..01b46df 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -35,6 +35,14 @@ typedef struct qspinlock {
 /*
  * Bitfields in the atomic value:
  *
+ * When NR_CPUS  16K
+ *  0- 7: locked byte
+ * 8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS = 16K
  *  0- 7: locked byte
  * 8: pending
  *  9-10: tail index
@@ -47,7 +55,11 @@ typedef struct qspinlock {
 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
 
 #define _Q_PENDING_OFFSET  (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS  (1U  14)
+#define _Q_PENDING_BITS8
+#else
 #define _Q_PENDING_BITS1
+#endif
 #define _Q_PENDING_MASK_Q_SET_MASK(PENDING)
 
 #define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
@@ -58,6 +70,7 @@ typedef struct qspinlock {
 #define _Q_TAIL_CPU_BITS   (32 - _Q_TAIL_CPU_OFFSET)
 #define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
 
+#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET
 #define _Q_TAIL_MASK   (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
 
 #define _Q_LOCKED_VAL  (1U  _Q_LOCKED_OFFSET)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 48bd2ad..7c127b4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
 #include linux/percpu.h
 #include linux/hardirq.h
 #include linux/mutex.h
+#include asm/byteorder.h
 #include asm/qspinlock.h
 
 /*
@@ -54,6 +55,10 @@
  * node; whereby avoiding the need to carry a node from lock to unlock, and
  * preserving existing lock API. This also makes the unlock code simpler and
  * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *  atomic operations on smaller 8-bit and 16-bit data types.
+ *
  */
 
 #include mcs_spinlock.h
@@ -94,6 +99,64 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
 
 #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
 
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ */
+#if _Q_PENDING_BITS == 8
+
+struct __qspinlock {
+   union {
+   atomic_t val;
+   struct {
+#ifdef __LITTLE_ENDIAN
+   u16 locked_pending;
+   u16 tail;
+#else
+   u16 tail;
+   u16 locked_pending;
+#endif
+   };
+   };
+};
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queue spinlock structure
+ * @val : Current value of the queue spinlock 32-bit word
+ *
+ * *,1,0 - *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void
+clear_pending_set_locked(struct qspinlock *lock, u32 val)
+{
+   struct __qspinlock *l = (void *)lock;
+
+   ACCESS_ONCE(l-locked_pending) = _Q_LOCKED_VAL;
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word  retrieve previous one
+ * @lock : Pointer to queue spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* - n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+   struct __qspinlock *l = (void *)lock;
+
+   return (u32)xchg(l-tail, tail  _Q_TAIL_OFFSET)  _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queue spinlock structure
@@ -141,6 +204,7 @@ static __always_inline u32 xchg_tail(struct qspinlock 
*lock, u32 tail)
}
   

[PATCH v13 06/11] qspinlock: Use a simple write to grab the lock

2014-10-29 Thread Waiman Long
Currently, atomic_cmpxchg() is used to get the lock. However, this
is not really necessary if there is more than one task in the queue
and the queue head don't need to reset the tail code. For that case,
a simple write to set the lock bit is enough as the queue head will
be the only one eligible to get the lock as long as it checks that
both the lock and pending bits are not set. The current pending bit
waiting code will ensure that the bit will not be set as soon as the
tail code in the lock is set.

With that change, the are some slight improvement in the performance
of the queue spinlock in the 5M loop micro-benchmark run on a 4-socket
Westere-EX machine as shown in the tables below.

[Standalone/Embedded - same node]
  # of tasksBefore patchAfter patch %Change
  ----- --  ---
   3 2324/2321  2248/2265-3%/-2%
   4 2890/2896  2819/2831-2%/-2%
   5 3611/3595  3522/3512-2%/-2%
   6 4281/4276  4173/4160-3%/-3%
   7 5018/5001  4875/4861-3%/-3%
   8 5759/5750  5563/5568-3%/-3%

[Standalone/Embedded - different nodes]
  # of tasksBefore patchAfter patch %Change
  ----- --  ---
   312242/12237 12087/12093  -1%/-1%
   410688/10696 10507/10521  -2%/-2%

It was also found that this change produced a much bigger performance
improvement in the newer IvyBridge-EX chip and was essentially to close
the performance gap between the ticket spinlock and queue spinlock.

The disk workload of the AIM7 benchmark was run on a 4-socket
Westmere-EX machine with both ext4 and xfs RAM disks at 3000 users
on a 3.14 based kernel. The results of the test runs were:

AIM7 XFS Disk Test
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  ticketlock56782333.17   96.61   5.81
  qspinlock 57507993.13   94.83   5.97

AIM7 EXT4 Disk Test
  kernel JPMReal Time   Sys TimeUsr Time
  -  ----   
  ticketlock1114551   16.15  509.72   7.11
  qspinlock 21844668.24  232.99   6.01

The ext4 filesystem run had a much higher spinlock contention than
the xfs filesystem run.

The ebizzy -m test was also run with the following results:

  kernel   records/s  Real Time   Sys TimeUsr Time
  --  -   
  ticketlock 2075   10.00  216.35   3.49
  qspinlock  3023   10.00  198.20   4.80

Signed-off-by: Waiman Long waiman.l...@hp.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 kernel/locking/qspinlock.c |   59 
 1 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 7c127b4..fb0e988 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -103,24 +103,33 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
  * By using the whole 2nd least significant byte for the pending bit, we
  * can allow better optimization of the lock acquisition for the pending
  * bit holder.
+ *
+ * This internal structure is also used by the set_locked function which
+ * is not restricted to _Q_PENDING_BITS == 8.
  */
-#if _Q_PENDING_BITS == 8
-
 struct __qspinlock {
union {
atomic_t val;
-   struct {
 #ifdef __LITTLE_ENDIAN
+   u8   locked;
+   struct {
u16 locked_pending;
u16 tail;
+   };
 #else
+   struct {
u16 tail;
u16 locked_pending;
-#endif
};
+   struct {
+   u8  reserved[3];
+   u8  locked;
+   };
+#endif
};
 };
 
+#if _Q_PENDING_BITS == 8
 /**
  * clear_pending_set_locked - take ownership and clear the pending bit.
  * @lock: Pointer to queue spinlock structure
@@ -207,6 +216,19 @@ static __always_inline u32 xchg_tail(struct qspinlock 
*lock, u32 tail)
 #endif /* _Q_PENDING_BITS == 8 */
 
 /**
+ * set_locked - Set the lock bit and own the lock
+ * @lock: Pointer to queue spinlock structure
+ *
+ * *,*,0 - *,0,1
+ */
+static __always_inline void set_locked(struct qspinlock *lock)
+{
+   struct __qspinlock *l = (void *)lock;
+
+   ACCESS_ONCE(l-locked) = _Q_LOCKED_VAL;
+}
+
+/**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
  * @val: Current value of the queue 

[PATCH v13 01/11] qspinlock: A simple generic 4-byte queue spinlock

2014-10-29 Thread Waiman Long
This patch introduces a new generic queue spinlock implementation that
can serve as an alternative to the default ticket spinlock. Compared
with the ticket spinlock, this queue spinlock should be almost as fair
as the ticket spinlock. It has about the same speed in single-thread
and it can be much faster in high contention situations especially when
the spinlock is embedded within the data structure to be protected.

Only in light to moderate contention where the average queue depth
is around 1-3 will this queue spinlock be potentially a bit slower
due to the higher slowpath overhead.

This queue spinlock is especially suit to NUMA machines with a large
number of cores as the chance of spinlock contention is much higher
in those machines. The cost of contention is also higher because of
slower inter-node memory traffic.

Due to the fact that spinlocks are acquired with preemption disabled,
the process will not be migrated to another CPU while it is trying
to get a spinlock. Ignoring interrupt handling, a CPU can only be
contending in one spinlock at any one time. Counting soft IRQ, hard
IRQ and NMI, a CPU can only have a maximum of 4 concurrent lock waiting
activities.  By allocating a set of per-cpu queue nodes and used them
to form a waiting queue, we can encode the queue node address into a
much smaller 24-bit size (including CPU number and queue node index)
leaving one byte for the lock.

Please note that the queue node is only needed when waiting for the
lock. Once the lock is acquired, the queue node can be released to
be used later.

Signed-off-by: Waiman Long waiman.l...@hp.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 include/asm-generic/qspinlock.h   |  118 +++
 include/asm-generic/qspinlock_types.h |   58 +
 kernel/Kconfig.locks  |7 +
 kernel/locking/Makefile   |1 +
 kernel/locking/mcs_spinlock.h |1 +
 kernel/locking/qspinlock.c|  207 +
 6 files changed, 392 insertions(+), 0 deletions(-)
 create mode 100644 include/asm-generic/qspinlock.h
 create mode 100644 include/asm-generic/qspinlock_types.h
 create mode 100644 kernel/locking/qspinlock.c

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
new file mode 100644
index 000..e8a7ae8
--- /dev/null
+++ b/include/asm-generic/qspinlock.h
@@ -0,0 +1,118 @@
+/*
+ * Queue spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long waiman.l...@hp.com
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_H
+#define __ASM_GENERIC_QSPINLOCK_H
+
+#include asm-generic/qspinlock_types.h
+
+/**
+ * queue_spin_is_locked - is the spinlock locked?
+ * @lock: Pointer to queue spinlock structure
+ * Return: 1 if it is locked, 0 otherwise
+ */
+static __always_inline int queue_spin_is_locked(struct qspinlock *lock)
+{
+   return atomic_read(lock-val);
+}
+
+/**
+ * queue_spin_value_unlocked - is the spinlock structure unlocked?
+ * @lock: queue spinlock structure
+ * Return: 1 if it is unlocked, 0 otherwise
+ *
+ * N.B. Whenever there are tasks waiting for the lock, it is considered
+ *  locked wrt the lockref code to avoid lock stealing by the lockref
+ *  code and change things underneath the lock. This also allows some
+ *  optimizations to be applied without conflict with lockref.
+ */
+static __always_inline int queue_spin_value_unlocked(struct qspinlock lock)
+{
+   return !atomic_read(lock.val);
+}
+
+/**
+ * queue_spin_is_contended - check if the lock is contended
+ * @lock : Pointer to queue spinlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static __always_inline int queue_spin_is_contended(struct qspinlock *lock)
+{
+   return atomic_read(lock-val)  ~_Q_LOCKED_MASK;
+}
+/**
+ * queue_spin_trylock - try to acquire the queue spinlock
+ * @lock : Pointer to queue spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int queue_spin_trylock(struct qspinlock *lock)
+{
+   if (!atomic_read(lock-val) 
+  (atomic_cmpxchg(lock-val, 0, _Q_LOCKED_VAL) == 0))
+   return 1;
+   return 0;
+}
+
+extern void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+
+/**
+ * queue_spin_lock - acquire a queue spinlock
+ * @lock: Pointer to queue spinlock structure
+ */
+static __always_inline void queue_spin_lock(struct qspinlock *lock)
+{
+   u32 val;
+
+   

[PATCH v13 03/11] qspinlock: Add pending bit

2014-10-29 Thread Waiman Long
From: Peter Zijlstra pet...@infradead.org

Because the qspinlock needs to touch a second cacheline (the per-cpu
mcs_nodes[]); add a pending bit and allow a single in-word spinner
before we punt to the second cacheline.

It is possible so observe the pending bit without the locked bit when
the last owner has just released but the pending owner has not yet
taken ownership.

In this case we would normally queue -- because the pending bit is
already taken. However, in this case the pending bit is guaranteed
to be released 'soon', therefore wait for it and avoid queueing.

Signed-off-by: Peter Zijlstra pet...@infradead.org
Signed-off-by: Waiman Long waiman.l...@hp.com
---
 include/asm-generic/qspinlock_types.h |   12 +++-
 kernel/locking/qspinlock.c|  119 +++--
 2 files changed, 107 insertions(+), 24 deletions(-)

diff --git a/include/asm-generic/qspinlock_types.h 
b/include/asm-generic/qspinlock_types.h
index 67a2110..4196694 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -36,8 +36,9 @@ typedef struct qspinlock {
  * Bitfields in the atomic value:
  *
  *  0- 7: locked byte
- *  8- 9: tail index
- * 10-31: tail cpu (+1)
+ * 8: pending
+ *  9-10: tail index
+ * 11-31: tail cpu (+1)
  */
 #define_Q_SET_MASK(type)   (((1U  _Q_ ## type ## _BITS) - 1)\
   _Q_ ## type ## _OFFSET)
@@ -45,7 +46,11 @@ typedef struct qspinlock {
 #define _Q_LOCKED_BITS 8
 #define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED)
 
-#define _Q_TAIL_IDX_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#define _Q_PENDING_OFFSET  (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#define _Q_PENDING_BITS1
+#define _Q_PENDING_MASK_Q_SET_MASK(PENDING)
+
+#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
 #define _Q_TAIL_IDX_BITS   2
 #define _Q_TAIL_IDX_MASK   _Q_SET_MASK(TAIL_IDX)
 
@@ -54,5 +59,6 @@ typedef struct qspinlock {
 #define _Q_TAIL_CPU_MASK   _Q_SET_MASK(TAIL_CPU)
 
 #define _Q_LOCKED_VAL  (1U  _Q_LOCKED_OFFSET)
+#define _Q_PENDING_VAL (1U  _Q_PENDING_OFFSET)
 
 #endif /* __ASM_GENERIC_QSPINLOCK_TYPES_H */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index c114076..226b11d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -92,24 +92,28 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
return per_cpu_ptr(mcs_nodes[idx], cpu);
 }
 
+#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
+
 /**
  * queue_spin_lock_slowpath - acquire the queue spinlock
  * @lock: Pointer to queue spinlock structure
  * @val: Current value of the queue spinlock 32-bit word
  *
- * (queue tail, lock value)
- *
- *  fast  :slow  :
unlock
- *:  :
- * uncontended  (0,0)   --:-- (0,1) :-- (*,0)
- *:   | ^./  :
- *:   v   \   |  :
- * uncontended:(n,x) --+-- (n,0) |  :
- *   queue:   | ^--'  |  :
- *:   v   |  :
- * contended  :(*,x) --+-- (*,0) - (*,1) ---'  :
- *   queue: ^--' :
+ * (queue tail, pending bit, lock value)
  *
+ *  fast :slow  :unlock
+ *   :  :
+ * uncontended  (0,0,0) -:-- (0,0,1) --:-- 
(*,*,0)
+ *   :   | ^.--. /  :
+ *   :   v   \  \|  :
+ * pending   :(0,1,1) +-- (0,1,0)   \   |  :
+ *   :   | ^--'  |   |  :
+ *   :   v   |   |  :
+ * uncontended   :(n,x,y) +-- (n,0,0) --'   |  :
+ *   queue   :   | ^--'  |  :
+ *   :   v   |  :
+ * contended :(*,x,y) +-- (*,0,0) --- (*,0,1) -'  :
+ *   queue   : ^--' :
  */
 void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
 {
@@ -119,6 +123,75 @@ void queue_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 
BUILD_BUG_ON(CONFIG_NR_CPUS = (1U  _Q_TAIL_CPU_BITS));
 
+   /*
+* wait for in-progress pending-locked hand-overs
+*
+* 0,1,0 - 0,0,1
+*/
+   if (val == _Q_PENDING_VAL) {
+   while ((val = atomic_read(lock-val)) == _Q_PENDING_VAL)
+   

[PATCH v13 02/11] qspinlock, x86: Enable x86-64 to use queue spinlock

2014-10-29 Thread Waiman Long
This patch makes the necessary changes at the x86 architecture
specific layer to enable the use of queue spinlock for x86-64. As
x86-32 machines are typically not multi-socket. The benefit of queue
spinlock may not be apparent. So queue spinlock is not enabled.

Currently, there is some incompatibilities between the para-virtualized
spinlock code (which hard-codes the use of ticket spinlock) and the
queue spinlock. Therefore, the use of queue spinlock is disabled when
the para-virtualized spinlock is enabled.

The arch/x86/include/asm/qspinlock.h header file includes some x86
specific optimization which will make the queue spinlock code perform
better than the generic implementation.

Signed-off-by: Waiman Long waiman.l...@hp.com
Signed-off-by: Peter Zijlstra pet...@infradead.org
---
 arch/x86/Kconfig  |1 +
 arch/x86/include/asm/qspinlock.h  |   25 +
 arch/x86/include/asm/spinlock.h   |5 +
 arch/x86/include/asm/spinlock_types.h |4 
 4 files changed, 35 insertions(+), 0 deletions(-)
 create mode 100644 arch/x86/include/asm/qspinlock.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2327e8..9ee8324 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -122,6 +122,7 @@ config X86
select MODULES_USE_ELF_RELA if X86_64
select CLONE_BACKWARDS if X86_32
select ARCH_USE_BUILTIN_BSWAP
+   select ARCH_USE_QUEUE_SPINLOCK
select ARCH_USE_QUEUE_RWLOCK
select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
select OLD_SIGACTION if X86_32
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000..a6a8762
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,25 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include asm-generic/qspinlock_types.h
+
+#ifndef CONFIG_X86_PPRO_FENCE
+
+#definequeue_spin_unlock queue_spin_unlock
+/**
+ * queue_spin_unlock - release a queue spinlock
+ * @lock : Pointer to queue spinlock structure
+ *
+ * An effective smp_store_release() on the least-significant byte.
+ */
+static inline void queue_spin_unlock(struct qspinlock *lock)
+{
+   barrier();
+   ACCESS_ONCE(*(u8 *)lock) = 0;
+}
+
+#endif /* !CONFIG_X86_PPRO_FENCE */
+
+#include asm-generic/qspinlock.h
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 9295016..5899483 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
 extern struct static_key paravirt_ticketlocks_enabled;
 static __always_inline bool static_key_false(struct static_key *key);
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+#include asm/qspinlock.h
+#else
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 
 static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -180,6 +184,7 @@ static __always_inline void 
arch_spin_lock_flags(arch_spinlock_t *lock,
 {
arch_spin_lock(lock);
 }
+#endif /* CONFIG_QUEUE_SPINLOCK */
 
 static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
 {
diff --git a/arch/x86/include/asm/spinlock_types.h 
b/arch/x86/include/asm/spinlock_types.h
index 5f9d757..5d654a1 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
 
 #define TICKET_SHIFT   (sizeof(__ticket_t) * 8)
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+#include asm-generic/qspinlock_types.h
+#else
 typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED  { { 0 } }
+#endif /* CONFIG_QUEUE_SPINLOCK */
 
 #include asm-generic/qrwlock_types.h
 
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v12 09/11] pvqspinlock, x86: Add para-virtualization support

2014-10-29 Thread Waiman Long

On 10/29/2014 03:05 PM, Waiman Long wrote:

On 10/27/2014 05:22 PM, Waiman Long wrote:

On 10/27/2014 02:04 PM, Peter Zijlstra wrote:

On Mon, Oct 27, 2014 at 01:38:20PM -0400, Waiman Long wrote:

On 10/24/2014 04:54 AM, Peter Zijlstra wrote:

On Thu, Oct 16, 2014 at 02:10:38PM -0400, Waiman Long wrote:

Since enabling paravirt spinlock will disable unlock function 
inlining,
a jump label can be added to the unlock function without adding 
patch

sites all over the kernel.

But you don't have to. My patches allowed for the inline to remain,
again reducing the overhead of enabling PV spinlocks while running 
on a

real machine.

Look at:

   http://lkml.kernel.org/r/20140615130154.213923...@chello.nl

In particular this hunk:

Index: linux-2.6/arch/x86/kernel/paravirt_patch_64.c
===
--- linux-2.6.orig/arch/x86/kernel/paravirt_patch_64.c
+++ linux-2.6/arch/x86/kernel/paravirt_patch_64.c
@@ -22,6 +22,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, swapgs)
  DEF_NATIVE(, mov32, mov %edi, %eax);
  DEF_NATIVE(, mov64, mov %rdi, %rax);

+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+DEF_NATIVE(pv_lock_ops, queue_unlock, movb $0, (%rdi));
+#endif
+
  unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
  {
 return paravirt_patch_insns(insnbuf, len,
@@ -61,6 +65,9 @@ unsigned native_patch(u8 type, u16 clobb
 PATCH_SITE(pv_cpu_ops, clts);
 PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 PATCH_SITE(pv_cpu_ops, wbinvd);
+#if defined(CONFIG_PARAVIRT_SPINLOCKS)   
defined(CONFIG_QUEUE_SPINLOCK)

+   PATCH_SITE(pv_lock_ops, queue_unlock);
+#endif

 patch_site:
 ret = paravirt_patch_insns(ibuf, len, start, end);


That makes sure to overwrite the callee-saved call to the
pv_lock_ops::queue_unlock with the immediate asm movb $0, (%rdi).


Therefore you can retain the inlined unlock with hardly (there 
might be

some NOP padding) any overhead at all. On PV it reverts to a callee
saved function call.
My concern is that spin_unlock() can be called in many places, 
including
loadable kernel modules. Can the paravirt_patch_ident_32() function 
able to
patch all of them in reasonable time? How about a kernel module 
loaded later

at run time?

modules should be fine, see arch/x86/kernel/module.c:module_finalize()
-  apply_paravirt().

Also note that the 'default' text is an indirect call into the paravirt
ops table which routes to the 'right' function, so even if the text
patching would be 'late' calls would 'work' as expected, just slower.


Thanks for letting me know about that. I have this concern because 
your patch didn't change the current configuration of disabling 
unlock inlining when paravirt_spinlock is enabled. With that, I think 
it is worthwhile to reduce the performance delta between the PV and 
non-PV kernel on bare metal.


I am sorry that the unlock call sites patching code doesn't work in a 
virtual guest. Your pvqspinlock patch did an unconditional patching 
even in a virtual guest. I added check for the 
paravirt_spinlocks_enabled, but it turned out that some spin_unlock() 
seemed to be called before paravirt_spinlocks_enabled is set. As a 
result, some call sites were still patched resulting in missed wake 
up's and system hang.


At this point, I am going to leave out that change from my patch set 
until we can figure out a better way of doing that.




Below was a partial kernel log with the unlock call site patch code in a 
KVM guest:


[0.438006] native_patch: patch out pv_queue_unlock!
[0.438565] native_patch: patch out pv_queue_unlock!
[0.439006] native_patch: patch out pv_queue_unlock!
[0.439638] native_patch: patch out pv_queue_unlock!
[0.440052] native_patch: patch out pv_queue_unlock!
[0.441006] native_patch: patch out pv_queue_unlock!
[0.441566] native_patch: patch out pv_queue_unlock!
[0.442035] ftrace: allocating 24168 entries in 95 pages
[0.451208] Switched APIC routing to physical flat.
[0.453202] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[0.454002] smpboot: CPU0: Intel QEMU Virtual CPU version 1.5.3 (fam: 
06, model: 06, stepping: 03)
[0.456000] Performance Events: Broken PMU hardware detected, using 
software events only.

[0.456003] Failed to access perfctr msr (MSR c1 is 0)
[0.457151] KVM setup paravirtual spinlock
[0.460039] NMI watchdog: disabled (cpu0): hardware events not enabled

It could be seen that some unlock call sites were patched before the KVM 
setup code set the paravirt_spinlocks_enabled flag.


-Longman

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 3/4] vhost: byteswap virtqueue attributes

2014-10-29 Thread Cédric Le Goater
The virtqueue structure shares a few attributes with the guest OS
which need to be byteswapped when the endian order of the host is
different.

This patch uses the vq-byteswap attribute to decide whether to
byteswap or not data being accessed in the guest memory.

Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---
 drivers/vhost/vhost.c |   52 +
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index afcb3368370c..9a538f4d2ff1 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1097,7 +1097,7 @@ static void vring_desc_swap(struct vring_desc *desc)
 static int vhost_update_used_flags(struct vhost_virtqueue *vq)
 {
void __user *used;
-   if (__put_user(vq-used_flags, vq-used-flags)  0)
+   if (__vq_put_user(vq, vq-used_flags, vq-used-flags)  0)
return -EFAULT;
if (unlikely(vq-log_used)) {
/* Make sure the flag is seen before log. */
@@ -1115,7 +1115,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
-   if (__put_user(vq-avail_idx, vhost_avail_event(vq)))
+   if (__vq_put_user(vq, vq-avail_idx, vhost_avail_event(vq)))
return -EFAULT;
if (unlikely(vq-log_used)) {
void __user *used;
@@ -1142,7 +1142,7 @@ int vhost_init_used(struct vhost_virtqueue *vq)
if (r)
return r;
vq-signalled_used_valid = false;
-   return get_user(vq-last_used_idx, vq-used-idx);
+   return vq_get_user(vq, vq-last_used_idx, vq-used-idx);
 }
 EXPORT_SYMBOL_GPL(vhost_init_used);
 
@@ -1254,6 +1254,10 @@ static int get_indirect(struct vhost_virtqueue *vq,
   i, (size_t)indirect-addr + i * sizeof desc);
return -EINVAL;
}
+
+   if (vq-byteswap)
+   vring_desc_swap(desc);
+
if (unlikely(desc.flags  VRING_DESC_F_INDIRECT)) {
vq_err(vq, Nested indirect descriptor: idx %d, %zx\n,
   i, (size_t)indirect-addr + i * sizeof desc);
@@ -1309,7 +1313,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Check it isn't doing very strange things with descriptor numbers. */
last_avail_idx = vq-last_avail_idx;
-   if (unlikely(__get_user(vq-avail_idx, vq-avail-idx))) {
+   if (unlikely(__vq_get_user(vq, vq-avail_idx, vq-avail-idx))) {
vq_err(vq, Failed to access avail idx at %p\n,
   vq-avail-idx);
return -EFAULT;
@@ -1330,7 +1334,7 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 
/* Grab the next descriptor number they're advertising, and increment
 * the index we've seen. */
-   if (unlikely(__get_user(head,
+   if (unlikely(__vq_get_user(vq, head,
vq-avail-ring[last_avail_idx % vq-num]))) {
vq_err(vq, Failed to read head: idx %d address %p\n,
   last_avail_idx,
@@ -1370,6 +1374,10 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
   i, vq-desc + i);
return -EFAULT;
}
+
+   if (vq-byteswap)
+   vring_desc_swap(desc);
+
if (desc.flags  VRING_DESC_F_INDIRECT) {
ret = get_indirect(vq, iov, iov_size,
   out_num, in_num,
@@ -1437,6 +1445,26 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned 
int head, int len)
 }
 EXPORT_SYMBOL_GPL(vhost_add_used);
 
+static int __copyhead_to_user(struct vhost_virtqueue *vq,
+   struct vring_used_elem *heads,
+   struct vring_used_elem __user *used,
+   unsigned count)
+{
+   int i;
+
+   for (i = 0; i  count; i++) {
+   if (__vq_put_user(vq, heads[i].id, used[i].id)) {
+   vq_err(vq, Failed to write used id);
+   return -EFAULT;
+   }
+   if (__vq_put_user(vq, heads[i].len, used[i].len)) {
+   vq_err(vq, Failed to write used len);
+   return -EFAULT;
+   }
+   }
+   return 0;
+}
+
 static int __vhost_add_used_n(struct vhost_virtqueue *vq,
struct vring_used_elem *heads,
unsigned count)
@@ -1448,15 +1476,15 @@ static int __vhost_add_used_n(struct vhost_virtqueue 
*vq,
start = vq-last_used_idx % vq-num;
used = vq-used-ring + start;
if (count == 1) {
-   if (__put_user(heads[0].id, used-id)) {
+   if (__vq_put_user(vq, heads[0].id, used-id)) {
vq_err(vq, Failed to write used id);

[RFC PATCH 4/4] vhost_net: byteswap virtio_net header

2014-10-29 Thread Cédric Le Goater
Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---
 drivers/vhost/net.c |   39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 8dae2f724a35..f2d5f585dae9 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -18,6 +18,7 @@
 #include linux/file.h
 #include linux/slab.h
 #include linux/vmalloc.h
+#include linux/swab.h
 
 #include linux/net.h
 #include linux/if_packet.h
@@ -329,6 +330,14 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
rcu_read_unlock_bh();
 }
 
+static void virtio_net_hdr_swap(struct virtio_net_hdr *hdr)
+{
+   hdr-hdr_len = swab16(hdr-hdr_len);
+   hdr-gso_size= swab16(hdr-gso_size);
+   hdr-csum_start  = swab16(hdr-csum_start);
+   hdr-csum_offset = swab16(hdr-csum_offset);
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -346,7 +355,7 @@ static void handle_tx(struct vhost_net *net)
.msg_flags = MSG_DONTWAIT,
};
size_t len, total_len = 0;
-   int err;
+   int err, has_vnet_hdr;
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
@@ -359,6 +368,7 @@ static void handle_tx(struct vhost_net *net)
 
vhost_disable_notify(net-dev, vq);
 
+   has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR);
hdr_size = nvq-vhost_hlen;
zcopy = nvq-ubufs;
 
@@ -406,6 +416,8 @@ static void handle_tx(struct vhost_net *net)
break;
}
 
+   if (!has_vnet_hdr  vq-byteswap)
+   virtio_net_hdr_swap((void *) vq-iov[0].iov_base);
zcopy_used = zcopy  len = VHOST_GOODCOPY_LEN
(nvq-upend_idx + 1) % UIO_MAXIOV !=
  nvq-done_idx
@@ -570,7 +582,7 @@ static void handle_rx(struct vhost_net *net)
.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
};
size_t total_len = 0;
-   int err, mergeable;
+   int err, mergeable, has_vnet_hdr;
s16 headcount;
size_t vhost_hlen, sock_hlen;
size_t vhost_len, sock_len;
@@ -588,6 +600,7 @@ static void handle_rx(struct vhost_net *net)
vq_log = unlikely(vhost_has_feature(vq, VHOST_F_LOG_ALL)) ?
vq-log : NULL;
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
+   has_vnet_hdr = vhost_has_feature(vq, VHOST_NET_F_VIRTIO_NET_HDR);
 
while ((sock_len = peek_head_len(sock-sk))) {
sock_len += sock_hlen;
@@ -638,6 +651,9 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq, headcount);
continue;
}
+
+   if (!has_vnet_hdr  vq-byteswap)
+   virtio_net_hdr_swap((void *) vq-iov[0].iov_base);
if (unlikely(vhost_hlen) 
memcpy_toiovecend(nvq-hdr, (unsigned char *)hdr, 0,
  vhost_hlen)) {
@@ -646,13 +662,18 @@ static void handle_rx(struct vhost_net *net)
break;
}
/* TODO: Should check and handle checksum. */
-   if (likely(mergeable) 
-   memcpy_toiovecend(nvq-hdr, (unsigned char *)headcount,
- offsetof(typeof(hdr), num_buffers),
- sizeof hdr.num_buffers)) {
-   vq_err(vq, Failed num_buffers write);
-   vhost_discard_vq_desc(vq, headcount);
-   break;
+   if (likely(mergeable)) {
+   __u16 tmp = headcount;
+
+   if (vq-byteswap)
+   tmp = swab16(headcount);
+   if (memcpy_toiovecend(nvq-hdr, (unsigned char *)tmp,
+   offsetof(typeof(hdr), num_buffers),
+ sizeof(hdr.num_buffers))) {
+   vq_err(vq, Failed num_buffers write);
+   vhost_discard_vq_desc(vq, headcount);
+   break;
+   }
}
vhost_add_used_and_signal_n(net-dev, vq, vq-heads,
headcount);
-- 
1.7.10.4

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH 2/4] vhost: add byteswap routines

2014-10-29 Thread Cédric Le Goater
This patch adds a few helper routines around get_user and put_user
to ease byteswapping.

Signed-off-by: Cédric Le Goater c...@fr.ibm.com
---

I am not sure these routines belong to this file. There is room for 
improvement to remove the ugly  switch (sizeof(*(ptr))). Please 
comment !

 drivers/vhost/vhost.c |   93 +
 1 file changed, 93 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 72c21b790ba3..afcb3368370c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -25,6 +25,7 @@
 #include linux/kthread.h
 #include linux/cgroup.h
 #include linux/module.h
+#include linux/swab.h
 
 #include vhost.h
 
@@ -1001,6 +1002,98 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct 
vhost_log *log,
 }
 EXPORT_SYMBOL_GPL(vhost_log_write);
 
+#define vq_get_user(vq, x, ptr)\
+({ \
+   int ret = get_user(x, ptr); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   x = swab16(x);  \
+   break;  \
+   case 4: \
+   x = swab32(x);  \
+   break;  \
+   case 8: \
+   x = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   ret;\
+})
+
+#define vq_put_user(vq, x, ptr)\
+({ \
+   __typeof__(*(ptr)) y = (x); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   y = swab16(x);  \
+   break;  \
+   case 4: \
+   y = swab32(x);  \
+   break;  \
+   case 8: \
+   y = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   put_user(y, ptr);   \
+})
+
+#define __vq_get_user(vq, x, ptr)  \
+({ \
+   int ret = __get_user(x, ptr);   \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: \
+   x = swab16(x);  \
+   break;  \
+   case 4: \
+   x = swab32(x);  \
+   break;  \
+   case 8: \
+   x = swab64(x);  \
+   break;  \
+   default:\
+   break;  \
+   }   \
+   }   \
+   ret;\
+})
+
+#define __vq_put_user(vq, x, ptr)  \
+({ \
+   __typeof__(*(ptr)) y = (x); \
+   if (vq-byteswap) { \
+   switch (sizeof(*(ptr))) {   \
+   case 2: