Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()

2018-07-02 Thread Jason Wang



On 2018年07月03日 13:48, Tonghao Zhang wrote:

On Tue, Jul 3, 2018 at 10:12 AM Jason Wang  wrote:



On 2018年07月02日 20:57, xiangxia.m@gmail.com wrote:

From: Tonghao Zhang 

Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang 
---
   drivers/vhost/net.c | 94 
+++--
   1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..2790959 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
   return vhost_poll_start(poll, sock->file);
   }

+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(>sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool rx)
+{
+ unsigned long uninitialized_var(endtime);
+ unsigned long busyloop_timeout;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+ mutex_lock_nested(>mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+
+ vhost_disable_notify(>dev, vq);
+ sock = rvq->private_data;
+ busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+ while (vhost_can_busy_poll(tvq->dev, endtime) &&
+!(sock && sk_has_rx_data(sock->sk)) &&
+vhost_vq_avail_empty(tvq->dev, tvq))
+ cpu_relax();
+ preempt_enable();
+
+ if ((rx && !vhost_vq_avail_empty(>dev, vq)) ||
+ (!rx && (sock && sk_has_rx_data(sock->sk {
+ vhost_poll_queue(>poll);
+ } else if (unlikely(vhost_enable_notify(>dev, vq))) {

One last question, do we need this for rx? This check will be always
true under light or medium load.

will remove the 'unlikely'


Not only the unlikely. We only want rx kick when packet is pending at 
socket but we're out of available buffers. This is not the case here 
(you have polled the vq above).


So we probably want to do this only when rx == true.

Thanks




Thanks


+ vhost_disable_notify(>dev, vq);
+ vhost_poll_queue(>poll);
+ }
+
+ mutex_unlock(>mutex);
+}
+
+
   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
   struct vhost_virtqueue *vq,
   struct iovec iov[], unsigned int iov_size,
@@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, 
struct sock *sk)
   return len;
   }

-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(>sk_receive_queue);
-}
-
   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
   {
   struct vhost_virtqueue *vq = >vq;
@@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct 
vhost_net_virtqueue *nvq)

   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
   {
- struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
- struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
- struct vhost_virtqueue *vq = >vq;
- unsigned long uninitialized_var(endtime);
- int len = peek_head_len(rvq, sk);
+ struct vhost_net_virtqueue *rnvq = >vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *tnvq = >vqs[VHOST_NET_VQ_TX];

- if (!len && vq->busyloop_timeout) {
- /* Flush batched heads first */
- vhost_rx_signal_used(rvq);
- /* Both tx vq and rx socket were polled here */
- mutex_lock_nested(>mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(>dev, vq);
+ int len = peek_head_len(rnvq, sk);

- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
-
- while (vhost_can_busy_poll(>dev, endtime) &&
-!sk_has_rx_data(sk) &&
-vhost_vq_avail_empty(>dev, vq))
- cpu_relax();
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(>dev, vq))
- vhost_poll_queue(>poll);
- else if (unlikely(vhost_enable_notify(>dev, vq))) {
- vhost_disable_notify(>dev, vq);
- vhost_poll_queue(>poll);
- }
+ if (!len && rnvq->vq.busyloop_timeout) {
+ /* Flush batched heads first */
+ vhost_rx_signal_used(rnvq);

- mutex_unlock(>mutex);
+ /* Both tx vq and rx 

Re: [PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()

2018-07-02 Thread Tonghao Zhang
On Tue, Jul 3, 2018 at 10:12 AM Jason Wang  wrote:
>
>
>
> On 2018年07月02日 20:57, xiangxia.m@gmail.com wrote:
> > From: Tonghao Zhang 
> >
> > Factor out generic busy polling logic and will be
> > used for in tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang 
> > ---
> >   drivers/vhost/net.c | 94 
> > +++--
> >   1 file changed, 55 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 62bb8e8..2790959 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> >   return vhost_poll_start(poll, sock->file);
> >   }
> >
> > +static int sk_has_rx_data(struct sock *sk)
> > +{
> > + struct socket *sock = sk->sk_socket;
> > +
> > + if (sock->ops->peek_len)
> > + return sock->ops->peek_len(sock);
> > +
> > + return skb_queue_empty(>sk_receive_queue);
> > +}
> > +
> > +static void vhost_net_busy_poll(struct vhost_net *net,
> > + struct vhost_virtqueue *rvq,
> > + struct vhost_virtqueue *tvq,
> > + bool rx)
> > +{
> > + unsigned long uninitialized_var(endtime);
> > + unsigned long busyloop_timeout;
> > + struct socket *sock;
> > + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> > +
> > + mutex_lock_nested(>mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
> > +
> > + vhost_disable_notify(>dev, vq);
> > + sock = rvq->private_data;
> > + busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
> > +
> > + preempt_disable();
> > + endtime = busy_clock() + busyloop_timeout;
> > + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> > +!(sock && sk_has_rx_data(sock->sk)) &&
> > +vhost_vq_avail_empty(tvq->dev, tvq))
> > + cpu_relax();
> > + preempt_enable();
> > +
> > + if ((rx && !vhost_vq_avail_empty(>dev, vq)) ||
> > + (!rx && (sock && sk_has_rx_data(sock->sk {
> > + vhost_poll_queue(>poll);
> > + } else if (unlikely(vhost_enable_notify(>dev, vq))) {
>
> One last question, do we need this for rx? This check will be always
> true under light or medium load.
will remove the 'unlikely'

> Thanks
>
> > + vhost_disable_notify(>dev, vq);
> > + vhost_poll_queue(>poll);
> > + }
> > +
> > + mutex_unlock(>mutex);
> > +}
> > +
> > +
> >   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >   struct vhost_virtqueue *vq,
> >   struct iovec iov[], unsigned int iov_size,
> > @@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue 
> > *rvq, struct sock *sk)
> >   return len;
> >   }
> >
> > -static int sk_has_rx_data(struct sock *sk)
> > -{
> > - struct socket *sock = sk->sk_socket;
> > -
> > - if (sock->ops->peek_len)
> > - return sock->ops->peek_len(sock);
> > -
> > - return skb_queue_empty(>sk_receive_queue);
> > -}
> > -
> >   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >   {
> >   struct vhost_virtqueue *vq = >vq;
> > @@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct 
> > vhost_net_virtqueue *nvq)
> >
> >   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock 
> > *sk)
> >   {
> > - struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
> > - struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
> > - struct vhost_virtqueue *vq = >vq;
> > - unsigned long uninitialized_var(endtime);
> > - int len = peek_head_len(rvq, sk);
> > + struct vhost_net_virtqueue *rnvq = >vqs[VHOST_NET_VQ_RX];
> > + struct vhost_net_virtqueue *tnvq = >vqs[VHOST_NET_VQ_TX];
> >
> > - if (!len && vq->busyloop_timeout) {
> > - /* Flush batched heads first */
> > - vhost_rx_signal_used(rvq);
> > - /* Both tx vq and rx socket were polled here */
> > - mutex_lock_nested(>mutex, VHOST_NET_VQ_TX);
> > - vhost_disable_notify(>dev, vq);
> > + int len = peek_head_len(rnvq, sk);
> >
> > - preempt_disable();
> > - endtime = busy_clock() + vq->busyloop_timeout;
> > -
> > - while (vhost_can_busy_poll(>dev, endtime) &&
> > -!sk_has_rx_data(sk) &&
> > -vhost_vq_avail_empty(>dev, vq))
> > - cpu_relax();
> > -
> > - preempt_enable();
> > -
> > - if (!vhost_vq_avail_empty(>dev, vq))
> > - vhost_poll_queue(>poll);
> > - else if (unlikely(vhost_enable_notify(>dev, vq))) {
> > - vhost_disable_notify(>dev, vq);
> > - vhost_poll_queue(>poll);
> > - }
> > +

[PATCH net-next 8/8] vhost: event suppression for packed ring

2018-07-02 Thread Jason Wang
This patch introduces support for event suppression. This is done by
have a two areas: device area and driver area. One side could then try
to disable or enable (delayed) notification from other side by using a
boolean hint or event index interface in the areas.

For more information, please refer Virtio spec.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 191 ++
 drivers/vhost/vhost.h |  10 ++-
 2 files changed, 185 insertions(+), 16 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 0f3f07c..cccbc82 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1115,10 +1115,15 @@ static int vq_access_ok_packed(struct vhost_virtqueue 
*vq, unsigned int num,
   struct vring_used __user *used)
 {
struct vring_desc_packed *packed = (struct vring_desc_packed *)desc;
+   struct vring_packed_desc_event *driver_event =
+   (struct vring_packed_desc_event *)avail;
+   struct vring_packed_desc_event *device_event =
+   (struct vring_packed_desc_event *)used;
 
-   /* TODO: check device area and driver area */
return access_ok(VERIFY_READ, packed, num * sizeof(*packed)) &&
-  access_ok(VERIFY_WRITE, packed, num * sizeof(*packed));
+  access_ok(VERIFY_WRITE, packed, num * sizeof(*packed)) &&
+  access_ok(VERIFY_READ, driver_event, sizeof(*driver_event)) &&
+  access_ok(VERIFY_WRITE, device_event, sizeof(*device_event));
 }
 
 static int vq_access_ok_split(struct vhost_virtqueue *vq, unsigned int num,
@@ -1193,14 +1198,27 @@ static bool iotlb_access_ok(struct vhost_virtqueue *vq,
return true;
 }
 
-int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+int vq_iotlb_prefetch_packed(struct vhost_virtqueue *vq)
+{
+   int num = vq->num;
+
+   return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
+  num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_WO, (u64)(uintptr_t)vq->desc,
+  num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_RO,
+  (u64)(uintptr_t)vq->driver_event,
+  sizeof(*vq->driver_event), VHOST_ADDR_AVAIL) &&
+  iotlb_access_ok(vq, VHOST_ACCESS_WO,
+  (u64)(uintptr_t)vq->device_event,
+  sizeof(*vq->device_event), VHOST_ADDR_USED);
+}
+
+int vq_iotlb_prefetch_split(struct vhost_virtqueue *vq)
 {
size_t s = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0;
unsigned int num = vq->num;
 
-   if (!vq->iotlb)
-   return 1;
-
return iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->desc,
   num * sizeof(*vq->desc), VHOST_ADDR_DESC) &&
   iotlb_access_ok(vq, VHOST_ACCESS_RO, (u64)(uintptr_t)vq->avail,
@@ -1212,6 +1230,17 @@ int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
   num * sizeof(*vq->used->ring) + s,
   VHOST_ADDR_USED);
 }
+
+int vq_iotlb_prefetch(struct vhost_virtqueue *vq)
+{
+   if (!vq->iotlb)
+   return 1;
+
+   if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED))
+   return vq_iotlb_prefetch_packed(vq);
+   else
+   return vq_iotlb_prefetch_split(vq);
+}
 EXPORT_SYMBOL_GPL(vq_iotlb_prefetch);
 
 /* Can we log writes? */
@@ -1771,6 +1800,50 @@ static int vhost_update_used_flags(struct 
vhost_virtqueue *vq)
return 0;
 }
 
+static int vhost_update_device_flags(struct vhost_virtqueue *vq,
+__virtio16 device_flags)
+{
+   void __user *flags;
+
+   if (vhost_put_user(vq, device_flags, >device_event->flags,
+  VHOST_ADDR_USED) < 0)
+   return -EFAULT;
+   if (unlikely(vq->log_used)) {
+   /* Make sure the flag is seen before log. */
+   smp_wmb();
+   /* Log used flag write. */
+   flags = >device_event->flags;
+   log_write(vq->log_base, vq->log_addr +
+ (flags - (void __user *)vq->device_event),
+ sizeof(vq->device_event->flags));
+   if (vq->log_ctx)
+   eventfd_signal(vq->log_ctx, 1);
+   }
+   return 0;
+}
+
+static int vhost_update_device_off_wrap(struct vhost_virtqueue *vq,
+   __virtio16 device_off_wrap)
+{
+   void __user *off_wrap;
+
+   if (vhost_put_user(vq, device_off_wrap, >device_event->off_wrap,
+  VHOST_ADDR_USED) < 0)
+   return -EFAULT;
+   if (unlikely(vq->log_used)) {
+   /* Make sure the flag is seen before log. */
+   smp_wmb();
+   /* 

[PATCH net-next 7/8] vhost: packed ring support

2018-07-02 Thread Jason Wang
This patch introduces basic support for packed ring. The idea behinds
packed ring is to use a single descriptor ring instead of three
different rings (avail, used and descriptor). This could help to
reduce the cache contention and PCI transactions. So it was designed
to help for the performance for both software implementation and
hardware implementation.

The implementation was straightforward, packed version of vhost core
(whose name has a packed suffix) helpers were introduced and previous
helpers were renamed with a split suffix. Then the exported helpers
can just do a switch to go to the correct internal helpers.

The event suppression (device area and driver area) were not
implemented. It will be done on top with another patch.

For more information of packed ring, please refer Virtio spec.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c|  15 +-
 drivers/vhost/vhost.c  | 655 ++---
 drivers/vhost/vhost.h  |  13 +-
 include/uapi/linux/vhost.h |   7 +
 4 files changed, 644 insertions(+), 46 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index eada5a6..90d9efb 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,7 +74,8 @@ enum {
VHOST_NET_FEATURES = VHOST_FEATURES |
 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
 (1ULL << VIRTIO_NET_F_MRG_RXBUF) |
-(1ULL << VIRTIO_F_IOMMU_PLATFORM)
+(1ULL << VIRTIO_F_IOMMU_PLATFORM) |
+(1ULL << VIRTIO_F_RING_PACKED)
 };
 
 enum {
@@ -583,7 +584,7 @@ static void handle_tx(struct vhost_net *net)
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
-   vhost_discard_vq_desc(vq, 1);
+   vhost_discard_vq_desc(vq, , 1);
vhost_net_enable_vq(net, vq);
break;
}
@@ -736,10 +737,12 @@ static void handle_rx(struct vhost_net *net)
mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
+   struct vhost_used_elem *used = vq->heads + nvq->done_idx;
+
sock_len += sock_hlen;
vhost_len = sock_len + vhost_hlen;
-   err = vhost_get_bufs(vq, vq->heads + nvq->done_idx,
-vhost_len, , vq_log, ,
+   err = vhost_get_bufs(vq, used, vhost_len,
+, vq_log, ,
 likely(mergeable) ? UIO_MAXIOV : 1,
 );
/* OK, now we need to know about added descriptors. */
@@ -784,7 +787,7 @@ static void handle_rx(struct vhost_net *net)
if (unlikely(err != sock_len)) {
pr_debug("Discarded rx packet: "
 " len %d, expected %zd\n", err, sock_len);
-   vhost_discard_vq_desc(vq, headcount);
+   vhost_discard_vq_desc(vq, used, 1);
continue;
}
/* Supply virtio_net_hdr if VHOST_NET_F_VIRTIO_NET_HDR */
@@ -808,7 +811,7 @@ static void handle_rx(struct vhost_net *net)
copy_to_iter(_buffers, sizeof num_buffers,
 ) != sizeof num_buffers) {
vq_err(vq, "Failed num_buffers write");
-   vhost_discard_vq_desc(vq, headcount);
+   vhost_discard_vq_desc(vq, used, 1);
goto out;
}
nvq->done_idx += headcount;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 060a431..0f3f07c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -323,6 +323,9 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vhost_reset_is_le(vq);
vhost_disable_cross_endian(vq);
vq->busyloop_timeout = 0;
+   vq->last_used_wrap_counter = true;
+   vq->last_avail_wrap_counter = true;
+   vq->avail_wrap_counter = true;
vq->umem = NULL;
vq->iotlb = NULL;
__vhost_vq_meta_reset(vq);
@@ -1106,11 +1109,22 @@ static int vhost_iotlb_miss(struct vhost_virtqueue *vq, 
u64 iova, int access)
return 0;
 }
 
-static bool vq_access_ok(struct vhost_virtqueue *vq, unsigned int num,
-struct vring_desc __user *desc,
-struct vring_avail __user *avail,
-struct vring_used __user *used)
+static int vq_access_ok_packed(struct vhost_virtqueue *vq, unsigned int num,
+  struct vring_desc __user *desc,
+  struct vring_avail __user *avail,
+  struct vring_used __user *used)
+{
+   struct 

[PATCH net-next 6/8] virtio: introduce packed ring defines

2018-07-02 Thread Jason Wang
Signed-off-by: Jason Wang 
---
 include/uapi/linux/virtio_config.h |  2 ++
 include/uapi/linux/virtio_ring.h   | 32 
 2 files changed, 34 insertions(+)

diff --git a/include/uapi/linux/virtio_config.h 
b/include/uapi/linux/virtio_config.h
index 449132c..947f6a3 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -75,6 +75,8 @@
  */
 #define VIRTIO_F_IOMMU_PLATFORM33
 
+#define VIRTIO_F_RING_PACKED   34
+
 /*
  * Does the device support Single Root I/O Virtualization?
  */
diff --git a/include/uapi/linux/virtio_ring.h b/include/uapi/linux/virtio_ring.h
index 6d5d5fa..71c7a46 100644
--- a/include/uapi/linux/virtio_ring.h
+++ b/include/uapi/linux/virtio_ring.h
@@ -43,6 +43,8 @@
 #define VRING_DESC_F_WRITE 2
 /* This means the buffer contains a list of buffer descriptors. */
 #define VRING_DESC_F_INDIRECT  4
+#define VRING_DESC_F_AVAIL  7
+#define VRING_DESC_F_USED  15
 
 /* The Host uses this in used->flags to advise the Guest: don't kick me when
  * you add a buffer.  It's unreliable, so it's simply an optimization.  Guest
@@ -62,6 +64,36 @@
  * at the end of the used ring. Guest should ignore the used->flags field. */
 #define VIRTIO_RING_F_EVENT_IDX29
 
+struct vring_desc_packed {
+   /* Buffer Address. */
+   __virtio64 addr;
+   /* Buffer Length. */
+   __virtio32 len;
+   /* Buffer ID. */
+   __virtio16 id;
+   /* The flags depending on descriptor type. */
+   __virtio16 flags;
+};
+
+/* Enable events */
+#define RING_EVENT_FLAGS_ENABLE 0x0
+/* Disable events */
+#define RING_EVENT_FLAGS_DISABLE 0x1
+/*
+ * Enable events for a specific descriptor
+ * (as specified by Descriptor Ring Change Event Offset/Wrap Counter).
+ * Only valid if VIRTIO_F_RING_EVENT_IDX has been negotiated.
+ */
+#define RING_EVENT_FLAGS_DESC 0x2
+/* The value 0x3 is reserved */
+
+struct vring_packed_desc_event {
+   /* Descriptor Ring Change Event Offset and Wrap Counter */
+   __virtio16 off_wrap;
+   /* Descriptor Ring Change Event Flags */
+   __virtio16 flags;
+};
+
 /* Virtio ring descriptors: 16 bytes.  These can chain together via "next". */
 struct vring_desc {
/* Address (guest-physical). */
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 5/8] vhost: vhost_put_user() can accept metadata type

2018-07-02 Thread Jason Wang
We assumes used ring update is the only user for vhost_put_user() in
the past. This may not be the case for the incoming packed ring which
may update the descriptor ring for used. So introduce a new type
parameter.

Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index af15bec..060a431 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -814,7 +814,7 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
return __vhost_get_user_slow(vq, addr, size, type);
 }
 
-#define vhost_put_user(vq, x, ptr) \
+#define vhost_put_user(vq, x, ptr, type)   \
 ({ \
int ret = -EFAULT; \
if (!vq->iotlb) { \
@@ -822,7 +822,7 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
} else { \
__typeof__(ptr) to = \
(__typeof__(ptr)) __vhost_get_user(vq, ptr, \
- sizeof(*ptr), VHOST_ADDR_USED); \
+ sizeof(*ptr), type); \
if (to != NULL) \
ret = __put_user(x, to); \
else \
@@ -1687,7 +1687,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 {
void __user *used;
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
-  >used->flags) < 0)
+  >used->flags, VHOST_ADDR_USED) < 0)
return -EFAULT;
if (unlikely(vq->log_used)) {
/* Make sure the flag is seen before log. */
@@ -1706,7 +1706,7 @@ static int vhost_update_used_flags(struct vhost_virtqueue 
*vq)
 static int vhost_update_avail_event(struct vhost_virtqueue *vq, u16 
avail_event)
 {
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
-  vhost_avail_event(vq)))
+  vhost_avail_event(vq), VHOST_ADDR_USED))
return -EFAULT;
if (unlikely(vq->log_used)) {
void __user *used;
@@ -2189,12 +2189,12 @@ static int __vhost_add_used_n(struct vhost_virtqueue 
*vq,
used = vq->used->ring + start;
for (i = 0; i < count; i++) {
if (unlikely(vhost_put_user(vq, heads[i].elem.id,
-   [i].id))) {
+   [i].id, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used id");
return -EFAULT;
}
if (unlikely(vhost_put_user(vq, heads[i].elem.len,
-   [i].len))) {
+   [i].len, VHOST_ADDR_USED))) {
vq_err(vq, "Failed to write used len");
return -EFAULT;
}
@@ -2240,7 +2240,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct 
vhost_used_elem *heads,
/* Make sure buffer is written before we update index. */
smp_wmb();
if (vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
-  >used->idx)) {
+  >used->idx, VHOST_ADDR_USED)) {
vq_err(vq, "Failed to increment used idx");
return -EFAULT;
}
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 4/8] vhost_net: do not explicitly manipulate vhost_used_elem

2018-07-02 Thread Jason Wang
Two helpers of setting/getting used len were introduced to avoid
explicitly manipulating vhost_used_elem in zerocopy code. This will be
used to hide used_elem internals and simplify packed ring
implementation.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 11 +--
 drivers/vhost/vhost.c | 12 ++--
 drivers/vhost/vhost.h |  5 +
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d109649..eada5a6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -348,9 +348,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
*net,
int j = 0;
 
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-   if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
+   if (vhost_get_used_len(vq, >heads[i]) ==
+   VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
-   if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+   if (VHOST_DMA_IS_DONE(vhost_get_used_len(vq, >heads[i]))) {
vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
@@ -549,10 +550,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].elem.id =
-   cpu_to_vhost32(vq, used.elem.id);
-   vq->heads[nvq->upend_idx].elem.len =
-   VHOST_DMA_IN_PROGRESS;
+   vhost_set_used_len(vq, , VHOST_DMA_IN_PROGRESS);
+   vq->heads[nvq->upend_idx] = used;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 641f4c6..af15bec 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2071,11 +2071,19 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
-static void vhost_set_used_len(struct vhost_virtqueue *vq,
-  struct vhost_used_elem *used, int len)
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+   struct vhost_used_elem *used, int len)
 {
used->elem.len = cpu_to_vhost32(vq, len);
 }
+EXPORT_SYMBOL_GPL(vhost_set_used_len);
+
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+  struct vhost_used_elem *used)
+{
+   return vhost32_to_cpu(vq, used->elem.len);
+}
+EXPORT_SYMBOL_GPL(vhost_get_used_len);
 
 /* This is a multi-buffer version of vhost_get_desc, that works if
  * vq has read descriptors only.
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 8dea44b..604821b 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -198,6 +198,11 @@ int vhost_get_bufs(struct vhost_virtqueue *vq,
   unsigned *log_num,
   unsigned int quota,
   s16 *count);
+void vhost_set_used_len(struct vhost_virtqueue *vq,
+   struct vhost_used_elem *used,
+   int len);
+int vhost_get_used_len(struct vhost_virtqueue *vq,
+  struct vhost_used_elem *used);
 void vhost_discard_vq_desc(struct vhost_virtqueue *, int n);
 
 int vhost_vq_init_access(struct vhost_virtqueue *);
-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 3/8] vhost: do not use vring_used_elem

2018-07-02 Thread Jason Wang
Instead of depending on the exported vring_used_elem, this patch
switches to use a new internal structure vhost_used_elem which embed
vring_used_elem in itself. This could be used to let vhost to record
extra metadata for the incoming packed ring layout.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 19 +++---
 drivers/vhost/scsi.c  | 10 
 drivers/vhost/vhost.c | 68 ---
 drivers/vhost/vhost.h | 18 --
 drivers/vhost/vsock.c |  6 ++---
 5 files changed, 45 insertions(+), 76 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 449f793..d109649 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -348,10 +348,10 @@ static void vhost_zerocopy_signal_used(struct vhost_net 
*net,
int j = 0;
 
for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
-   if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
+   if (vq->heads[i].elem.len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
-   if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
-   vq->heads[i].len = VHOST_DMA_CLEAR_LEN;
+   if (VHOST_DMA_IS_DONE(vq->heads[i].elem.len)) {
+   vq->heads[i].elem.len = VHOST_DMA_CLEAR_LEN;
++j;
} else
break;
@@ -374,7 +374,7 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, 
bool success)
rcu_read_lock_bh();
 
/* set len to mark this desc buffers done DMA */
-   vq->heads[ubuf->desc].len = success ?
+   vq->heads[ubuf->desc].elem.len = success ?
VHOST_DMA_DONE_LEN : VHOST_DMA_FAILED_LEN;
cnt = vhost_net_ubuf_put(ubufs);
 
@@ -433,7 +433,7 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
-   struct vring_used_elem *used_elem,
+   struct vhost_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
-   struct vring_used_elem used;
+   struct vhost_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
 
@@ -549,9 +549,10 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id =
-   cpu_to_vhost32(vq, used.id);
-   vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
+   vq->heads[nvq->upend_idx].elem.id =
+   cpu_to_vhost32(vq, used.elem.id);
+   vq->heads[nvq->upend_idx].elem.len =
+   VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 013464c..149c38c 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -67,7 +67,7 @@ struct vhost_scsi_inflight {
 
 struct vhost_scsi_cmd {
/* Descriptor from vhost_get_vq_desc() for virt_queue segment */
-   struct vring_used_elem tvc_vq_used;
+   struct vhost_used_elem tvc_vq_used;
/* virtio-scsi initiator task attribute */
int tvc_task_attr;
/* virtio-scsi response incoming iovecs */
@@ -441,7 +441,7 @@ vhost_scsi_do_evt_work(struct vhost_scsi *vs, struct 
vhost_scsi_evt *evt)
struct vhost_virtqueue *vq = >vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = >event;
struct virtio_scsi_event __user *eventp;
-   struct vring_used_elem used;
+   struct vhost_used_elem used;
unsigned out, in;
int ret;
 
@@ -785,7 +785,7 @@ static void vhost_scsi_submission_work(struct work_struct 
*work)
 static void
 vhost_scsi_send_bad_target(struct vhost_scsi *vs,
   struct vhost_virtqueue *vq,
-  struct vring_used_elem *used, unsigned out)
+  struct vhost_used_elem *used, unsigned out)
 {
struct virtio_scsi_cmd_resp __user *resp;
struct virtio_scsi_cmd_resp rsp;
@@ -808,7 +808,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct 
vhost_virtqueue *vq)
struct virtio_scsi_cmd_req v_req;
struct virtio_scsi_cmd_req_pi v_req_pi;
struct vhost_scsi_cmd *cmd;
-   struct vring_used_elem used;
+   struct 

[PATCH net-next 2/8] vhost: hide used ring layout from device

2018-07-02 Thread Jason Wang
We used to return descriptor head by vhost_get_vq_desc() to device and
pass it back to vhost_add_used() and its friends. This exposes the
internal used ring layout to device which makes it hard to be extended for
e.g packed ring layout.

So this patch tries to hide the used ring layout by

- letting vhost_get_vq_desc() return pointer to struct vring_used_elem
- accepting pointer to struct vring_used_elem in vhost_add_used() and
  vhost_add_used_and_signal()

This could help to hide used ring layout and make it easier to
implement packed ring on top.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 46 +-
 drivers/vhost/scsi.c  | 62 +++
 drivers/vhost/vhost.c | 52 +-
 drivers/vhost/vhost.h |  9 +---
 drivers/vhost/vsock.c | 42 +-
 5 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 712b134..449f793 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -433,22 +433,24 @@ static int vhost_net_enable_vq(struct vhost_net *n,
 
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
+   struct vring_used_elem *used_elem,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
unsigned long uninitialized_var(endtime);
-   int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+   int r = vhost_get_vq_desc(vq, used_elem, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
 
-   if (r == vq->num && vq->busyloop_timeout) {
+   if (r == -ENOSPC && vq->busyloop_timeout) {
preempt_disable();
endtime = busy_clock() + vq->busyloop_timeout;
while (vhost_can_busy_poll(vq->dev, endtime) &&
   vhost_vq_avail_empty(vq->dev, vq))
cpu_relax();
preempt_enable();
-   r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
+   r = vhost_get_vq_desc(vq, used_elem, vq->iov,
+ ARRAY_SIZE(vq->iov), out_num, in_num,
+ NULL, NULL);
}
 
return r;
@@ -470,7 +472,6 @@ static void handle_tx(struct vhost_net *net)
struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *vq = >vq;
unsigned out, in;
-   int head;
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
@@ -483,6 +484,7 @@ static void handle_tx(struct vhost_net *net)
size_t hdr_size;
struct socket *sock;
struct vhost_net_ubuf_ref *uninitialized_var(ubufs);
+   struct vring_used_elem used;
bool zcopy, zcopy_used;
int sent_pkts = 0;
 
@@ -506,20 +508,20 @@ static void handle_tx(struct vhost_net *net)
vhost_zerocopy_signal_used(net, vq);
 
 
-   head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
-   ARRAY_SIZE(vq->iov),
-   , );
-   /* On error, stop handling until the next kick. */
-   if (unlikely(head < 0))
-   break;
+   err = vhost_net_tx_get_vq_desc(net, vq, , vq->iov,
+  ARRAY_SIZE(vq->iov),
+  , );
/* Nothing new?  Wait for eventfd to tell us they refilled. */
-   if (head == vq->num) {
+   if (err == -ENOSPC) {
if (unlikely(vhost_enable_notify(>dev, vq))) {
vhost_disable_notify(>dev, vq);
continue;
}
break;
}
+   /* On error, stop handling until the next kick. */
+   if (unlikely(err < 0))
+   break;
if (in) {
vq_err(vq, "Unexpected descriptor format for TX: "
   "out %d, int %d\n", out, in);
@@ -547,7 +549,8 @@ static void handle_tx(struct vhost_net *net)
struct ubuf_info *ubuf;
ubuf = nvq->ubuf_info + nvq->upend_idx;
 
-   vq->heads[nvq->upend_idx].id = cpu_to_vhost32(vq, head);
+   vq->heads[nvq->upend_idx].id =
+   cpu_to_vhost32(vq, used.id);
vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS;
ubuf->callback = 

[PATCH net-next 1/8] vhost: move get_rx_bufs to vhost.c

2018-07-02 Thread Jason Wang
Move get_rx_bufs() to vhost.c and rename it to
vhost_get_bufs(). This helps to hide vring internal layout from
specific device implementation. Packed ring implementation will
benefit from this.

Signed-off-by: Jason Wang 
---
 drivers/vhost/net.c   | 77 --
 drivers/vhost/vhost.c | 78 +++
 drivers/vhost/vhost.h |  7 +
 3 files changed, 85 insertions(+), 77 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 29756d8..712b134 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -685,83 +685,6 @@ static int vhost_net_rx_peek_head_len(struct vhost_net 
*net, struct sock *sk)
return len;
 }
 
-/* This is a multi-buffer version of vhost_get_desc, that works if
- * vq has read descriptors only.
- * @vq - the relevant virtqueue
- * @datalen- data length we'll be reading
- * @iovcount   - returned count of io vectors we fill
- * @log- vhost log
- * @log_num- log offset
- * @quota   - headcount quota, 1 for big buffer
- * returns number of buffer heads allocated, negative on error
- */
-static int get_rx_bufs(struct vhost_virtqueue *vq,
-  struct vring_used_elem *heads,
-  int datalen,
-  unsigned *iovcount,
-  struct vhost_log *log,
-  unsigned *log_num,
-  unsigned int quota)
-{
-   unsigned int out, in;
-   int seg = 0;
-   int headcount = 0;
-   unsigned d;
-   int r, nlogs = 0;
-   /* len is always initialized before use since we are always called with
-* datalen > 0.
-*/
-   u32 uninitialized_var(len);
-
-   while (datalen > 0 && headcount < quota) {
-   if (unlikely(seg >= UIO_MAXIOV)) {
-   r = -ENOBUFS;
-   goto err;
-   }
-   r = vhost_get_vq_desc(vq, vq->iov + seg,
- ARRAY_SIZE(vq->iov) - seg, ,
- , log, log_num);
-   if (unlikely(r < 0))
-   goto err;
-
-   d = r;
-   if (d == vq->num) {
-   r = 0;
-   goto err;
-   }
-   if (unlikely(out || in <= 0)) {
-   vq_err(vq, "unexpected descriptor format for RX: "
-   "out %d, in %d\n", out, in);
-   r = -EINVAL;
-   goto err;
-   }
-   if (unlikely(log)) {
-   nlogs += *log_num;
-   log += *log_num;
-   }
-   heads[headcount].id = cpu_to_vhost32(vq, d);
-   len = iov_length(vq->iov + seg, in);
-   heads[headcount].len = cpu_to_vhost32(vq, len);
-   datalen -= len;
-   ++headcount;
-   seg += in;
-   }
-   heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen);
-   *iovcount = seg;
-   if (unlikely(log))
-   *log_num = nlogs;
-
-   /* Detect overrun */
-   if (unlikely(datalen > 0)) {
-   r = UIO_MAXIOV + 1;
-   goto err;
-   }
-   return headcount;
-err:
-   vhost_discard_vq_desc(vq, headcount);
-   return r;
-}
-
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..8814e5b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2104,6 +2104,84 @@ int vhost_get_vq_desc(struct vhost_virtqueue *vq,
 }
 EXPORT_SYMBOL_GPL(vhost_get_vq_desc);
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ * vq has read descriptors only.
+ * @vq - the relevant virtqueue
+ * @datalen- data length we'll be reading
+ * @iovcount   - returned count of io vectors we fill
+ * @log- vhost log
+ * @log_num- log offset
+ * @quota   - headcount quota, 1 for big buffer
+ * returns number of buffer heads allocated, negative on error
+ */
+int vhost_get_bufs(struct vhost_virtqueue *vq,
+  struct vring_used_elem *heads,
+  int datalen,
+  unsigned *iovcount,
+  struct vhost_log *log,
+  unsigned *log_num,
+  unsigned int quota)
+{
+   unsigned int out, in;
+   int seg = 0;
+   int headcount = 0;
+   unsigned d;
+   int r, nlogs = 0;
+   /* len is always initialized before use since we are always called with
+* datalen > 0.
+*/
+   u32 uninitialized_var(len);
+
+   while (datalen > 0 && headcount < quota) {
+   if (unlikely(seg >= UIO_MAXIOV)) {
+  

[PATCH net-next 0/8] Packed virtqueue for vhost

2018-07-02 Thread Jason Wang
Hi all:

This series implements packed virtqueues. The code were tested with
Tiwei's RFC V6 at https://lkml.org/lkml/2018/6/5/120.

Pktgen test for both RX and TX does not show obvious difference with
split virtqueues. The main bottleneck is the guest Linux driver, since
it can not stress vhost for a 100% CPU utilization. A full TCP
benchmark is ongoing. Will test virtio-net pmd as well when it was
ready.

This version were tested with:

- Zerocopy (Out of Order) support
- vIOMMU support
- mergeable buffer on/off
- busy polling on/off

Changes from RFC V5:

- save unnecessary barriers during vhost_add_used_packed_n()
- more compact math for event idx
- fix failure of SET_VRING_BASE when avail_wrap_counter is true
- fix not copy avail_wrap_counter during GET_VRING_BASE
- introduce SET_VRING_USED_BASE/GET_VRING_USED_BASE for syncing last_used_idx
- rename used_wrap_counter to last_used_wrap_counter
- rebase to net-next

Changes from RFC V4:

- fix signalled_used index recording
- track avail index correctly
- various minor fixes

Changes from RFC V3:

- Fix math on event idx checking
- Sync last avail wrap counter through GET/SET_VRING_BASE
- remove desc_event prefix in the driver/device structure

Changes from RFC V2:

- do not use & in checking desc_event_flags
- off should be most significant bit
- remove the workaround of mergeable buffer for dpdk prototype
- id should be in the last descriptor in the chain
- keep _F_WRITE for write descriptor when adding used
- device flags updating should use ADDR_USED type
- return error on unexpected unavail descriptor in a chain
- return false in vhost_ve_avail_empty is descriptor is available
- track last seen avail_wrap_counter
- correctly examine available descriptor in get_indirect_packed()
- vhost_idx_diff should return u16 instead of bool

Changes from RFC V1:

- Refactor vhost used elem code to avoid open coding on used elem
- Event suppression support (compile test only).
- Indirect descriptor support (compile test only).
- Zerocopy support.
- vIOMMU support.
- SCSI/VSOCK support (compile test only).
- Fix several bugs

Jason Wang (8):
  vhost: move get_rx_bufs to vhost.c
  vhost: hide used ring layout from device
  vhost: do not use vring_used_elem
  vhost_net: do not explicitly manipulate vhost_used_elem
  vhost: vhost_put_user() can accept metadata type
  virtio: introduce packed ring defines
  vhost: packed ring support
  vhost: event suppression for packed ring

 drivers/vhost/net.c| 144 ++
 drivers/vhost/scsi.c   |  62 +--
 drivers/vhost/vhost.c  | 996 +
 drivers/vhost/vhost.h  |  52 +-
 drivers/vhost/vsock.c  |  42 +-
 include/uapi/linux/vhost.h |   7 +
 include/uapi/linux/virtio_config.h |   2 +
 include/uapi/linux/virtio_ring.h   |  32 ++
 8 files changed, 1070 insertions(+), 267 deletions(-)

-- 
2.7.4

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 0/9] Fix references for some missing documentation files

2018-07-02 Thread Jonathan Corbet
On Tue, 26 Jun 2018 06:49:02 -0300
Mauro Carvalho Chehab  wrote:

> Having nothing to do while waiting for my plane to arrive while
> returning back from Japan, I ended by writing a small series of 
> patches meant to reduce the number of bad Documentation/* 
> links that are detected by:
>   ./scripts/documentation-file-ref-check

I've applied everything except the two networking patches, since I expect
those to go through Dave's tree.

Thanks,

jon
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] x86-64: use RIP-relative calls for paravirt indirect ones

2018-07-02 Thread Juergen Gross
On 25/06/18 12:29, Jan Beulich wrote:
> This saves one insn byte per instance, summing up to a savings of over
> 4k in my (stripped down) configuration. No variant of to be patched in
> replacement code relies on the one byte larger size.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Juergen Gross 


Juergen
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v4 4/4] net: vhost: add rx busy polling in tx path

2018-07-02 Thread xiangxia . m . yue
From: Tonghao Zhang 

This patch improves the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.

We set the poll-us=100us and use the iperf3 to test
its bandwidth, use the netperf to test throughput and mean
latency. When running the tests, the vhost-net kthread of
that VM, is alway 100% CPU. The commands are shown as below.

iperf3  -s -D
iperf3  -c IP -i 1 -P 1 -t 20 -M 1400

or
netserver
netperf -H IP -t TCP_RR -l 20 -- -O "THROUGHPUT,MEAN_LATENCY"

host -> guest:
iperf3:
* With the patch: 27.0 Gbits/sec
* Without the patch:  14.4 Gbits/sec

netperf (TCP_RR):
* With the patch: 48039.56 trans/s, 20.64us mean latency
* Without the patch:  46027.07 trans/s, 21.58us mean latency

This patch also improves the guest transmit performance.

guest -> host:
iperf3:
* With the patch: 27.2 Gbits/sec
* Without the patch:  24.4 Gbits/sec

netperf (TCP_RR):
* With the patch: 47963.25 trans/s, 20.71us mean latency
* Without the patch:  45796.70 trans/s, 21.68us mean latency

Signed-off-by: Tonghao Zhang 
---
 drivers/vhost/net.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2790959..3f26547 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -480,17 +480,13 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num)
 {
-   unsigned long uninitialized_var(endtime);
+   struct vhost_net_virtqueue *rnvq = >vqs[VHOST_NET_VQ_RX];
int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
 
if (r == vq->num && vq->busyloop_timeout) {
-   preempt_disable();
-   endtime = busy_clock() + vq->busyloop_timeout;
-   while (vhost_can_busy_poll(vq->dev, endtime) &&
-  vhost_vq_avail_empty(vq->dev, vq))
-   cpu_relax();
-   preempt_enable();
+   vhost_net_busy_poll(net, >vq, vq, false);
+
r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
  out_num, in_num, NULL, NULL);
}
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v4 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()

2018-07-02 Thread xiangxia . m . yue
From: Tonghao Zhang 

Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.

Signed-off-by: Tonghao Zhang 
---
 drivers/vhost/net.c | 94 +++--
 1 file changed, 55 insertions(+), 39 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 62bb8e8..2790959 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -429,6 +429,52 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
 }
 
+static int sk_has_rx_data(struct sock *sk)
+{
+   struct socket *sock = sk->sk_socket;
+
+   if (sock->ops->peek_len)
+   return sock->ops->peek_len(sock);
+
+   return skb_queue_empty(>sk_receive_queue);
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+   struct vhost_virtqueue *rvq,
+   struct vhost_virtqueue *tvq,
+   bool rx)
+{
+   unsigned long uninitialized_var(endtime);
+   unsigned long busyloop_timeout;
+   struct socket *sock;
+   struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+   mutex_lock_nested(>mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+
+   vhost_disable_notify(>dev, vq);
+   sock = rvq->private_data;
+   busyloop_timeout = rx ? rvq->busyloop_timeout : tvq->busyloop_timeout;
+
+   preempt_disable();
+   endtime = busy_clock() + busyloop_timeout;
+   while (vhost_can_busy_poll(tvq->dev, endtime) &&
+  !(sock && sk_has_rx_data(sock->sk)) &&
+  vhost_vq_avail_empty(tvq->dev, tvq))
+   cpu_relax();
+   preempt_enable();
+
+   if ((rx && !vhost_vq_avail_empty(>dev, vq)) ||
+   (!rx && (sock && sk_has_rx_data(sock->sk {
+   vhost_poll_queue(>poll);
+   } else if (unlikely(vhost_enable_notify(>dev, vq))) {
+   vhost_disable_notify(>dev, vq);
+   vhost_poll_queue(>poll);
+   }
+
+   mutex_unlock(>mutex);
+}
+
+
 static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
@@ -621,16 +667,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, 
struct sock *sk)
return len;
 }
 
-static int sk_has_rx_data(struct sock *sk)
-{
-   struct socket *sock = sk->sk_socket;
-
-   if (sock->ops->peek_len)
-   return sock->ops->peek_len(sock);
-
-   return skb_queue_empty(>sk_receive_queue);
-}
-
 static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
 {
struct vhost_virtqueue *vq = >vq;
@@ -645,39 +681,19 @@ static void vhost_rx_signal_used(struct 
vhost_net_virtqueue *nvq)
 
 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
 {
-   struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
-   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
-   struct vhost_virtqueue *vq = >vq;
-   unsigned long uninitialized_var(endtime);
-   int len = peek_head_len(rvq, sk);
+   struct vhost_net_virtqueue *rnvq = >vqs[VHOST_NET_VQ_RX];
+   struct vhost_net_virtqueue *tnvq = >vqs[VHOST_NET_VQ_TX];
 
-   if (!len && vq->busyloop_timeout) {
-   /* Flush batched heads first */
-   vhost_rx_signal_used(rvq);
-   /* Both tx vq and rx socket were polled here */
-   mutex_lock_nested(>mutex, VHOST_NET_VQ_TX);
-   vhost_disable_notify(>dev, vq);
+   int len = peek_head_len(rnvq, sk);
 
-   preempt_disable();
-   endtime = busy_clock() + vq->busyloop_timeout;
-
-   while (vhost_can_busy_poll(>dev, endtime) &&
-  !sk_has_rx_data(sk) &&
-  vhost_vq_avail_empty(>dev, vq))
-   cpu_relax();
-
-   preempt_enable();
-
-   if (!vhost_vq_avail_empty(>dev, vq))
-   vhost_poll_queue(>poll);
-   else if (unlikely(vhost_enable_notify(>dev, vq))) {
-   vhost_disable_notify(>dev, vq);
-   vhost_poll_queue(>poll);
-   }
+   if (!len && rnvq->vq.busyloop_timeout) {
+   /* Flush batched heads first */
+   vhost_rx_signal_used(rnvq);
 
-   mutex_unlock(>mutex);
+   /* Both tx vq and rx socket were polled here */
+   vhost_net_busy_poll(net, >vq, >vq, true);
 
-   len = peek_head_len(rvq, sk);
+   len = peek_head_len(rnvq, sk);
}
 
return len;
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v4 2/4] net: vhost: replace magic number of lock annotation

2018-07-02 Thread xiangxia . m . yue
From: Tonghao Zhang 

Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.

Signed-off-by: Tonghao Zhang 
Acked-by: Jason Wang 
---
 drivers/vhost/net.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index e7cf7d2..62bb8e8 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -484,7 +484,7 @@ static void handle_tx(struct vhost_net *net)
bool zcopy, zcopy_used;
int sent_pkts = 0;
 
-   mutex_lock(>mutex);
+   mutex_lock_nested(>mutex, VHOST_NET_VQ_TX);
sock = vq->private_data;
if (!sock)
goto out;
@@ -655,7 +655,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net 
*net, struct sock *sk)
/* Flush batched heads first */
vhost_rx_signal_used(rvq);
/* Both tx vq and rx socket were polled here */
-   mutex_lock_nested(>mutex, 1);
+   mutex_lock_nested(>mutex, VHOST_NET_VQ_TX);
vhost_disable_notify(>dev, vq);
 
preempt_disable();
@@ -789,7 +789,7 @@ static void handle_rx(struct vhost_net *net)
__virtio16 num_buffers;
int recv_pkts = 0;
 
-   mutex_lock_nested(>mutex, 0);
+   mutex_lock_nested(>mutex, VHOST_NET_VQ_RX);
sock = vq->private_data;
if (!sock)
goto out;
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v4 1/4] vhost: lock the vqs one by one

2018-07-02 Thread xiangxia . m . yue
From: Tonghao Zhang 

This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.

Signed-off-by: Tonghao Zhang 
Acked-by: Jason Wang 
Signed-off-by: Jason Wang 
---
 drivers/vhost/vhost.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 895eaa2..4ca9383 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
 {
int i;
 
-   for (i = 0; i < d->nvqs; ++i)
+   for (i = 0; i < d->nvqs; ++i) {
+   mutex_lock(>vqs[i]->mutex);
__vhost_vq_meta_reset(d->vqs[i]);
+   mutex_unlock(>vqs[i]->mutex);
+   }
 }
 
 static void vhost_vq_reset(struct vhost_dev *dev,
@@ -887,20 +890,6 @@ static inline void __user *__vhost_get_user(struct 
vhost_virtqueue *vq,
 #define vhost_get_used(vq, x, ptr) \
vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
 
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
-   int i = 0;
-   for (i = 0; i < d->nvqs; ++i)
-   mutex_lock_nested(>vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
-   int i = 0;
-   for (i = 0; i < d->nvqs; ++i)
-   mutex_unlock(>vqs[i]->mutex);
-}
-
 static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -950,7 +939,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
if (msg->iova <= vq_msg->iova &&
msg->iova + msg->size - 1 > vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
+   mutex_lock(>vq->mutex);
vhost_poll_queue(>vq->poll);
+   mutex_unlock(>vq->mutex);
+
list_del(>node);
kfree(node);
}
@@ -982,7 +974,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
int ret = 0;
 
mutex_lock(>mutex);
-   vhost_dev_lock_vqs(dev);
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
if (!dev->iotlb) {
@@ -1016,7 +1007,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
 
-   vhost_dev_unlock_vqs(dev);
mutex_unlock(>mutex);
 
return ret;
-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next v4 0/4] net: vhost: improve performance when enable busyloop

2018-07-02 Thread xiangxia . m . yue
From: Tonghao Zhang 

This patches improve the guest receive and transmit performance.
On the handle_tx side, we poll the sock receive queue at the same time.
handle_rx do that in the same way.

For more performance report, see patch 4.

v3 -> v4:
fix some issues

v2 -> v3:
This patches are splited from previous big patch:
http://patchwork.ozlabs.org/patch/934673/

Tonghao Zhang (4):
  vhost: lock the vqs one by one
  net: vhost: replace magic number of lock annotation
  net: vhost: factor out busy polling logic to vhost_net_busy_poll()
  net: vhost: add rx busy polling in tx path

 drivers/vhost/net.c   | 108 --
 drivers/vhost/vhost.c |  24 ---
 2 files changed, 67 insertions(+), 65 deletions(-)

-- 
1.8.3.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll

2018-07-02 Thread Jason Wang



On 2018年07月02日 16:05, Toshiaki Makita wrote:

On 2018/07/02 16:52, Jason Wang wrote:

On 2018年07月02日 15:11, Toshiaki Makita wrote:

On 2018/07/02 15:17, Jason Wang wrote:

On 2018年07月02日 12:37, Toshiaki Makita wrote:

On 2018/07/02 11:54, Jason Wang wrote:

On 2018年07月02日 10:45, Toshiaki Makita wrote:

Hi Jason,

On 2018/06/29 18:30, Jason Wang wrote:

On 2018年06月29日 16:09, Toshiaki Makita wrote:

...

To fix this, poll the work instead of enabling notification when
busypoll is interrupted by something. IMHO signal_pending() and
vhost_has_work() are kind of interruptions rather than signals to
completely cancel the busypoll, so let's run busypoll after the
necessary work is done. In order to avoid too long busyloop due to
interruption, save the endtime in vq field and use it when
reentering
the work function.

I think we don't care long busyloop unless e.g tx can starve rx?

I just want to keep it user-controllable. Unless memorizing it
busypoll
can run unexpectedly long.

I think the total amount of time for busy polling is bounded. If I was
wrong, it should be a bug somewhere.

Consider this kind of scenario:
0. Set 100us busypoll for example.
1. handle_tx() runs busypoll.
2. Something like zerocopy queues tx_work within 100us.
3. busypoll exits and call handle_tx() again.
4. Repeat 1-3.

In this case handle_tx() does not process packets but busypoll
essentially runs beyond 100us without endtime memorized. This may be
just a theoretical problem, but I was worried that more code to poll tx
queue can be added in the future and it becomes realistic.

Yes, but consider zerocopy tends to batch 16 used packets and we will
finally finish all processing of packets. The above won't be endless, so
it was probably tolerable.

Right. So endtime memorization is more like a future-proof thing.
Would you like to keep it or change something?

I think we'd better introduce it only when we meet real bugs.

I'll change it to a flag to indicate the previous busypoll is interrupted.


Performance numbers:

- Bulk transfer from guest to external physical server.
     [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
[Server]

Just to confirm in this case since zerocopy is enabled, we are in
fact
use the generic XDP datapath?

For some reason zerocopy was not applied for most packets, so in most
cases driver XDP was used. I was going to dig into it but not yet.

Right, just to confirm this. This is expected.

In tuntap, we do native XDP only for small and non zerocopy
packets. See
tun_can_build_skb(). The reason is XDP may adjust packet header
which is
not supported by zercopy. We can only use XDP generic for zerocopy in
this case.

I think I understand when driver XDP can be used. What I'm not sure and
was going to narrow down is why zerocopy is mostly not applied.


I see, any touch to the zerocopy packet (clone, header expansion or
segmentation) that lead a userspace copy will increase the error counter
in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
zerocopy. So it was probably something in your setup or a bug somewhere.

Thanks for the hint!

Seems zerocopy packets are always nonlinear and
netif_receive_generic_xdp() calls skb_linearize() in which
__pskb_pull_tail() calls skb_zcopy_clear(). Looks like tx_zcopy_err is
always counted when zerocopy is used with XDP in my env.



Right, since it needs do copy there, so we tend to disable zerocopy in 
this case.


Thanks

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll

2018-07-02 Thread Toshiaki Makita
On 2018/07/02 16:52, Jason Wang wrote:
> On 2018年07月02日 15:11, Toshiaki Makita wrote:
>> On 2018/07/02 15:17, Jason Wang wrote:
>>> On 2018年07月02日 12:37, Toshiaki Makita wrote:
 On 2018/07/02 11:54, Jason Wang wrote:
> On 2018年07月02日 10:45, Toshiaki Makita wrote:
>> Hi Jason,
>>
>> On 2018/06/29 18:30, Jason Wang wrote:
>>> On 2018年06月29日 16:09, Toshiaki Makita wrote:
>> ...
 To fix this, poll the work instead of enabling notification when
 busypoll is interrupted by something. IMHO signal_pending() and
 vhost_has_work() are kind of interruptions rather than signals to
 completely cancel the busypoll, so let's run busypoll after the
 necessary work is done. In order to avoid too long busyloop due to
 interruption, save the endtime in vq field and use it when
 reentering
 the work function.
>>> I think we don't care long busyloop unless e.g tx can starve rx?
>> I just want to keep it user-controllable. Unless memorizing it
>> busypoll
>> can run unexpectedly long.
> I think the total amount of time for busy polling is bounded. If I was
> wrong, it should be a bug somewhere.
 Consider this kind of scenario:
 0. Set 100us busypoll for example.
 1. handle_tx() runs busypoll.
 2. Something like zerocopy queues tx_work within 100us.
 3. busypoll exits and call handle_tx() again.
 4. Repeat 1-3.

 In this case handle_tx() does not process packets but busypoll
 essentially runs beyond 100us without endtime memorized. This may be
 just a theoretical problem, but I was worried that more code to poll tx
 queue can be added in the future and it becomes realistic.
>>> Yes, but consider zerocopy tends to batch 16 used packets and we will
>>> finally finish all processing of packets. The above won't be endless, so
>>> it was probably tolerable.
>> Right. So endtime memorization is more like a future-proof thing.
>> Would you like to keep it or change something?
> 
> I think we'd better introduce it only when we meet real bugs.

I'll change it to a flag to indicate the previous busypoll is interrupted.

 Performance numbers:

 - Bulk transfer from guest to external physical server.
     [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
 [Server]
>>> Just to confirm in this case since zerocopy is enabled, we are in
>>> fact
>>> use the generic XDP datapath?
>> For some reason zerocopy was not applied for most packets, so in most
>> cases driver XDP was used. I was going to dig into it but not yet.
> Right, just to confirm this. This is expected.
>
> In tuntap, we do native XDP only for small and non zerocopy
> packets. See
> tun_can_build_skb(). The reason is XDP may adjust packet header
> which is
> not supported by zercopy. We can only use XDP generic for zerocopy in
> this case.
 I think I understand when driver XDP can be used. What I'm not sure and
 was going to narrow down is why zerocopy is mostly not applied.

>>> I see, any touch to the zerocopy packet (clone, header expansion or
>>> segmentation) that lead a userspace copy will increase the error counter
>>> in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
>>> zerocopy. So it was probably something in your setup or a bug somewhere.
>> Thanks for the hint!

Seems zerocopy packets are always nonlinear and
netif_receive_generic_xdp() calls skb_linearize() in which
__pskb_pull_tail() calls skb_zcopy_clear(). Looks like tx_zcopy_err is
always counted when zerocopy is used with XDP in my env.

-- 
Toshiaki Makita

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll

2018-07-02 Thread Jason Wang



On 2018年07月02日 15:11, Toshiaki Makita wrote:

On 2018/07/02 15:17, Jason Wang wrote:

On 2018年07月02日 12:37, Toshiaki Makita wrote:

On 2018/07/02 11:54, Jason Wang wrote:

On 2018年07月02日 10:45, Toshiaki Makita wrote:

Hi Jason,

On 2018/06/29 18:30, Jason Wang wrote:

On 2018年06月29日 16:09, Toshiaki Makita wrote:

...

To fix this, poll the work instead of enabling notification when
busypoll is interrupted by something. IMHO signal_pending() and
vhost_has_work() are kind of interruptions rather than signals to
completely cancel the busypoll, so let's run busypoll after the
necessary work is done. In order to avoid too long busyloop due to
interruption, save the endtime in vq field and use it when reentering
the work function.

I think we don't care long busyloop unless e.g tx can starve rx?

I just want to keep it user-controllable. Unless memorizing it busypoll
can run unexpectedly long.

I think the total amount of time for busy polling is bounded. If I was
wrong, it should be a bug somewhere.

Consider this kind of scenario:
0. Set 100us busypoll for example.
1. handle_tx() runs busypoll.
2. Something like zerocopy queues tx_work within 100us.
3. busypoll exits and call handle_tx() again.
4. Repeat 1-3.

In this case handle_tx() does not process packets but busypoll
essentially runs beyond 100us without endtime memorized. This may be
just a theoretical problem, but I was worried that more code to poll tx
queue can be added in the future and it becomes realistic.

Yes, but consider zerocopy tends to batch 16 used packets and we will
finally finish all processing of packets. The above won't be endless, so
it was probably tolerable.

Right. So endtime memorization is more like a future-proof thing.
Would you like to keep it or change something?


I think we'd better introduce it only when we meet real bugs.

Thanks




Performance numbers:

- Bulk transfer from guest to external physical server.
    [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
[Server]

Just to confirm in this case since zerocopy is enabled, we are in fact
use the generic XDP datapath?

For some reason zerocopy was not applied for most packets, so in most
cases driver XDP was used. I was going to dig into it but not yet.

Right, just to confirm this. This is expected.

In tuntap, we do native XDP only for small and non zerocopy packets. See
tun_can_build_skb(). The reason is XDP may adjust packet header which is
not supported by zercopy. We can only use XDP generic for zerocopy in
this case.

I think I understand when driver XDP can be used. What I'm not sure and
was going to narrow down is why zerocopy is mostly not applied.


I see, any touch to the zerocopy packet (clone, header expansion or
segmentation) that lead a userspace copy will increase the error counter
in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use
zerocopy. So it was probably something in your setup or a bug somewhere.

Thanks for the hint!



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH vhost] vhost_net: Fix too many vring kick on busypoll

2018-07-02 Thread Jason Wang



On 2018年07月02日 12:37, Toshiaki Makita wrote:

On 2018/07/02 11:54, Jason Wang wrote:

On 2018年07月02日 10:45, Toshiaki Makita wrote:

Hi Jason,

On 2018/06/29 18:30, Jason Wang wrote:

On 2018年06月29日 16:09, Toshiaki Makita wrote:

...

To fix this, poll the work instead of enabling notification when
busypoll is interrupted by something. IMHO signal_pending() and
vhost_has_work() are kind of interruptions rather than signals to
completely cancel the busypoll, so let's run busypoll after the
necessary work is done. In order to avoid too long busyloop due to
interruption, save the endtime in vq field and use it when reentering
the work function.

I think we don't care long busyloop unless e.g tx can starve rx?

I just want to keep it user-controllable. Unless memorizing it busypoll
can run unexpectedly long.

I think the total amount of time for busy polling is bounded. If I was
wrong, it should be a bug somewhere.

Consider this kind of scenario:
0. Set 100us busypoll for example.
1. handle_tx() runs busypoll.
2. Something like zerocopy queues tx_work within 100us.
3. busypoll exits and call handle_tx() again.
4. Repeat 1-3.

In this case handle_tx() does not process packets but busypoll
essentially runs beyond 100us without endtime memorized. This may be
just a theoretical problem, but I was worried that more code to poll tx
queue can be added in the future and it becomes realistic.


Yes, but consider zerocopy tends to batch 16 used packets and we will 
finally finish all processing of packets. The above won't be endless, so 
it was probably tolerable.





Performance numbers:

- Bulk transfer from guest to external physical server.
   [Guest]->vhost_net->tap--(XDP_REDIRECT)-->i40e --(wire)-->
[Server]

Just to confirm in this case since zerocopy is enabled, we are in fact
use the generic XDP datapath?

For some reason zerocopy was not applied for most packets, so in most
cases driver XDP was used. I was going to dig into it but not yet.

Right, just to confirm this. This is expected.

In tuntap, we do native XDP only for small and non zerocopy packets. See
tun_can_build_skb(). The reason is XDP may adjust packet header which is
not supported by zercopy. We can only use XDP generic for zerocopy in
this case.

I think I understand when driver XDP can be used. What I'm not sure and
was going to narrow down is why zerocopy is mostly not applied.



I see, any touch to the zerocopy packet (clone, header expansion or 
segmentation) that lead a userspace copy will increase the error counter 
in vhost_net. Then vhost_net_tx_select_zcopy() may choose not to use 
zerocopy. So it was probably something in your setup or a bug somewhere.


Thanks
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization