Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-08 Thread xiaohui . xin
From: Xin Xiaohui xiaohui@intel.com

---
Michael,
This is a small patch for the write logging issue with async queue.
I have made a __vhost_get_vq_desc() func which may compute the log
info with any valid buffer index. The __vhost_get_vq_desc() is 
coming from the code in vq_get_vq_desc().
And I use it to recompute the log info when logging is enabled.

Thanks
Xiaohui

 drivers/vhost/net.c   |   27 ---
 drivers/vhost/vhost.c |  115 -
 drivers/vhost/vhost.h |5 ++
 3 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2aafd90..00a45ef 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -115,7 +115,8 @@ static void handle_async_rx_events_notify(struct vhost_net 
*net,
struct kiocb *iocb = NULL;
struct vhost_log *vq_log = NULL;
int rx_total_len = 0;
-   int log, size;
+   unsigned int head, log, in, out;
+   int size;
 
if (vq-link_state != VHOST_VQ_LINK_ASYNC)
return;
@@ -130,14 +131,25 @@ static void handle_async_rx_events_notify(struct 
vhost_net *net,
iocb-ki_pos, iocb-ki_nbytes);
log = (int)iocb-ki_user_data;
size = iocb-ki_nbytes;
+   head = iocb-ki_pos;
rx_total_len += iocb-ki_nbytes;
 
if (iocb-ki_dtor)
iocb-ki_dtor(iocb);
kmem_cache_free(net-cache, iocb);
 
-   if (unlikely(vq_log))
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(net-dev, vq, vq-iov,
+   ARRAY_SIZE(vq-iov),
+   out, in, vq_log,
+   log, head);
vhost_log_write(vq, vq_log, log, size);
+   }
if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
vhost_poll_queue(vq-poll);
break;
@@ -313,14 +325,13 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq-hdr_size;
 
-   /* In async cases, for write logging, the simple way is to get
-* the log info always, and really logging is decided later.
-* Thus, when logging enabled, we can get log, and when logging
-* disabled, we can get log disabled accordingly.
+   /* In async cases, when write log is enabled, in case the submitted
+* buffers did not get log info before the log enabling, so we'd
+* better recompute the log info when needed. We do this in
+* handle_async_rx_events_notify().
 */
 
-   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) |
-   (vq-link_state == VHOST_VQ_LINK_ASYNC) ?
+   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
vq-log : NULL;
 
handle_async_rx_events_notify(net, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, 
struct vhost_virtqueue *vq,
return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq-num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
   struct iovec iov[], unsigned int iov_size,
   unsigned int *out_num, unsigned int *in_num,
-  struct vhost_log *log, unsigned int *log_num)
+  struct vhost_log *log, unsigned int *log_num,
+  unsigned int head)
 {
struct vring_desc desc;
-   unsigned int i, head, found = 0;
-   u16 last_avail_idx;
+   unsigned int i = head, found = 0;
int ret;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq-last_avail_idx;
-   if (get_user(vq-avail_idx, vq-avail-idx)) {
-   vq_err(vq, Failed to access avail idx at %p\n,
-  vq-avail-idx);
-   return 

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-07 Thread Michael S. Tsirkin
On Wed, Apr 07, 2010 at 09:36:36AM +0800, Xin, Xiaohui wrote:
 Michael,
For the write logging, do you have a function in hand that we can
recompute the log? If that, I think I can use it to recompute the
   log info when the logging is suddenly enabled.
For the outstanding requests, do you mean all the user buffers have
   submitted before the logging ioctl changed? That may be a lot, and
some of them are still in NIC ring descriptors. Waiting them to be
   finished may be need some time. I think when logging ioctl changed,
then the logging is changed just after that is also reasonable.
 
   The key point is that after loggin ioctl returns, any
   subsequent change to memory must be logged. It does not
   matter when was the request submitted, otherwise we will
   get memory corruption on migration.
 
   The change to memory happens when vhost_add_used_and_signal(), right?
   So after ioctl returns, just recompute the log info to the events in 
   the async queue,
   is ok. Since the ioctl and write log operations are all protected by 
   vq-mutex.
 
   Thanks
   Xiaohui
 
  Yes, I think this will work.
 
  Thanks, so do you have the function to recompute the log info in your hand 
  that I can
 use? I have weakly remembered that you have noticed it before some time.
 
 Doesn't just rerunning vhost_get_vq_desc work?
 
 Am I missing something here?
 The vhost_get_vq_desc() looks in vq, and finds the first available buffers, 
 and converts it
 to an iovec. I think the first available buffer is not the buffers in the 
 async queue, so I
 think rerunning vhost_get_vq_desc() cannot work.
 
 Thanks
 Xiaohui

Right, but we can move the head back, so we'll find the same buffers
again, or add a variant of vhost_get_vq_desc that will process
descriptors already consumed.

Thanks
Xiaohui
   
 drivers/vhost/net.c   |  189 
+++--
 drivers/vhost/vhost.h |   10 +++
 2 files changed, 192 insertions(+), 7 deletions(-)
   
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..2aafd90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include linux/workqueue.h
 #include linux/rcupdate.h
 #include linux/file.h
+#include linux/aio.h
   
 #include linux/net.h
 #include linux/if_packet.h
 #include linux/if_arp.h
 #include linux/if_tun.h
+#include linux/mpassthru.h
   
 #include net/sock.h
   
@@ -47,6 +49,7 @@ struct vhost_net {
  struct vhost_dev dev;
  struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
  struct vhost_poll poll[VHOST_NET_VQ_MAX];
+ struct kmem_cache   *cache;
  /* Tells us whether we are polling a socket for TX.
   * We only do this when socket buffer fills up.
   * Protected by tx vq lock. */
@@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
struct socket *sock)
  net-tx_poll_state = VHOST_NET_POLL_STARTED;
 }
   
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(vq-notify_lock, flags);
+ if (!list_empty(vq-notifier)) {
+ iocb = list_first_entry(vq-notifier,
+ struct kiocb, ki_list);
+ list_del(iocb-ki_list);
+ }
+ spin_unlock_irqrestore(vq-notify_lock, flags);
+ return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ struct vhost_log *vq_log = NULL;
+ int rx_total_len = 0;
+ int log, size;
+
+ if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ if (vq-receiver)
+ vq-receiver(vq);
+
+ vq_log = unlikely(vhost_has_feature(
+ net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
+ while ((iocb = notify_dequeue(vq)) != NULL) {
+ vhost_add_used_and_signal(net-dev, vq,
+ iocb-ki_pos, iocb-ki_nbytes);
+ log = (int)iocb-ki_user_data;
+ size = iocb-ki_nbytes;
+ rx_total_len += iocb-ki_nbytes;
+
+ if (iocb-ki_dtor)
+ iocb-ki_dtor(iocb);
+ kmem_cache_free(net-cache, iocb);
+
+ if (unlikely(vq_log))
+ vhost_log_write(vq, vq_log, log, size);
+ if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
+ vhost_poll_queue(vq-poll);
+ break;
+ }
+ }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ struct kiocb *iocb = NULL;
+ int tx_total_len = 0;
+
+ if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+ return;
+
+ 

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-06 Thread Michael S. Tsirkin
On Tue, Apr 06, 2010 at 01:46:56PM +0800, Xin, Xiaohui wrote:
 Michael,
   For the write logging, do you have a function in hand that we can
   recompute the log? If that, I think I can use it to recompute the
  log info when the logging is suddenly enabled.
   For the outstanding requests, do you mean all the user buffers have
  submitted before the logging ioctl changed? That may be a lot, and
   some of them are still in NIC ring descriptors. Waiting them to be
  finished may be need some time. I think when logging ioctl changed,
   then the logging is changed just after that is also reasonable.
  
  The key point is that after loggin ioctl returns, any
  subsequent change to memory must be logged. It does not
  matter when was the request submitted, otherwise we will
  get memory corruption on migration.
 
  The change to memory happens when vhost_add_used_and_signal(), right?
  So after ioctl returns, just recompute the log info to the events in the 
  async queue,
  is ok. Since the ioctl and write log operations are all protected by 
  vq-mutex.
  
  Thanks
  Xiaohui
 
 Yes, I think this will work.
 
 Thanks, so do you have the function to recompute the log info in your hand 
 that I can 
 use? I have weakly remembered that you have noticed it before some time.

Doesn't just rerunning vhost_get_vq_desc work?

   Thanks
   Xiaohui
   
drivers/vhost/net.c   |  189 
   +++--
drivers/vhost/vhost.h |   10 +++
2 files changed, 192 insertions(+), 7 deletions(-)
   
   diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
   index 22d5fef..2aafd90 100644
   --- a/drivers/vhost/net.c
   +++ b/drivers/vhost/net.c
   @@ -17,11 +17,13 @@
#include linux/workqueue.h
#include linux/rcupdate.h
#include linux/file.h
   +#include linux/aio.h

#include linux/net.h
#include linux/if_packet.h
#include linux/if_arp.h
#include linux/if_tun.h
   +#include linux/mpassthru.h

#include net/sock.h

   @@ -47,6 +49,7 @@ struct vhost_net {
 struct vhost_dev dev;
 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 struct vhost_poll poll[VHOST_NET_VQ_MAX];
   + struct kmem_cache   *cache;
 /* Tells us whether we are polling a socket for TX.
  * We only do this when socket buffer fills up.
  * Protected by tx vq lock. */
   @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
   struct socket *sock)
 net-tx_poll_state = VHOST_NET_POLL_STARTED;
}

   +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + unsigned long flags;
   +
   + spin_lock_irqsave(vq-notify_lock, flags);
   + if (!list_empty(vq-notifier)) {
   + iocb = list_first_entry(vq-notifier,
   + struct kiocb, ki_list);
   + list_del(iocb-ki_list);
   + }
   + spin_unlock_irqrestore(vq-notify_lock, flags);
   + return iocb;
   +}
   +
   +static void handle_async_rx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + struct vhost_log *vq_log = NULL;
   + int rx_total_len = 0;
   + int log, size;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + if (vq-receiver)
   + vq-receiver(vq);
   +
   + vq_log = unlikely(vhost_has_feature(
   + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, iocb-ki_nbytes);
   + log = (int)iocb-ki_user_data;
   + size = iocb-ki_nbytes;
   + rx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   + kmem_cache_free(net-cache, iocb);
   +
   + if (unlikely(vq_log))
   + vhost_log_write(vq, vq_log, log, size);
   + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
   + vhost_poll_queue(vq-poll);
   + break;
   + }
   + }
   +}
   +
   +static void handle_async_tx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + int tx_total_len = 0;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, 0);
   + tx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   +
   + kmem_cache_free(net-cache, iocb);
   + if (unlikely(tx_total_len = VHOST_NET_WEIGHT)) {
   + vhost_poll_queue(vq-poll);
   + break;
   + }
   + }
   +}
   +
/* Expects to be always run from workqueue - which acts as
 * read-size 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-06 Thread Xin, Xiaohui
Michael,
   For the write logging, do you have a function in hand that we can
   recompute the log? If that, I think I can use it to recompute the
  log info when the logging is suddenly enabled.
   For the outstanding requests, do you mean all the user buffers have
  submitted before the logging ioctl changed? That may be a lot, and
   some of them are still in NIC ring descriptors. Waiting them to be
  finished may be need some time. I think when logging ioctl changed,
   then the logging is changed just after that is also reasonable.

  The key point is that after loggin ioctl returns, any
  subsequent change to memory must be logged. It does not
  matter when was the request submitted, otherwise we will
  get memory corruption on migration.

  The change to memory happens when vhost_add_used_and_signal(), right?
  So after ioctl returns, just recompute the log info to the events in the 
  async queue,
  is ok. Since the ioctl and write log operations are all protected by 
  vq-mutex.

  Thanks
  Xiaohui

 Yes, I think this will work.

 Thanks, so do you have the function to recompute the log info in your hand 
 that I can
use? I have weakly remembered that you have noticed it before some time.

Doesn't just rerunning vhost_get_vq_desc work?

Am I missing something here?
The vhost_get_vq_desc() looks in vq, and finds the first available buffers, and 
converts it
to an iovec. I think the first available buffer is not the buffers in the async 
queue, so I
think rerunning vhost_get_vq_desc() cannot work.

Thanks
Xiaohui

   Thanks
   Xiaohui
  
drivers/vhost/net.c   |  189 
   +++--
drivers/vhost/vhost.h |   10 +++
2 files changed, 192 insertions(+), 7 deletions(-)
  
   diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
   index 22d5fef..2aafd90 100644
   --- a/drivers/vhost/net.c
   +++ b/drivers/vhost/net.c
   @@ -17,11 +17,13 @@
#include linux/workqueue.h
#include linux/rcupdate.h
#include linux/file.h
   +#include linux/aio.h
  
#include linux/net.h
#include linux/if_packet.h
#include linux/if_arp.h
#include linux/if_tun.h
   +#include linux/mpassthru.h
  
#include net/sock.h
  
   @@ -47,6 +49,7 @@ struct vhost_net {
 struct vhost_dev dev;
 struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
 struct vhost_poll poll[VHOST_NET_VQ_MAX];
   + struct kmem_cache   *cache;
 /* Tells us whether we are polling a socket for TX.
  * We only do this when socket buffer fills up.
  * Protected by tx vq lock. */
   @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
   struct socket *sock)
 net-tx_poll_state = VHOST_NET_POLL_STARTED;
}
  
   +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + unsigned long flags;
   +
   + spin_lock_irqsave(vq-notify_lock, flags);
   + if (!list_empty(vq-notifier)) {
   + iocb = list_first_entry(vq-notifier,
   + struct kiocb, ki_list);
   + list_del(iocb-ki_list);
   + }
   + spin_unlock_irqrestore(vq-notify_lock, flags);
   + return iocb;
   +}
   +
   +static void handle_async_rx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + struct vhost_log *vq_log = NULL;
   + int rx_total_len = 0;
   + int log, size;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + if (vq-receiver)
   + vq-receiver(vq);
   +
   + vq_log = unlikely(vhost_has_feature(
   + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, iocb-ki_nbytes);
   + log = (int)iocb-ki_user_data;
   + size = iocb-ki_nbytes;
   + rx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   + kmem_cache_free(net-cache, iocb);
   +
   + if (unlikely(vq_log))
   + vhost_log_write(vq, vq_log, log, size);
   + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
   + vhost_poll_queue(vq-poll);
   + break;
   + }
   + }
   +}
   +
   +static void handle_async_tx_events_notify(struct vhost_net *net,
   + struct vhost_virtqueue *vq)
   +{
   + struct kiocb *iocb = NULL;
   + int tx_total_len = 0;
   +
   + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
   + return;
   +
   + while ((iocb = notify_dequeue(vq)) != NULL) {
   + vhost_add_used_and_signal(net-dev, vq,
   + iocb-ki_pos, 0);
   + tx_total_len += iocb-ki_nbytes;
   +
   + if (iocb-ki_dtor)
   + iocb-ki_dtor(iocb);
   +
   + kmem_cache_free(net-cache, iocb);
   + if (unlikely(tx_total_len = 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-05 Thread Xin, Xiaohui
Michael,
  For the write logging, do you have a function in hand that we can
  recompute the log? If that, I think I can use it to recompute the
 log info when the logging is suddenly enabled.
  For the outstanding requests, do you mean all the user buffers have
 submitted before the logging ioctl changed? That may be a lot, and
  some of them are still in NIC ring descriptors. Waiting them to be
 finished may be need some time. I think when logging ioctl changed,
  then the logging is changed just after that is also reasonable.
 
 The key point is that after loggin ioctl returns, any
 subsequent change to memory must be logged. It does not
 matter when was the request submitted, otherwise we will
 get memory corruption on migration.

 The change to memory happens when vhost_add_used_and_signal(), right?
 So after ioctl returns, just recompute the log info to the events in the 
 async queue,
 is ok. Since the ioctl and write log operations are all protected by 
 vq-mutex.
 
 Thanks
 Xiaohui

Yes, I think this will work.

Thanks, so do you have the function to recompute the log info in your hand that 
I can 
use? I have weakly remembered that you have noticed it before some time.

  Thanks
  Xiaohui
  
   drivers/vhost/net.c   |  189 
  +++--
   drivers/vhost/vhost.h |   10 +++
   2 files changed, 192 insertions(+), 7 deletions(-)
  
  diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
  index 22d5fef..2aafd90 100644
  --- a/drivers/vhost/net.c
  +++ b/drivers/vhost/net.c
  @@ -17,11 +17,13 @@
   #include linux/workqueue.h
   #include linux/rcupdate.h
   #include linux/file.h
  +#include linux/aio.h
   
   #include linux/net.h
   #include linux/if_packet.h
   #include linux/if_arp.h
   #include linux/if_tun.h
  +#include linux/mpassthru.h
   
   #include net/sock.h
   
  @@ -47,6 +49,7 @@ struct vhost_net {
  struct vhost_dev dev;
  struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
  struct vhost_poll poll[VHOST_NET_VQ_MAX];
  +   struct kmem_cache   *cache;
  /* Tells us whether we are polling a socket for TX.
   * We only do this when socket buffer fills up.
   * Protected by tx vq lock. */
  @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
  socket *sock)
  net-tx_poll_state = VHOST_NET_POLL_STARTED;
   }
   
  +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
  +{
  +   struct kiocb *iocb = NULL;
  +   unsigned long flags;
  +
  +   spin_lock_irqsave(vq-notify_lock, flags);
  +   if (!list_empty(vq-notifier)) {
  +   iocb = list_first_entry(vq-notifier,
  +   struct kiocb, ki_list);
  +   list_del(iocb-ki_list);
  +   }
  +   spin_unlock_irqrestore(vq-notify_lock, flags);
  +   return iocb;
  +}
  +
  +static void handle_async_rx_events_notify(struct vhost_net *net,
  +   struct vhost_virtqueue *vq)
  +{
  +   struct kiocb *iocb = NULL;
  +   struct vhost_log *vq_log = NULL;
  +   int rx_total_len = 0;
  +   int log, size;
  +
  +   if (vq-link_state != VHOST_VQ_LINK_ASYNC)
  +   return;
  +
  +   if (vq-receiver)
  +   vq-receiver(vq);
  +
  +   vq_log = unlikely(vhost_has_feature(
  +   net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
  +   while ((iocb = notify_dequeue(vq)) != NULL) {
  +   vhost_add_used_and_signal(net-dev, vq,
  +   iocb-ki_pos, iocb-ki_nbytes);
  +   log = (int)iocb-ki_user_data;
  +   size = iocb-ki_nbytes;
  +   rx_total_len += iocb-ki_nbytes;
  +
  +   if (iocb-ki_dtor)
  +   iocb-ki_dtor(iocb);
  +   kmem_cache_free(net-cache, iocb);
  +
  +   if (unlikely(vq_log))
  +   vhost_log_write(vq, vq_log, log, size);
  +   if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
  +   vhost_poll_queue(vq-poll);
  +   break;
  +   }
  +   }
  +}
  +
  +static void handle_async_tx_events_notify(struct vhost_net *net,
  +   struct vhost_virtqueue *vq)
  +{
  +   struct kiocb *iocb = NULL;
  +   int tx_total_len = 0;
  +
  +   if (vq-link_state != VHOST_VQ_LINK_ASYNC)
  +   return;
  +
  +   while ((iocb = notify_dequeue(vq)) != NULL) {
  +   vhost_add_used_and_signal(net-dev, vq,
  +   iocb-ki_pos, 0);
  +   tx_total_len += iocb-ki_nbytes;
  +
  +   if (iocb-ki_dtor)
  +   iocb-ki_dtor(iocb);
  +
  +   kmem_cache_free(net-cache, iocb);
  +   if (unlikely(tx_total_len = VHOST_NET_WEIGHT)) {
  +   vhost_poll_queue(vq-poll);
  +   break;
  +   }
  +   }
  +}
  +
   /* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
   static void handle_tx(struct vhost_net *net)
   {
  struct vhost_virtqueue *vq = 

Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Xin Xiaohui
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
---

Michael,
Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
sendmsg/recvmsg. I did not remove vq-receiver, since what the
callback does is related to the structures owned by mp device,
and I think isolation them to vhost is a good thing to us all.
And it will not prevent mp device to be independent of vhost 
in future. Later, when mp device can be a real device which
provides asynchronous read/write operations and not just report
proto_ops, it will use another callback function which is not
related to vhost at all.

For the write logging, do you have a function in hand that we can
recompute the log? If that, I think I can use it to recompute the
log info when the logging is suddenly enabled.
For the outstanding requests, do you mean all the user buffers have
submitted before the logging ioctl changed? That may be a lot, and
some of them are still in NIC ring descriptors. Waiting them to be
finished may be need some time. I think when logging ioctl changed,
then the logging is changed just after that is also reasonable.

Thanks
Xiaohui

 drivers/vhost/net.c   |  189 +++--
 drivers/vhost/vhost.h |   10 +++
 2 files changed, 192 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..2aafd90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include linux/workqueue.h
 #include linux/rcupdate.h
 #include linux/file.h
+#include linux/aio.h
 
 #include linux/net.h
 #include linux/if_packet.h
 #include linux/if_arp.h
 #include linux/if_tun.h
+#include linux/mpassthru.h
 
 #include net/sock.h
 
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net-tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(vq-notify_lock, flags);
+   if (!list_empty(vq-notifier)) {
+   iocb = list_first_entry(vq-notifier,
+   struct kiocb, ki_list);
+   list_del(iocb-ki_list);
+   }
+   spin_unlock_irqrestore(vq-notify_lock, flags);
+   return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   int log, size;
+
+   if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   if (vq-receiver)
+   vq-receiver(vq);
+
+   vq_log = unlikely(vhost_has_feature(
+   net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(net-dev, vq,
+   iocb-ki_pos, iocb-ki_nbytes);
+   log = (int)iocb-ki_user_data;
+   size = iocb-ki_nbytes;
+   rx_total_len += iocb-ki_nbytes;
+
+   if (iocb-ki_dtor)
+   iocb-ki_dtor(iocb);
+   kmem_cache_free(net-cache, iocb);
+
+   if (unlikely(vq_log))
+   vhost_log_write(vq, vq_log, log, size);
+   if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(vq-poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (vq-link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(net-dev, vq,
+   iocb-ki_pos, 0);
+   tx_total_len += iocb-ki_nbytes;
+
+   if (iocb-ki_dtor)
+   iocb-ki_dtor(iocb);
+
+   kmem_cache_free(net-cache, iocb);
+   if (unlikely(tx_total_len = VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(vq-poll);
+   break;
+   }
+   }
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of 

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Michael S. Tsirkin
On Thu, Apr 01, 2010 at 05:14:56PM +0800, Xin Xiaohui wrote:
 The vhost-net backend now only supports synchronous send/recv
 operations. The patch provides multiple submits and asynchronous
 notifications. This is needed for zero-copy case.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 ---
 
 Michael,
 Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
 sendmsg/recvmsg. I did not remove vq-receiver, since what the
 callback does is related to the structures owned by mp device,
 and I think isolation them to vhost is a good thing to us all.
 And it will not prevent mp device to be independent of vhost 
 in future. Later, when mp device can be a real device which
 provides asynchronous read/write operations and not just report
 proto_ops, it will use another callback function which is not
 related to vhost at all.

Thanks, I'll look at the code!

 For the write logging, do you have a function in hand that we can
 recompute the log? If that, I think I can use it to recompute the
 log info when the logging is suddenly enabled.
 For the outstanding requests, do you mean all the user buffers have
 submitted before the logging ioctl changed? That may be a lot, and
 some of them are still in NIC ring descriptors. Waiting them to be
 finished may be need some time. I think when logging ioctl changed,
 then the logging is changed just after that is also reasonable.

The key point is that after loggin ioctl returns, any
subsequent change to memory must be logged. It does not
matter when was the request submitted, otherwise we will
get memory corruption on migration.

 Thanks
 Xiaohui
 
  drivers/vhost/net.c   |  189 
 +++--
  drivers/vhost/vhost.h |   10 +++
  2 files changed, 192 insertions(+), 7 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..2aafd90 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  
  #include net/sock.h
  
 @@ -47,6 +49,7 @@ struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 + struct kmem_cache   *cache;
   /* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
 @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + if (!list_empty(vq-notifier)) {
 + iocb = list_first_entry(vq-notifier,
 + struct kiocb, ki_list);
 + list_del(iocb-ki_list);
 + }
 + spin_unlock_irqrestore(vq-notify_lock, flags);
 + return iocb;
 +}
 +
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + struct vhost_log *vq_log = NULL;
 + int rx_total_len = 0;
 + int log, size;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + if (vq-receiver)
 + vq-receiver(vq);
 +
 + vq_log = unlikely(vhost_has_feature(
 + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, iocb-ki_nbytes);
 + log = (int)iocb-ki_user_data;
 + size = iocb-ki_nbytes;
 + rx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 + kmem_cache_free(net-cache, iocb);
 +
 + if (unlikely(vq_log))
 + vhost_log_write(vq, vq_log, log, size);
 + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + int tx_total_len = 0;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, 0);
 + tx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 +
 + 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Xin, Xiaohui

 For the write logging, do you have a function in hand that we can
 recompute the log? If that, I think I can use it to recompute the
log info when the logging is suddenly enabled.
 For the outstanding requests, do you mean all the user buffers have
submitted before the logging ioctl changed? That may be a lot, and
 some of them are still in NIC ring descriptors. Waiting them to be
finished may be need some time. I think when logging ioctl changed,
 then the logging is changed just after that is also reasonable.

The key point is that after loggin ioctl returns, any
subsequent change to memory must be logged. It does not
matter when was the request submitted, otherwise we will
get memory corruption on migration.

The change to memory happens when vhost_add_used_and_signal(), right?
So after ioctl returns, just recompute the log info to the events in the async 
queue,
is ok. Since the ioctl and write log operations are all protected by vq-mutex.

Thanks
Xiaohui

 Thanks
 Xiaohui
 
  drivers/vhost/net.c   |  189 
 +++--
  drivers/vhost/vhost.h |   10 +++
  2 files changed, 192 insertions(+), 7 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..2aafd90 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  
  #include net/sock.h
  
 @@ -47,6 +49,7 @@ struct vhost_net {
   struct vhost_dev dev;
   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
   struct vhost_poll poll[VHOST_NET_VQ_MAX];
 + struct kmem_cache   *cache;
   /* Tells us whether we are polling a socket for TX.
* We only do this when socket buffer fills up.
* Protected by tx vq lock. */
 @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + unsigned long flags;
 +
 + spin_lock_irqsave(vq-notify_lock, flags);
 + if (!list_empty(vq-notifier)) {
 + iocb = list_first_entry(vq-notifier,
 + struct kiocb, ki_list);
 + list_del(iocb-ki_list);
 + }
 + spin_unlock_irqrestore(vq-notify_lock, flags);
 + return iocb;
 +}
 +
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + struct vhost_log *vq_log = NULL;
 + int rx_total_len = 0;
 + int log, size;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + if (vq-receiver)
 + vq-receiver(vq);
 +
 + vq_log = unlikely(vhost_has_feature(
 + net-dev, VHOST_F_LOG_ALL)) ? vq-log : NULL;
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, iocb-ki_nbytes);
 + log = (int)iocb-ki_user_data;
 + size = iocb-ki_nbytes;
 + rx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 + kmem_cache_free(net-cache, iocb);
 +
 + if (unlikely(vq_log))
 + vhost_log_write(vq, vq_log, log, size);
 + if (unlikely(rx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq)
 +{
 + struct kiocb *iocb = NULL;
 + int tx_total_len = 0;
 +
 + if (vq-link_state != VHOST_VQ_LINK_ASYNC)
 + return;
 +
 + while ((iocb = notify_dequeue(vq)) != NULL) {
 + vhost_add_used_and_signal(net-dev, vq,
 + iocb-ki_pos, 0);
 + tx_total_len += iocb-ki_nbytes;
 +
 + if (iocb-ki_dtor)
 + iocb-ki_dtor(iocb);
 +
 + kmem_cache_free(net-cache, iocb);
 + if (unlikely(tx_total_len = VHOST_NET_WEIGHT)) {
 + vhost_poll_queue(vq-poll);
 + break;
 + }
 + }
 +}
 +
  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_tx(struct vhost_net *net)
  {
   struct vhost_virtqueue *vq = net-dev.vqs[VHOST_NET_VQ_TX];
 + struct kiocb *iocb = NULL;
   unsigned head, out, in, s;
   struct msghdr msg = {
   .msg_name = NULL,
 @@ -124,6 +204,8 @@ static void handle_tx(struct 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-17 Thread Xin, Xiaohui
 Michael,
 I don't use the kiocb comes from the sendmsg/recvmsg,
 since I have embeded the kiocb in page_info structure,
 and allocate it when page_info allocated.

So what I suggested was that vhost allocates and tracks the iocbs, and
passes them to your device with sendmsg/ recvmsg calls. This way your
device won't need to share structures and locking strategy with vhost:
you get an iocb, handle it, invoke a callback to notify vhost about
completion.

This also gets rid of the 'receiver' callback

I'm not sure receiver callback can be removed here:
The patch describes a work flow like this:
netif_receive_skb() gets the packet, it does nothing but just queue the skb
and wakeup the handle_rx() of vhost. handle_rx() then calls the receiver 
callback
to deal with skb and and get the necessary notify info into a list, vhost owns 
the 
list and in the same handle_rx() context use it to complete.

We use receiver callback here is because only handle_rx() is waked up from
netif_receive_skb(), and we need mp device context to deal with the skb and
notify info attached to it. We also have some lock in the callback function.

If I remove the receiver callback, I can only deal with the skb and notify
info in netif_receive_skb(), but this function is in an interrupt context,
which I think lock is not allowed there. But I cannot remove the lock there.


 Please have a review and thanks for the instruction
 for replying email which helps me a lot.
 
 Thanks,
 Xiaohui
 
  drivers/vhost/net.c   |  159 
  +++--
  drivers/vhost/vhost.h |   12 
  2 files changed, 166 insertions(+), 5 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..5483848 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  
  #include net/sock.h
  
 @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
  net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 +struct vhost_virtqueue *vq);
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 +struct vhost_virtqueue *vq);
 +

A couple of style comments:

- It's better to arrange functions in such order that forward declarations
aren't necessary.  Since we don't have recursion, this should always be
possible.

- continuation lines should be idented at least at the position of '('
on the previous line.

Thanks. I'd correct that.

  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_tx(struct vhost_net *net)
 @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
  tx_poll_stop(net);
  hdr_size = vq-hdr_size;
  
 +handle_async_tx_events_notify(net, vq);
 +
  for (;;) {
  head = vhost_get_vq_desc(net-dev, vq, vq-iov,
   ARRAY_SIZE(vq-iov),
 @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
  /* Skip header. TODO: support TSO. */
  s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
  msg.msg_iovlen = out;
 +
 +if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
 +vq-head = head;
 +msg.msg_control = (void *)vq;

So here a device gets a pointer to vhost_virtqueue structure. If it gets
an iocb and invokes a callback, it would not care about vhost internals.

 +}
 +
  len = iov_length(vq-iov, out);
  /* Sanity check */
  if (!len) {
 @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
  tx_poll_start(net, sock);
  break;
  }
 +
 +if (vq-link_state == VHOST_VQ_LINK_ASYNC)
 +continue;
+
  if (err != len)
  pr_err(Truncated TX packet: 
  len %d != %zd\n, err, len);
 @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
  }
  }
  
 +handle_async_tx_events_notify(net, vq);
 +
  mutex_unlock(vq-mutex);
  unuse_mm(net-dev.mm);
  }
@@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
  int err;
  size_t hdr_size;
  struct socket *sock = rcu_dereference(vq-private_data);
 -if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
 +if (!sock || (skb_queue_empty(sock-sk-sk_receive_queue) 
 +vq-link_state == VHOST_VQ_LINK_SYNC))
  return;
  
  use_mm(net-dev.mm);
 @@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net 

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-17 Thread Michael S. Tsirkin
On Wed, Mar 17, 2010 at 05:48:10PM +0800, Xin, Xiaohui wrote:
  Michael,
  I don't use the kiocb comes from the sendmsg/recvmsg,
  since I have embeded the kiocb in page_info structure,
  and allocate it when page_info allocated.
 
 So what I suggested was that vhost allocates and tracks the iocbs, and
 passes them to your device with sendmsg/ recvmsg calls. This way your
 device won't need to share structures and locking strategy with vhost:
 you get an iocb, handle it, invoke a callback to notify vhost about
 completion.
 
 This also gets rid of the 'receiver' callback
 
 I'm not sure receiver callback can be removed here:
 The patch describes a work flow like this:
 netif_receive_skb() gets the packet, it does nothing but just queue the skb
 and wakeup the handle_rx() of vhost. handle_rx() then calls the receiver 
 callback
 to deal with skb and and get the necessary notify info into a list, vhost 
 owns the 
 list and in the same handle_rx() context use it to complete.
 
 We use receiver callback here is because only handle_rx() is waked up from
 netif_receive_skb(), and we need mp device context to deal with the skb and
 notify info attached to it. We also have some lock in the callback function.
 
 If I remove the receiver callback, I can only deal with the skb and notify
 info in netif_receive_skb(), but this function is in an interrupt context,
 which I think lock is not allowed there. But I cannot remove the lock there.
 

The basic idea is that vhost passes iocb to recvmsg and backend
completes the iocb to signal that data is ready. That completion could
be in interrupt context and so we need to switch to workqueue to handle
the event, it is true, but the code to do this would live in vhost.c or
net.c.

With this structure your device won't depend on
vhost, and can go under drivers/net/, opening up possibility
to use it for zero copy without vhost in the future.



  Please have a review and thanks for the instruction
  for replying email which helps me a lot.
  
  Thanks,
  Xiaohui
  
   drivers/vhost/net.c   |  159 
   +++--
   drivers/vhost/vhost.h |   12 
   2 files changed, 166 insertions(+), 5 deletions(-)
  
  diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
  index 22d5fef..5483848 100644
  --- a/drivers/vhost/net.c
  +++ b/drivers/vhost/net.c
  @@ -17,11 +17,13 @@
   #include linux/workqueue.h
   #include linux/rcupdate.h
   #include linux/file.h
  +#include linux/aio.h
   
   #include linux/net.h
   #include linux/if_packet.h
   #include linux/if_arp.h
   #include linux/if_tun.h
  +#include linux/mpassthru.h
   
   #include net/sock.h
   
  @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
  socket *sock)
 net-tx_poll_state = VHOST_NET_POLL_STARTED;
   }
   
  +static void handle_async_rx_events_notify(struct vhost_net *net,
  +  struct vhost_virtqueue *vq);
  +
  +static void handle_async_tx_events_notify(struct vhost_net *net,
  +  struct vhost_virtqueue *vq);
  +
 
 A couple of style comments:
 
 - It's better to arrange functions in such order that forward declarations
 aren't necessary.  Since we don't have recursion, this should always be
 possible.
 
 - continuation lines should be idented at least at the position of '('
 on the previous line.
 
 Thanks. I'd correct that.
 
   /* Expects to be always run from workqueue - which acts as
* read-size critical section for our kind of RCU. */
   static void handle_tx(struct vhost_net *net)
  @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
 tx_poll_stop(net);
 hdr_size = vq-hdr_size;
   
  +  handle_async_tx_events_notify(net, vq);
  +
 for (;;) {
 head = vhost_get_vq_desc(net-dev, vq, vq-iov,
  ARRAY_SIZE(vq-iov),
  @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
 /* Skip header. TODO: support TSO. */
 s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
 msg.msg_iovlen = out;
  +
  +  if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
  +  vq-head = head;
  +  msg.msg_control = (void *)vq;
 
 So here a device gets a pointer to vhost_virtqueue structure. If it gets
 an iocb and invokes a callback, it would not care about vhost internals.
 
  +  }
  +
 len = iov_length(vq-iov, out);
 /* Sanity check */
 if (!len) {
  @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
 tx_poll_start(net, sock);
 break;
 }
  +
  +  if (vq-link_state == VHOST_VQ_LINK_ASYNC)
  +  continue;
 +
 if (err != len)
 pr_err(Truncated TX packet: 
 len %d != %zd\n, err, len);
  @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
 }
  

Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-16 Thread Xin Xiaohui
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
---

Michael,
I don't use the kiocb comes from the sendmsg/recvmsg,
since I have embeded the kiocb in page_info structure,
and allocate it when page_info allocated.
Please have a review and thanks for the instruction
for replying email which helps me a lot.

Thanks,
Xiaohui

 drivers/vhost/net.c   |  159 +++--
 drivers/vhost/vhost.h |   12 
 2 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..5483848 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include linux/workqueue.h
 #include linux/rcupdate.h
 #include linux/file.h
+#include linux/aio.h
 
 #include linux/net.h
 #include linux/if_packet.h
 #include linux/if_arp.h
 #include linux/if_tun.h
+#include linux/mpassthru.h
 
 #include net/sock.h
 
@@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net-tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq-hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
@@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
+   vq-head = head;
+   msg.msg_control = (void *)vq;
+   }
+
len = iov_length(vq-iov, out);
/* Sanity check */
if (!len) {
@@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
tx_poll_start(net, sock);
break;
}
+
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
if (err != len)
pr_err(Truncated TX packet: 
len %d != %zd\n, err, len);
@@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
}
}
 
+   handle_async_tx_events_notify(net, vq);
+
mutex_unlock(vq-mutex);
unuse_mm(net-dev.mm);
 }
@@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq-private_data);
-   if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
+   if (!sock || (skb_queue_empty(sock-sk-sk_receive_queue) 
+   vq-link_state == VHOST_VQ_LINK_SYNC))
return;
 
use_mm(net-dev.mm);
@@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq-hdr_size;
 
-   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
+   /* In async cases, for write logging, the simple way is to get
+* the log info always, and really logging is decided later.
+* Thus, when logging enabled, we can get log, and when logging
+* disabled, we can get log disabled accordingly.
+*/
+
+   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) |
+   (vq-link_state == VHOST_VQ_LINK_ASYNC) ?
vq-log : NULL;
 
+   handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
@@ -245,6 +277,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq-iov, in);
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
+   vq-head = head;
+   vq-_log = log;
+   msg.msg_control = (void *)vq;
+   }
/* Sanity check */
if (!len) {
vq_err(vq, Unexpected header len for RX: 
@@ -259,6 +296,10 @@ static void handle_rx(struct vhost_net 

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-16 Thread Michael S. Tsirkin
On Tue, Mar 16, 2010 at 05:32:17PM +0800, Xin Xiaohui wrote:
 The vhost-net backend now only supports synchronous send/recv
 operations. The patch provides multiple submits and asynchronous
 notifications. This is needed for zero-copy case.
 
 Signed-off-by: Xin Xiaohui xiaohui@intel.com
 ---
 
 Michael,
 I don't use the kiocb comes from the sendmsg/recvmsg,
 since I have embeded the kiocb in page_info structure,
 and allocate it when page_info allocated.

So what I suggested was that vhost allocates and tracks the iocbs, and
passes them to your device with sendmsg/ recvmsg calls. This way your
device won't need to share structures and locking strategy with vhost:
you get an iocb, handle it, invoke a callback to notify vhost about
completion.

This also gets rid of the 'receiver' callback.

 Please have a review and thanks for the instruction
 for replying email which helps me a lot.
 
 Thanks,
 Xiaohui
 
  drivers/vhost/net.c   |  159 
 +++--
  drivers/vhost/vhost.h |   12 
  2 files changed, 166 insertions(+), 5 deletions(-)
 
 diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 index 22d5fef..5483848 100644
 --- a/drivers/vhost/net.c
 +++ b/drivers/vhost/net.c
 @@ -17,11 +17,13 @@
  #include linux/workqueue.h
  #include linux/rcupdate.h
  #include linux/file.h
 +#include linux/aio.h
  
  #include linux/net.h
  #include linux/if_packet.h
  #include linux/if_arp.h
  #include linux/if_tun.h
 +#include linux/mpassthru.h
  
  #include net/sock.h
  
 @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
 socket *sock)
   net-tx_poll_state = VHOST_NET_POLL_STARTED;
  }
  
 +static void handle_async_rx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq);
 +
 +static void handle_async_tx_events_notify(struct vhost_net *net,
 + struct vhost_virtqueue *vq);
 +

A couple of style comments:

- It's better to arrange functions in such order that forward declarations
aren't necessary.  Since we don't have recursion, this should always be
possible.

- continuation lines should be idented at least at the position of '('
on the previous line.

  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_tx(struct vhost_net *net)
 @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
   tx_poll_stop(net);
   hdr_size = vq-hdr_size;
  
 + handle_async_tx_events_notify(net, vq);
 +
   for (;;) {
   head = vhost_get_vq_desc(net-dev, vq, vq-iov,
ARRAY_SIZE(vq-iov),
 @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
   /* Skip header. TODO: support TSO. */
   s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
   msg.msg_iovlen = out;
 +
 + if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
 + vq-head = head;
 + msg.msg_control = (void *)vq;

So here a device gets a pointer to vhost_virtqueue structure. If it gets
an iocb and invokes a callback, it would not care about vhost internals.

 + }
 +
   len = iov_length(vq-iov, out);
   /* Sanity check */
   if (!len) {
 @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
   tx_poll_start(net, sock);
   break;
   }
 +
 + if (vq-link_state == VHOST_VQ_LINK_ASYNC)
 + continue;
 +
   if (err != len)
   pr_err(Truncated TX packet: 
   len %d != %zd\n, err, len);
 @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
   }
   }
  
 + handle_async_tx_events_notify(net, vq);
 +
   mutex_unlock(vq-mutex);
   unuse_mm(net-dev.mm);
  }
 @@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
   int err;
   size_t hdr_size;
   struct socket *sock = rcu_dereference(vq-private_data);
 - if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
 + if (!sock || (skb_queue_empty(sock-sk-sk_receive_queue) 
 + vq-link_state == VHOST_VQ_LINK_SYNC))
   return;
  
   use_mm(net-dev.mm);
 @@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net)
   vhost_disable_notify(vq);
   hdr_size = vq-hdr_size;
  
 - vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
 + /* In async cases, for write logging, the simple way is to get
 +  * the log info always, and really logging is decided later.
 +  * Thus, when logging enabled, we can get log, and when logging
 +  * disabled, we can get log disabled accordingly.
 +  */
 +


This adds overhead and might be one of the reasons
your patch does not perform that well. A better way
would be to 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-15 Thread Xin, Xiaohui
 +/* The structure to notify the virtqueue for async socket */
 +struct vhost_notifier {
 +struct list_head list;
 +struct vhost_virtqueue *vq;
 +int head;
 +int size;
 +int log;
 +void *ctrl;
 +void (*dtor)(struct vhost_notifier *);
 +};
 +

So IMO, this is not the best interface between vhost
and your driver, exposing them to each other unnecessarily.

If you think about it, your driver should not care about this structure.
It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
on completion.  vhost could save it's state in ki_user_data.  If your
driver needs to add more data to do more tracking, I think it can put
skb pointer in the private pointer.

Then if I remove the struct vhost_notifier, and just use struct kiocb, but 
don't use the one got from sendmsg or recvmsg, but allocated within the 
page_info structure, and don't implement any aio logic related to it, is that 
ok?

Sorry, I made a patch, but don't know how to reply mail with a good formatted 
patch here

Thanks
Xiaohui
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-15 Thread Michael S. Tsirkin
On Mon, Mar 15, 2010 at 04:46:50PM +0800, Xin, Xiaohui wrote:
  +/* The structure to notify the virtqueue for async socket */
  +struct vhost_notifier {
  +  struct list_head list;
  +  struct vhost_virtqueue *vq;
  +  int head;
  +  int size;
  +  int log;
  +  void *ctrl;
  +  void (*dtor)(struct vhost_notifier *);
  +};
  +
 
 So IMO, this is not the best interface between vhost
 and your driver, exposing them to each other unnecessarily.
 
 If you think about it, your driver should not care about this structure.
 It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
 on completion.  vhost could save it's state in ki_user_data.  If your
 driver needs to add more data to do more tracking, I think it can put
 skb pointer in the private pointer.
 
 Then if I remove the struct vhost_notifier, and just use struct kiocb, but 
 don't use the one got from sendmsg or recvmsg, but allocated within the 
 page_info structure, and don't implement any aio logic related to it, is that 
 ok?

Hmm, not sure I understand.  It seems both cleaner and easier to use the
iocb passed to sendmsg/recvmsg. No? I am not saying you necessarily must
implement full aio directly.

 Sorry, I made a patch, but don't know how to reply mail with a good formatted 
 patch here
 
 Thanks
 Xiaohui

Maybe Documentation/email-clients.txt will help?
Generally you do it like this (at start of mail):

Subject: one line patch summary (overrides mail subject)

multilie patch description

Signed-off-by: ...

---

Free text comes after --- delimeter, before patch.

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index a140dad..e830b30 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -106,22 +106,41 @@ static void handle_tx(struct vhost_net *net)




-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-07 Thread Michael S. Tsirkin
 +/* The structure to notify the virtqueue for async socket */
 +struct vhost_notifier {
 + struct list_head list;
 + struct vhost_virtqueue *vq;
 + int head;
 + int size;
 + int log;
 + void *ctrl;
 + void (*dtor)(struct vhost_notifier *);
 +};
 +

So IMO, this is not the best interface between vhost
and your driver, exposing them to each other unnecessarily.

If you think about it, your driver should not care about this structure.
It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
on completion.  vhost could save it's state in ki_user_data.  If your
driver needs to add more data to do more tracking, I think it can put
skb pointer in the private pointer.

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-06 Thread xiaohui . xin
From: Xin Xiaohui xiaohui@intel.com

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
---
 drivers/vhost/net.c   |  156 +++--
 drivers/vhost/vhost.h |   23 +++
 2 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..24a6c3d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -22,6 +22,7 @@
 #include linux/if_packet.h
 #include linux/if_arp.h
 #include linux/if_tun.h
+#include linux/mpassthru.h
 
 #include net/sock.h
 
@@ -91,6 +92,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net-tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +131,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq-hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
@@ -151,6 +160,12 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
+   vq-head = head;
+   msg.msg_control = (void *)vq;
+   }
+
len = iov_length(vq-iov, out);
/* Sanity check */
if (!len) {
@@ -166,6 +181,10 @@ static void handle_tx(struct vhost_net *net)
tx_poll_start(net, sock);
break;
}
+
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
if (err != len)
pr_err(Truncated TX packet: 
len %d != %zd\n, err, len);
@@ -177,6 +196,8 @@ static void handle_tx(struct vhost_net *net)
}
}
 
+   handle_async_tx_events_notify(net, vq);
+
mutex_unlock(vq-mutex);
unuse_mm(net-dev.mm);
 }
@@ -206,7 +227,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq-private_data);
-   if (!sock || skb_queue_empty(sock-sk-sk_receive_queue))
+   if (!sock || (skb_queue_empty(sock-sk-sk_receive_queue) 
+   vq-link_state == VHOST_VQ_LINK_SYNC))
return;
 
use_mm(net-dev.mm);
@@ -214,9 +236,18 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq-hdr_size;
 
-   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) ?
+   /* In async cases, for write logging, the simple way is to get
+* the log info always, and really logging is decided later.
+* Thus, when logging enabled, we can get log, and when logging
+* disabled, we can get log disabled accordingly.
+*/
+
+   vq_log = unlikely(vhost_has_feature(net-dev, VHOST_F_LOG_ALL)) |
+   (vq-link_state == VHOST_VQ_LINK_ASYNC) ?
vq-log : NULL;
 
+   handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(net-dev, vq, vq-iov,
 ARRAY_SIZE(vq-iov),
@@ -245,6 +276,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq-iov, vq-hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq-iov, in);
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC) {
+   vq-head = head;
+   vq-_log = log;
+   msg.msg_control = (void *)vq;
+   }
/* Sanity check */
if (!len) {
vq_err(vq, Unexpected header len for RX: 
@@ -259,6 +295,10 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq);
break;
}
+
+   if (vq-link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
/* TODO: Should check and handle checksum. */
if (err  len) {
pr_err(Discarded truncated rx packet: