Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-08 Thread xiaohui . xin
From: Xin Xiaohui 

---
Michael,
This is a small patch for the write logging issue with async queue.
I have made a __vhost_get_vq_desc() func which may compute the log
info with any valid buffer index. The __vhost_get_vq_desc() is 
coming from the code in vq_get_vq_desc().
And I use it to recompute the log info when logging is enabled.

Thanks
Xiaohui

 drivers/vhost/net.c   |   27 ---
 drivers/vhost/vhost.c |  115 -
 drivers/vhost/vhost.h |5 ++
 3 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2aafd90..00a45ef 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -115,7 +115,8 @@ static void handle_async_rx_events_notify(struct vhost_net 
*net,
struct kiocb *iocb = NULL;
struct vhost_log *vq_log = NULL;
int rx_total_len = 0;
-   int log, size;
+   unsigned int head, log, in, out;
+   int size;
 
if (vq->link_state != VHOST_VQ_LINK_ASYNC)
return;
@@ -130,14 +131,25 @@ static void handle_async_rx_events_notify(struct 
vhost_net *net,
iocb->ki_pos, iocb->ki_nbytes);
log = (int)iocb->ki_user_data;
size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
rx_total_len += iocb->ki_nbytes;
 
if (iocb->ki_dtor)
iocb->ki_dtor(iocb);
kmem_cache_free(net->cache, iocb);
 
-   if (unlikely(vq_log))
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
vhost_log_write(vq, vq_log, log, size);
+   }
if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break;
@@ -313,14 +325,13 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
 
-   /* In async cases, for write logging, the simple way is to get
-* the log info always, and really logging is decided later.
-* Thus, when logging enabled, we can get log, and when logging
-* disabled, we can get log disabled accordingly.
+   /* In async cases, when write log is enabled, in case the submitted
+* buffers did not get log info before the log enabling, so we'd
+* better recompute the log info when needed. We do this in
+* handle_async_rx_events_notify().
 */
 
-   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
-   (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
 
handle_async_rx_events_notify(net, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, 
struct vhost_virtqueue *vq,
return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
   struct iovec iov[], unsigned int iov_size,
   unsigned int *out_num, unsigned int *in_num,
-  struct vhost_log *log, unsigned int *log_num)
+  struct vhost_log *log, unsigned int *log_num,
+  unsigned int head)
 {
struct vring_desc desc;
-   unsigned int i, head, found = 0;
-   u16 last_avail_idx;
+   unsigned int i = head, found = 0;
int ret;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-   if (get_user(vq->avail_idx, &vq->avail->idx)) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-  &vq->avail->idx);
-

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-07 Thread Michael S. Tsirkin
On Wed, Apr 07, 2010 at 09:36:36AM +0800, Xin, Xiaohui wrote:
> Michael,
> > >  For the write logging, do you have a function in hand that we can
> > >  recompute the log? If that, I think I can use it to recompute the
> > > log info when the logging is suddenly enabled.
> > >  For the outstanding requests, do you mean all the user buffers have
> > > submitted before the logging ioctl changed? That may be a lot, and
> > > >> >>some of them are still in NIC ring descriptors. Waiting them to be
> > > finished may be need some time. I think when logging ioctl changed,
> > > >> >>then the logging is changed just after that is also reasonable.
> 
> > > >>>The key point is that after loggin ioctl returns, any
> > > >>>subsequent change to memory must be logged. It does not
> > > >>>matter when was the request submitted, otherwise we will
> > > >>>get memory corruption on migration.
> 
> > > >>The change to memory happens when vhost_add_used_and_signal(), right?
> > > >>So after ioctl returns, just recompute the log info to the events in 
> > > >>the async queue,
> > > >>is ok. Since the ioctl and write log operations are all protected by 
> > > >>vq->mutex.
> 
> > >>> Thanks
> > >> >Xiaohui
> 
> > >>Yes, I think this will work.
> 
> >> Thanks, so do you have the function to recompute the log info in your hand 
> >> that I can
> >>use? I have weakly remembered that you have noticed it before some time.
> 
> >Doesn't just rerunning vhost_get_vq_desc work?
> 
> Am I missing something here?
> The vhost_get_vq_desc() looks in vq, and finds the first available buffers, 
> and converts it
> to an iovec. I think the first available buffer is not the buffers in the 
> async queue, so I
> think rerunning vhost_get_vq_desc() cannot work.
> 
> Thanks
> Xiaohui

Right, but we can move the head back, so we'll find the same buffers
again, or add a variant of vhost_get_vq_desc that will process
descriptors already consumed.

> > > > Thanks
> > > > Xiaohui
> > > >
> > > >  drivers/vhost/net.c   |  189 
> > > > +++--
> > > >  drivers/vhost/vhost.h |   10 +++
> > > >  2 files changed, 192 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > > index 22d5fef..2aafd90 100644
> > > > --- a/drivers/vhost/net.c
> > > > +++ b/drivers/vhost/net.c
> > > > @@ -17,11 +17,13 @@
> > > >  #include 
> > > >  #include 
> > > >  #include 
> > > > +#include 
> > > >
> > > >  #include 
> > > >  #include 
> > > >  #include 
> > > >  #include 
> > > > +#include 
> > > >
> > > >  #include 
> > > >
> > > > @@ -47,6 +49,7 @@ struct vhost_net {
> > > >   struct vhost_dev dev;
> > > >   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > > >   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > > + struct kmem_cache   *cache;
> > > >   /* Tells us whether we are polling a socket for TX.
> > > >* We only do this when socket buffer fills up.
> > > >* Protected by tx vq lock. */
> > > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
> > > > struct socket *sock)
> > > >   net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > >  }
> > > >
> > > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + unsigned long flags;
> > > > +
> > > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > > + if (!list_empty(&vq->notifier)) {
> > > > + iocb = list_first_entry(&vq->notifier,
> > > > + struct kiocb, ki_list);
> > > > + list_del(&iocb->ki_list);
> > > > + }
> > > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > > + return iocb;
> > > > +}
> > > > +
> > > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > > + struct vhost_virtqueue *vq)
> > > > +{
> > > > + struct kiocb *iocb = NULL;
> > > > + struct vhost_log *vq_log = NULL;
> > > > + int rx_total_len = 0;
> > > > + int log, size;
> > > > +
> > > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > > + return;
> > > > +
> > > > + if (vq->receiver)
> > > > + vq->receiver(vq);
> > > > +
> > > > + vq_log = unlikely(vhost_has_feature(
> > > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > > + vhost_add_used_and_signal(&net->dev, vq,
> > > > + iocb->ki_pos, iocb->ki_nbytes);
> > > > + log = (int)iocb->ki_user_data;
> > > > + size = iocb->ki_nbytes;
> > > > + rx_total_len += iocb->ki_nbytes;
> > > > +
> > > > + if (iocb->ki_dtor)
> > > > + iocb->ki_dtor(iocb);
> > > > + kmem_cache_free(net->cache, iocb);
> > > > +
> > > > + if (unlikely(vq_log))
> > > > + vhost_log_write(vq, vq_log, log, size);
> > > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > > + vhost_poll_q

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-06 Thread Xin, Xiaohui
Michael,
> >  For the write logging, do you have a function in hand that we can
> >  recompute the log? If that, I think I can use it to recompute the
> > log info when the logging is suddenly enabled.
> >  For the outstanding requests, do you mean all the user buffers have
> > submitted before the logging ioctl changed? That may be a lot, and
> > >> >>some of them are still in NIC ring descriptors. Waiting them to be
> > finished may be need some time. I think when logging ioctl changed,
> > >> >>then the logging is changed just after that is also reasonable.

> > >>>The key point is that after loggin ioctl returns, any
> > >>>subsequent change to memory must be logged. It does not
> > >>>matter when was the request submitted, otherwise we will
> > >>>get memory corruption on migration.

> > >>The change to memory happens when vhost_add_used_and_signal(), right?
> > >>So after ioctl returns, just recompute the log info to the events in the 
> > >>async queue,
> > >>is ok. Since the ioctl and write log operations are all protected by 
> > >>vq->mutex.

> >>> Thanks
> >> >Xiaohui

> >>Yes, I think this will work.

>> Thanks, so do you have the function to recompute the log info in your hand 
>> that I can
>>use? I have weakly remembered that you have noticed it before some time.

>Doesn't just rerunning vhost_get_vq_desc work?

Am I missing something here?
The vhost_get_vq_desc() looks in vq, and finds the first available buffers, and 
converts it
to an iovec. I think the first available buffer is not the buffers in the async 
queue, so I
think rerunning vhost_get_vq_desc() cannot work.

Thanks
Xiaohui

> > > Thanks
> > > Xiaohui
> > >
> > >  drivers/vhost/net.c   |  189 
> > > +++--
> > >  drivers/vhost/vhost.h |   10 +++
> > >  2 files changed, 192 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 22d5fef..2aafd90 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -17,11 +17,13 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >
> > >  #include 
> > >
> > > @@ -47,6 +49,7 @@ struct vhost_net {
> > >   struct vhost_dev dev;
> > >   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > >   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > + struct kmem_cache   *cache;
> > >   /* Tells us whether we are polling a socket for TX.
> > >* We only do this when socket buffer fills up.
> > >* Protected by tx vq lock. */
> > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
> > > struct socket *sock)
> > >   net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > >  }
> > >
> > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > + if (!list_empty(&vq->notifier)) {
> > > + iocb = list_first_entry(&vq->notifier,
> > > + struct kiocb, ki_list);
> > > + list_del(&iocb->ki_list);
> > > + }
> > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > + return iocb;
> > > +}
> > > +
> > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + struct vhost_log *vq_log = NULL;
> > > + int rx_total_len = 0;
> > > + int log, size;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + if (vq->receiver)
> > > + vq->receiver(vq);
> > > +
> > > + vq_log = unlikely(vhost_has_feature(
> > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, iocb->ki_nbytes);
> > > + log = (int)iocb->ki_user_data;
> > > + size = iocb->ki_nbytes;
> > > + rx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > + kmem_cache_free(net->cache, iocb);
> > > +
> > > + if (unlikely(vq_log))
> > > + vhost_log_write(vq, vq_log, log, size);
> > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + int tx_total_len = 0;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-06 Thread Michael S. Tsirkin
On Tue, Apr 06, 2010 at 01:46:56PM +0800, Xin, Xiaohui wrote:
> Michael,
> > >>> For the write logging, do you have a function in hand that we can
> > >>> recompute the log? If that, I think I can use it to recompute the
> > >>>log info when the logging is suddenly enabled.
> > >>> For the outstanding requests, do you mean all the user buffers have
> > >>>submitted before the logging ioctl changed? That may be a lot, and
> > >> >some of them are still in NIC ring descriptors. Waiting them to be
> > >>>finished may be need some time. I think when logging ioctl changed,
> > >> >then the logging is changed just after that is also reasonable.
>  
> > >>The key point is that after loggin ioctl returns, any
> > >>subsequent change to memory must be logged. It does not
> > >>matter when was the request submitted, otherwise we will
> > >>get memory corruption on migration.
> 
> > >The change to memory happens when vhost_add_used_and_signal(), right?
> > >So after ioctl returns, just recompute the log info to the events in the 
> > >async queue,
> > >is ok. Since the ioctl and write log operations are all protected by 
> > >vq->mutex.
>  
> >> Thanks
> >> Xiaohui
> 
> >Yes, I think this will work.
> 
> Thanks, so do you have the function to recompute the log info in your hand 
> that I can 
> use? I have weakly remembered that you have noticed it before some time.

Doesn't just rerunning vhost_get_vq_desc work?

> > > Thanks
> > > Xiaohui
> > > 
> > >  drivers/vhost/net.c   |  189 
> > > +++--
> > >  drivers/vhost/vhost.h |   10 +++
> > >  2 files changed, 192 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > > index 22d5fef..2aafd90 100644
> > > --- a/drivers/vhost/net.c
> > > +++ b/drivers/vhost/net.c
> > > @@ -17,11 +17,13 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >  
> > >  #include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >  
> > >  #include 
> > >  
> > > @@ -47,6 +49,7 @@ struct vhost_net {
> > >   struct vhost_dev dev;
> > >   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > >   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > > + struct kmem_cache   *cache;
> > >   /* Tells us whether we are polling a socket for TX.
> > >* We only do this when socket buffer fills up.
> > >* Protected by tx vq lock. */
> > > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, 
> > > struct socket *sock)
> > >   net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > >  }
> > >  
> > > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + unsigned long flags;
> > > +
> > > + spin_lock_irqsave(&vq->notify_lock, flags);
> > > + if (!list_empty(&vq->notifier)) {
> > > + iocb = list_first_entry(&vq->notifier,
> > > + struct kiocb, ki_list);
> > > + list_del(&iocb->ki_list);
> > > + }
> > > + spin_unlock_irqrestore(&vq->notify_lock, flags);
> > > + return iocb;
> > > +}
> > > +
> > > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + struct vhost_log *vq_log = NULL;
> > > + int rx_total_len = 0;
> > > + int log, size;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + if (vq->receiver)
> > > + vq->receiver(vq);
> > > +
> > > + vq_log = unlikely(vhost_has_feature(
> > > + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, iocb->ki_nbytes);
> > > + log = (int)iocb->ki_user_data;
> > > + size = iocb->ki_nbytes;
> > > + rx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb);
> > > + kmem_cache_free(net->cache, iocb);
> > > +
> > > + if (unlikely(vq_log))
> > > + vhost_log_write(vq, vq_log, log, size);
> > > + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > > + vhost_poll_queue(&vq->poll);
> > > + break;
> > > + }
> > > + }
> > > +}
> > > +
> > > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > > + struct vhost_virtqueue *vq)
> > > +{
> > > + struct kiocb *iocb = NULL;
> > > + int tx_total_len = 0;
> > > +
> > > + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > > + return;
> > > +
> > > + while ((iocb = notify_dequeue(vq)) != NULL) {
> > > + vhost_add_used_and_signal(&net->dev, vq,
> > > + iocb->ki_pos, 0);
> > > + tx_total_len += iocb->ki_nbytes;
> > > +
> > > + if (iocb->ki_dtor)
> > > + iocb->ki_dtor(iocb)

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-05 Thread Xin, Xiaohui
Michael,
> >>> For the write logging, do you have a function in hand that we can
> >>> recompute the log? If that, I think I can use it to recompute the
> >>>log info when the logging is suddenly enabled.
> >>> For the outstanding requests, do you mean all the user buffers have
> >>>submitted before the logging ioctl changed? That may be a lot, and
> >> >some of them are still in NIC ring descriptors. Waiting them to be
> >>>finished may be need some time. I think when logging ioctl changed,
> >> >then the logging is changed just after that is also reasonable.
 
> >>The key point is that after loggin ioctl returns, any
> >>subsequent change to memory must be logged. It does not
> >>matter when was the request submitted, otherwise we will
> >>get memory corruption on migration.

> >The change to memory happens when vhost_add_used_and_signal(), right?
> >So after ioctl returns, just recompute the log info to the events in the 
> >async queue,
> >is ok. Since the ioctl and write log operations are all protected by 
> >vq->mutex.
 
>> Thanks
>> Xiaohui

>Yes, I think this will work.

Thanks, so do you have the function to recompute the log info in your hand that 
I can 
use? I have weakly remembered that you have noticed it before some time.

> > Thanks
> > Xiaohui
> > 
> >  drivers/vhost/net.c   |  189 
> > +++--
> >  drivers/vhost/vhost.h |   10 +++
> >  2 files changed, 192 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 22d5fef..2aafd90 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -17,11 +17,13 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  
> > @@ -47,6 +49,7 @@ struct vhost_net {
> > struct vhost_dev dev;
> > struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > +   struct kmem_cache   *cache;
> > /* Tells us whether we are polling a socket for TX.
> >  * We only do this when socket buffer fills up.
> >  * Protected by tx vq lock. */
> > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
> > socket *sock)
> > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> >  }
> >  
> > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   unsigned long flags;
> > +
> > +   spin_lock_irqsave(&vq->notify_lock, flags);
> > +   if (!list_empty(&vq->notifier)) {
> > +   iocb = list_first_entry(&vq->notifier,
> > +   struct kiocb, ki_list);
> > +   list_del(&iocb->ki_list);
> > +   }
> > +   spin_unlock_irqrestore(&vq->notify_lock, flags);
> > +   return iocb;
> > +}
> > +
> > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > +   struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   struct vhost_log *vq_log = NULL;
> > +   int rx_total_len = 0;
> > +   int log, size;
> > +
> > +   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +   return;
> > +
> > +   if (vq->receiver)
> > +   vq->receiver(vq);
> > +
> > +   vq_log = unlikely(vhost_has_feature(
> > +   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > +   while ((iocb = notify_dequeue(vq)) != NULL) {
> > +   vhost_add_used_and_signal(&net->dev, vq,
> > +   iocb->ki_pos, iocb->ki_nbytes);
> > +   log = (int)iocb->ki_user_data;
> > +   size = iocb->ki_nbytes;
> > +   rx_total_len += iocb->ki_nbytes;
> > +
> > +   if (iocb->ki_dtor)
> > +   iocb->ki_dtor(iocb);
> > +   kmem_cache_free(net->cache, iocb);
> > +
> > +   if (unlikely(vq_log))
> > +   vhost_log_write(vq, vq_log, log, size);
> > +   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > +   vhost_poll_queue(&vq->poll);
> > +   break;
> > +   }
> > +   }
> > +}
> > +
> > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > +   struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   int tx_total_len = 0;
> > +
> > +   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +   return;
> > +
> > +   while ((iocb = notify_dequeue(vq)) != NULL) {
> > +   vhost_add_used_and_signal(&net->dev, vq,
> > +   iocb->ki_pos, 0);
> > +   tx_total_len += iocb->ki_nbytes;
> > +
> > +   if (iocb->ki_dtor)
> > +   iocb->ki_dtor(iocb);
> > +
> > +   kmem_cache_free(net->cache, iocb);
> > +   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > +   vhost_poll_queue(&vq->poll);
> > +   break;
> > +   }
> > +   }
> > +}
> > +
> >  /* Expects to

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-04 Thread Michael S. Tsirkin
On Fri, Apr 02, 2010 at 10:16:16AM +0800, Xin, Xiaohui wrote:
> 
> >> For the write logging, do you have a function in hand that we can
> >> recompute the log? If that, I think I can use it to recompute the
> >>log info when the logging is suddenly enabled.
> >> For the outstanding requests, do you mean all the user buffers have
> >>submitted before the logging ioctl changed? That may be a lot, and
> >> some of them are still in NIC ring descriptors. Waiting them to be
> >>finished may be need some time. I think when logging ioctl changed,
> >> then the logging is changed just after that is also reasonable.
> 
> >The key point is that after loggin ioctl returns, any
> >subsequent change to memory must be logged. It does not
> >matter when was the request submitted, otherwise we will
> >get memory corruption on migration.
> 
> The change to memory happens when vhost_add_used_and_signal(), right?
> So after ioctl returns, just recompute the log info to the events in the 
> async queue,
> is ok. Since the ioctl and write log operations are all protected by 
> vq->mutex.
> 
> Thanks
> Xiaohui

Yes, I think this will work.

> > Thanks
> > Xiaohui
> > 
> >  drivers/vhost/net.c   |  189 
> > +++--
> >  drivers/vhost/vhost.h |   10 +++
> >  2 files changed, 192 insertions(+), 7 deletions(-)
> > 
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 22d5fef..2aafd90 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -17,11 +17,13 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  
> >  #include 
> >  
> > @@ -47,6 +49,7 @@ struct vhost_net {
> > struct vhost_dev dev;
> > struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
> > struct vhost_poll poll[VHOST_NET_VQ_MAX];
> > +   struct kmem_cache   *cache;
> > /* Tells us whether we are polling a socket for TX.
> >  * We only do this when socket buffer fills up.
> >  * Protected by tx vq lock. */
> > @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
> > socket *sock)
> > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> >  }
> >  
> > +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   unsigned long flags;
> > +
> > +   spin_lock_irqsave(&vq->notify_lock, flags);
> > +   if (!list_empty(&vq->notifier)) {
> > +   iocb = list_first_entry(&vq->notifier,
> > +   struct kiocb, ki_list);
> > +   list_del(&iocb->ki_list);
> > +   }
> > +   spin_unlock_irqrestore(&vq->notify_lock, flags);
> > +   return iocb;
> > +}
> > +
> > +static void handle_async_rx_events_notify(struct vhost_net *net,
> > +   struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   struct vhost_log *vq_log = NULL;
> > +   int rx_total_len = 0;
> > +   int log, size;
> > +
> > +   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +   return;
> > +
> > +   if (vq->receiver)
> > +   vq->receiver(vq);
> > +
> > +   vq_log = unlikely(vhost_has_feature(
> > +   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> > +   while ((iocb = notify_dequeue(vq)) != NULL) {
> > +   vhost_add_used_and_signal(&net->dev, vq,
> > +   iocb->ki_pos, iocb->ki_nbytes);
> > +   log = (int)iocb->ki_user_data;
> > +   size = iocb->ki_nbytes;
> > +   rx_total_len += iocb->ki_nbytes;
> > +
> > +   if (iocb->ki_dtor)
> > +   iocb->ki_dtor(iocb);
> > +   kmem_cache_free(net->cache, iocb);
> > +
> > +   if (unlikely(vq_log))
> > +   vhost_log_write(vq, vq_log, log, size);
> > +   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> > +   vhost_poll_queue(&vq->poll);
> > +   break;
> > +   }
> > +   }
> > +}
> > +
> > +static void handle_async_tx_events_notify(struct vhost_net *net,
> > +   struct vhost_virtqueue *vq)
> > +{
> > +   struct kiocb *iocb = NULL;
> > +   int tx_total_len = 0;
> > +
> > +   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> > +   return;
> > +
> > +   while ((iocb = notify_dequeue(vq)) != NULL) {
> > +   vhost_add_used_and_signal(&net->dev, vq,
> > +   iocb->ki_pos, 0);
> > +   tx_total_len += iocb->ki_nbytes;
> > +
> > +   if (iocb->ki_dtor)
> > +   iocb->ki_dtor(iocb);
> > +
> > +   kmem_cache_free(net->cache, iocb);
> > +   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> > +   vhost_poll_queue(&vq->poll);
> > +   break;
> > +   }
> > +   }
> > +}
> > +
> >  /* Expects to be always run from workqueue - which acts as
> >   * read-size critical section for our kind of RCU. */
> >  static void 

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Xin, Xiaohui

>> For the write logging, do you have a function in hand that we can
>> recompute the log? If that, I think I can use it to recompute the
>>log info when the logging is suddenly enabled.
>> For the outstanding requests, do you mean all the user buffers have
>>submitted before the logging ioctl changed? That may be a lot, and
>> some of them are still in NIC ring descriptors. Waiting them to be
>>finished may be need some time. I think when logging ioctl changed,
>> then the logging is changed just after that is also reasonable.

>The key point is that after loggin ioctl returns, any
>subsequent change to memory must be logged. It does not
>matter when was the request submitted, otherwise we will
>get memory corruption on migration.

The change to memory happens when vhost_add_used_and_signal(), right?
So after ioctl returns, just recompute the log info to the events in the async 
queue,
is ok. Since the ioctl and write log operations are all protected by vq->mutex.

Thanks
Xiaohui

> Thanks
> Xiaohui
> 
>  drivers/vhost/net.c   |  189 
> +++--
>  drivers/vhost/vhost.h |   10 +++
>  2 files changed, 192 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..2aafd90 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -47,6 +49,7 @@ struct vhost_net {
>   struct vhost_dev dev;
>   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
>   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> + struct kmem_cache   *cache;
>   /* Tells us whether we are polling a socket for TX.
>* We only do this when socket buffer fills up.
>* Protected by tx vq lock. */
> @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
> socket *sock)
>   net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + if (!list_empty(&vq->notifier)) {
> + iocb = list_first_entry(&vq->notifier,
> + struct kiocb, ki_list);
> + list_del(&iocb->ki_list);
> + }
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> + return iocb;
> +}
> +
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + struct vhost_log *vq_log = NULL;
> + int rx_total_len = 0;
> + int log, size;
> +
> + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> + return;
> +
> + if (vq->receiver)
> + vq->receiver(vq);
> +
> + vq_log = unlikely(vhost_has_feature(
> + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> + while ((iocb = notify_dequeue(vq)) != NULL) {
> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, iocb->ki_nbytes);
> + log = (int)iocb->ki_user_data;
> + size = iocb->ki_nbytes;
> + rx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);
> + kmem_cache_free(net->cache, iocb);
> +
> + if (unlikely(vq_log))
> + vhost_log_write(vq, vq_log, log, size);
> + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> + vhost_poll_queue(&vq->poll);
> + break;
> + }
> + }
> +}
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + int tx_total_len = 0;
> +
> + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> + return;
> +
> + while ((iocb = notify_dequeue(vq)) != NULL) {
> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, 0);
> + tx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);
> +
> + kmem_cache_free(net->cache, iocb);
> + if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
> + vhost_poll_queue(&vq->poll);
> + break;
> + }
> + }
> +}
> +
>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_tx(struct vhost_net *net)
>  {
>   struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
> + struct kiocb *iocb = NULL;
>   unsigned head, out, in, s;
>   struct msghdr msg = {
>   .msg_name = NULL,
> @@ -124,

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Michael S. Tsirkin
On Thu, Apr 01, 2010 at 05:14:56PM +0800, Xin Xiaohui wrote:
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
> 
> Signed-off-by: Xin Xiaohui 
> ---
> 
> Michael,
> Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
> sendmsg/recvmsg. I did not remove vq->receiver, since what the
> callback does is related to the structures owned by mp device,
> and I think isolation them to vhost is a good thing to us all.
> And it will not prevent mp device to be independent of vhost 
> in future. Later, when mp device can be a real device which
> provides asynchronous read/write operations and not just report
> proto_ops, it will use another callback function which is not
> related to vhost at all.

Thanks, I'll look at the code!

> For the write logging, do you have a function in hand that we can
> recompute the log? If that, I think I can use it to recompute the
> log info when the logging is suddenly enabled.
> For the outstanding requests, do you mean all the user buffers have
> submitted before the logging ioctl changed? That may be a lot, and
> some of them are still in NIC ring descriptors. Waiting them to be
> finished may be need some time. I think when logging ioctl changed,
> then the logging is changed just after that is also reasonable.

The key point is that after loggin ioctl returns, any
subsequent change to memory must be logged. It does not
matter when was the request submitted, otherwise we will
get memory corruption on migration.

> Thanks
> Xiaohui
> 
>  drivers/vhost/net.c   |  189 
> +++--
>  drivers/vhost/vhost.h |   10 +++
>  2 files changed, 192 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..2aafd90 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -47,6 +49,7 @@ struct vhost_net {
>   struct vhost_dev dev;
>   struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
>   struct vhost_poll poll[VHOST_NET_VQ_MAX];
> + struct kmem_cache   *cache;
>   /* Tells us whether we are polling a socket for TX.
>* We only do this when socket buffer fills up.
>* Protected by tx vq lock. */
> @@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
> socket *sock)
>   net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + unsigned long flags;
> +
> + spin_lock_irqsave(&vq->notify_lock, flags);
> + if (!list_empty(&vq->notifier)) {
> + iocb = list_first_entry(&vq->notifier,
> + struct kiocb, ki_list);
> + list_del(&iocb->ki_list);
> + }
> + spin_unlock_irqrestore(&vq->notify_lock, flags);
> + return iocb;
> +}
> +
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + struct vhost_log *vq_log = NULL;
> + int rx_total_len = 0;
> + int log, size;
> +
> + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> + return;
> +
> + if (vq->receiver)
> + vq->receiver(vq);
> +
> + vq_log = unlikely(vhost_has_feature(
> + &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
> + while ((iocb = notify_dequeue(vq)) != NULL) {
> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, iocb->ki_nbytes);
> + log = (int)iocb->ki_user_data;
> + size = iocb->ki_nbytes;
> + rx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);
> + kmem_cache_free(net->cache, iocb);
> +
> + if (unlikely(vq_log))
> + vhost_log_write(vq, vq_log, log, size);
> + if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
> + vhost_poll_queue(&vq->poll);
> + break;
> + }
> + }
> +}
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq)
> +{
> + struct kiocb *iocb = NULL;
> + int tx_total_len = 0;
> +
> + if (vq->link_state != VHOST_VQ_LINK_ASYNC)
> + return;
> +
> + while ((iocb = notify_dequeue(vq)) != NULL) {
> + vhost_add_used_and_signal(&net->dev, vq,
> + iocb->ki_pos, 0);
> + tx_total_len += iocb->ki_nbytes;
> +
> + if (iocb->ki_dtor)
> + iocb->ki_dtor(iocb);
> +
> +  

Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-01 Thread Xin Xiaohui
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---

Michael,
Now, I made vhost to alloc/destroy the kiocb, and transfer it from 
sendmsg/recvmsg. I did not remove vq->receiver, since what the
callback does is related to the structures owned by mp device,
and I think isolation them to vhost is a good thing to us all.
And it will not prevent mp device to be independent of vhost 
in future. Later, when mp device can be a real device which
provides asynchronous read/write operations and not just report
proto_ops, it will use another callback function which is not
related to vhost at all.

For the write logging, do you have a function in hand that we can
recompute the log? If that, I think I can use it to recompute the
log info when the logging is suddenly enabled.
For the outstanding requests, do you mean all the user buffers have
submitted before the logging ioctl changed? That may be a lot, and
some of them are still in NIC ring descriptors. Waiting them to be
finished may be need some time. I think when logging ioctl changed,
then the logging is changed just after that is also reasonable.

Thanks
Xiaohui

 drivers/vhost/net.c   |  189 +++--
 drivers/vhost/vhost.h |   10 +++
 2 files changed, 192 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..2aafd90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   int log, size;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   if (vq->receiver)
+   vq->receiver(vq);
+
+   vq_log = unlikely(vhost_has_feature(
+   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   log = (int)iocb->ki_user_data;
+   size = iocb->ki_nbytes;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   if (unlikely(vq_log))
+   vhost_log_write(vq, vq_log, log, size);
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-17 Thread Michael S. Tsirkin
On Wed, Mar 17, 2010 at 05:48:10PM +0800, Xin, Xiaohui wrote:
> >> Michael,
> >> I don't use the kiocb comes from the sendmsg/recvmsg,
> > >since I have embeded the kiocb in page_info structure,
> > >and allocate it when page_info allocated.
> 
> >So what I suggested was that vhost allocates and tracks the iocbs, and
> >passes them to your device with sendmsg/ recvmsg calls. This way your
> >device won't need to share structures and locking strategy with vhost:
> >you get an iocb, handle it, invoke a callback to notify vhost about
> >completion.
> 
> >This also gets rid of the 'receiver' callback
> 
> I'm not sure receiver callback can be removed here:
> The patch describes a work flow like this:
> netif_receive_skb() gets the packet, it does nothing but just queue the skb
> and wakeup the handle_rx() of vhost. handle_rx() then calls the receiver 
> callback
> to deal with skb and and get the necessary notify info into a list, vhost 
> owns the 
> list and in the same handle_rx() context use it to complete.
> 
> We use "receiver" callback here is because only handle_rx() is waked up from
> netif_receive_skb(), and we need mp device context to deal with the skb and
> notify info attached to it. We also have some lock in the callback function.
> 
> If I remove the receiver callback, I can only deal with the skb and notify
> info in netif_receive_skb(), but this function is in an interrupt context,
> which I think lock is not allowed there. But I cannot remove the lock there.
> 

The basic idea is that vhost passes iocb to recvmsg and backend
completes the iocb to signal that data is ready. That completion could
be in interrupt context and so we need to switch to workqueue to handle
the event, it is true, but the code to do this would live in vhost.c or
net.c.

With this structure your device won't depend on
vhost, and can go under drivers/net/, opening up possibility
to use it for zero copy without vhost in the future.



> >> Please have a review and thanks for the instruction
> >> for replying email which helps me a lot.
> >> 
> > >Thanks,
> > >Xiaohui
> > >
> > > drivers/vhost/net.c   |  159 
> > > +++--
> >>  drivers/vhost/vhost.h |   12 
> >>  2 files changed, 166 insertions(+), 5 deletions(-)
> >> 
> >> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > >index 22d5fef..5483848 100644
> > >--- a/drivers/vhost/net.c
> > >+++ b/drivers/vhost/net.c
> > >@@ -17,11 +17,13 @@
> > > #include 
> > > #include 
> > > #include 
> > >+#include 
> > > 
> > > #include 
> > > #include 
> > > #include 
> > > #include 
> > >+#include 
> > > 
> > > #include 
> > > 
> > >@@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
> > >socket *sock)
> > >   net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > > }
> > > 
> > >+static void handle_async_rx_events_notify(struct vhost_net *net,
> > >+  struct vhost_virtqueue *vq);
> > >+
> > >+static void handle_async_tx_events_notify(struct vhost_net *net,
> > >+  struct vhost_virtqueue *vq);
> > >+
> 
> >A couple of style comments:
> >
> >- It's better to arrange functions in such order that forward declarations
> >aren't necessary.  Since we don't have recursion, this should always be
> >possible.
> 
> >- continuation lines should be idented at least at the position of '('
> >on the previous line.
> 
> Thanks. I'd correct that.
> 
> >>  /* Expects to be always run from workqueue - which acts as
> >>   * read-size critical section for our kind of RCU. */
> >>  static void handle_tx(struct vhost_net *net)
> >> @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
> >>tx_poll_stop(net);
> >>hdr_size = vq->hdr_size;
> >>  
> >> +  handle_async_tx_events_notify(net, vq);
> > >+
> >>for (;;) {
> >>head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> >> ARRAY_SIZE(vq->iov),
> > >@@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
> >>/* Skip header. TODO: support TSO. */
> >>s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
> >>msg.msg_iovlen = out;
> > >+
> > >+  if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> > >+  vq->head = head;
> > >+  msg.msg_control = (void *)vq;
> 
> >So here a device gets a pointer to vhost_virtqueue structure. If it gets
> >an iocb and invokes a callback, it would not care about vhost internals.
> 
> >> +  }
> >> +
> >>len = iov_length(vq->iov, out);
> >>/* Sanity check */
> >>if (!len) {
> >> @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
> >>tx_poll_start(net, sock);
> >>break;
> >>}
> >> +
> >> +  if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> >> +  continue;
> >>+
> >>if (err != len)
> >>p

RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-17 Thread Xin, Xiaohui
>> Michael,
>> I don't use the kiocb comes from the sendmsg/recvmsg,
> >since I have embeded the kiocb in page_info structure,
> >and allocate it when page_info allocated.

>So what I suggested was that vhost allocates and tracks the iocbs, and
>passes them to your device with sendmsg/ recvmsg calls. This way your
>device won't need to share structures and locking strategy with vhost:
>you get an iocb, handle it, invoke a callback to notify vhost about
>completion.

>This also gets rid of the 'receiver' callback

I'm not sure receiver callback can be removed here:
The patch describes a work flow like this:
netif_receive_skb() gets the packet, it does nothing but just queue the skb
and wakeup the handle_rx() of vhost. handle_rx() then calls the receiver 
callback
to deal with skb and and get the necessary notify info into a list, vhost owns 
the 
list and in the same handle_rx() context use it to complete.

We use "receiver" callback here is because only handle_rx() is waked up from
netif_receive_skb(), and we need mp device context to deal with the skb and
notify info attached to it. We also have some lock in the callback function.

If I remove the receiver callback, I can only deal with the skb and notify
info in netif_receive_skb(), but this function is in an interrupt context,
which I think lock is not allowed there. But I cannot remove the lock there.


>> Please have a review and thanks for the instruction
>> for replying email which helps me a lot.
>> 
> >Thanks,
> >Xiaohui
> >
> > drivers/vhost/net.c   |  159 
> > +++--
>>  drivers/vhost/vhost.h |   12 
>>  2 files changed, 166 insertions(+), 5 deletions(-)
>> 
>> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> >index 22d5fef..5483848 100644
> >--- a/drivers/vhost/net.c
> >+++ b/drivers/vhost/net.c
> >@@ -17,11 +17,13 @@
> > #include 
> > #include 
> > #include 
> >+#include 
> > 
> > #include 
> > #include 
> > #include 
> > #include 
> >+#include 
> > 
> > #include 
> > 
> >@@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
> >socket *sock)
> > net->tx_poll_state = VHOST_NET_POLL_STARTED;
> > }
> > 
> >+static void handle_async_rx_events_notify(struct vhost_net *net,
> >+struct vhost_virtqueue *vq);
> >+
> >+static void handle_async_tx_events_notify(struct vhost_net *net,
> >+struct vhost_virtqueue *vq);
> >+

>A couple of style comments:
>
>- It's better to arrange functions in such order that forward declarations
>aren't necessary.  Since we don't have recursion, this should always be
>possible.

>- continuation lines should be idented at least at the position of '('
>on the previous line.

Thanks. I'd correct that.

>>  /* Expects to be always run from workqueue - which acts as
>>   * read-size critical section for our kind of RCU. */
>>  static void handle_tx(struct vhost_net *net)
>> @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
>>  tx_poll_stop(net);
>>  hdr_size = vq->hdr_size;
>>  
>> +handle_async_tx_events_notify(net, vq);
> >+
>>  for (;;) {
>>  head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>>   ARRAY_SIZE(vq->iov),
> >@@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
>>  /* Skip header. TODO: support TSO. */
>>  s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>>  msg.msg_iovlen = out;
> >+
> >+if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> >+vq->head = head;
> >+msg.msg_control = (void *)vq;

>So here a device gets a pointer to vhost_virtqueue structure. If it gets
>an iocb and invokes a callback, it would not care about vhost internals.

>> +}
>> +
>>  len = iov_length(vq->iov, out);
>>  /* Sanity check */
>>  if (!len) {
>> @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
>>  tx_poll_start(net, sock);
>>  break;
>>  }
>> +
>> +if (vq->link_state == VHOST_VQ_LINK_ASYNC)
>> +continue;
>>+
>>  if (err != len)
>>  pr_err("Truncated TX packet: "
>> " len %d != %zd\n", err, len);
>> @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
>>  }
>>  }
>>  
>> +handle_async_tx_events_notify(net, vq);
>> +
>>  mutex_unlock(&vq->mutex);
>>  unuse_mm(net->dev.mm);
>>  }
>>@@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
>>  int err;
>>  size_t hdr_size;
>>  struct socket *sock = rcu_dereference(vq->private_data);
>> -if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>> +if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
>> +vq->link_state == VHOST_VQ_LINK_SYNC))
>>   

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-16 Thread Michael S. Tsirkin
On Tue, Mar 16, 2010 at 05:32:17PM +0800, Xin Xiaohui wrote:
> The vhost-net backend now only supports synchronous send/recv
> operations. The patch provides multiple submits and asynchronous
> notifications. This is needed for zero-copy case.
> 
> Signed-off-by: Xin Xiaohui 
> ---
> 
> Michael,
> I don't use the kiocb comes from the sendmsg/recvmsg,
> since I have embeded the kiocb in page_info structure,
> and allocate it when page_info allocated.

So what I suggested was that vhost allocates and tracks the iocbs, and
passes them to your device with sendmsg/ recvmsg calls. This way your
device won't need to share structures and locking strategy with vhost:
you get an iocb, handle it, invoke a callback to notify vhost about
completion.

This also gets rid of the 'receiver' callback.

> Please have a review and thanks for the instruction
> for replying email which helps me a lot.
> 
> Thanks,
> Xiaohui
> 
>  drivers/vhost/net.c   |  159 
> +++--
>  drivers/vhost/vhost.h |   12 
>  2 files changed, 166 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index 22d5fef..5483848 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c
> @@ -17,11 +17,13 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  
> @@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
> socket *sock)
>   net->tx_poll_state = VHOST_NET_POLL_STARTED;
>  }
>  
> +static void handle_async_rx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq);
> +
> +static void handle_async_tx_events_notify(struct vhost_net *net,
> + struct vhost_virtqueue *vq);
> +

A couple of style comments:

- It's better to arrange functions in such order that forward declarations
aren't necessary.  Since we don't have recursion, this should always be
possible.

- continuation lines should be idented at least at the position of '('
on the previous line.

>  /* Expects to be always run from workqueue - which acts as
>   * read-size critical section for our kind of RCU. */
>  static void handle_tx(struct vhost_net *net)
> @@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
>   tx_poll_stop(net);
>   hdr_size = vq->hdr_size;
>  
> + handle_async_tx_events_notify(net, vq);
> +
>   for (;;) {
>   head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>ARRAY_SIZE(vq->iov),
> @@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
>   /* Skip header. TODO: support TSO. */
>   s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
>   msg.msg_iovlen = out;
> +
> + if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
> + vq->head = head;
> + msg.msg_control = (void *)vq;

So here a device gets a pointer to vhost_virtqueue structure. If it gets
an iocb and invokes a callback, it would not care about vhost internals.

> + }
> +
>   len = iov_length(vq->iov, out);
>   /* Sanity check */
>   if (!len) {
> @@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
>   tx_poll_start(net, sock);
>   break;
>   }
> +
> + if (vq->link_state == VHOST_VQ_LINK_ASYNC)
> + continue;
> +
>   if (err != len)
>   pr_err("Truncated TX packet: "
>  " len %d != %zd\n", err, len);
> @@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
>   }
>   }
>  
> + handle_async_tx_events_notify(net, vq);
> +
>   mutex_unlock(&vq->mutex);
>   unuse_mm(net->dev.mm);
>  }
> @@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
>   int err;
>   size_t hdr_size;
>   struct socket *sock = rcu_dereference(vq->private_data);
> - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> + if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
> + vq->link_state == VHOST_VQ_LINK_SYNC))
>   return;
>  
>   use_mm(net->dev.mm);
> @@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net)
>   vhost_disable_notify(vq);
>   hdr_size = vq->hdr_size;
>  
> - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> + /* In async cases, for write logging, the simple way is to get
> +  * the log info always, and really logging is decided later.
> +  * Thus, when logging enabled, we can get log, and when logging
> +  * disabled, we can get log disabled accordingly.
> +  */
> +


This adds overhead and might be one of the reasons
your patch does not perform that well. A better way
would be to flush outsta

Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-16 Thread Xin Xiaohui
The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---

Michael,
I don't use the kiocb comes from the sendmsg/recvmsg,
since I have embeded the kiocb in page_info structure,
and allocate it when page_info allocated.
Please have a review and thanks for the instruction
for replying email which helps me a lot.

Thanks,
Xiaohui

 drivers/vhost/net.c   |  159 +++--
 drivers/vhost/vhost.h |   12 
 2 files changed, 166 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..5483848 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -91,6 +93,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +132,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -151,6 +161,12 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   msg.msg_control = (void *)vq;
+   }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -166,6 +182,10 @@ static void handle_tx(struct vhost_net *net)
tx_poll_start(net, sock);
break;
}
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
if (err != len)
pr_err("Truncated TX packet: "
   " len %d != %zd\n", err, len);
@@ -177,6 +197,8 @@ static void handle_tx(struct vhost_net *net)
}
}
 
+   handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
 }
@@ -206,7 +228,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
-   if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+   if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+   vq->link_state == VHOST_VQ_LINK_SYNC))
return;
 
use_mm(net->dev.mm);
@@ -214,9 +237,18 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
 
-   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   /* In async cases, for write logging, the simple way is to get
+* the log info always, and really logging is decided later.
+* Thus, when logging enabled, we can get log, and when logging
+* disabled, we can get log disabled accordingly.
+*/
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
+   (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
vq->log : NULL;
 
+   handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -245,6 +277,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   vq->_log = log;
+   msg.msg_control = (void *)vq;
+   }
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -259,6 +296,10 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq);
break;
}
+
+  

Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-15 Thread Michael S. Tsirkin
On Mon, Mar 15, 2010 at 04:46:50PM +0800, Xin, Xiaohui wrote:
> >> +/* The structure to notify the virtqueue for async socket */
> >> +struct vhost_notifier {
> >> +  struct list_head list;
> > >+  struct vhost_virtqueue *vq;
> > >+  int head;
> > >+  int size;
> > >+  int log;
> > >+  void *ctrl;
> > >+  void (*dtor)(struct vhost_notifier *);
> > >+};
> > >+
> 
> >So IMO, this is not the best interface between vhost
> >and your driver, exposing them to each other unnecessarily.
> >
> >If you think about it, your driver should not care about this structure.
> >It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
> >on completion.  vhost could save it's state in ki_user_data.  If your
> >driver needs to add more data to do more tracking, I think it can put
> >skb pointer in the private pointer.
> 
> Then if I remove the struct vhost_notifier, and just use struct kiocb, but 
> don't use the one got from sendmsg or recvmsg, but allocated within the 
> page_info structure, and don't implement any aio logic related to it, is that 
> ok?

Hmm, not sure I understand.  It seems both cleaner and easier to use the
iocb passed to sendmsg/recvmsg. No? I am not saying you necessarily must
implement full aio directly.

> Sorry, I made a patch, but don't know how to reply mail with a good formatted 
> patch here
> 
> Thanks
> Xiaohui

Maybe Documentation/email-clients.txt will help?
Generally you do it like this (at start of mail):

Subject: one line patch summary (overrides mail subject)

multilie patch description

Signed-off-by: <...>

---

Free text comes after --- delimeter, before patch.

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index a140dad..e830b30 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -106,22 +106,41 @@ static void handle_tx(struct vhost_net *net)




-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-15 Thread Xin, Xiaohui
>> +/* The structure to notify the virtqueue for async socket */
>> +struct vhost_notifier {
>> +struct list_head list;
> >+struct vhost_virtqueue *vq;
> >+int head;
> >+int size;
> >+int log;
> >+void *ctrl;
> >+void (*dtor)(struct vhost_notifier *);
> >+};
> >+

>So IMO, this is not the best interface between vhost
>and your driver, exposing them to each other unnecessarily.
>
>If you think about it, your driver should not care about this structure.
>It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
>on completion.  vhost could save it's state in ki_user_data.  If your
>driver needs to add more data to do more tracking, I think it can put
>skb pointer in the private pointer.

Then if I remove the struct vhost_notifier, and just use struct kiocb, but 
don't use the one got from sendmsg or recvmsg, but allocated within the 
page_info structure, and don't implement any aio logic related to it, is that 
ok?

Sorry, I made a patch, but don't know how to reply mail with a good formatted 
patch here

Thanks
Xiaohui
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-07 Thread Michael S. Tsirkin
> +/* The structure to notify the virtqueue for async socket */
> +struct vhost_notifier {
> + struct list_head list;
> + struct vhost_virtqueue *vq;
> + int head;
> + int size;
> + int log;
> + void *ctrl;
> + void (*dtor)(struct vhost_notifier *);
> +};
> +

So IMO, this is not the best interface between vhost
and your driver, exposing them to each other unnecessarily.

If you think about it, your driver should not care about this structure.
It could get e.g. a kiocb (sendmsg already gets one), and call ki_dtor
on completion.  vhost could save it's state in ki_user_data.  If your
driver needs to add more data to do more tracking, I think it can put
skb pointer in the private pointer.

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-06 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  156 +++--
 drivers/vhost/vhost.h |   23 +++
 2 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..24a6c3d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -91,6 +92,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +131,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -151,6 +160,12 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   msg.msg_control = (void *)vq;
+   }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -166,6 +181,10 @@ static void handle_tx(struct vhost_net *net)
tx_poll_start(net, sock);
break;
}
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
if (err != len)
pr_err("Truncated TX packet: "
   " len %d != %zd\n", err, len);
@@ -177,6 +196,8 @@ static void handle_tx(struct vhost_net *net)
}
}
 
+   handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
 }
@@ -206,7 +227,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
-   if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+   if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+   vq->link_state == VHOST_VQ_LINK_SYNC))
return;
 
use_mm(net->dev.mm);
@@ -214,9 +236,18 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
 
-   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   /* In async cases, for write logging, the simple way is to get
+* the log info always, and really logging is decided later.
+* Thus, when logging enabled, we can get log, and when logging
+* disabled, we can get log disabled accordingly.
+*/
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
+   (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
vq->log : NULL;
 
+   handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -245,6 +276,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   vq->_log = log;
+   msg.msg_control = (void *)vq;
+   }
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -259,6 +295,10 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq);
break;
}
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -284,10 +324,83 @@ static void handle_rx(struct vhost_net