Re:[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-22 Thread xiaohui . xin
From: Xin Xiaohui 

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

Michael,
Thanks. I have updated the patch with your suggestion.
It looks much clean now. Please have a review.

Thanks
Xiaohui

 drivers/vhost/Kconfig |   10 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1239 +
 include/linux/mpassthru.h |   29 +
 4 files changed, 1280 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..91806b1 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..cc99b14
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1239 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+  

Re:[RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.

2010-04-22 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---

Michael,

>Can't vhost supply a kiocb completion callback that will handle the list?

Yes, thanks. And with it I also remove the vq->receiver finally.

Thanks
Xiaohui

 drivers/vhost/net.c   |  227 +++--
 drivers/vhost/vhost.c |  115 ++---
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 301 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..4a70f66 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -91,11 +94,132 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+spin_lock_irqsave(&vq->notify_lock, flags);
+list_add_tail(&iocb->ki_list, &vq->notifier);
+spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+struct vhost_virtqueue *vq,
+struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(
+   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   log = (int)(iocb->ki_user_data >> 32);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static struct kiocb *

Re:[RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.

2010-04-23 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---

Michael,
>>>Can't vhost supply a kiocb completion callback that will handle the list?
>>Yes, thanks. And with it I also remove the vq->receivr finally.
>>Thanks
>>Xiaohui

>Nice progress. I commented on some minor issues below.
>Thanks!

The updated patch addressed your comments on the minor issues.
Thanks!

Thanks
Xiaohui  

 drivers/vhost/net.c   |  236 +++-
 drivers/vhost/vhost.c |  120 ++---
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 314 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 38989d1..18f6c41 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -48,6 +50,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_add_tail(&iocb->ki_list, &vq->notifier);
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+   return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (!is_async_vq(vq))
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   vq->log : NULL;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (!is_async_vq(vq))
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+  

[RFC][PATCH v4 01/18] Add a new struct for device to manipulate external buffer.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |   19 ++-
 1 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88b..bf79756 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,6 +530,22 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+/* Add a structure in structure net_device, the new field is
+ * named as mp_port. It's for mediate passthru (zero-copy).
+ * It contains the capability for the net device driver,
+ * a socket, and an external buffer creator, external means
+ * skb buffer belongs to the device may not be allocated from
+ * kernel space.
+ */
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_external_page *(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
 
 /*
  * This structure defines the management hooks for network devices.
@@ -952,7 +968,8 @@ struct net_device {
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional device, statistics, and wireless sysfs groups */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 03/18] Add a ndo_mp_port_prep pointer to net_device_ops.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

If the driver want to allocate external buffers,
then it can export it's capability, as the skb
buffer header length, the page length can be DMA, etc.
The external buffers owner may utilize this.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5c473fb..3a1583b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -707,6 +707,10 @@ struct net_device_ops {
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
 #endif
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 04/18] Add a function make external buffer owner to query capability.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |2 +
 net/core/dev.c|   51 +
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 3a1583b..2f9a4f2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1599,6 +1599,8 @@ extern gro_result_t   napi_gro_frags(struct 
napi_struct *napi);
 extern int netdev_mp_port_attach(struct net_device *dev,
 struct mpassthru_port *port);
 extern void netdev_mp_port_detach(struct net_device *dev);
+int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 6a73fc7..4972bc4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2492,6 +2492,57 @@ void netdev_mp_port_detach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_mp_port_detach);
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {
+   /* If the NIC driver did not report this,
+* then we try to use default value.
+*/
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 05/18] Add a function to indicate if device use external buffer.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2f9a4f2..a1a2aaf 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1602,6 +1602,13 @@ extern void netdev_mp_port_detach(struct net_device 
*dev);
 int netdev_mp_port_prep(struct net_device *dev,
struct mpassthru_port *port);
 
+static int dev_is_mpassthru(struct net_device *dev)
+{
+   if (dev && dev->mp_port)
+   return 1;
+   return 0;
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
kfree_skb(napi->skb);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 02/18] Export 2 func for device to assign/dassign new structure.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Export 2 func for device to assign/deassign new strucure

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |3 +++
 net/core/dev.c|   28 
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bf79756..5c473fb 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1592,6 +1592,9 @@ extern gro_result_t   napi_frags_finish(struct 
napi_struct *napi,
  gro_result_t ret);
 extern struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 extern gro_result_tnapi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_attach(struct net_device *dev,
+struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index e5972f7..6a73fc7 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2464,6 +2464,34 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
 }
 
+/* Export two functions to assign/de-assign mp_port pointer
+ * to a net device.
+ */
+
+int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   /* locked by mp_mutex */
+   if (rcu_dereference(dev->mp_port))
+   return -EBUSY;
+
+   rcu_assign_pointer(dev->mp_port, port);
+
+   return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+   /* locked by mp_mutex */
+   if (!rcu_dereference(dev->mp_port))
+   return;
+
+   rcu_assign_pointer(dev->mp_port, NULL);
+   synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 10/18] Use callback to deal with skb_release_data() specially.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

If buffer is external, then use the callback to destruct
buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 169f22c..5d93b2d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -385,6 +385,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+   /* check if the skb has external buffers, we have use destructor_arg
+* here to indicate
+*/
+   struct skb_external_page *ext_page = skb_shinfo(skb)->destructor_arg;
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
   &skb_shinfo(skb)->dataref)) {
@@ -397,6 +402,12 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frags(skb))
skb_drop_fraglist(skb);
 
+   /* if the skb has external buffers, use destructor here,
+* since after that skb->head will be kfree, in case skb->head
+* from external buffer cannot use kfree to destroy.
+*/
+   if (dev_is_mpassthru(skb->dev) && ext_page && ext_page->dtor)
+   ext_page->dtor(ext_page);
kfree(skb->head);
}
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 11/18] Add a hook to intercept external buffers from NIC driver.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

The hook is called in netif_receive_skb().
Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4972bc4..3e3ad23 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2543,6 +2543,37 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+  struct packet_type **pt_prev,
+  int *ret,
+  struct net_device *orig_dev)
+{
+   struct mpassthru_port *mp_port = NULL;
+   struct sock *sk = NULL;
+
+   if (!dev_is_mpassthru(skb->dev))
+   return skb;
+   mp_port = skb->dev->mp_port;
+
+   if (*pt_prev) {
+   *ret = deliver_skb(skb, *pt_prev, orig_dev);
+   *pt_prev = NULL;
+   }
+
+   sk = mp_port->sock->sk;
+   skb_queue_tail(&sk->sk_receive_queue, skb);
+   sk->sk_state_change(sk);
+
+   return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
@@ -2622,6 +2653,10 @@ int netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+   /* To intercept mediate passthru(zero-copy) packets here */
+   skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+   if (!skb)
+   goto out;
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 07/18] Make __alloc_skb() to get external buffer.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Add a dev parameter to __alloc_skb(), skb->data
points to external buffer, recompute skb->head,
maintain shinfo of the external buffer, record
external buffer info into destructor_arg field.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

__alloc_skb() cleanup by

Jeff Dike 

 include/linux/skbuff.h |7 ---
 net/core/skbuff.c  |   43 +--
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 96799f5..8949b15 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -448,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+  gfp_t priority, int fclone,
+  int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
 {
-   return __alloc_skb(size, priority, 0, -1);
+   return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
   gfp_t priority)
 {
-   return __alloc_skb(size, priority, 1, -1);
+   return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6345acc..ae223d2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -161,7 +161,8 @@ EXPORT_SYMBOL(skb_under_panic);
  * @fclone: allocate from fclone cache instead of head cache
  * and allocate a cloned (child) skb
  * @node: numa node to allocate memory on
- *
+ * @dev: a device owns the skb if the skb try to get external buffer.
+ * otherwise is NULL.
  * Allocate a new &sk_buff. The returned buffer has no headroom and a
  * tail room of size bytes. The object has a reference count of one.
  * The return is the buffer. On a failure the return is %NULL.
@@ -170,12 +171,13 @@ EXPORT_SYMBOL(skb_under_panic);
  * %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-   int fclone, int node)
+   int fclone, int node, struct net_device *dev)
 {
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
-   u8 *data;
+   u8 *data = NULL;
+   struct skb_external_page *ext_page = NULL;
 
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +187,23 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
goto out;
 
size = SKB_DATA_ALIGN(size);
-   data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-   gfp_mask, node);
+
+   /* If the device wants to do mediate passthru(zero-copy),
+* the skb may try to get external buffers from outside.
+* If fails, then fall back to alloc buffers from kernel.
+*/
+   if (dev && dev->mp_port) {
+   ext_page = netdev_alloc_external_page(dev, skb, size);
+   if (ext_page) {
+   data = ext_page->start;
+   size = ext_page->size;
+   }
+   }
+
+   if (!data)
+   data = kmalloc_node_track_caller(
+   size + sizeof(struct skb_shared_info),
+   gfp_mask, node);
if (!data)
goto nodata;
 
@@ -208,6 +225,15 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
skb->mac_header = ~0U;
 #endif
 
+   /* If the skb get external buffers sucessfully, since the shinfo is
+* at the end of the buffer, we may retain the shinfo once we
+* need it sometime.
+*/
+   if (ext_page) {
+   skb->head = skb->data - NET_IP_ALIGN - NET_SKB_PAD;
+   memcpy(ext_page->ushinfo, skb_shinfo(skb),
+  sizeof(struct skb_shared_info));
+   }
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +257,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+   /* Record the external buffer info in this field. It's not so good,
+* but we cannot find another place easily.
+*/
+   shinfo->destructor_arg = ext_page;
+
 out:
return skb;
 nodata:
@@ -259,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(

[RFC][PATCH v4 06/18] Add interface to get external buffers.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, it can get external buffers from mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 net/core/skbuff.c  |   16 
 2 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 3104e7d..96799f5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1525,6 +1525,18 @@ static inline void netdev_free_page(struct net_device 
*dev, struct page *page)
__free_page(page);
 }
 
+extern struct skb_external_page *netdev_alloc_external_pages(
+   struct net_device *dev,
+   struct sk_buff *skb, int npages);
+
+static inline struct skb_external_page *netdev_alloc_external_page(
+   struct net_device *dev,
+   struct sk_buff *skb, unsigned int size)
+{
+   return netdev_alloc_external_pages(dev, skb,
+  DIV_ROUND_UP(size, PAGE_SIZE));
+}
+
 /**
  * skb_clone_writable - is the header of a clone writable
  * @skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e06..6345acc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,6 +278,22 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_external_page *netdev_alloc_external_pages(struct net_device *dev,
+   struct sk_buff *skb, int npages)
+{
+   struct mpassthru_port *port;
+   struct skb_external_page *ext_page = NULL;
+
+   port = rcu_dereference(dev->mp_port);
+   if (!port)
+   goto out;
+   BUG_ON(npages > port->npages);
+   ext_page = port->ctor(port, skb, npages);
+out:
+   return ext_page;
+}
+EXPORT_SYMBOL(netdev_alloc_external_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
 {
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 08/18] Ignore skb_reserve() when device is using external buffer.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Make the skb->data and skb->head from external buffer
to be consistent, we ignore the room reserved by driver
for kernel skb.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8949b15..4458dc8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1206,6 +1206,15 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  */
 static inline void skb_reserve(struct sk_buff *skb, int len)
 {
+   /* Since skb_reserve() is only for an empty buffer,
+* and when the skb is getting external buffer, we cannot
+* retain the external buffer has the same reserved space
+* in the header which kernel allocatd skb has, so have to
+* ignore this. And we have recorded the external buffer
+* info in the destructor_arg field, so use it as indicator.
+*/
+   if (skb_shinfo(skb)->destructor_arg)
+   return;
skb->data += len;
skb->tail += len;
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 09/18] Don't do skb recycle, if device use external buffer.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ae223d2..169f22c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -553,6 +553,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;
 
+   /* if the device wants to do mediate passthru, the skb may
+* get external buffer, so don't recycle
+*/
+   if (dev_is_mpassthru(skb->dev))
+   return 0;
+
skb_release_head_state(skb);
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 13/18] Add header file for mp device.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

 include/linux/mpassthru.h |   29 +
 1 files changed, 29 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/mpassthru.h

diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 000..e3983d3
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,29 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include 
+#include 
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV  _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV_IOW('M', 214, int)
+
+/* MPASSTHRU ifc flags */
+#define IFF_MPASSTHRU  0x0001
+#define IFF_MPASSTHRU_EXCL 0x0002
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include 
+#include 
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 14/18] Add basic func and special ioctl to mp device.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

memory leak fixed,
kconfig made,
do_unbind() made,
mp_chr_ioctl() cleanup

by Jeff Dike 

 drivers/vhost/mpassthru.c |  671 +
 1 files changed, 671 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..c5ede17
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,671 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the external buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a external allocated skb or kernel
+*/
+   struct skb_external_pageext_page;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   struct iovechdr[MAX_SKB_FRAGS + 2];
+   struct ioveciov[MAX_SKB_FRAGS + 2];
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+   int debug;
+#endif
+};
+
+struct mp_file {
+   atomic_t count;
+   struct mp_struct *mp;
+   struct net *net;
+};
+
+struct mp_sock {
+   struct sock sk;
+   struct mp_struct*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+   int ret = 0;
+
+   rtnl_lock();
+   ret = dev_change_flags(dev, flags);
+   rtnl_unlock();
+
+   if (ret < 0)
+   printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+   return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+   int rc;
+   struct page_ctor *ctor;
+   struct net_device *dev = mp->dev;
+
+   /* locked by mp_mutex */
+   if (rcu_dereference(mp->ctor))
+   return -EBUSY;
+
+   ctor = kza

[RFC][PATCH v4 12/18] To skip GRO if buffer is external.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3e3ad23..f275c44 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2780,6 +2780,10 @@ enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;
 
+   /* currently GRO is not supported by mediate passthru */
+   if (dev_is_mpassthru(skb->dev))
+   goto normal;
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 16/18] Export proto_ops to vhost-net driver.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  321 -
 1 files changed, 317 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index b171f21..0ac1a71 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -563,8 +563,321 @@ failed:
return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   struct page_ctor *ctor = NULL;
+   struct sk_buff *skb = NULL;
+   struct page_info *info = NULL;
+   struct ethhdr *eth;
+   struct kiocb *iocb = NULL;
+   int len, i;
+
+   struct virtio_net_hdr hdr = {
+   .flags = 0,
+   .gso_type = VIRTIO_NET_HDR_GSO_NONE
+   };
+
+   ctor = rcu_dereference(mp->ctor);
+   if (!ctor)
+   return;
+
+   while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+   if (skb_shinfo(skb)->destructor_arg) {
+   info = container_of(skb_shinfo(skb)->destructor_arg,
+   struct page_info, ext_page);
+   info->skb = skb;
+   if (skb->len > info->len) {
+   mp->dev->stats.rx_dropped++;
+   DBG(KERN_INFO "Discarded truncated rx packet: "
+   " len %d > %zd\n", skb->len, info->len);
+   info->total = skb->len;
+   goto clean;
+   } else {
+   int i;
+   struct skb_shared_info *gshinfo =
+   (struct skb_shared_info *)
+   (&info->ushinfo);
+   struct skb_shared_info *hshinfo =
+   skb_shinfo(skb);
+
+   if (gshinfo->nr_frags < hshinfo->nr_frags)
+   goto clean;
+   eth = eth_hdr(skb);
+   skb_push(skb, ETH_HLEN);
+
+   hdr.hdr_len = skb_headlen(skb);
+   info->total = skb->len;
+
+   for (i = 0; i < gshinfo->nr_frags; i++)
+   gshinfo->frags[i].size = 0;
+   for (i = 0; i < hshinfo->nr_frags; i++)
+   gshinfo->frags[i].size =
+   hshinfo->frags[i].size;
+   memcpy(skb_shinfo(skb), &info->ushinfo,
+   sizeof(struct skb_shared_info));
+   }
+   } else {
+   /* The skb composed with kernel buffers
+* in case external buffers are not sufficent.
+* The case should be rare.
+*/
+   unsigned long flags;
+   int i;
+   struct skb_shared_info *gshinfo = NULL;
+
+   info = NULL;
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq,
+   struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info) {
+   DBG(KERN_INFO
+   "No external buffer avaliable %p\n",
+   skb);
+   skb_queue_head(&sk->sk_receive_queue,
+   skb);
+   break;
+   }
+   info->skb = skb;
+   /* compute the guest skb frags info */
+   gshinfo = (struct skb_shared_info *)
+ (info->ext_page.start +
+ SKB_DATA_ALIGN(info->ext_pa

[RFC][PATCH v4 17/18] Add a kconfig entry and make entry for mp device.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/Kconfig  |   10 ++
 drivers/vhost/Makefile |2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v4 15/18] Manipulate external buffers in mp device.

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  237 -
 1 files changed, 236 insertions(+), 1 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index c5ede17..b171f21 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -159,6 +159,39 @@ static int mp_dev_change_flags(struct net_device *dev, 
unsigned flags)
return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+   struct sk_buff *skb, int npages)
+{
+   int i;
+   unsigned long flags;
+   struct page_ctor *ctor;
+   struct page_info *info = NULL;
+
+   ctor = container_of(port, struct page_ctor, port);
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq, struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info)
+   return NULL;
+
+   for (i = 0; i < info->pnum; i++) {
+   get_page(info->pages[i]);
+   info->frag[i].page = info->pages[i];
+   info->frag[i].page_offset = i ? 0 : info->offset;
+   info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+   port->data_len;
+   }
+   info->skb = skb;
+   info->ext_page.frags = info->frag;
+   info->ext_page.ushinfo = &info->ushinfo;
+   return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
int rc;
@@ -191,7 +224,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
dev_hold(dev);
ctor->dev = dev;
-   ctor->port.ctor = NULL;
+   ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->lock_pages = 0;
rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -258,6 +291,52 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int 
resource,
task_unlock(current->group_leader);
return 0;
 }
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+   struct page_info *info = (struct page_info *)(iocb->private);
+   int i;
+
+   if (info->flags == INFO_READ) {
+   for (i = 0; i < info->pnum; i++) {
+   if (info->pages[i]) {
+   set_page_dirty_lock(info->pages[i]);
+   put_page(info->pages[i]);
+   }
+   }
+   skb_shinfo(info->skb)->destructor_arg = &info->ext_page;
+   info->skb->destructor = NULL;
+   kfree_skb(info->skb);
+   }
+   /* Decrement the number of locked pages */
+   info->ctor->lock_pages -= info->pnum;
+   kmem_cache_free(info->ctor->cache, info);
+
+   return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+   struct kiocb *iocb = NULL;
+
+   iocb = info->iocb;
+   if (!iocb)
+   return iocb;
+   iocb->ki_flags = 0;
+   iocb->ki_users = 1;
+   iocb->ki_key = 0;
+   iocb->ki_ctx = NULL;
+   iocb->ki_cancel = NULL;
+   iocb->ki_retry = NULL;
+   iocb->ki_iovec = NULL;
+   iocb->ki_eventfd = NULL;
+   iocb->ki_pos = info->desc_pos;
+   iocb->ki_nbytes = size;
+   iocb->ki_dtor(iocb);
+   iocb->private = (void *)info;
+   iocb->ki_dtor = mp_ki_dtor;
+
+   return iocb;
+}
 
 static int page_ctor_detach(struct mp_struct *mp)
 {
@@ -275,6 +354,7 @@ static int page_ctor_detach(struct mp_struct *mp)
for (i = 0; i < info->pnum; i++)
if (info->pages[i])
put_page(info->pages[i]);
+   create_iocb(info, 0);
kmem_cache_free(ctor->cache, info);
}
set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
@@ -328,6 +408,161 @@ static void mp_put(struct mp_file *mfile)
mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+   struct page_info *info;
+   struct page_ctor *ctor;
+   struct sock *sk;
+   struct sk_buff *skb;
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   if (!ext_page)
+   return;
+   info = container_of(ext_page, struct page_info, ext_page);
+   if (!info)
+   return;
+   ctor = info->ctor;
+   skb = info->skb;
+
+   if ((info->flags == INFO_READ) && info->skb)
+   info->skb->head = NULL;
+
+   /* If the info->total is 0, make it to be reused */
+   if (!info->total) {
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   list_add(&info->list, &ctor->re

[RFC][PATCH v4 18/18] Provides multiple submits and async notifications

2010-04-25 Thread xiaohui . xin
From: Xin Xiaohui 

Provides multiple submits and asynchronous notifications.

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  236 +++-
 drivers/vhost/vhost.c |  120 ++---
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 314 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 38989d1..18f6c41 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -23,6 +23,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -48,6 +50,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -92,11 +95,138 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_add_tail(&iocb->ki_list, &vq->notifier);
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+   return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (!is_async_vq(vq))
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   vq->log : NULL;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (!is_async_vq(vq))
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+struct vhost_virtqueue *vq,
+unsig

[RFC][PATCH v4 00/18] Provide a zero-copy method on KVM virtio-net.

2010-04-25 Thread xiaohui . xin
We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-12:net core changes.
patch 13-17:new device as interface to mantpulate external buffers.
patch 18:   for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.
Is that reasonable?

Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

Please give comments especially for the network part modifications.


We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
using dynamic minor instead of static minor number
a __KERNEL__ protect to mp_get_sock()

what we have done in v2:

remove most of the RCU usage, since the ctor pointer is only
changed by BIND/UNBIND ioctl, and during that time, NIC will be
stopped to get good cl

[RFC][PATCH v5 00/19] Provide a zero-copy method on KVM virtio-net.

2010-05-07 Thread xiaohui . xin
We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-13:net core changes.
patch 14-18:new device as interface to mantpulate external buffers.
patch 19:   for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.


Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

We have got comments from Michael. And he said the first method will break
the compatiblity of virtio-net driver and may complicate the qemu live 
migration. Currently, we tried to ignore the skb_reserve() if the device
is doing zero-copy. Then guest virtio-net driver wil not changed. So we now
continue to go with the first way. 
But comments about the two ways are still appreicated.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
usin

[RFC][PATCH v5 02/19] Add a new struct for device to manipulate external buffer.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |   19 ++-
 1 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b476..bae725c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,6 +530,22 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+/* Add a structure in structure net_device, the new field is
+ * named as mp_port. It's for mediate passthru (zero-copy).
+ * It contains the capability for the net device driver,
+ * a socket, and an external buffer creator, external means
+ * skb buffer belongs to the device may not be allocated from
+ * kernel space.
+ */
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_external_page *(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
 
 /*
  * This structure defines the management hooks for network devices.
@@ -952,7 +968,8 @@ struct net_device {
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional device, statistics, and wireless sysfs groups */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 05/19] Add a function make external buffer owner to query capability.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |2 +
 net/core/dev.c|   51 +
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 183c786..31d9c4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1599,6 +1599,8 @@ extern gro_result_t   napi_gro_frags(struct 
napi_struct *napi);
 extern int netdev_mp_port_attach(struct net_device *dev,
 struct mpassthru_port *port);
 extern void netdev_mp_port_detach(struct net_device *dev);
+extern int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ecbb6b1..37b389a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2497,6 +2497,57 @@ void netdev_mp_port_detach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_mp_port_detach);
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {
+   /* If the NIC driver did not report this,
+* then we try to use default value.
+*/
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 08/19] Make __alloc_skb() to get external buffer.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Add a dev parameter to __alloc_skb(), skb->data
points to external buffer, recompute skb->head,
maintain shinfo of the external buffer, record
external buffer info into destructor_arg field.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

__alloc_skb() cleaup by 

Jeff Dike 

 include/linux/skbuff.h |7 ---
 net/core/skbuff.c  |   43 +--
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 281a1c0..5ff8c27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -442,17 +442,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+  gfp_t priority, int fclone,
+  int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
 {
-   return __alloc_skb(size, priority, 0, -1);
+   return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
   gfp_t priority)
 {
-   return __alloc_skb(size, priority, 1, -1);
+   return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6345acc..ae223d2 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -161,7 +161,8 @@ EXPORT_SYMBOL(skb_under_panic);
  * @fclone: allocate from fclone cache instead of head cache
  * and allocate a cloned (child) skb
  * @node: numa node to allocate memory on
- *
+ * @dev: a device owns the skb if the skb try to get external buffer.
+ * otherwise is NULL.
  * Allocate a new &sk_buff. The returned buffer has no headroom and a
  * tail room of size bytes. The object has a reference count of one.
  * The return is the buffer. On a failure the return is %NULL.
@@ -170,12 +171,13 @@ EXPORT_SYMBOL(skb_under_panic);
  * %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-   int fclone, int node)
+   int fclone, int node, struct net_device *dev)
 {
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
-   u8 *data;
+   u8 *data = NULL;
+   struct skb_external_page *ext_page = NULL;
 
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +187,23 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
goto out;
 
size = SKB_DATA_ALIGN(size);
-   data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-   gfp_mask, node);
+
+   /* If the device wants to do mediate passthru(zero-copy),
+* the skb may try to get external buffers from outside.
+* If fails, then fall back to alloc buffers from kernel.
+*/
+   if (dev && dev->mp_port) {
+   ext_page = netdev_alloc_external_page(dev, skb, size);
+   if (ext_page) {
+   data = ext_page->start;
+   size = ext_page->size;
+   }
+   }
+
+   if (!data)
+   data = kmalloc_node_track_caller(
+   size + sizeof(struct skb_shared_info),
+   gfp_mask, node);
if (!data)
goto nodata;
 
@@ -208,6 +225,15 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
skb->mac_header = ~0U;
 #endif
 
+   /* If the skb get external buffers sucessfully, since the shinfo is
+* at the end of the buffer, we may retain the shinfo once we
+* need it sometime.
+*/
+   if (ext_page) {
+   skb->head = skb->data - NET_IP_ALIGN - NET_SKB_PAD;
+   memcpy(ext_page->ushinfo, skb_shinfo(skb),
+  sizeof(struct skb_shared_info));
+   }
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +257,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+   /* Record the external buffer info in this field. It's not so good,
+* but we cannot find another place easily.
+*/
+   shinfo->destructor_arg = ext_page;
+
 out:
return skb;
 nodata:
@@ -259,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_

[RFC][PATCH v5 07/19] Add interface to get external buffers.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, it can get external buffers from mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 net/core/skbuff.c  |   16 
 2 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf309c9..281a1c0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1519,6 +1519,18 @@ static inline void netdev_free_page(struct net_device 
*dev, struct page *page)
__free_page(page);
 }
 
+extern struct skb_external_page *netdev_alloc_external_pages(
+   struct net_device *dev,
+   struct sk_buff *skb, int npages);
+
+static inline struct skb_external_page *netdev_alloc_external_page(
+   struct net_device *dev,
+   struct sk_buff *skb, unsigned int size)
+{
+   return netdev_alloc_external_pages(dev, skb,
+  DIV_ROUND_UP(size, PAGE_SIZE));
+}
+
 /**
  * skb_clone_writable - is the header of a clone writable
  * @skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e06..6345acc 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,6 +278,22 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_external_page *netdev_alloc_external_pages(struct net_device *dev,
+   struct sk_buff *skb, int npages)
+{
+   struct mpassthru_port *port;
+   struct skb_external_page *ext_page = NULL;
+
+   port = rcu_dereference(dev->mp_port);
+   if (!port)
+   goto out;
+   BUG_ON(npages > port->npages);
+   ext_page = port->ctor(port, skb, npages);
+out:
+   return ext_page;
+}
+EXPORT_SYMBOL(netdev_alloc_external_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
 {
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 09/19] Ignore room skb_reserve() when device is using external buffer.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Make the skb->data and skb->head from external buffer
to be consistent, we ignore the room reserved by driver
for kernel skb.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ff8c27..193b259 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1200,6 +1200,15 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  */
 static inline void skb_reserve(struct sk_buff *skb, int len)
 {
+   /* Since skb_reserve() is only for an empty buffer,
+* and when the skb is getting external buffer, we cannot
+* retain the external buffer has the same reserved space
+* in the header which kernel allocatd skb has, so have to
+* ignore this. And we have recorded the external buffer
+* info in the destructor_arg field, so use it as indicator.
+*/
+   if (skb_shinfo(skb)->destructor_arg)
+   return;
skb->data += len;
skb->tail += len;
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 14/19] Add header file for mp device.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/mpassthru.h |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/mpassthru.h

diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 000..ba8f320
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,25 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include 
+#include 
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV  _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV_IO('M', 214)
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include 
+#include 
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 16/19] Manipulate external buffers in mp device.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  254 -
 1 files changed, 251 insertions(+), 3 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 33cc123..8538a87 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -160,6 +160,39 @@ static int mp_dev_change_flags(struct net_device *dev, 
unsigned flags)
return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+   struct sk_buff *skb, int npages)
+{
+   int i;
+   unsigned long flags;
+   struct page_ctor *ctor;
+   struct page_info *info = NULL;
+
+   ctor = container_of(port, struct page_ctor, port);
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq, struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info)
+   return NULL;
+
+   for (i = 0; i < info->pnum; i++) {
+   get_page(info->pages[i]);
+   info->frag[i].page = info->pages[i];
+   info->frag[i].page_offset = i ? 0 : info->offset;
+   info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+   port->data_len;
+   }
+   info->skb = skb;
+   info->ext_page.frags = info->frag;
+   info->ext_page.ushinfo = &info->ushinfo;
+   return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
int rc;
@@ -192,7 +225,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
dev_hold(dev);
ctor->dev = dev;
-   ctor->port.ctor = NULL;
+   ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->lock_pages = 0;
rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -260,11 +293,66 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int 
resource,
return 0;
 }
 
+static void relinquish_resource(struct page_ctor *ctor)
+{
+   if (!(ctor->dev->flags & IFF_UP) &&
+   !(ctor->wq_len + ctor->rq_len))
+   kmem_cache_destroy(ctor->cache);
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+   struct page_info *info = (struct page_info *)(iocb->private);
+   int i;
+
+   if (info->flags == INFO_READ) {
+   for (i = 0; i < info->pnum; i++) {
+   if (info->pages[i]) {
+   set_page_dirty_lock(info->pages[i]);
+   put_page(info->pages[i]);
+   }
+   }
+   info->skb->destructor = NULL;
+   kfree_skb(info->skb);
+   info->ctor->rq_len--;
+   } else
+   info->ctor->wq_len--;
+   /* Decrement the number of locked pages */
+   info->ctor->lock_pages -= info->pnum;
+   kmem_cache_free(info->ctor->cache, info);
+   relinquish_resource(info->ctor);
+
+   return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+   struct kiocb *iocb = NULL;
+
+   iocb = info->iocb;
+   if (!iocb)
+   return iocb;
+   iocb->ki_flags = 0;
+   iocb->ki_users = 1;
+   iocb->ki_key = 0;
+   iocb->ki_ctx = NULL;
+   iocb->ki_cancel = NULL;
+   iocb->ki_retry = NULL;
+   iocb->ki_iovec = NULL;
+   iocb->ki_eventfd = NULL;
+   iocb->ki_pos = info->desc_pos;
+   iocb->ki_nbytes = size;
+   iocb->ki_dtor(iocb);
+   iocb->private = (void *)info;
+   iocb->ki_dtor = mp_ki_dtor;
+
+   return iocb;
+}
+
 static int page_ctor_detach(struct mp_struct *mp)
 {
struct page_ctor *ctor;
struct page_info *info;
-   struct kiocb *iocb = NULL;
int i;
 
/* locked by mp_mutex */
@@ -276,12 +364,17 @@ static int page_ctor_detach(struct mp_struct *mp)
for (i = 0; i < info->pnum; i++)
if (info->pages[i])
put_page(info->pages[i]);
+   create_iocb(info, 0);
+   ctor->rq_len--;
kmem_cache_free(ctor->cache, info);
}
+
+   relinquish_resource(ctor);
+
set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
   ctor->o_rlim.rlim_cur,
   ctor->o_rlim.rlim_max);
-   kmem_cache_destroy(ctor->cache);
+
netdev_mp_port_detach(ctor->dev);
dev_put(ctor->dev);
 
@@ -329,6 +422,161 @@ static void mp_put(struct mp_file *mfile)
mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+   struct page_info 

[RFC][PATCH v5 17/19] Export proto_ops to vhost-net driver.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  322 -
 1 files changed, 318 insertions(+), 4 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 8538a87..96b314a 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -577,8 +577,322 @@ failed:
return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   struct page_ctor *ctor = NULL;
+   struct sk_buff *skb = NULL;
+   struct page_info *info = NULL;
+   struct ethhdr *eth;
+   struct kiocb *iocb = NULL;
+   int len, i;
+
+   struct virtio_net_hdr hdr = {
+   .flags = 0,
+   .gso_type = VIRTIO_NET_HDR_GSO_NONE
+   };
+
+   ctor = rcu_dereference(mp->ctor);
+   if (!ctor)
+   return;
+
+   while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+   if (skb_shinfo(skb)->destructor_arg) {
+   info = container_of(skb_shinfo(skb)->destructor_arg,
+   struct page_info, ext_page);
+   info->skb = skb;
+   if (skb->len > info->len) {
+   mp->dev->stats.rx_dropped++;
+   DBG(KERN_INFO "Discarded truncated rx packet: "
+   " len %d > %zd\n", skb->len, info->len);
+   info->total = skb->len;
+   goto clean;
+   } else {
+   int i;
+   struct skb_shared_info *gshinfo =
+   (struct skb_shared_info *)
+   (&info->ushinfo);
+   struct skb_shared_info *hshinfo =
+   skb_shinfo(skb);
+
+   if (gshinfo->nr_frags < hshinfo->nr_frags)
+   goto clean;
+   eth = eth_hdr(skb);
+   skb_push(skb, ETH_HLEN);
+
+   hdr.hdr_len = skb_headlen(skb);
+   info->total = skb->len;
+
+   for (i = 0; i < gshinfo->nr_frags; i++)
+   gshinfo->frags[i].size = 0;
+   for (i = 0; i < hshinfo->nr_frags; i++)
+   gshinfo->frags[i].size =
+   hshinfo->frags[i].size;
+   }
+   } else {
+   /* The skb composed with kernel buffers
+* in case external buffers are not sufficent.
+* The case should be rare.
+*/
+   unsigned long flags;
+   int i;
+   struct skb_shared_info *gshinfo = NULL;
+
+   info = NULL;
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq,
+   struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info) {
+   DBG(KERN_INFO
+   "No external buffer avaliable %p\n",
+   skb);
+   skb_queue_head(&sk->sk_receive_queue,
+   skb);
+   break;
+   }
+   info->skb = skb;
+   /* compute the guest skb frags info */
+   gshinfo = (struct skb_shared_info *)
+ (info->ext_page.start +
+ SKB_DATA_ALIGN(info->ext_page.size));
+
+   if (gshinfo->nr_frags < skb_shinfo(skb)->nr_frags)
+   goto clean;
+
+  

[RFC][PATCH v5 18/19] Add a kconfig entry and make entry for mp device.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/Kconfig  |   10 ++
 drivers/vhost/Makefile |2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 19/19] Provides multiple submits and asynchronous notifications.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  240 +++-
 drivers/vhost/vhost.c |  120 ++---
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 318 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9777583..b3171ed 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -49,6 +51,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -93,11 +96,138 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_add_tail(&iocb->ki_list, &vq->notifier);
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+   return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (!is_async_vq(vq))
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   vq->log : NULL;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (!is_async_vq(vq))
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static struct kiocb *create_iocb(struct vhost_net *net,
+struct vhost_virtqueue *vq,
+unsigned head)
+{
+   struct kiocb *iocb = NULL;
+
+   if (!is_async_vq(vq))

[RFC][PATCH v5 15/19] Add basic funcs and ioctl to mp device.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

memory leak fixed,
kconfig made,
do_unbind() made,
mp_chr_ioctl() cleanup

by Jeff Dike 

 drivers/vhost/mpassthru.c |  682 +
 1 files changed, 682 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..33cc123
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,682 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the external buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a external allocated skb or kernel
+*/
+   struct skb_external_pageext_page;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   struct iovechdr[MAX_SKB_FRAGS + 2];
+   struct ioveciov[MAX_SKB_FRAGS + 2];
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int wq_len;
+   int rq_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+   int debug;
+#endif
+};
+
+struct mp_file {
+   atomic_t count;
+   struct mp_struct *mp;
+   struct net *net;
+};
+
+struct mp_sock {
+   struct sock sk;
+   struct mp_struct*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+   int ret = 0;
+
+   rtnl_lock();
+   ret = dev_change_flags(dev, flags);
+   rtnl_unlock();
+
+   if (ret < 0)
+   printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+   return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+   int rc;
+   struct page_ctor *ctor;
+   struct net_device *dev = mp->dev;
+
+   /* locked by mp_mutex */
+   if (rcu_dereference(mp->ctor))
+   return -EBUSY;
+
+

[RFC][PATCH v5 13/19] To skip GRO if buffer is external currently.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dc2f225..6c6b2fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2787,6 +2787,10 @@ enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;
 
+   /* currently GRO is not supported by mediate passthru */
+   if (dev_is_mpassthru(skb->dev))
+   goto normal;
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 12/19] Add a hook to intercept external buffers from NIC driver.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

The hook is called in netif_receive_skb().
Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 37b389a..dc2f225 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2548,6 +2548,37 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+  struct packet_type **pt_prev,
+  int *ret,
+  struct net_device *orig_dev)
+{
+   struct mpassthru_port *mp_port = NULL;
+   struct sock *sk = NULL;
+
+   if (!dev_is_mpassthru(skb->dev))
+   return skb;
+   mp_port = skb->dev->mp_port;
+
+   if (*pt_prev) {
+   *ret = deliver_skb(skb, *pt_prev, orig_dev);
+   *pt_prev = NULL;
+   }
+
+   sk = mp_port->sock->sk;
+   skb_queue_tail(&sk->sk_receive_queue, skb);
+   sk->sk_state_change(sk);
+
+   return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
@@ -2629,6 +2660,10 @@ int netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+   /* To intercept mediate passthru(zero-copy) packets here */
+   skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+   if (!skb)
+   goto out;
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 11/19] Use callback to deal with skb_release_data() specially.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

If buffer is external, then use the callback to destruct
buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 169f22c..5d93b2d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -385,6 +385,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+   /* check if the skb has external buffers, we have use destructor_arg
+* here to indicate
+*/
+   struct skb_external_page *ext_page = skb_shinfo(skb)->destructor_arg;
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
   &skb_shinfo(skb)->dataref)) {
@@ -397,6 +402,12 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frags(skb))
skb_drop_fraglist(skb);
 
+   /* if the skb has external buffers, use destructor here,
+* since after that skb->head will be kfree, in case skb->head
+* from external buffer cannot use kfree to destroy.
+*/
+   if (dev_is_mpassthru(skb->dev) && ext_page && ext_page->dtor)
+   ext_page->dtor(ext_page);
kfree(skb->head);
}
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 10/19] Don't do skb recycle, if device use external buffer.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ae223d2..169f22c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -553,6 +553,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;
 
+   /* if the device wants to do mediate passthru, the skb may
+* get external buffer, so don't recycle
+*/
+   if (dev_is_mpassthru(skb->dev))
+   return 0;
+
skb_release_head_state(skb);
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 06/19] Add a function to indicate if device use external buffer.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31d9c4a..0cb78f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1602,6 +1602,11 @@ extern void netdev_mp_port_detach(struct net_device 
*dev);
 extern int netdev_mp_port_prep(struct net_device *dev,
struct mpassthru_port *port);
 
+static inline bool dev_is_mpassthru(struct net_device *dev)
+{
+   return (dev && dev->mp_port);
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
kfree_skb(napi->skb);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 04/19] Add a ndo_mp_port_prep pointer to net_device_ops.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

If the driver want to allocate external buffers,
then it can export it's capability, as the skb
buffer header length, the page length can be DMA, etc.
The external buffers owner may utilize this.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index efb575a..183c786 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -707,6 +707,10 @@ struct net_device_ops {
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
 #endif
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 03/19] Export 2 func for device to assign/deassign new strucure

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |3 +++
 net/core/dev.c|   28 
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bae725c..efb575a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1592,6 +1592,9 @@ extern gro_result_t   napi_frags_finish(struct 
napi_struct *napi,
  gro_result_t ret);
 extern struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 extern gro_result_tnapi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_attach(struct net_device *dev,
+struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index f769098..ecbb6b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2469,6 +2469,34 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
 }
 
+/* Export two functions to assign/de-assign mp_port pointer
+ * to a net device.
+ */
+
+int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   /* locked by mp_mutex */
+   if (rcu_dereference(dev->mp_port))
+   return -EBUSY;
+
+   rcu_assign_pointer(dev->mp_port, port);
+
+   return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+   /* locked by mp_mutex */
+   if (!rcu_dereference(dev->mp_port))
+   return;
+
+   rcu_assign_pointer(dev->mp_port, NULL);
+   synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v5 01/19] Add a new structure for skb buffer from external.

2010-05-07 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 124f90c..cf309c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,18 @@ struct skb_shared_info {
void *  destructor_arg;
 };
 
+/* The structure is for a skb which skb->data may point to
+ * an external buffer, which is not allocated from kernel space.
+ * Since the buffer is external, then the shinfo or frags are
+ * also extern too. It also contains a destructor for itself.
+ */
+struct skb_external_page {
+   u8  *start;
+   int size;
+   struct skb_frag_struct *frags;
+   struct skb_shared_info *ushinfo;
+   void(*dtor)(struct skb_external_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 01/19] Add a new structure for skb buffer from external.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 124f90c..cf309c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,18 @@ struct skb_shared_info {
void *  destructor_arg;
 };
 
+/* The structure is for a skb which skb->data may point to
+ * an external buffer, which is not allocated from kernel space.
+ * Since the buffer is external, then the shinfo or frags are
+ * also extern too. It also contains a destructor for itself.
+ */
+struct skb_external_page {
+   u8  *start;
+   int size;
+   struct skb_frag_struct *frags;
+   struct skb_shared_info *ushinfo;
+   void(*dtor)(struct skb_external_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 04/19] Add a ndo_mp_port_prep pointer to net_device_ops.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

If the driver want to allocate external buffers,
then it can export it's capability, as the skb
buffer header length, the page length can be DMA, etc.
The external buffers owner may utilize this.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index efb575a..183c786 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -707,6 +707,10 @@ struct net_device_ops {
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
 #endif
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 07/19] Add interface to get external buffers.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, it can get external buffers from mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 net/core/skbuff.c  |   16 
 2 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf309c9..281a1c0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1519,6 +1519,18 @@ static inline void netdev_free_page(struct net_device 
*dev, struct page *page)
__free_page(page);
 }
 
+extern struct skb_external_page *netdev_alloc_external_pages(
+   struct net_device *dev,
+   struct sk_buff *skb, int npages);
+
+static inline struct skb_external_page *netdev_alloc_external_page(
+   struct net_device *dev,
+   struct sk_buff *skb, unsigned int size)
+{
+   return netdev_alloc_external_pages(dev, skb,
+  DIV_ROUND_UP(size, PAGE_SIZE));
+}
+
 /**
  * skb_clone_writable - is the header of a clone writable
  * @skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e06..fbdb1f1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,6 +278,22 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_external_page *netdev_alloc_external_pages(struct net_device *dev,
+   struct sk_buff *skb, int npages)
+{
+   struct mpassthru_port *port;
+   struct skb_external_page *ext_page = NULL;
+
+   port = rcu_dereference(dev->mp_port);
+   if (!port)
+   goto out;
+   WARN_ON(npages > port->npages);
+   ext_page = port->ctor(port, skb, npages);
+out:
+   return ext_page;
+}
+EXPORT_SYMBOL(netdev_alloc_external_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
 {
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 06/19] Add a function to indicate if device use external buffer.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31d9c4a..0cb78f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1602,6 +1602,11 @@ extern void netdev_mp_port_detach(struct net_device 
*dev);
 extern int netdev_mp_port_prep(struct net_device *dev,
struct mpassthru_port *port);
 
+static inline bool dev_is_mpassthru(struct net_device *dev)
+{
+   return (dev && dev->mp_port);
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
kfree_skb(napi->skb);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 11/19] Use callback to deal with skb_release_data() specially.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

If buffer is external, then use the callback to destruct
buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 37587f0..418457c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -385,6 +385,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+   /* check if the skb has external buffers, we have use destructor_arg
+* here to indicate
+*/
+   struct skb_external_page *ext_page = skb_shinfo(skb)->destructor_arg;
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
   &skb_shinfo(skb)->dataref)) {
@@ -397,6 +402,12 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frags(skb))
skb_drop_fraglist(skb);
 
+   /* if the skb has external buffers, use destructor here,
+* since after that skb->head will be kfree, in case skb->head
+* from external buffer cannot use kfree to destroy.
+*/
+   if (dev_is_mpassthru(skb->dev) && ext_page && ext_page->dtor)
+   ext_page->dtor(ext_page);
kfree(skb->head);
}
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 13/19] To skip GRO if buffer is external currently.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dc2f225..6c6b2fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2787,6 +2787,10 @@ enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;
 
+   /* currently GRO is not supported by mediate passthru */
+   if (dev_is_mpassthru(skb->dev))
+   goto normal;
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 14/19] Add header file for mp device.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/mpassthru.h |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/mpassthru.h

diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 000..ba8f320
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,25 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include 
+#include 
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV  _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV_IO('M', 214)
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include 
+#include 
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 18/19] Add a kconfig entry and make entry for mp device.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/Kconfig  |   10 ++
 drivers/vhost/Makefile |2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6 17/19] Export proto_ops to vhost-net driver.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  330 -
 1 files changed, 325 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index de07f1e..d0df691 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -414,6 +414,11 @@ static void mp_put(struct mp_file *mfile)
mp_detach(mfile->mp);
 }
 
+static void iocb_tag(struct kiocb *iocb)
+{
+   iocb->ki_flags = 1;
+}
+
 /* The callback to destruct the external buffers or skb */
 static void page_dtor(struct skb_external_page *ext_page)
 {
@@ -449,7 +454,7 @@ static void page_dtor(struct skb_external_page *ext_page)
 * Queue the notifier to wake up the backend driver
 */
 
-   create_iocb(info, info->total);
+   iocb_tag(info->iocb);
 
sk = ctor->port.sock->sk;
sk->sk_write_space(sk);
@@ -569,8 +574,323 @@ failed:
return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   struct page_ctor *ctor = NULL;
+   struct sk_buff *skb = NULL;
+   struct page_info *info = NULL;
+   struct ethhdr *eth;
+   struct kiocb *iocb = NULL;
+   int len, i;
+
+   struct virtio_net_hdr hdr = {
+   .flags = 0,
+   .gso_type = VIRTIO_NET_HDR_GSO_NONE
+   };
+
+   ctor = rcu_dereference(mp->ctor);
+   if (!ctor)
+   return;
+
+   while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+   if (skb_shinfo(skb)->destructor_arg) {
+   info = container_of(skb_shinfo(skb)->destructor_arg,
+   struct page_info, ext_page);
+   info->skb = skb;
+   if (skb->len > info->len) {
+   mp->dev->stats.rx_dropped++;
+   DBG(KERN_INFO "Discarded truncated rx packet: "
+   " len %d > %zd\n", skb->len, info->len);
+   info->total = skb->len;
+   goto clean;
+   } else {
+   int i;
+   struct skb_shared_info *gshinfo =
+   (struct skb_shared_info *)
+   (&info->ushinfo);
+   struct skb_shared_info *hshinfo =
+   skb_shinfo(skb);
+
+   if (gshinfo->nr_frags < hshinfo->nr_frags)
+   goto clean;
+   eth = eth_hdr(skb);
+   skb_push(skb, ETH_HLEN);
+
+   hdr.hdr_len = skb_headlen(skb);
+   info->total = skb->len;
+
+   for (i = 0; i < gshinfo->nr_frags; i++)
+   gshinfo->frags[i].size = 0;
+   for (i = 0; i < hshinfo->nr_frags; i++)
+   gshinfo->frags[i].size =
+   hshinfo->frags[i].size;
+   }
+   } else {
+   /* The skb composed with kernel buffers
+* in case external buffers are not sufficent.
+* The case should be rare.
+*/
+   unsigned long flags;
+   int i;
+   struct skb_shared_info *gshinfo = NULL;
+
+   info = NULL;
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq,
+   struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info) {
+   DBG(KERN_INFO
+   "No external buffer avaliable %p\n",
+   skb);
+ 

[RFC][PATCH v6 19/19] Provides multiple submits and asynchronous notifications.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  255 -
 drivers/vhost/vhost.c |  120 +--
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 333 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9777583..9a0d162 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -45,10 +47,13 @@ enum vhost_net_poll_state {
VHOST_NET_POLL_STOPPED = 2,
 };
 
+static struct kmem_cache *notify_cache;
+
 struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -93,11 +98,146 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_add_tail(&iocb->ki_list, &vq->notifier);
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+   return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (!is_async_vq(vq))
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   vq->log : NULL;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct list_head *entry, *tmp;
+   unsigned long flags;
+   int tx_total_len = 0;
+
+   if (!is_async_vq(vq))
+   return;
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_for_each_safe(entry, tmp, &vq->notifier) {
+   iocb = list_entry(entry,
+struct kiocb, ki_list);
+   if (!iocb->ki_flags)
+   continue;
+   list_del(&iocb->ki_list);   
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+

[RFC][PATCH v6 16/19] Manipulate external buffers in mp device.

2010-05-21 Thread xiaohui . xin
From: Xin, Xiaohui

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  253 -
 1 files changed, 251 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 25e2f3e..de07f1e 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -161,6 +161,39 @@ static int mp_dev_change_flags(struct net_device *dev, 
unsigned flags)
return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+   struct sk_buff *skb, int npages)
+{
+   int i;
+   unsigned long flags;
+   struct page_ctor *ctor;
+   struct page_info *info = NULL;
+
+   ctor = container_of(port, struct page_ctor, port);
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq, struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info)
+   return NULL;
+
+   for (i = 0; i < info->pnum; i++) {
+   get_page(info->pages[i]);
+   info->frag[i].page = info->pages[i];
+   info->frag[i].page_offset = i ? 0 : info->offset;
+   info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+   port->data_len;
+   }
+   info->skb = skb;
+   info->ext_page.frags = info->frag;
+   info->ext_page.ushinfo = &info->ushinfo;
+   return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
int rc;
@@ -186,7 +219,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
dev_hold(dev);
ctor->dev = dev;
-   ctor->port.ctor = NULL;
+   ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->lock_pages = 0;
rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -252,11 +285,66 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int 
resource,
return 0;
 }
 
+static void relinquish_resource(struct page_ctor *ctor)
+{
+   if (!(ctor->dev->flags & IFF_UP) &&
+   !(ctor->wq_len + ctor->rq_len))
+   printk(KERN_INFO "relinquish_resource\n");
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+   struct page_info *info = (struct page_info *)(iocb->private);
+   int i;
+
+   if (info->flags == INFO_READ) {
+   for (i = 0; i < info->pnum; i++) {
+   if (info->pages[i]) {
+   set_page_dirty_lock(info->pages[i]);
+   put_page(info->pages[i]);
+   }
+   }
+   info->skb->destructor = NULL;
+   kfree_skb(info->skb);
+   info->ctor->rq_len--;
+   } else
+   info->ctor->wq_len--;
+   /* Decrement the number of locked pages */
+   info->ctor->lock_pages -= info->pnum;
+   kmem_cache_free(ext_page_info_cache, info);
+   relinquish_resource(info->ctor);
+
+   return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+   struct kiocb *iocb = NULL;
+
+   iocb = info->iocb;
+   if (!iocb)
+   return iocb;
+   iocb->ki_flags = 0;
+   iocb->ki_users = 1;
+   iocb->ki_key = 0;
+   iocb->ki_ctx = NULL;
+   iocb->ki_cancel = NULL;
+   iocb->ki_retry = NULL;
+   iocb->ki_iovec = NULL;
+   iocb->ki_eventfd = NULL;
+   iocb->ki_pos = info->desc_pos;
+   iocb->ki_nbytes = size;
+   iocb->ki_dtor(iocb);
+   iocb->private = (void *)info;
+   iocb->ki_dtor = mp_ki_dtor;
+
+   return iocb;
+}
+
 static int page_ctor_detach(struct mp_struct *mp)
 {
struct page_ctor *ctor;
struct page_info *info;
-   struct kiocb *iocb = NULL;
int i;
 
/* locked by mp_mutex */
@@ -268,11 +356,17 @@ static int page_ctor_detach(struct mp_struct *mp)
for (i = 0; i < info->pnum; i++)
if (info->pages[i])
put_page(info->pages[i]);
+   create_iocb(info, 0);
+   ctor->rq_len--;
kmem_cache_free(ext_page_info_cache, info);
}
+
+   relinquish_resource(ctor);
+
set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
   ctor->o_rlim.rlim_cur,
   ctor->o_rlim.rlim_max);
+
netdev_mp_port_detach(ctor->dev);
dev_put(ctor->dev);
 
@@ -320,6 +414,161 @@ static void mp_put(struct mp_file *mfile)
mp_detach(mfile->mp);
 }
 
+/* The callback to destruct the external buffers or skb */
+static void page_dtor(struct skb_external_page *ext_page)
+{
+   struct page_info *info;
+   struct

[RFC][PATCH v6 15/19] Add basic funcs and ioctl to mp device.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

memory leak fixed,
kconfig made,
do_unbind() made,
mp_chr_ioctl() cleanup

by Jeff Dike 

 drivers/vhost/mpassthru.c |  681 +
 1 files changed, 681 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..25e2f3e
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,681 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the external buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a external allocated skb or kernel
+*/
+   struct skb_external_pageext_page;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   struct iovechdr[MAX_SKB_FRAGS + 2];
+   struct ioveciov[MAX_SKB_FRAGS + 2];
+};
+
+static struct kmem_cache *ext_page_info_cache;
+
+struct page_ctor {
+   struct list_headreadq;
+   int wq_len;
+   int rq_len;
+   spinlock_t  read_lock;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+   int debug;
+#endif
+};
+
+struct mp_file {
+   atomic_t count;
+   struct mp_struct *mp;
+   struct net *net;
+};
+
+struct mp_sock {
+   struct sock sk;
+   struct mp_struct*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+   int ret = 0;
+
+   rtnl_lock();
+   ret = dev_change_flags(dev, flags);
+   rtnl_unlock();
+
+   if (ret < 0)
+   printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+   return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+   int rc;
+   struct page_ctor *ctor;
+   struct net_device *dev = mp->dev;
+
+   /* locked by mp_mutex */
+   if (rcu_dereference(mp->ctor))
+   return -EBUS

[RFC][PATCH v6 12/19] Add a hook to intercept external buffers from NIC driver.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

The hook is called in netif_receive_skb().
Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 37b389a..dc2f225 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2548,6 +2548,37 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+  struct packet_type **pt_prev,
+  int *ret,
+  struct net_device *orig_dev)
+{
+   struct mpassthru_port *mp_port = NULL;
+   struct sock *sk = NULL;
+
+   if (!dev_is_mpassthru(skb->dev))
+   return skb;
+   mp_port = skb->dev->mp_port;
+
+   if (*pt_prev) {
+   *ret = deliver_skb(skb, *pt_prev, orig_dev);
+   *pt_prev = NULL;
+   }
+
+   sk = mp_port->sock->sk;
+   skb_queue_tail(&sk->sk_receive_queue, skb);
+   sk->sk_state_change(sk);
+
+   return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
@@ -2629,6 +2660,10 @@ int netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+   /* To intercept mediate passthru(zero-copy) packets here */
+   skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+   if (!skb)
+   goto out;
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 10/19] Don't do skb recycle, if device use external buffer.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 38d19d0..37587f0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -553,6 +553,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;
 
+   /* if the device wants to do mediate passthru, the skb may
+* get external buffer, so don't recycle
+*/
+   if (dev_is_mpassthru(skb->dev))
+   return 0;
+
skb_release_head_state(skb);
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 09/19] Ignore room skb_reserve() when device is using external buffer.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Make the skb->data and skb->head from external buffer
to be consistent, we ignore the room reserved by driver
for kernel skb.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ff8c27..193b259 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1200,6 +1200,15 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  */
 static inline void skb_reserve(struct sk_buff *skb, int len)
 {
+   /* Since skb_reserve() is only for an empty buffer,
+* and when the skb is getting external buffer, we cannot
+* retain the external buffer has the same reserved space
+* in the header which kernel allocatd skb has, so have to
+* ignore this. And we have recorded the external buffer
+* info in the destructor_arg field, so use it as indicator.
+*/
+   if (skb_shinfo(skb)->destructor_arg)
+   return;
skb->data += len;
skb->tail += len;
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 08/19] Make __alloc_skb() to get external buffer.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Add a dev parameter to __alloc_skb(), skb->data
points to external buffer, recompute skb->head,
maintain shinfo of the external buffer, record
external buffer info into destructor_arg field.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

__alloc_skb() cleanup by

Jeff Dike 

 include/linux/skbuff.h |7 ---
 net/core/skbuff.c  |   43 +--
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 281a1c0..5ff8c27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -442,17 +442,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+  gfp_t priority, int fclone,
+  int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
 {
-   return __alloc_skb(size, priority, 0, -1);
+   return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
   gfp_t priority)
 {
-   return __alloc_skb(size, priority, 1, -1);
+   return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fbdb1f1..38d19d0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -161,7 +161,8 @@ EXPORT_SYMBOL(skb_under_panic);
  * @fclone: allocate from fclone cache instead of head cache
  * and allocate a cloned (child) skb
  * @node: numa node to allocate memory on
- *
+ * @dev: a device owns the skb if the skb try to get external buffer.
+ * otherwise is NULL.
  * Allocate a new &sk_buff. The returned buffer has no headroom and a
  * tail room of size bytes. The object has a reference count of one.
  * The return is the buffer. On a failure the return is %NULL.
@@ -170,12 +171,13 @@ EXPORT_SYMBOL(skb_under_panic);
  * %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-   int fclone, int node)
+   int fclone, int node, struct net_device *dev)
 {
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
-   u8 *data;
+   u8 *data = NULL;
+   struct skb_external_page *ext_page = NULL;
 
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +187,23 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
goto out;
 
size = SKB_DATA_ALIGN(size);
-   data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-   gfp_mask, node);
+
+   /* If the device wants to do mediate passthru(zero-copy),
+* the skb may try to get external buffers from outside.
+* If fails, then fall back to alloc buffers from kernel.
+*/
+   if (dev && dev->mp_port) {
+   ext_page = netdev_alloc_external_page(dev, skb, size);
+   if (ext_page) {
+   data = ext_page->start;
+   size = ext_page->size;
+   }
+   }
+
+   if (!data)
+   data = kmalloc_node_track_caller(
+   size + sizeof(struct skb_shared_info),
+   gfp_mask, node);
if (!data)
goto nodata;
 
@@ -208,6 +225,15 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
skb->mac_header = ~0U;
 #endif
 
+   /* If the skb get external buffers sucessfully, since the shinfo is
+* at the end of the buffer, we may retain the shinfo once we
+* need it sometime.
+*/
+   if (ext_page) {
+   skb->head = skb->data - NET_IP_ALIGN - NET_SKB_PAD;
+   memcpy(ext_page->ushinfo, skb_shinfo(skb),
+  sizeof(struct skb_shared_info));
+   }
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +257,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+   /* Record the external buffer info in this field. It's not so good,
+* but we cannot find another place easily.
+*/
+   shinfo->destructor_arg = ext_page;
+
 out:
return skb;
 nodata:
@@ -259,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(

[RFC][PATCH v6 05/19] Add a function make external buffer owner to query capability.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |2 +
 net/core/dev.c|   51 +
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 183c786..31d9c4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1599,6 +1599,8 @@ extern gro_result_t   napi_gro_frags(struct 
napi_struct *napi);
 extern int netdev_mp_port_attach(struct net_device *dev,
 struct mpassthru_port *port);
 extern void netdev_mp_port_detach(struct net_device *dev);
+extern int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ecbb6b1..37b389a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2497,6 +2497,57 @@ void netdev_mp_port_detach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_mp_port_detach);
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {
+   /* If the NIC driver did not report this,
+* then we try to use default value.
+*/
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 03/19] Export 2 func for device to assign/deassign new strucure

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |3 +++
 net/core/dev.c|   28 
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bae725c..efb575a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1592,6 +1592,9 @@ extern gro_result_t   napi_frags_finish(struct 
napi_struct *napi,
  gro_result_t ret);
 extern struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 extern gro_result_tnapi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_attach(struct net_device *dev,
+struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index f769098..ecbb6b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2469,6 +2469,34 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
 }
 
+/* Export two functions to assign/de-assign mp_port pointer
+ * to a net device.
+ */
+
+int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   /* locked by mp_mutex */
+   if (rcu_dereference(dev->mp_port))
+   return -EBUSY;
+
+   rcu_assign_pointer(dev->mp_port, port);
+
+   return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+   /* locked by mp_mutex */
+   if (!rcu_dereference(dev->mp_port))
+   return;
+
+   rcu_assign_pointer(dev->mp_port, NULL);
+   synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 02/19] Add a new struct for device to manipulate external buffer.

2010-05-21 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |   19 ++-
 1 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b476..bae725c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,6 +530,22 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+/* Add a structure in structure net_device, the new field is
+ * named as mp_port. It's for mediate passthru (zero-copy).
+ * It contains the capability for the net device driver,
+ * a socket, and an external buffer creator, external means
+ * skb buffer belongs to the device may not be allocated from
+ * kernel space.
+ */
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_external_page *(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
 
 /*
  * This structure defines the management hooks for network devices.
@@ -952,7 +968,8 @@ struct net_device {
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional device, statistics, and wireless sysfs groups */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v6 00/19] Provide a zero-copy method on KVM virtio-net.

2010-05-21 Thread xiaohui . xin
We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-13:net core changes.
patch 14-18:new device as interface to mantpulate external buffers.
patch 19:   for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.


Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

We have got comments from Michael. And he said the first method will break
the compatiblity of virtio-net driver and may complicate the qemu live 
migration. Currently, we tried to ignore the skb_reserve() if the device
is doing zero-copy. Then guest virtio-net driver wil not changed. So we now
continue to go with the first way. 
But comments about the two ways are still appreicated.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
usin

[RFC PATCH v7 01/19] Add a new structure for skb buffer from external.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 124f90c..cf309c9 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -203,6 +203,18 @@ struct skb_shared_info {
void *  destructor_arg;
 };
 
+/* The structure is for a skb which skb->data may point to
+ * an external buffer, which is not allocated from kernel space.
+ * Since the buffer is external, then the shinfo or frags are
+ * also extern too. It also contains a destructor for itself.
+ */
+struct skb_external_page {
+   u8  *start;
+   int size;
+   struct skb_frag_struct *frags;
+   struct skb_shared_info *ushinfo;
+   void(*dtor)(struct skb_external_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 03/19] Export 2 func for device to assign/deassign new strucure

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |3 +++
 net/core/dev.c|   28 
 2 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bae725c..efb575a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1592,6 +1592,9 @@ extern gro_result_t   napi_frags_finish(struct 
napi_struct *napi,
  gro_result_t ret);
 extern struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 extern gro_result_tnapi_gro_frags(struct napi_struct *napi);
+extern int netdev_mp_port_attach(struct net_device *dev,
+struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index f769098..ecbb6b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2469,6 +2469,34 @@ void netif_nit_deliver(struct sk_buff *skb)
rcu_read_unlock();
 }
 
+/* Export two functions to assign/de-assign mp_port pointer
+ * to a net device.
+ */
+
+int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   /* locked by mp_mutex */
+   if (rcu_dereference(dev->mp_port))
+   return -EBUSY;
+
+   rcu_assign_pointer(dev->mp_port, port);
+
+   return 0;
+}
+EXPORT_SYMBOL(netdev_mp_port_attach);
+
+void netdev_mp_port_detach(struct net_device *dev)
+{
+   /* locked by mp_mutex */
+   if (!rcu_dereference(dev->mp_port))
+   return;
+
+   rcu_assign_pointer(dev->mp_port, NULL);
+   synchronize_rcu();
+}
+EXPORT_SYMBOL(netdev_mp_port_detach);
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 02/19] Add a new struct for device to manipulate external buffer.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |   19 ++-
 1 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa8b476..bae725c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,6 +530,22 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+/* Add a structure in structure net_device, the new field is
+ * named as mp_port. It's for mediate passthru (zero-copy).
+ * It contains the capability for the net device driver,
+ * a socket, and an external buffer creator, external means
+ * skb buffer belongs to the device may not be allocated from
+ * kernel space.
+ */
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_external_page *(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
 
 /*
  * This structure defines the management hooks for network devices.
@@ -952,7 +968,8 @@ struct net_device {
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional device, statistics, and wireless sysfs groups */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 07/19] Add interface to get external buffers.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, it can get external buffers from mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |   12 
 net/core/skbuff.c  |   16 
 2 files changed, 28 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index cf309c9..281a1c0 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1519,6 +1519,18 @@ static inline void netdev_free_page(struct net_device 
*dev, struct page *page)
__free_page(page);
 }
 
+extern struct skb_external_page *netdev_alloc_external_pages(
+   struct net_device *dev,
+   struct sk_buff *skb, int npages);
+
+static inline struct skb_external_page *netdev_alloc_external_page(
+   struct net_device *dev,
+   struct sk_buff *skb, unsigned int size)
+{
+   return netdev_alloc_external_pages(dev, skb,
+  DIV_ROUND_UP(size, PAGE_SIZE));
+}
+
 /**
  * skb_clone_writable - is the header of a clone writable
  * @skb: buffer to check
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 93c4e06..fbdb1f1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -278,6 +278,22 @@ struct page *__netdev_alloc_page(struct net_device *dev, 
gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(__netdev_alloc_page);
 
+struct skb_external_page *netdev_alloc_external_pages(struct net_device *dev,
+   struct sk_buff *skb, int npages)
+{
+   struct mpassthru_port *port;
+   struct skb_external_page *ext_page = NULL;
+
+   port = rcu_dereference(dev->mp_port);
+   if (!port)
+   goto out;
+   WARN_ON(npages > port->npages);
+   ext_page = port->ctor(port, skb, npages);
+out:
+   return ext_page;
+}
+EXPORT_SYMBOL(netdev_alloc_external_pages);
+
 void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
int size)
 {
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 11/19] Use callback to deal with skb_release_data() specially.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

If buffer is external, then use the callback to destruct
buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |   11 +++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 37587f0..418457c 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -385,6 +385,11 @@ static void skb_clone_fraglist(struct sk_buff *skb)
 
 static void skb_release_data(struct sk_buff *skb)
 {
+   /* check if the skb has external buffers, we have use destructor_arg
+* here to indicate
+*/
+   struct skb_external_page *ext_page = skb_shinfo(skb)->destructor_arg;
+
if (!skb->cloned ||
!atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
   &skb_shinfo(skb)->dataref)) {
@@ -397,6 +402,12 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_has_frags(skb))
skb_drop_fraglist(skb);
 
+   /* if the skb has external buffers, use destructor here,
+* since after that skb->head will be kfree, in case skb->head
+* from external buffer cannot use kfree to destroy.
+*/
+   if (dev_is_mpassthru(skb->dev) && ext_page && ext_page->dtor)
+   ext_page->dtor(ext_page);
kfree(skb->head);
}
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 14/19] Add header file for mp device.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/mpassthru.h |   25 +
 1 files changed, 25 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/mpassthru.h

diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
new file mode 100644
index 000..ba8f320
--- /dev/null
+++ b/include/linux/mpassthru.h
@@ -0,0 +1,25 @@
+#ifndef __MPASSTHRU_H
+#define __MPASSTHRU_H
+
+#include 
+#include 
+
+/* ioctl defines */
+#define MPASSTHRU_BINDDEV  _IOW('M', 213, int)
+#define MPASSTHRU_UNBINDDEV_IO('M', 214)
+
+#ifdef __KERNEL__
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+struct socket *mp_get_socket(struct file *);
+#else
+#include 
+#include 
+struct file;
+struct socket;
+static inline struct socket *mp_get_socket(struct file *f)
+{
+   return ERR_PTR(-EINVAL);
+}
+#endif /* CONFIG_MEDIATE_PASSTHRU */
+#endif /* __KERNEL__ */
+#endif /* __MPASSTHRU_H */
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 18/19] Add a kconfig entry and make entry for mp device.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/Kconfig  |   10 ++
 drivers/vhost/Makefile |2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 19/19] Provides multiple submits and asynchronous notifications.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  255 -
 drivers/vhost/vhost.c |  120 +--
 drivers/vhost/vhost.h |   14 +++
 3 files changed, 333 insertions(+), 56 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9777583..9a0d162 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -45,10 +47,13 @@ enum vhost_net_poll_state {
VHOST_NET_POLL_STOPPED = 2,
 };
 
+static struct kmem_cache *notify_cache;
+
 struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -93,11 +98,146 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_iocb(struct kiocb *iocb)
+{
+   struct vhost_virtqueue *vq = iocb->private;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_add_tail(&iocb->ki_list, &vq->notifier);
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+}
+
+static int is_async_vq(struct vhost_virtqueue *vq)
+{
+   return (vq->link_state == VHOST_VQ_LINK_ASYNC);
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq,
+ struct socket *sock)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (!is_async_vq(vq))
+   return;
+
+   if (sock->sk->sk_data_ready)
+   sock->sk->sk_data_ready(sock->sk, 0);
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   vq->log : NULL;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct list_head *entry, *tmp;
+   unsigned long flags;
+   int tx_total_len = 0;
+
+   if (!is_async_vq(vq))
+   return;
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   list_for_each_safe(entry, tmp, &vq->notifier) {
+   iocb = list_entry(entry,
+struct kiocb, ki_list);
+   if (!iocb->ki_flags)
+   continue;
+   list_del(&iocb->ki_list);   
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   

[RFC PATCH v7 00/19] Provide a zero-copy method on KVM virtio-net.

2010-06-05 Thread xiaohui . xin
We provide an zero-copy method which driver side may get external
buffers to DMA. Here external means driver don't use kernel space
to allocate skb buffers. Currently the external buffer can be from
guest virtio-net driver.

The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

patch 01-13:net core changes.
patch 14-18:new device as interface to mantpulate external buffers.
patch 19:   for vhost-net.

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.


Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

We have got comments from Michael. And he said the first method will break
the compatiblity of virtio-net driver and may complicate the qemu live 
migration. Currently, we tried to ignore the skb_reserve() if the device
is doing zero-copy. Then guest virtio-net driver wil not changed. So we now
continue to go with the first way. 
But comments about the two ways are still appreicated.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
usin

[RFC PATCH v7 17/19] Export proto_ops to vhost-net driver.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Currently, vhost-net is only user to the mp device.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  295 -
 1 files changed, 290 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 8c48898..23755ba 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -414,6 +414,11 @@ static void mp_put(struct mp_file *mfile)
mp_detach(mfile->mp);
 }
 
+static void iocb_tag(struct kiocb *iocb)
+{
+   iocb->ki_flags = 1;
+}
+
 /* The callback to destruct the external buffers or skb */
 static void page_dtor(struct skb_external_page *ext_page)
 {
@@ -449,7 +454,7 @@ static void page_dtor(struct skb_external_page *ext_page)
 * Queue the notifier to wake up the backend driver
 */
 
-   create_iocb(info, info->total);
+   iocb_tag(info->iocb);
 
sk = ctor->port.sock->sk;
sk->sk_write_space(sk);
@@ -569,8 +574,288 @@ failed:
return NULL;
 }
 
+static void mp_sock_destruct(struct sock *sk)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   kfree(mp);
+}
+
+static void mp_sock_state_change(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN);
+}
+
+static void mp_sock_write_space(struct sock *sk)
+{
+   if (sk_has_sleeper(sk))
+   wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT);
+}
+
+static void mp_sock_data_ready(struct sock *sk, int coming)
+{
+   struct mp_struct *mp = container_of(sk, struct mp_sock, sk)->mp;
+   struct page_ctor *ctor = NULL;
+   struct sk_buff *skb = NULL;
+   struct page_info *info = NULL;
+   struct ethhdr *eth;
+   struct kiocb *iocb = NULL;
+   int len, i;
+
+   struct virtio_net_hdr hdr = {
+   .flags = 0,
+   .gso_type = VIRTIO_NET_HDR_GSO_NONE
+   };
+
+   ctor = rcu_dereference(mp->ctor);
+   if (!ctor)
+   return;
+
+   while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+   if (skb_shinfo(skb)->destructor_arg) {
+   info = container_of(skb_shinfo(skb)->destructor_arg,
+   struct page_info, ext_page);
+   info->skb = skb;
+   if (skb->len > info->len) {
+   mp->dev->stats.rx_dropped++;
+   DBG(KERN_INFO "Discarded truncated rx packet: "
+   " len %d > %zd\n", skb->len, info->len);
+   info->total = skb->len;
+   goto clean;
+   } else {
+   eth = eth_hdr(skb);
+   skb_push(skb, ETH_HLEN);
+   info->total = skb->len;
+   }
+   } else {
+   /* The skb composed with kernel buffers
+* in case external buffers are not sufficent.
+* The case should be rare.
+*/
+   unsigned long flags;
+   int i;
+   info = NULL;
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq,
+   struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info) {
+   DBG(KERN_INFO
+   "No external buffer avaliable %p\n",
+   skb);
+   skb_queue_head(&sk->sk_receive_queue,
+   skb);
+   break;
+   }
+   info->skb = skb;
+   eth = eth_hdr(skb);
+   skb_push(skb, ETH_HLEN);
+   info->total = skb->len;
+   skb_copy_datagram_iovec(skb, 0, info->iov, skb->len);
+   }
+
+   len = memcpy_toiovec(info->hdr, (unsigned char *)&hdr,
+   sizeof hdr);
+   if (len) {
+   DBG(KERN_INFO
+   "Unable to write vnet_hdr at addr %p: %d\n",
+   info->hdr->iov_base, len);
+   goto clean;
+   }
+
+   iocb = create_iocb(info, skb->len + sizeof(hdr));
+   continue;
+
+clean:
+   kfree_skb(skb);
+   for

[RFC PATCH v7 16/19] Manipulate external buffers in mp device.

2010-06-05 Thread xiaohui . xin
From: Xiaohui Xin

How external buffer comes from, how to destroy.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c |  253 -
 1 files changed, 251 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index 25e2f3e..8c48898 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -161,6 +161,39 @@ static int mp_dev_change_flags(struct net_device *dev, 
unsigned flags)
return ret;
 }
 
+/* The main function to allocate external buffers */
+static struct skb_external_page *page_ctor(struct mpassthru_port *port,
+   struct sk_buff *skb, int npages)
+{
+   int i;
+   unsigned long flags;
+   struct page_ctor *ctor;
+   struct page_info *info = NULL;
+
+   ctor = container_of(port, struct page_ctor, port);
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq, struct page_info, list);
+   list_del(&info->list);
+   }
+   spin_unlock_irqrestore(&ctor->read_lock, flags);
+   if (!info)
+   return NULL;
+
+   for (i = 0; i < info->pnum; i++) {
+   get_page(info->pages[i]);
+   info->frag[i].page = info->pages[i];
+   info->frag[i].page_offset = i ? 0 : info->offset;
+   info->frag[i].size = port->npages > 1 ? PAGE_SIZE :
+   port->data_len;
+   }
+   info->skb = skb;
+   info->ext_page.frags = info->frag;
+   info->ext_page.ushinfo = &info->ushinfo;
+   return &info->ext_page;
+}
+
 static int page_ctor_attach(struct mp_struct *mp)
 {
int rc;
@@ -186,7 +219,7 @@ static int page_ctor_attach(struct mp_struct *mp)
 
dev_hold(dev);
ctor->dev = dev;
-   ctor->port.ctor = NULL;
+   ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->lock_pages = 0;
rc = netdev_mp_port_attach(dev, &ctor->port);
@@ -252,11 +285,66 @@ static int set_memlock_rlimit(struct page_ctor *ctor, int 
resource,
return 0;
 }
 
+static void relinquish_resource(struct page_ctor *ctor)
+{
+   if (!(ctor->dev->flags & IFF_UP) &&
+   !(ctor->wq_len + ctor->rq_len))
+   printk(KERN_INFO "relinquish_resource\n");
+}
+
+static void mp_ki_dtor(struct kiocb *iocb)
+{
+   struct page_info *info = (struct page_info *)(iocb->private);
+   int i;
+
+   if (info->flags == INFO_READ) {
+   for (i = 0; i < info->pnum; i++) {
+   if (info->pages[i]) {
+   set_page_dirty_lock(info->pages[i]);
+   put_page(info->pages[i]);
+   }
+   }
+   info->skb->destructor = NULL;
+   kfree_skb(info->skb);
+   info->ctor->rq_len--;
+   } else
+   info->ctor->wq_len--;
+   /* Decrement the number of locked pages */
+   info->ctor->lock_pages -= info->pnum;
+   kmem_cache_free(ext_page_info_cache, info);
+   relinquish_resource(info->ctor);
+
+   return;
+}
+
+static struct kiocb *create_iocb(struct page_info *info, int size)
+{
+   struct kiocb *iocb = NULL;
+
+   iocb = info->iocb;
+   if (!iocb)
+   return iocb;
+   iocb->ki_flags = 0;
+   iocb->ki_users = 1;
+   iocb->ki_key = 0;
+   iocb->ki_ctx = NULL;
+   iocb->ki_cancel = NULL;
+   iocb->ki_retry = NULL;
+   iocb->ki_iovec = NULL;
+   iocb->ki_eventfd = NULL;
+   iocb->ki_pos = info->desc_pos;
+   iocb->ki_nbytes = size;
+   iocb->ki_dtor(iocb);
+   iocb->private = (void *)info;
+   iocb->ki_dtor = mp_ki_dtor;
+
+   return iocb;
+}
+
 static int page_ctor_detach(struct mp_struct *mp)
 {
struct page_ctor *ctor;
struct page_info *info;
-   struct kiocb *iocb = NULL;
int i;
 
/* locked by mp_mutex */
@@ -268,11 +356,17 @@ static int page_ctor_detach(struct mp_struct *mp)
for (i = 0; i < info->pnum; i++)
if (info->pages[i])
put_page(info->pages[i]);
+   create_iocb(info, 0);
+   ctor->rq_len--;
kmem_cache_free(ext_page_info_cache, info);
}
+
+   relinquish_resource(ctor);
+
set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
   ctor->o_rlim.rlim_cur,
   ctor->o_rlim.rlim_max);
+
netdev_mp_port_detach(ctor->dev);
dev_put(ctor->

[RFC PATCH v7 15/19] Add basic funcs and ioctl to mp device.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

The ioctl is used by mp device to bind an underlying
NIC, it will query hardware capability and declare the
NIC to use external buffers.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

memory leak fixed,
kconfig made,
do_unbind() made,
mp_chr_ioctl() cleanup

by Jeff Dike 


 drivers/vhost/mpassthru.c |  681 +
 1 files changed, 681 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..25e2f3e
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,681 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the external buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a external allocated skb or kernel
+*/
+   struct skb_external_pageext_page;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   struct iovechdr[MAX_SKB_FRAGS + 2];
+   struct ioveciov[MAX_SKB_FRAGS + 2];
+};
+
+static struct kmem_cache *ext_page_info_cache;
+
+struct page_ctor {
+   struct list_headreadq;
+   int wq_len;
+   int rq_len;
+   spinlock_t  read_lock;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+   int debug;
+#endif
+};
+
+struct mp_file {
+   atomic_t count;
+   struct mp_struct *mp;
+   struct net *net;
+};
+
+struct mp_sock {
+   struct sock sk;
+   struct mp_struct*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+   int ret = 0;
+
+   rtnl_lock();
+   ret = dev_change_flags(dev, flags);
+   rtnl_unlock();
+
+   if (ret < 0)
+   printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+   return ret;
+}
+
+static int page_ctor_attach(struct mp_struct *mp)
+{
+   int rc;
+   struct page_ctor *ctor;
+   struct net_device *dev = mp->dev;
+
+   /* locked by mp_mutex */
+   if (rcu_dereference(mp->ctor))
+   return -EBU

[RFC PATCH v7 13/19] To skip GRO if buffer is external currently.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index dc2f225..6c6b2fe 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2787,6 +2787,10 @@ enum gro_result dev_gro_receive(struct napi_struct 
*napi, struct sk_buff *skb)
if (skb_is_gso(skb) || skb_has_frags(skb))
goto normal;
 
+   /* currently GRO is not supported by mediate passthru */
+   if (dev_is_mpassthru(skb->dev))
+   goto normal;
+
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
if (ptype->type != type || ptype->dev || !ptype->gro_receive)
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 12/19] Add a hook to intercept external buffers from NIC driver.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

The hook is called in netif_receive_skb().
Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/dev.c |   35 +++
 1 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 37b389a..dc2f225 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2548,6 +2548,37 @@ err:
 EXPORT_SYMBOL(netdev_mp_port_prep);
 #endif
 
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+/* Add a hook to intercept mediate passthru(zero-copy) packets,
+ * and insert it to the socket queue owned by mp_port specially.
+ */
+static inline struct sk_buff *handle_mpassthru(struct sk_buff *skb,
+  struct packet_type **pt_prev,
+  int *ret,
+  struct net_device *orig_dev)
+{
+   struct mpassthru_port *mp_port = NULL;
+   struct sock *sk = NULL;
+
+   if (!dev_is_mpassthru(skb->dev))
+   return skb;
+   mp_port = skb->dev->mp_port;
+
+   if (*pt_prev) {
+   *ret = deliver_skb(skb, *pt_prev, orig_dev);
+   *pt_prev = NULL;
+   }
+
+   sk = mp_port->sock->sk;
+   skb_queue_tail(&sk->sk_receive_queue, skb);
+   sk->sk_state_change(sk);
+
+   return NULL;
+}
+#else
+#define handle_mpassthru(skb, pt_prev, ret, orig_dev) (skb)
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
@@ -2629,6 +2660,10 @@ int netif_receive_skb(struct sk_buff *skb)
 ncls:
 #endif
 
+   /* To intercept mediate passthru(zero-copy) packets here */
+   skb = handle_mpassthru(skb, &pt_prev, &ret, orig_dev);
+   if (!skb)
+   goto out;
skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 10/19] Don't do skb recycle, if device use external buffer.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 net/core/skbuff.c |6 ++
 1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 38d19d0..37587f0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -553,6 +553,12 @@ int skb_recycle_check(struct sk_buff *skb, int skb_size)
if (skb_shared(skb) || skb_cloned(skb))
return 0;
 
+   /* if the device wants to do mediate passthru, the skb may
+* get external buffer, so don't recycle
+*/
+   if (dev_is_mpassthru(skb->dev))
+   return 0;
+
skb_release_head_state(skb);
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 09/19] Ignore room skb_reserve() when device is using external buffer.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Make the skb->data and skb->head from external buffer
to be consistent, we ignore the room reserved by driver
for kernel skb.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/skbuff.h |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ff8c27..193b259 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1200,6 +1200,15 @@ static inline int skb_tailroom(const struct sk_buff *skb)
  */
 static inline void skb_reserve(struct sk_buff *skb, int len)
 {
+   /* Since skb_reserve() is only for an empty buffer,
+* and when the skb is getting external buffer, we cannot
+* retain the external buffer has the same reserved space
+* in the header which kernel allocatd skb has, so have to
+* ignore this. And we have recorded the external buffer
+* info in the destructor_arg field, so use it as indicator.
+*/
+   if (skb_shinfo(skb)->destructor_arg)
+   return;
skb->data += len;
skb->tail += len;
 }
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 08/19] Make __alloc_skb() to get external buffer.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Add a dev parameter to __alloc_skb(), skb->data
points to external buffer, recompute skb->head,
maintain shinfo of the external buffer, record
external buffer info into destructor_arg field.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

__alloc_skb() cleanup by

Jeff Dike 

 include/linux/skbuff.h |7 ---
 net/core/skbuff.c  |   43 +--
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 281a1c0..5ff8c27 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -442,17 +442,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+  gfp_t priority, int fclone,
+  int node, struct net_device *dev);
 static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
 {
-   return __alloc_skb(size, priority, 0, -1);
+   return __alloc_skb(size, priority, 0, -1, NULL);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
   gfp_t priority)
 {
-   return __alloc_skb(size, priority, 1, -1);
+   return __alloc_skb(size, priority, 1, -1, NULL);
 }
 
 extern int skb_recycle_check(struct sk_buff *skb, int skb_size);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fbdb1f1..38d19d0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -161,7 +161,8 @@ EXPORT_SYMBOL(skb_under_panic);
  * @fclone: allocate from fclone cache instead of head cache
  * and allocate a cloned (child) skb
  * @node: numa node to allocate memory on
- *
+ * @dev: a device owns the skb if the skb try to get external buffer.
+ * otherwise is NULL.
  * Allocate a new &sk_buff. The returned buffer has no headroom and a
  * tail room of size bytes. The object has a reference count of one.
  * The return is the buffer. On a failure the return is %NULL.
@@ -170,12 +171,13 @@ EXPORT_SYMBOL(skb_under_panic);
  * %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-   int fclone, int node)
+   int fclone, int node, struct net_device *dev)
 {
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
-   u8 *data;
+   u8 *data = NULL;
+   struct skb_external_page *ext_page = NULL;
 
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
@@ -185,8 +187,23 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
goto out;
 
size = SKB_DATA_ALIGN(size);
-   data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
-   gfp_mask, node);
+
+   /* If the device wants to do mediate passthru(zero-copy),
+* the skb may try to get external buffers from outside.
+* If fails, then fall back to alloc buffers from kernel.
+*/
+   if (dev && dev->mp_port) {
+   ext_page = netdev_alloc_external_page(dev, skb, size);
+   if (ext_page) {
+   data = ext_page->start;
+   size = ext_page->size;
+   }
+   }
+
+   if (!data)
+   data = kmalloc_node_track_caller(
+   size + sizeof(struct skb_shared_info),
+   gfp_mask, node);
if (!data)
goto nodata;
 
@@ -208,6 +225,15 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
skb->mac_header = ~0U;
 #endif
 
+   /* If the skb get external buffers sucessfully, since the shinfo is
+* at the end of the buffer, we may retain the shinfo once we
+* need it sometime.
+*/
+   if (ext_page) {
+   skb->head = skb->data - NET_IP_ALIGN - NET_SKB_PAD;
+   memcpy(ext_page->ushinfo, skb_shinfo(skb),
+  sizeof(struct skb_shared_info));
+   }
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
atomic_set(&shinfo->dataref, 1);
@@ -231,6 +257,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t 
gfp_mask,
 
child->fclone = SKB_FCLONE_UNAVAILABLE;
}
+   /* Record the external buffer info in this field. It's not so good,
+* but we cannot find another place easily.
+*/
+   shinfo->destructor_arg = ext_page;
+
 out:
return skb;
 nodata:
@@ -259,7 +290,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
int node = dev->dev.parent ? dev_to_node(

[RFC PATCH v7 06/19] Add a function to indicate if device use external buffer.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 31d9c4a..0cb78f4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1602,6 +1602,11 @@ extern void netdev_mp_port_detach(struct net_device 
*dev);
 extern int netdev_mp_port_prep(struct net_device *dev,
struct mpassthru_port *port);
 
+static inline bool dev_is_mpassthru(struct net_device *dev)
+{
+   return (dev && dev->mp_port);
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
kfree_skb(napi->skb);
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 05/19] Add a function make external buffer owner to query capability.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

The external buffer owner can use the functions to get
the capability of the underlying NIC driver.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |2 +
 net/core/dev.c|   51 +
 2 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 183c786..31d9c4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1599,6 +1599,8 @@ extern gro_result_t   napi_gro_frags(struct 
napi_struct *napi);
 extern int netdev_mp_port_attach(struct net_device *dev,
 struct mpassthru_port *port);
 extern void netdev_mp_port_detach(struct net_device *dev);
+extern int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port);
 
 static inline void napi_free_frags(struct napi_struct *napi)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index ecbb6b1..37b389a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2497,6 +2497,57 @@ void netdev_mp_port_detach(struct net_device *dev)
 }
 EXPORT_SYMBOL(netdev_mp_port_detach);
 
+/* To support meidate passthru(zero-copy) with NIC driver,
+ * we'd better query NIC driver for the capability it can
+ * provide, especially for packet split mode, now we only
+ * query for the header size, and the payload a descriptor
+ * may carry. If a driver does not use the API to export,
+ * then we may try to use a default value, currently,
+ * we use the default value from an IGB driver. Now,
+ * it's only called by mpassthru device.
+ */
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {
+   /* If the NIC driver did not report this,
+* then we try to use default value.
+*/
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_mp_port_prep);
+#endif
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH v7 04/19] Add a ndo_mp_port_prep pointer to net_device_ops.

2010-06-05 Thread xiaohui . xin
From: Xin Xiaohui 

If the driver want to allocate external buffers,
then it can export it's capability, as the skb
buffer header length, the page length can be DMA, etc.
The external buffers owner may utilize this.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index efb575a..183c786 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -707,6 +707,10 @@ struct net_device_ops {
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
 #endif
+#if defined(CONFIG_MEDIATE_PASSTHRU) || defined(CONFIG_MEDIATE_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 0/3] Provide a zero-copy method on KVM virtio-net.

2010-03-06 Thread xiaohui . xin
The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
using dynamic minor instead of static minor number
a __KERNEL__ protect to mp_get_sock()

performance:
using netperf with GSO/TSO disabled, 10G NIC, 
disabled packet split mode, with raw socket case compared to vhost.

bindwidth will be from 1.1Gbps to 1.7Gbps
CPU % from 120%-140% to 140%-160%
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v1 1/3] A device for zero-copy based on KVM virtio-net.

2010-03-06 Thread xiaohui . xin
From: Xin Xiaohui 

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Sigend-off-by: Jeff Dike 
---
 drivers/vhost/Kconfig |5 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1202 +
 include/linux/mpassthru.h |   29 ++
 4 files changed, 1238 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate "Zerocopy network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..744d6cd
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1202 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "vhost.h"
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   atomic_trefcnt;
+   struct kmem_cache   *cache;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+   void*sendctrl;
+   void*recvctrl;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+   struct vhost_notifier   notifier;
+   unsigned intdesc_pos;
+   unsigned intlog;
+   struct iovechdr[VHOST_NET_MAX_SG];
+   struct ioveciov[VHOST_NET_MAX_SG];
+   void*ctl;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *

[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-03-06 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  156 +++--
 drivers/vhost/vhost.h |   23 +++
 2 files changed, 174 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..24a6c3d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -91,6 +92,12 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq);
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -124,6 +131,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -151,6 +160,12 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   msg.msg_control = (void *)vq;
+   }
+
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
@@ -166,6 +181,10 @@ static void handle_tx(struct vhost_net *net)
tx_poll_start(net, sock);
break;
}
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
if (err != len)
pr_err("Truncated TX packet: "
   " len %d != %zd\n", err, len);
@@ -177,6 +196,8 @@ static void handle_tx(struct vhost_net *net)
}
}
 
+   handle_async_tx_events_notify(net, vq);
+
mutex_unlock(&vq->mutex);
unuse_mm(net->dev.mm);
 }
@@ -206,7 +227,8 @@ static void handle_rx(struct vhost_net *net)
int err;
size_t hdr_size;
struct socket *sock = rcu_dereference(vq->private_data);
-   if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
+   if (!sock || (skb_queue_empty(&sock->sk->sk_receive_queue) &&
+   vq->link_state == VHOST_VQ_LINK_SYNC))
return;
 
use_mm(net->dev.mm);
@@ -214,9 +236,18 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
 
-   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
+   /* In async cases, for write logging, the simple way is to get
+* the log info always, and really logging is decided later.
+* Thus, when logging enabled, we can get log, and when logging
+* disabled, we can get log disabled accordingly.
+*/
+
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
+   (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
vq->log : NULL;
 
+   handle_async_rx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -245,6 +276,11 @@ static void handle_rx(struct vhost_net *net)
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
msg.msg_iovlen = in;
len = iov_length(vq->iov, in);
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   vq->head = head;
+   vq->_log = log;
+   msg.msg_control = (void *)vq;
+   }
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for RX: "
@@ -259,6 +295,10 @@ static void handle_rx(struct vhost_net *net)
vhost_discard_vq_desc(vq);
break;
}
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC)
+   continue;
+
/* TODO: Should check and handle checksum. */
if (err > len) {
pr_err("Discarded truncated rx packet: "
@@ -284,10 +324,83 @@ static void handle_rx(struct vhost_net

[PATCH v1 3/3] Let host NIC driver to DMA to guest user space.

2010-03-06 Thread xiaohui . xin
From: Xin Xiaohui 

The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Sigend-off-by: Jeff Dike 
---
 include/linux/netdevice.h |   76 ++-
 include/linux/skbuff.h|   30 +++--
 net/core/dev.c|   32 ++
 net/core/skbuff.c |   79 +
 4 files changed, 205 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..97bf12c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -485,6 +485,17 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_user_page*(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
+#endif
 
 /*
  * This structure defines the management hooks for network devices.
@@ -636,6 +647,10 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
 u16 xid);
 #endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
@@ -891,7 +906,8 @@ struct net_device
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional statistics and wireless sysfs groups */
@@ -2013,6 +2029,62 @@ static inline u32 dev_ethtool_get_flags(struct 
net_device *dev)
return 0;
return dev->ethtool_ops->get_flags(dev);
 }
-#endif /* __KERNEL__ */
 
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+static inline int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {  /* should be temp */
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+
+static inline int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   if (rcu_dereference(dev->mp_port))
+   return -EBUSY;
+
+   rcu_assign_pointer(dev->mp_port, port);
+
+   return 0;
+}
+
+static inline void netdev_mp_port_detach(struct net_device *dev)
+{
+   if (!rcu_dereference(dev->mp_port))
+   return;
+
+   rcu_assign_pointer(dev->mp_port, NULL);
+   synchronize_rcu();
+}
+#endif /* CONFIG_VHOST_PASSTHRU */
+#endif /* __KERNEL__ */
 #endif /* _LINUX_NETDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..e59fa57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -209,6 +209,13 @@ struct skb_shared_info {
void *  destructor_arg;
 };
 
+struct skb_user_page {
+   u8  *start;
+   int size;
+   struct skb_frag_struct *frags;
+   struct skb_shared_info *ushinfo;
+   void(*dtor)(struct skb_user_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+ 

[RFC][PATCH v2 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-02 Thread xiaohui . xin
The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

The scenario is like this:

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.
Is that reasonable?

Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

Please give comments especially for the network part modifications.


We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
using dynamic minor instead of static minor number
a __KERNEL__ protect to mp_get_sock()

what we have done in v2:

remove most of the RCU usage, since the ctor pointer is only
changed by BIND/UNBIND ioctl, and during that time, NIC will be
stopped to get good cleanup(all outstanding requests are finished),
so the ctor pointer cannot be raced into wrong situation.

Remove the struct vhost_notifier with struct kiocb.
Let vhost-net backend to alloc/free the kiocb and transfer them
via sendmsg/recvmsg.

use get_user_pages_fast() and set_page_dirty_lock() 

[RFC] [PATCH v2 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-02 Thread xiaohui . xin
From: Xin Xiaohui 

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Sigend-off-by: Jeff Dike 
---

 drivers/vhost/Kconfig |5 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1162 +
 include/linux/mpassthru.h |   29 ++
 4 files changed, 1198 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate "Zerocopy network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..6e8fc4d
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1162 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "vhost.h"
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   void*ctrl;
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   unsigned intlog;
+   struct iovechdr[VHOST_NET_MAX_SG];
+   struct ioveciov[VHOST_NET_MAX_SG];
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+ 

[RFC] [PATCH v2 2/3] Provides multiple submits and asynchronous notifications.

2010-04-02 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---

 drivers/vhost/net.c   |  189 +++--
 drivers/vhost/vhost.h |   10 +++
 2 files changed, 192 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..2aafd90 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -91,11 +94,88 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   int log, size;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   if (vq->receiver)
+   vq->receiver(vq);
+
+   vq_log = unlikely(vhost_has_feature(
+   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   log = (int)iocb->ki_user_data;
+   size = iocb->ki_nbytes;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   if (unlikely(vq_log))
+   vhost_log_write(vq, vq_log, log, size);
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+   struct kiocb *iocb = NULL;
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -124,6 +204,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 ARRAY_SIZE(vq->iov),
@@ -151,6 +233,15 @@ static void handle_tx(struct vhost_net *net)
/* Skip header. TODO: support TSO. */
s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
msg.msg_iovlen = out;
+
+   if (vq->link_state == VHOST_VQ_LINK_ASYNC) {
+   iocb = kmem_cache_zalloc(net->cache, GFP_KERNEL);
+   if (!iocb)
+   break;
+   iocb->ki_pos = head;
+   iocb->private = (void *)vq;
+   }
+
len = 

[RFC] [PATCH v2 3/3] Let host NIC driver to DMA to guest user space.

2010-04-02 Thread xiaohui . xin
From: Xin Xiaohui 

The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.
We want it to be more generic as a zero copy framework.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Sigend-off-by: Jeff Dike 
---

We consider 2 way to utilize the user buffres, but not sure which one
is better. Please give any comments.

One:Modify __alloc_skb() function a bit, it can only allocate a
structure of sk_buff, and the data pointer is pointing to a
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device
we have bind to do zero-copy. Then we ask guest to do so.
Is that reasonable?

Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For
the head buffer side, let host allocates skb, and h/w fills it.
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.


Thanks
Xiaohui

 include/linux/netdevice.h |   69 -
 include/linux/skbuff.h|   30 --
 net/core/dev.c|   63 ++
 net/core/skbuff.c |   74 
 4 files changed, 224 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..ba48eb0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -485,6 +485,17 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_user_page*(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
+#endif
 
 /*
  * This structure defines the management hooks for network devices.
@@ -636,6 +647,10 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
 u16 xid);
 #endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
@@ -891,7 +906,8 @@ struct net_device
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional statistics and wireless sysfs groups */
@@ -2013,6 +2029,55 @@ static inline u32 dev_ethtool_get_flags(struct 
net_device *dev)
return 0;
return dev->ethtool_ops->get_flags(dev);
 }
-#endif /* __KERNEL__ */
 
+/* To support zero-copy between user space application and NIC driver,
+ * we'd better ask NIC driver for the capability it can provide, especially
+ * for packet split mode, now we only ask for the header size, and the
+ * payload once a descriptor may carry.
+ */
+
+#if defined(CONFIG_VHOST_PASSTHRU) || defin

Re:[PATCH 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-07 Thread xiaohui . xin
From: Xin Xiaohui 

---

Michael,
Thanks a lot for the explanation. I have drafted a patch for the qemu write
after I looked into tun driver. Does it do in right way?

Thanks
Xiaohui

 drivers/vhost/mpassthru.c |   45 +
 1 files changed, 45 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index e9449ac..1cde097 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -1065,6 +1065,49 @@ static unsigned int mp_chr_poll(struct file *file, 
poll_table * wait)
return mask;
 }
 
+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
+   unsigned long count, loff_t pos)
+{
+   struct file *file = iocb->ki_filp;
+   struct mp_struct *mp = mp_get(file->private_data);
+   struct sock *sk = mp->socket.sk;
+   struct sk_buff *skb;
+   int len, err;
+   ssize_t result;
+
+   if (!mp)
+   return -EBADFD;
+
+   /* currently, async is not supported */
+   if (!is_sync_kiocb(iocb))
+   return -EFAULT;
+
+   len = iov_length(iov, count);
+   skb = sock_alloc_send_skb(sk, len + NET_IP_ALIGN,
+ file->f_flags & O_NONBLOCK, &err);
+
+   if (!skb)
+   return -EFAULT;
+
+   skb_reserve(skb, NET_IP_ALIGN);
+   skb_put(skb, len);
+
+   if (skb_copy_datagram_from_iovec(skb, 0, iov, 0, len)) {
+   kfree_skb(skb);
+   return -EFAULT;
+   }
+   skb_set_network_header(skb, ETH_HLEN);
+   skb->protocol = *((__be16 *)(skb->data) + ETH_ALEN);
+   skb->dev = mp->dev;
+
+   dev_queue_xmit(skb);
+   mp->dev->stats.tx_packets++;
+   mp->dev->stats.tx_bytes += len;
+
+   mp_put(mp);
+   return result;
+}
+
 static int mp_chr_close(struct inode *inode, struct file *file)
 {
struct mp_file *mfile = file->private_data;
@@ -1084,6 +1127,8 @@ static int mp_chr_close(struct inode *inode, struct file 
*file)
 static const struct file_operations mp_fops = {
.owner  = THIS_MODULE,
.llseek = no_llseek,
+   .write  = do_sync_write,
+   .aio_write = mp_chr_aio_write,
.poll   = mp_chr_poll,
.unlocked_ioctl = mp_chr_ioctl,
.open   = mp_chr_open,
-- 
1.5.4.4

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re:[PATCH v1 2/3] Provides multiple submits and asynchronous notifications.

2010-04-08 Thread xiaohui . xin
From: Xin Xiaohui 

---
Michael,
This is a small patch for the write logging issue with async queue.
I have made a __vhost_get_vq_desc() func which may compute the log
info with any valid buffer index. The __vhost_get_vq_desc() is 
coming from the code in vq_get_vq_desc().
And I use it to recompute the log info when logging is enabled.

Thanks
Xiaohui

 drivers/vhost/net.c   |   27 ---
 drivers/vhost/vhost.c |  115 -
 drivers/vhost/vhost.h |5 ++
 3 files changed, 90 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2aafd90..00a45ef 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -115,7 +115,8 @@ static void handle_async_rx_events_notify(struct vhost_net 
*net,
struct kiocb *iocb = NULL;
struct vhost_log *vq_log = NULL;
int rx_total_len = 0;
-   int log, size;
+   unsigned int head, log, in, out;
+   int size;
 
if (vq->link_state != VHOST_VQ_LINK_ASYNC)
return;
@@ -130,14 +131,25 @@ static void handle_async_rx_events_notify(struct 
vhost_net *net,
iocb->ki_pos, iocb->ki_nbytes);
log = (int)iocb->ki_user_data;
size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
rx_total_len += iocb->ki_nbytes;
 
if (iocb->ki_dtor)
iocb->ki_dtor(iocb);
kmem_cache_free(net->cache, iocb);
 
-   if (unlikely(vq_log))
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
vhost_log_write(vq, vq_log, log, size);
+   }
if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
vhost_poll_queue(&vq->poll);
break;
@@ -313,14 +325,13 @@ static void handle_rx(struct vhost_net *net)
vhost_disable_notify(vq);
hdr_size = vq->hdr_size;
 
-   /* In async cases, for write logging, the simple way is to get
-* the log info always, and really logging is decided later.
-* Thus, when logging enabled, we can get log, and when logging
-* disabled, we can get log disabled accordingly.
+   /* In async cases, when write log is enabled, in case the submitted
+* buffers did not get log info before the log enabling, so we'd
+* better recompute the log info when needed. We do this in
+* handle_async_rx_events_notify().
 */
 
-   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) |
-   (vq->link_state == VHOST_VQ_LINK_ASYNC) ?
+   vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
 
handle_async_rx_events_notify(net, vq);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97233d5..53dab80 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -715,66 +715,21 @@ static unsigned get_indirect(struct vhost_dev *dev, 
struct vhost_virtqueue *vq,
return 0;
 }
 
-/* This looks in the virtqueue and for the first available buffer, and converts
- * it to an iovec for convenient access.  Since descriptors consist of some
- * number of output then some number of input descriptors, it's actually two
- * iovecs, but we pack them into one and note how many of each there were.
- *
- * This function returns the descriptor number found, or vq->num (which
- * is never a valid descriptor number) if none was found. */
-unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
+unsigned __vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq,
   struct iovec iov[], unsigned int iov_size,
   unsigned int *out_num, unsigned int *in_num,
-  struct vhost_log *log, unsigned int *log_num)
+  struct vhost_log *log, unsigned int *log_num,
+  unsigned int head)
 {
struct vring_desc desc;
-   unsigned int i, head, found = 0;
-   u16 last_avail_idx;
+   unsigned int i = head, found = 0;
int ret;
 
-   /* Check it isn't doing very strange things with descriptor numbers. */
-   last_avail_idx = vq->last_avail_idx;
-   if (get_user(vq->avail_idx, &vq->avail->idx)) {
-   vq_err(vq, "Failed to access avail idx at %p\n",
-  &vq->avail->idx);
-

[RFC][PATCH v3 0/3] Provide a zero-copy method on KVM virtio-net.

2010-04-09 Thread xiaohui . xin
The idea is simple, just to pin the guest VM user space and then
let host NIC driver has the chance to directly DMA to it. 
The patches are based on vhost-net backend driver. We add a device
which provides proto_ops as sendmsg/recvmsg to vhost-net to
send/recv directly to/from the NIC driver. KVM guest who use the
vhost-net backend may bind any ethX interface in the host side to
get copyless data transfer thru guest virtio-net frontend.

The scenario is like this:

The guest virtio-net driver submits multiple requests thru vhost-net
backend driver to the kernel. And the requests are queued and then
completed after corresponding actions in h/w are done.

For read, user space buffers are dispensed to NIC driver for rx when
a page constructor API is invoked. Means NICs can allocate user buffers
from a page constructor. We add a hook in netif_receive_skb() function
to intercept the incoming packets, and notify the zero-copy device.

For write, the zero-copy deivce may allocates a new host skb and puts
payload on the skb_shinfo(skb)->frags, and copied the header to skb->data.
The request remains pending until the skb is transmitted by h/w.

Here, we have ever considered 2 ways to utilize the page constructor
API to dispense the user buffers.

One:Modify __alloc_skb() function a bit, it can only allocate a 
structure of sk_buff, and the data pointer is pointing to a 
user buffer which is coming from a page constructor API.
Then the shinfo of the skb is also from guest.
When packet is received from hardware, the skb->data is filled
directly by h/w. What we have done is in this way.

Pros:   We can avoid any copy here.
Cons:   Guest virtio-net driver needs to allocate skb as almost
the same method with the host NIC drivers, say the size
of netdev_alloc_skb() and the same reserved space in the
head of skb. Many NIC drivers are the same with guest and
ok for this. But some lastest NIC drivers reserves special
room in skb head. To deal with it, we suggest to provide
a method in guest virtio-net driver to ask for parameter
we interest from the NIC driver when we know which device 
we have bind to do zero-copy. Then we ask guest to do so.
Is that reasonable?

Two:Modify driver to get user buffer allocated from a page constructor
API(to substitute alloc_page()), the user buffer are used as payload
buffers and filled by h/w directly when packet is received. Driver
should associate the pages with skb (skb_shinfo(skb)->frags). For 
the head buffer side, let host allocates skb, and h/w fills it. 
After that, the data filled in host skb header will be copied into
guest header buffer which is submitted together with the payload buffer.

Pros:   We could less care the way how guest or host allocates their
buffers.
Cons:   We still need a bit copy here for the skb header.

We are not sure which way is the better here. This is the first thing we want
to get comments from the community. We wish the modification to the network
part will be generic which not used by vhost-net backend only, but a user
application may use it as well when the zero-copy device may provides async
read/write operations later.

Please give comments especially for the network part modifications.


We provide multiple submits and asynchronous notifiicaton to 
vhost-net too.

Our goal is to improve the bandwidth and reduce the CPU usage.
Exact performance data will be provided later. But for simple
test with netperf, we found bindwidth up and CPU % up too,
but the bindwidth up ratio is much more than CPU % up ratio.

What we have not done yet:
packet split support
To support GRO
Performance tuning

what we have done in v1:
polish the RCU usage
deal with write logging in asynchroush mode in vhost
add notifier block for mp device
rename page_ctor to mp_port in netdevice.h to make it looks generic
add mp_dev_change_flags() for mp device to change NIC state
add CONIFG_VHOST_MPASSTHRU to limit the usage when module is not load
a small fix for missing dev_put when fail
using dynamic minor instead of static minor number
a __KERNEL__ protect to mp_get_sock()

what we have done in v2:

remove most of the RCU usage, since the ctor pointer is only
changed by BIND/UNBIND ioctl, and during that time, NIC will be
stopped to get good cleanup(all outstanding requests are finished),
so the ctor pointer cannot be raced into wrong situation.

Remove the struct vhost_notifier with struct kiocb.
Let vhost-net backend to alloc/free the kiocb and transfer them
via sendmsg/recvmsg.

use get_user_pages_fast() and set_page_dirty_lock() 

[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-09 Thread xiaohui . xin
From: Xin Xiaohui 

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

memory leak fixed,
kconfig made, 
do_unbind() made,
mp_chr_ioctl() cleaned up and
some other cleanups made
 
by Jeff Dike 

 drivers/vhost/Kconfig |5 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1264 +
 include/linux/mpassthru.h |   29 +
 4 files changed, 1300 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate "Zerocopy network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "vhost.h"
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   void*ctrl;
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* It's meaningful for receive, means
+* the max length allowed
+*/
+   size_t  len;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   unsigned intlog;
+   struct iovec  

[RFC][PATCH v3 3/3] Let host NIC driver to DMA to guest user space.

2010-04-09 Thread xiaohui . xin
From: Xin Xiaohui 

The patch let host NIC driver to receive user space skb,
then the driver has chance to directly DMA to guest user
space buffers thru single ethX interface.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---

alloc_skb() is cleanup by Jeff Dike 

 include/linux/netdevice.h |   69 -
 include/linux/skbuff.h|   30 --
 net/core/dev.c|   63 ++
 net/core/skbuff.c |   74 
 4 files changed, 224 insertions(+), 12 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 94958c1..ba48eb0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -485,6 +485,17 @@ struct netdev_queue {
unsigned long   tx_dropped;
 } cacheline_aligned_in_smp;
 
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+struct mpassthru_port  {
+   int hdr_len;
+   int data_len;
+   int npages;
+   unsignedflags;
+   struct socket   *sock;
+   struct skb_user_page*(*ctor)(struct mpassthru_port *,
+   struct sk_buff *, int);
+};
+#endif
 
 /*
  * This structure defines the management hooks for network devices.
@@ -636,6 +647,10 @@ struct net_device_ops {
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
 u16 xid);
 #endif
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+   int (*ndo_mp_port_prep)(struct net_device *dev,
+   struct mpassthru_port *port);
+#endif
 };
 
 /*
@@ -891,7 +906,8 @@ struct net_device
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port*garp_port;
-
+   /* mpassthru */
+   struct mpassthru_port   *mp_port;
/* class/net/name entry */
struct device   dev;
/* space for optional statistics and wireless sysfs groups */
@@ -2013,6 +2029,55 @@ static inline u32 dev_ethtool_get_flags(struct 
net_device *dev)
return 0;
return dev->ethtool_ops->get_flags(dev);
 }
-#endif /* __KERNEL__ */
 
+/* To support zero-copy between user space application and NIC driver,
+ * we'd better ask NIC driver for the capability it can provide, especially
+ * for packet split mode, now we only ask for the header size, and the
+ * payload once a descriptor may carry.
+ */
+
+#if defined(CONFIG_VHOST_PASSTHRU) || defined(CONFIG_VHOST_PASSTHRU_MODULE)
+static inline int netdev_mp_port_prep(struct net_device *dev,
+   struct mpassthru_port *port)
+{
+   int rc;
+   int npages, data_len;
+   const struct net_device_ops *ops = dev->netdev_ops;
+
+   /* needed by packet split */
+   if (ops->ndo_mp_port_prep) {
+   rc = ops->ndo_mp_port_prep(dev, port);
+   if (rc)
+   return rc;
+   } else {
+   /* If the NIC driver did not report this,
+* then we try to use it as igb driver.
+*/
+   port->hdr_len = 128;
+   port->data_len = 2048;
+   port->npages = 1;
+   }
+
+   if (port->hdr_len <= 0)
+   goto err;
+
+   npages = port->npages;
+   data_len = port->data_len;
+   if (npages <= 0 || npages > MAX_SKB_FRAGS ||
+   (data_len < PAGE_SIZE * (npages - 1) ||
+data_len > PAGE_SIZE * npages))
+   goto err;
+
+   return 0;
+err:
+   dev_warn(&dev->dev, "invalid page constructor parameters\n");
+
+   return -EINVAL;
+}
+
+extern int netdev_mp_port_attach(struct net_device *dev,
+   struct mpassthru_port *port);
+extern void netdev_mp_port_detach(struct net_device *dev);
+#endif /* CONFIG_VHOST_PASSTHRU */
+#endif /* __KERNEL__ */
 #endif /* _LINUX_NETDEVICE_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index df7b23a..e59fa57 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -209,6 +209,13 @@ struct skb_shared_info {
void *  destructor_arg;
 };
 
+struct skb_user_page {
+   u8  *start;
+   int size;
+   struct skb_frag_struct *frags;
+   struct skb_shared_info *ushinfo;
+   void(*dtor)(struct skb_user_page *);
+};
 /* We divide dataref into two halves.  The higher 16 bits hold references
  * to the payload part of skb->data.  The lower 16 bits hold references to
  * the entire skb->data.  A clone of a headerless skb holds the length of
@@ -441,17 +448,18 @@ extern void kfree_skb(struct sk_buff *skb);
 extern void consume_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *

[RFC][PATCH v3 2/3] Provides multiple submits and asynchronous notifications.

2010-04-09 Thread xiaohui . xin
From: Xin Xiaohui 

The vhost-net backend now only supports synchronous send/recv
operations. The patch provides multiple submits and asynchronous
notifications. This is needed for zero-copy case.

Signed-off-by: Xin Xiaohui 
---
 drivers/vhost/net.c   |  203 +++--
 drivers/vhost/vhost.c |  115 
 drivers/vhost/vhost.h |   15 
 3 files changed, 278 insertions(+), 55 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 22d5fef..d3fb3fc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -17,11 +17,13 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -47,6 +49,7 @@ struct vhost_net {
struct vhost_dev dev;
struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
+   struct kmem_cache   *cache;
/* Tells us whether we are polling a socket for TX.
 * We only do this when socket buffer fills up.
 * Protected by tx vq lock. */
@@ -91,11 +94,100 @@ static void tx_poll_start(struct vhost_net *net, struct 
socket *sock)
net->tx_poll_state = VHOST_NET_POLL_STARTED;
 }
 
+struct kiocb *notify_dequeue(struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   unsigned long flags;
+
+   spin_lock_irqsave(&vq->notify_lock, flags);
+   if (!list_empty(&vq->notifier)) {
+   iocb = list_first_entry(&vq->notifier,
+   struct kiocb, ki_list);
+   list_del(&iocb->ki_list);
+   }
+   spin_unlock_irqrestore(&vq->notify_lock, flags);
+   return iocb;
+}
+
+static void handle_async_rx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   struct vhost_log *vq_log = NULL;
+   int rx_total_len = 0;
+   unsigned int head, log, in, out;
+   int size;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   if (vq->receiver)
+   vq->receiver(vq);
+
+   vq_log = unlikely(vhost_has_feature(
+   &net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL;
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, iocb->ki_nbytes);
+   log = (int)iocb->ki_user_data;
+   size = iocb->ki_nbytes;
+   head = iocb->ki_pos;
+   rx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+   kmem_cache_free(net->cache, iocb);
+
+   /* when log is enabled, recomputing the log info is needed,
+* since these buffers are in async queue, and may not get
+* the log info before.
+*/
+   if (unlikely(vq_log)) {
+   if (!log)
+   __vhost_get_vq_desc(&net->dev, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   &out, &in, vq_log,
+   &log, head);
+   vhost_log_write(vq, vq_log, log, size);
+   }
+   if (unlikely(rx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
+static void handle_async_tx_events_notify(struct vhost_net *net,
+   struct vhost_virtqueue *vq)
+{
+   struct kiocb *iocb = NULL;
+   int tx_total_len = 0;
+
+   if (vq->link_state != VHOST_VQ_LINK_ASYNC)
+   return;
+
+   while ((iocb = notify_dequeue(vq)) != NULL) {
+   vhost_add_used_and_signal(&net->dev, vq,
+   iocb->ki_pos, 0);
+   tx_total_len += iocb->ki_nbytes;
+
+   if (iocb->ki_dtor)
+   iocb->ki_dtor(iocb);
+
+   kmem_cache_free(net->cache, iocb);
+   if (unlikely(tx_total_len >= VHOST_NET_WEIGHT)) {
+   vhost_poll_queue(&vq->poll);
+   break;
+   }
+   }
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
 {
struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+   struct kiocb *iocb = NULL;
unsigned head, out, in, s;
struct msghdr msg = {
.msg_name = NULL,
@@ -124,6 +216,8 @@ static void handle_tx(struct vhost_net *net)
tx_poll_stop(net);
hdr_size = vq->hdr_size;
 
+   handle_async_tx_events_notify(net, vq);
+
for (;;) {
head

Re: [RFC PATCH v9 12/16] Add mp(mediate passthru) device.

2010-09-20 Thread xiaohui . xin
From: Xin Xiaohui 

---
Michael,
I have move the ioctl to configure the locked memory to vhost and 
check the limit with mm->locked_vm. please have a look.

Thanks
Xiaohui

 drivers/vhost/mpassthru.c |   74 +--
 drivers/vhost/net.c   |   78 ++--
 include/linux/vhost.h |3 ++
 3 files changed, 85 insertions(+), 70 deletions(-)

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
index d86d94c..fd3827b 100644
--- a/drivers/vhost/mpassthru.c
+++ b/drivers/vhost/mpassthru.c
@@ -109,9 +109,6 @@ struct page_ctor {
int wq_len;
int rq_len;
spinlock_t  read_lock;
-   /* record the locked pages */
-   int lock_pages;
-   struct rlimit   o_rlim;
struct net_device   *dev;
struct mpassthru_port   port;
struct page_info**hash_table;
@@ -231,7 +228,6 @@ static int page_ctor_attach(struct mp_struct *mp)
ctor->port.ctor = page_ctor;
ctor->port.sock = &mp->socket;
ctor->port.hash = mp_lookup;
-   ctor->lock_pages = 0;
 
/* locked by mp_mutex */
dev->mp_port = &ctor->port;
@@ -264,37 +260,6 @@ struct page_info *info_dequeue(struct page_ctor *ctor)
return info;
 }
 
-static int set_memlock_rlimit(struct page_ctor *ctor, int resource,
- unsigned long cur, unsigned long max)
-{
-   struct rlimit new_rlim, *old_rlim;
-   int retval;
-
-   if (resource != RLIMIT_MEMLOCK)
-   return -EINVAL;
-   new_rlim.rlim_cur = cur;
-   new_rlim.rlim_max = max;
-
-   old_rlim = current->signal->rlim + resource;
-
-   /* remember the old rlimit value when backend enabled */
-   ctor->o_rlim.rlim_cur = old_rlim->rlim_cur;
-   ctor->o_rlim.rlim_max = old_rlim->rlim_max;
-
-   if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
-   !capable(CAP_SYS_RESOURCE))
-   return -EPERM;
-
-   retval = security_task_setrlimit(resource, &new_rlim);
-   if (retval)
-   return retval;
-
-   task_lock(current->group_leader);
-   *old_rlim = new_rlim;
-   task_unlock(current->group_leader);
-   return 0;
-}
-
 static void relinquish_resource(struct page_ctor *ctor)
 {
if (!(ctor->dev->flags & IFF_UP) &&
@@ -322,8 +287,6 @@ static void mp_ki_dtor(struct kiocb *iocb)
info->ctor->rq_len--;
} else
info->ctor->wq_len--;
-   /* Decrement the number of locked pages */
-   info->ctor->lock_pages -= info->pnum;
kmem_cache_free(ext_page_info_cache, info);
relinquish_resource(info->ctor);
 
@@ -349,7 +312,7 @@ static struct kiocb *create_iocb(struct page_info *info, 
int size)
iocb->ki_dtor(iocb);
iocb->private = (void *)info;
iocb->ki_dtor = mp_ki_dtor;
-
+   iocb->ki_user_data = info->pnum;
return iocb;
 }
 
@@ -375,10 +338,6 @@ static int page_ctor_detach(struct mp_struct *mp)
 
relinquish_resource(ctor);
 
-   set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
-  ctor->o_rlim.rlim_cur,
-  ctor->o_rlim.rlim_max);
-
/* locked by mp_mutex */
ctor->dev->mp_port = NULL;
dev_put(ctor->dev);
@@ -565,21 +524,23 @@ static struct page_info *alloc_page_info(struct page_ctor 
*ctor,
int rc;
int i, j, n = 0;
int len;
-   unsigned long base, lock_limit;
+   unsigned long base, lock_limit, locked;
struct page_info *info = NULL;
 
-   lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-   lock_limit >>= PAGE_SHIFT;
+   down_write(¤t->mm->mmap_sem);
+   locked = count + current->mm->locked_vm;
+   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-   if (ctor->lock_pages + count > lock_limit && npages) {
-   printk(KERN_INFO "exceed the locked memory rlimit.");
-   return NULL;
-   }
+   if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+   goto out;
 
info = kmem_cache_alloc(ext_page_info_cache, GFP_KERNEL);

if (!info)
-   return NULL;
+   goto out;
+
+   up_write(¤t->mm->mmap_sem);
+
info->skb = NULL;
info->next = info->prev = NULL;
 
@@ -633,8 +594,7 @@ static struct page_info *alloc_page_info(struct page_ctor 
*ctor,
for (i = 0; i < j; i++)
mp_hash_insert(ctor, info->pages[i], info);
}
-   /* increment the number of locked pages */
-   ctor->lock_pages += j;
+
return info;
 
 failed:
@@ -642,7 +602,9 @@ failed:
put_page(info->pages[i]);
 
kmem_cache_free(ext_page_info_cache, info);
-
+   return NULL;
+out:
+   up(¤t->mm->mmap_sem);
return NULL;
 }
 
@@ -1006,12 +

[PATCH v11 05/17] Add a function to indicate if device use external buffer.

2010-09-24 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 include/linux/netdevice.h |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5f192de..23d6ec0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1602,6 +1602,11 @@ extern gro_result_t  napi_gro_frags(struct 
napi_struct *napi);
 extern int netdev_mp_port_prep(struct net_device *dev,
struct mpassthru_port *port);
 
+static inline bool dev_is_mpassthru(struct net_device *dev)
+{
+   return dev && dev->mp_port;
+}
+
 static inline void napi_free_frags(struct napi_struct *napi)
 {
kfree_skb(napi->skb);
-- 
1.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v11 13/17] Add mp(mediate passthru) device.

2010-09-24 Thread xiaohui . xin
From: Xin Xiaohui 

The patch add mp(mediate passthru) device, which now
based on vhost-net backend driver and provides proto_ops
to send/receive guest buffers data from/to guest vitio-net
driver.

Signed-off-by: Xin Xiaohui 
Signed-off-by: Zhao Yu 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/mpassthru.c | 1407 +
 1 files changed, 1407 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c

diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..d86d94c
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1407 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAME"mpassthru"
+#define DRV_DESCRIPTION "Mediate passthru device driver"
+#define DRV_COPYRIGHT   "(C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G"
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp->debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES < 64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+#defineHASH_BUCKETS(8192*2)
+
+struct page_info {
+   struct list_headlist;
+   struct page_info*next;
+   struct page_info*prev;
+   struct page *pages[MAX_SKB_FRAGS];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a external allocated skb or kernel
+*/
+   struct skb_ext_pageext_page;
+
+#define INFO_READ  0
+#define INFO_WRITE 1
+   unsignedflags;
+   unsignedpnum;
+
+   /* The fields after that is for backend
+* driver, now for vhost-net.
+*/
+
+   struct kiocb*iocb;
+   unsigned intdesc_pos;
+   struct iovechdr[2];
+   struct ioveciov[MAX_SKB_FRAGS];
+};
+
+static struct kmem_cache *ext_page_info_cache;
+
+struct page_ctor {
+   struct list_headreadq;
+   int wq_len;
+   int rq_len;
+   spinlock_t  read_lock;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+   struct page_info**hash_table;
+};
+
+struct mp_struct {
+   struct mp_file  *mfile;
+   struct net_device   *dev;
+   struct page_ctor*ctor;
+   struct socket   socket;
+
+#ifdef MPASSTHRU_DEBUG
+   int debug;
+#endif
+};
+
+struct mp_file {
+   atomic_t count;
+   struct mp_struct *mp;
+   struct net *net;
+};
+
+struct mp_sock {
+   struct sock sk;
+   struct mp_struct*mp;
+};
+
+static int mp_dev_change_flags(struct net_device *dev, unsigned flags)
+{
+   int ret = 0;
+
+   rtnl_lock();
+   ret = dev_change_flags(dev, flags);
+   rtnl_unlock();
+
+   if (ret < 0)
+   printk(KERN_ERR "failed to change dev state of %s", dev->name);
+
+   return ret;
+}
+
+/* The main function to allocate external buffers */
+static struct skb_ext_page *page_ctor(struct mpassthru_port *port,
+   struct sk_buff *skb, int npages)
+{
+   int i;
+   unsigned long flags;
+   struct page_ctor *ctor;
+   struct page_info *info = NULL;
+
+   ctor = container_of(port, struct page_ctor, port);
+
+   spin_lock_irqsave(&ctor->read_lock, flags);
+   if (!list_empty(&ctor->readq)) {
+   info = list_first_entry(&ctor->readq, struct page_info, list);
+   list_del(&info->list);
+   ctor->rq_len--;
+   }
+   spin_unlock_irqrestore(&ctor->re

[PATCH v11 12/17] Add a kconfig entry and make entry for mp device.

2010-09-24 Thread xiaohui . xin
From: Xin Xiaohui 

Signed-off-by: Xin Xiaohui 
Reviewed-by: Jeff Dike 
---
 drivers/vhost/Kconfig  |   10 ++
 drivers/vhost/Makefile |2 ++
 2 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index e4e2fd1..a6b8cbf 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate "mediate passthru network driver (EXPERIMENTAL)"
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
-- 
1.7.3

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   >