Re:[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-22 Thread xiaohui . xin
From: Xin Xiaohui xiaohui@intel.com

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
Signed-off-by: Zhao Yu yzha...@gmail.com
Reviewed-by: Jeff Dike jd...@linux.intel.com
---

Michael,
Thanks. I have updated the patch with your suggestion.
It looks much clean now. Please have a review.

Thanks
Xiaohui

 drivers/vhost/Kconfig |   10 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1239 +
 include/linux/mpassthru.h |   29 +
 4 files changed, 1280 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..91806b1 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config MEDIATE_PASSTHRU
+   tristate mediate passthru network driver (EXPERIMENTAL)
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..cc99b14
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1239 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAMEmpassthru
+#define DRV_DESCRIPTION Mediate passthru device driver
+#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+
+#include linux/module.h
+#include linux/errno.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/slab.h
+#include linux/smp_lock.h
+#include linux/poll.h
+#include linux/fcntl.h
+#include linux/init.h
+#include linux/aio.h
+
+#include linux/skbuff.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/miscdevice.h
+#include linux/ethtool.h
+#include linux/rtnetlink.h
+#include linux/if.h
+#include linux/if_arp.h
+#include linux/if_ether.h
+#include linux/crc32.h
+#include linux/nsproxy.h
+#include linux/uaccess.h
+#include linux/virtio_net.h
+#include linux/mpassthru.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/rtnetlink.h
+#include net/sock.h
+
+#include asm/system.h
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp-debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct 

RE: Re:[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-22 Thread Xin, Xiaohui
Michael,
Sorry, it's based on the suggestion to hook an iocb completion callback
to handle the iocb list in vhost-net.

Thanks
Xiaohui

-Original Message-
From: Xin, Xiaohui
Sent: Thursday, April 22, 2010 4:24 PM
To: m...@redhat.com
Cc: a...@arndb.de; net...@vger.kernel.org; kvm@vger.kernel.org; 
linux-ker...@vger.kernel.org; mi...@elte.hu; da...@davemloft.net; 
jd...@linux.intel.com; Xin, Xiaohui
Subject: Re:[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

From: Xin Xiaohui xiaohui@intel.com

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
Signed-off-by: Zhao Yu yzha...@gmail.com
Reviewed-by: Jeff Dike jd...@linux.intel.com
---

Michael,
Thanks. I have updated the patch with your suggestion.
It looks much clean now. Please have a review.

Thanks
Xiaohui

 drivers/vhost/Kconfig |   10 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1239 +
 include/linux/mpassthru.h |   29 +
 4 files changed, 1280 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..91806b1 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,13 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.

+config MEDIATE_PASSTHRU
+   tristate mediate passthru network driver (EXPERIMENTAL)
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support, we call it as mediate passthru to
+ be distiguish with hardare passthru.
+
+ To compile this driver as a module, choose M here: the module will
+ be called mpassthru.
+
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..c18b9fc 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_MEDIATE_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..cc99b14
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1239 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAMEmpassthru
+#define DRV_DESCRIPTION Mediate passthru device driver
+#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+
+#include linux/module.h
+#include linux/errno.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/slab.h
+#include linux/smp_lock.h
+#include linux/poll.h
+#include linux/fcntl.h
+#include linux/init.h
+#include linux/aio.h
+
+#include linux/skbuff.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/miscdevice.h
+#include linux/ethtool.h
+#include linux/rtnetlink.h
+#include linux/if.h
+#include linux/if_arp.h
+#include linux/if_ether.h
+#include linux/crc32.h
+#include linux/nsproxy.h
+#include linux/uaccess.h
+#include linux/virtio_net.h
+#include linux/mpassthru.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/rtnetlink.h
+#include net/sock.h
+
+#include asm/system.h
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp-debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   struct list_headlist;
+   int header;
+   /* indicate the actual 

RE: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-15 Thread Xin, Xiaohui
Arnd,
 From: Xin Xiaohui xiaohui@intel.com
 
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.

Sorry for taking so long before finding the time to look
at your code in more detail.

It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?

I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?

I have not looked into your macvtap code in detail before.
Does the two interface exactly the same? We just want to create a simple
way to do zero-copy. Now it can only support vhost, but in future
we also want it to support directly read/write operations from user space too.

Basically, compared to the interface, I'm more worried about the modification
to net core we have made to implement zero-copy now. If this hardest part
can be done, then any user space interface modifications or integrations are 
more easily to be done after that.

 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 new file mode 100644
 index 000..86d2525
 --- /dev/null
 +++ b/drivers/vhost/mpassthru.c
 @@ -0,0 +1,1264 @@
 +
 +#ifdef MPASSTHRU_DEBUG
 +static int debug;
 +
 +#define DBG  if (mp-debug) printk
 +#define DBG1 if (debug == 2) printk
 +#else
 +#define DBG(a...)
 +#define DBG1(a...)
 +#endif

This should probably just use the existing dev_dbg/pr_debug infrastructure.

Thanks. Will try that.
 [... skipping buffer management code for now]

 +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
 +struct msghdr *m, size_t total_len)
 +{
 [...]

This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.

 +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
 +struct msghdr *m, size_t total_len,
 +int flags)
 +{
 +struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
 +struct page_ctor *ctor;
 +struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb-private);

It smells like a layering violation to look at the iocb-private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?

I don't like this too, but since the kiocb is maintained by vhost with a 
list_head.
And mp device is responsible to collect the kiocb into the list_head,
We need something known by vhost/mp both.
 
 +spin_lock_irqsave(ctor-read_lock, flag);
 +list_add_tail(info-list, ctor-readq);
 +spin_unlock_irqrestore(ctor-read_lock, flag);
 +
 +if (!vq-receiver) {
 +vq-receiver = mp_recvmsg_notify;
 +set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
 +   vq-num * 4096,
 +   vq-num * 4096);
 +}
 +
 +return 0;
 +}

Not sure what I'm missing, but who calls the vq-receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.

See Patch v3 2/3 I have sent out, it is called by handle_rx() in vhost.

 +static void __mp_detach(struct mp_struct *mp)
 +{
 +mp-mfile = NULL;
 +
 +mp_dev_change_flags(mp-dev, mp-dev-flags  ~IFF_UP);
 +page_ctor_detach(mp);
 +mp_dev_change_flags(mp-dev, mp-dev-flags | IFF_UP);
 +
 +/* Drop the extra count on the net device */
 +dev_put(mp-dev);
 +}
 +
 +static DEFINE_MUTEX(mp_mutex);
 +
 +static void mp_detach(struct mp_struct *mp)
 +{
 +mutex_lock(mp_mutex);
 +__mp_detach(mp);
 +mutex_unlock(mp_mutex);
 +}
 +
 +static void mp_put(struct mp_file *mfile)
 +{
 +if (atomic_dec_and_test(mfile-count))
 +mp_detach(mfile-mp);
 +}
 +
 +static int mp_release(struct socket *sock)
 +{
 +struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
 +struct mp_file *mfile = mp-mfile;
 +
 +mp_put(mfile);
 +sock_put(mp-socket.sk);
 +put_net(mfile-net);
 +
 +return 0;
 +}

Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?

Let me think.

 +/* Ops structure to mimic raw sockets with mp device */
 +static const struct proto_ops mp_socket_ops = {
 +.sendmsg = mp_sendmsg,
 +.recvmsg = mp_recvmsg,
 +.release = mp_release,
 +};

 +static int mp_chr_open(struct inode *inode, struct file * file)
 +{
 +struct mp_file *mfile;
+ cycle_kernel_lock();

I don't think you really want to use the BKL here, just kill 

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-15 Thread Michael S. Tsirkin
On Thu, Apr 15, 2010 at 05:01:10PM +0800, Xin, Xiaohui wrote:
 It smells like a layering violation to look at the iocb-private field
 from a lower-level driver. I would have hoped that it's possible to implement
 this without having this driver know about the higher-level vhost driver
 internals. Can you explain why this is needed?
 
 I don't like this too, but since the kiocb is maintained by vhost with a 
 list_head.
 And mp device is responsible to collect the kiocb into the list_head,
 We need something known by vhost/mp both.

Can't vhost supply a kiocb completion callback that will handle the list?

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-15 Thread Arnd Bergmann
On Thursday 15 April 2010, Xin, Xiaohui wrote:
 
 It seems that you are duplicating a lot of functionality that
 is already in macvtap. I've asked about this before but then
 didn't look at your newer versions. Can you explain the value
 of introducing another interface to user land?
 
 I'm still planning to add zero-copy support to macvtap,
 hopefully reusing parts of your code, but do you think there
 is value in having both?
 
 I have not looked into your macvtap code in detail before.
 Does the two interface exactly the same? We just want to create a simple
 way to do zero-copy. Now it can only support vhost, but in future
 we also want it to support directly read/write operations from user space too.

Right now, the features are mostly distinct. Macvtap first of all provides
a tap style interface for users, and can also be used by vhost-net.
It also provides a way to share a NIC among a number of guests by software,
though I indent to add support for VMDq and SR-IOV as well. Zero-copy
is also not yet done in macvtap but should be added.

mpassthru right now does not allow sharing a NIC between guests, and
does not have a tap interface for non-vhost operation, but does the
zero-copy that is missing in macvtap.

 Basically, compared to the interface, I'm more worried about the modification
 to net core we have made to implement zero-copy now. If this hardest part
 can be done, then any user space interface modifications or integrations are 
 more easily to be done after that.

I agree that the network stack modifications are the hard part for zero-copy,
and your work on that looks very promising and is complementary to what I've
done with macvtap. Your current user interface looks good for testing this out,
but I think we should not merge it (the interface) upstream if we can get the
same or better result by integrating your buffer management code into macvtap.

I can try to merge your code into macvtap myself if you agree, so you
can focus on getting the internals right.

 Not sure what I'm missing, but who calls the vq-receiver? This seems
 to be neither in the upstream version of vhost nor introduced by your
 patch.
 
 See Patch v3 2/3 I have sent out, it is called by handle_rx() in vhost.

Ok, I see. As a general rule, it's preferred to split a patch series
in a way that makes it possible to apply each patch separately and still
get a working kernel, ideally with more features than the version before
the patch. I believe you could get there by reordering your patches to
make the actual driver the last one in the series.

Not a big problem though, I was mostly looking in the wrong place.

  +  ifr.ifr_name[IFNAMSIZ-1] = '\0';
  +
  +  ret = -EBUSY;
  +
  +  if (ifr.ifr_flags  IFF_MPASSTHRU_EXCL)
  +  break;
 
 Your current use of the IFF_MPASSTHRU* flags does not seem to make
 any sense whatsoever. You check that this flag is never set, but set
 it later yourself and then ignore all flags.
 
 Using that flag is tried to prevent if another one wants to bind the same 
 device
 Again. But I will see if it really ignore all other flags.

The ifr variable is on the stack of the mp_chr_ioctl function, and you never
look at the value after setting it. In order to prevent multiple opens
of that device, you probably need to lock out any other users as well,
and make it a property of the underlying device. E.g. you also want to
prevent users on the host from setting an IP address on the NIC and
using it to send and receive data there.

Arnd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Arnd Bergmann
On Friday 09 April 2010, xiaohui@intel.com wrote:
 From: Xin Xiaohui xiaohui@intel.com
 
 Add a device to utilize the vhost-net backend driver for
 copy-less data transfer between guest FE and host NIC.
 It pins the guest user space to the host memory and
 provides proto_ops as sendmsg/recvmsg to vhost-net.

Sorry for taking so long before finding the time to look
at your code in more detail.

It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?

I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?

 diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
 new file mode 100644
 index 000..86d2525
 --- /dev/null
 +++ b/drivers/vhost/mpassthru.c
 @@ -0,0 +1,1264 @@
 +
 +#ifdef MPASSTHRU_DEBUG
 +static int debug;
 +
 +#define DBG  if (mp-debug) printk
 +#define DBG1 if (debug == 2) printk
 +#else
 +#define DBG(a...)
 +#define DBG1(a...)
 +#endif

This should probably just use the existing dev_dbg/pr_debug infrastructure.

 [... skipping buffer management code for now]

 +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
 + struct msghdr *m, size_t total_len)
 +{
 [...]

This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.

 +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
 + struct msghdr *m, size_t total_len,
 + int flags)
 +{
 + struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
 + struct page_ctor *ctor;
 + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb-private);

It smells like a layering violation to look at the iocb-private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?

 + spin_lock_irqsave(ctor-read_lock, flag);
 + list_add_tail(info-list, ctor-readq);
 + spin_unlock_irqrestore(ctor-read_lock, flag);
 +
 + if (!vq-receiver) {
 + vq-receiver = mp_recvmsg_notify;
 + set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
 +vq-num * 4096,
 +vq-num * 4096);
 + }
 +
 + return 0;
 +}

Not sure what I'm missing, but who calls the vq-receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.

 +static void __mp_detach(struct mp_struct *mp)
 +{
 + mp-mfile = NULL;
 +
 + mp_dev_change_flags(mp-dev, mp-dev-flags  ~IFF_UP);
 + page_ctor_detach(mp);
 + mp_dev_change_flags(mp-dev, mp-dev-flags | IFF_UP);
 +
 + /* Drop the extra count on the net device */
 + dev_put(mp-dev);
 +}
 +
 +static DEFINE_MUTEX(mp_mutex);
 +
 +static void mp_detach(struct mp_struct *mp)
 +{
 + mutex_lock(mp_mutex);
 + __mp_detach(mp);
 + mutex_unlock(mp_mutex);
 +}
 +
 +static void mp_put(struct mp_file *mfile)
 +{
 + if (atomic_dec_and_test(mfile-count))
 + mp_detach(mfile-mp);
 +}
 +
 +static int mp_release(struct socket *sock)
 +{
 + struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
 + struct mp_file *mfile = mp-mfile;
 +
 + mp_put(mfile);
 + sock_put(mp-socket.sk);
 + put_net(mfile-net);
 +
 + return 0;
 +}

Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?

 +/* Ops structure to mimic raw sockets with mp device */
 +static const struct proto_ops mp_socket_ops = {
 + .sendmsg = mp_sendmsg,
 + .recvmsg = mp_recvmsg,
 + .release = mp_release,
 +};

 +static int mp_chr_open(struct inode *inode, struct file * file)
 +{
 + struct mp_file *mfile;
 + cycle_kernel_lock();

I don't think you really want to use the BKL here, just kill that line.

 +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
 + unsigned long arg)
 +{
 + struct mp_file *mfile = file-private_data;
 + struct mp_struct *mp;
 + struct net_device *dev;
 + void __user* argp = (void __user *)arg;
 + struct ifreq ifr;
 + struct sock *sk;
 + int ret;
 +
 + ret = -EINVAL;
 +
 + switch (cmd) {
 + case MPASSTHRU_BINDDEV:
 + ret = -EFAULT;
 + if (copy_from_user(ifr, argp, sizeof ifr))
 + break;

This is broken for 32 bit compat mode ioctls, because struct ifreq
is different between 32 and 64 bit systems. Since you are only
using the device name anyway, a fixed length string or just the
interface index would be simpler and work better.

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin
On Wed, Apr 14, 2010 at 04:55:21PM +0200, Arnd Bergmann wrote:
 On Friday 09 April 2010, xiaohui@intel.com wrote:
  From: Xin Xiaohui xiaohui@intel.com
  
  Add a device to utilize the vhost-net backend driver for
  copy-less data transfer between guest FE and host NIC.
  It pins the guest user space to the host memory and
  provides proto_ops as sendmsg/recvmsg to vhost-net.
 
 Sorry for taking so long before finding the time to look
 at your code in more detail.
 
 It seems that you are duplicating a lot of functionality that
 is already in macvtap. I've asked about this before but then
 didn't look at your newer versions. Can you explain the value
 of introducing another interface to user land?

Hmm, I have not noticed a lot of duplication.
BTW macvtap also duplicates tun code, it might be
a good idea for tun to export some functionality.

 I'm still planning to add zero-copy support to macvtap,
 hopefully reusing parts of your code, but do you think there
 is value in having both?

If macvtap would get zero copy tx and rx, maybe not. But
it's not immediately obvious whether zero-copy support
for macvtap might work, though, especially for zero copy rx.
The approach with mpassthru is much simpler in that
it takes complete control of the device.

  diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
  new file mode 100644
  index 000..86d2525
  --- /dev/null
  +++ b/drivers/vhost/mpassthru.c
  @@ -0,0 +1,1264 @@
  +
  +#ifdef MPASSTHRU_DEBUG
  +static int debug;
  +
  +#define DBG  if (mp-debug) printk
  +#define DBG1 if (debug == 2) printk
  +#else
  +#define DBG(a...)
  +#define DBG1(a...)
  +#endif
 
 This should probably just use the existing dev_dbg/pr_debug infrastructure.
 
  [... skipping buffer management code for now]
 
  +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
  +   struct msghdr *m, size_t total_len)
  +{
  [...]
 
 This function looks like we should be able to easily include it into
 macvtap and get zero-copy transmits without introducing the new
 user-level interface.
 
  +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
  +   struct msghdr *m, size_t total_len,
  +   int flags)
  +{
  +   struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
  +   struct page_ctor *ctor;
  +   struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb-private);
 
 It smells like a layering violation to look at the iocb-private field
 from a lower-level driver. I would have hoped that it's possible to implement
 this without having this driver know about the higher-level vhost driver
 internals.

I agree.

 Can you explain why this is needed?
 
  +   spin_lock_irqsave(ctor-read_lock, flag);
  +   list_add_tail(info-list, ctor-readq);
  +   spin_unlock_irqrestore(ctor-read_lock, flag);
  +
  +   if (!vq-receiver) {
  +   vq-receiver = mp_recvmsg_notify;
  +   set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
  +  vq-num * 4096,
  +  vq-num * 4096);
  +   }
  +
  +   return 0;
  +}
 
 Not sure what I'm missing, but who calls the vq-receiver? This seems
 to be neither in the upstream version of vhost nor introduced by your
 patch.
 
  +static void __mp_detach(struct mp_struct *mp)
  +{
  +   mp-mfile = NULL;
  +
  +   mp_dev_change_flags(mp-dev, mp-dev-flags  ~IFF_UP);
  +   page_ctor_detach(mp);
  +   mp_dev_change_flags(mp-dev, mp-dev-flags | IFF_UP);
  +
  +   /* Drop the extra count on the net device */
  +   dev_put(mp-dev);
  +}
  +
  +static DEFINE_MUTEX(mp_mutex);
  +
  +static void mp_detach(struct mp_struct *mp)
  +{
  +   mutex_lock(mp_mutex);
  +   __mp_detach(mp);
  +   mutex_unlock(mp_mutex);
  +}
  +
  +static void mp_put(struct mp_file *mfile)
  +{
  +   if (atomic_dec_and_test(mfile-count))
  +   mp_detach(mfile-mp);
  +}
  +
  +static int mp_release(struct socket *sock)
  +{
  +   struct mp_struct *mp = container_of(sock-sk, struct mp_sock, sk)-mp;
  +   struct mp_file *mfile = mp-mfile;
  +
  +   mp_put(mfile);
  +   sock_put(mp-socket.sk);
  +   put_net(mfile-net);
  +
  +   return 0;
  +}
 
 Doesn't this prevent the underlying interface from going away while the 
 chardev
 is open? You also have logic to handle that case, so why do you keep the extra
 reference on the netdev?
 
  +/* Ops structure to mimic raw sockets with mp device */
  +static const struct proto_ops mp_socket_ops = {
  +   .sendmsg = mp_sendmsg,
  +   .recvmsg = mp_recvmsg,
  +   .release = mp_release,
  +};
 
  +static int mp_chr_open(struct inode *inode, struct file * file)
  +{
  +   struct mp_file *mfile;
  +   cycle_kernel_lock();
 
 I don't think you really want to use the BKL here, just kill that line.
 
  +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
  +   unsigned long arg)
  +{
  +   struct mp_file *mfile = file-private_data;
  +   struct mp_struct *mp;
  +   struct net_device *dev;
  +   

Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Arnd Bergmann
On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
 On Wed, Apr 14, 2010 at 04:55:21PM +0200, Arnd Bergmann wrote:
  On Friday 09 April 2010, xiaohui@intel.com wrote:
   From: Xin Xiaohui xiaohui@intel.com

  It seems that you are duplicating a lot of functionality that
  is already in macvtap. I've asked about this before but then
  didn't look at your newer versions. Can you explain the value
  of introducing another interface to user land?
 
 Hmm, I have not noticed a lot of duplication.

The code is indeed quite distinct, but the idea of adding another
character device to pass into vhost for direct device access is.

 BTW macvtap also duplicates tun code, it might be
 a good idea for tun to export some functionality.

Yes, that's something I plan to look into.

  I'm still planning to add zero-copy support to macvtap,
  hopefully reusing parts of your code, but do you think there
  is value in having both?
 
 If macvtap would get zero copy tx and rx, maybe not. But
 it's not immediately obvious whether zero-copy support
 for macvtap might work, though, especially for zero copy rx.
 The approach with mpassthru is much simpler in that
 it takes complete control of the device.

As far as I can tell, the most significant limitation of mpassthru
is that there can only ever be a single guest on a physical NIC.

Given that limitation, I believe we can do the same on macvtap,
and simply disable zero-copy RX when you want to use more than one
guest, or both guest and host on the same NIC.

The logical next step here would be to allow VMDq and similar
technologies to separate out the RX traffic in the hardware.
We don't have a configuration interface for that yet, but
since this is logically the same as macvlan, I think we should
use the same interfaces for both, essentially treating VMDq
as a hardware acceleration for macvlan. We can probably handle
it in similar ways to how we handle hardware support for vlan.

At that stage, macvtap would be the logical interface for
connecting a VMDq (hardware macvlan) device to a guest!

   +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec 
   *iov,
   + unsigned long count, loff_t pos)
   +{
   + struct file *file = iocb-ki_filp;
   + struct mp_struct *mp = mp_get(file-private_data);
   + struct sock *sk = mp-socket.sk;
   + struct sk_buff *skb;
   + int len, err;
   + ssize_t result;
  
  Can you explain what this function is even there for? AFAICT, vhost-net
  doesn't call it, the interface is incompatible with the existing
  tap interface, and you don't provide a read function.
 
 qemu needs the ability to inject raw packets into device
 from userspace, bypassing vhost/virtio (for live migration).

Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?

Moreover, it seems weird to have a new type of interface here that
duplicates tap/macvtap with less functionality. Coming back
to your original comment, this means that while mpassthru is currently
not duplicating the actual code from macvtap, it would need to do
exactly that to get the qemu interface right!

Arnd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin
On Wed, Apr 14, 2010 at 05:57:54PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
  On Wed, Apr 14, 2010 at 04:55:21PM +0200, Arnd Bergmann wrote:
   On Friday 09 April 2010, xiaohui@intel.com wrote:
From: Xin Xiaohui xiaohui@intel.com
 
   It seems that you are duplicating a lot of functionality that
   is already in macvtap. I've asked about this before but then
   didn't look at your newer versions. Can you explain the value
   of introducing another interface to user land?
  
  Hmm, I have not noticed a lot of duplication.
 
 The code is indeed quite distinct, but the idea of adding another
 character device to pass into vhost for direct device access is.

All backends besides tap seem to do this, btw :)

  BTW macvtap also duplicates tun code, it might be
  a good idea for tun to export some functionality.
 
 Yes, that's something I plan to look into.
 
   I'm still planning to add zero-copy support to macvtap,
   hopefully reusing parts of your code, but do you think there
   is value in having both?
  
  If macvtap would get zero copy tx and rx, maybe not. But
  it's not immediately obvious whether zero-copy support
  for macvtap might work, though, especially for zero copy rx.
  The approach with mpassthru is much simpler in that
  it takes complete control of the device.
 
 As far as I can tell, the most significant limitation of mpassthru
 is that there can only ever be a single guest on a physical NIC.
 
 Given that limitation, I believe we can do the same on macvtap,
 and simply disable zero-copy RX when you want to use more than one
 guest, or both guest and host on the same NIC.
 
 The logical next step here would be to allow VMDq and similar
 technologies to separate out the RX traffic in the hardware.
 We don't have a configuration interface for that yet, but
 since this is logically the same as macvlan, I think we should
 use the same interfaces for both, essentially treating VMDq
 as a hardware acceleration for macvlan. We can probably handle
 it in similar ways to how we handle hardware support for vlan.
 
 At that stage, macvtap would be the logical interface for
 connecting a VMDq (hardware macvlan) device to a guest!

I won't object to that but ... code walks.

+static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec 
*iov,
+   unsigned long count, loff_t pos)
+{
+   struct file *file = iocb-ki_filp;
+   struct mp_struct *mp = mp_get(file-private_data);
+   struct sock *sk = mp-socket.sk;
+   struct sk_buff *skb;
+   int len, err;
+   ssize_t result;
   
   Can you explain what this function is even there for? AFAICT, vhost-net
   doesn't call it, the interface is incompatible with the existing
   tap interface, and you don't provide a read function.
  
  qemu needs the ability to inject raw packets into device
  from userspace, bypassing vhost/virtio (for live migration).
 
 Ok, but since there is only a write callback and no read, it won't
 actually be able to do this with the current code, right?

I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).

 Moreover, it seems weird to have a new type of interface here that
 duplicates tap/macvtap with less functionality. Coming back
 to your original comment, this means that while mpassthru is currently
 not duplicating the actual code from macvtap, it would need to do
 exactly that to get the qemu interface right!
 
   Arnd

I don't think so, see above. anyway, both can reuse tun.c :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Arnd Bergmann
On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
   
   qemu needs the ability to inject raw packets into device
   from userspace, bypassing vhost/virtio (for live migration).
  
  Ok, but since there is only a write callback and no read, it won't
  actually be able to do this with the current code, right?
 
 I think it'll work as is, with vhost qemu only ever writes,
 never reads from device. We'll also never need GSO etc
 which is a large part of what tap does (and macvtap will
 have to do).

Ah, I see. I didn't realize that qemu needs to write to the
device even if vhost is used. But for the case of migration to
another machine without vhost, wouldn't qemu also need to read?

  Moreover, it seems weird to have a new type of interface here that
  duplicates tap/macvtap with less functionality. Coming back
  to your original comment, this means that while mpassthru is currently
  not duplicating the actual code from macvtap, it would need to do
  exactly that to get the qemu interface right!
  
 I don't think so, see above. anyway, both can reuse tun.c :)

There is one significant difference between macvtap/mpassthru and
tun/tap in that the directions are reversed. While macvtap and
mpassthru forward data from write into dev_queue_xmit and from
skb_receive into read, tun/tap forwards data from write into
skb_receive and from start_xmit into read.

Also, I'm not really objecting to duplicating code between
macvtap and mpassthru, as the implementation can always be merged.
My main objection is instead to having two different _user_interfaces_
for doing the same thing.

Arnd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin
On Wed, Apr 14, 2010 at 06:35:57PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010, Michael S. Tsirkin wrote:

qemu needs the ability to inject raw packets into device
from userspace, bypassing vhost/virtio (for live migration).
   
   Ok, but since there is only a write callback and no read, it won't
   actually be able to do this with the current code, right?
  
  I think it'll work as is, with vhost qemu only ever writes,
  never reads from device. We'll also never need GSO etc
  which is a large part of what tap does (and macvtap will
  have to do).
 
 Ah, I see. I didn't realize that qemu needs to write to the
 device even if vhost is used. But for the case of migration to
 another machine without vhost, wouldn't qemu also need to read?

Not that I know. Why?

   Moreover, it seems weird to have a new type of interface here that
   duplicates tap/macvtap with less functionality. Coming back
   to your original comment, this means that while mpassthru is currently
   not duplicating the actual code from macvtap, it would need to do
   exactly that to get the qemu interface right!
   
  I don't think so, see above. anyway, both can reuse tun.c :)
 
 There is one significant difference between macvtap/mpassthru and
 tun/tap in that the directions are reversed. While macvtap and
 mpassthru forward data from write into dev_queue_xmit and from
 skb_receive into read, tun/tap forwards data from write into
 skb_receive and from start_xmit into read.
 
 Also, I'm not really objecting to duplicating code between
 macvtap and mpassthru, as the implementation can always be merged.
 My main objection is instead to having two different _user_interfaces_
 for doing the same thing.
 
   Arnd

They *could* do the same thing :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Arnd Bergmann
On Wednesday 14 April 2010 22:31:42 Michael S. Tsirkin wrote:
 On Wed, Apr 14, 2010 at 06:35:57PM +0200, Arnd Bergmann wrote:
  On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
 
 qemu needs the ability to inject raw packets into device
 from userspace, bypassing vhost/virtio (for live migration).

Ok, but since there is only a write callback and no read, it won't
actually be able to do this with the current code, right?
   
   I think it'll work as is, with vhost qemu only ever writes,
   never reads from device. We'll also never need GSO etc
   which is a large part of what tap does (and macvtap will
   have to do).
  
  Ah, I see. I didn't realize that qemu needs to write to the
  device even if vhost is used. But for the case of migration to
  another machine without vhost, wouldn't qemu also need to read?
 
 Not that I know. Why?

Well, if the guest not only wants to send data but also receive
frames coming from other machines, they need to get from the kernel
into qemu, and the only way I can see for doing that is to read
from this device if there is no vhost support around on the new
machine.

Maybe we're talking about different things here.

Arnd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Michael S. Tsirkin
On Wed, Apr 14, 2010 at 10:39:49PM +0200, Arnd Bergmann wrote:
 On Wednesday 14 April 2010 22:31:42 Michael S. Tsirkin wrote:
  On Wed, Apr 14, 2010 at 06:35:57PM +0200, Arnd Bergmann wrote:
   On Wednesday 14 April 2010, Michael S. Tsirkin wrote:
  
  qemu needs the ability to inject raw packets into device
  from userspace, bypassing vhost/virtio (for live migration).
 
 Ok, but since there is only a write callback and no read, it won't
 actually be able to do this with the current code, right?

I think it'll work as is, with vhost qemu only ever writes,
never reads from device. We'll also never need GSO etc
which is a large part of what tap does (and macvtap will
have to do).
   
   Ah, I see. I didn't realize that qemu needs to write to the
   device even if vhost is used. But for the case of migration to
   another machine without vhost, wouldn't qemu also need to read?
  
  Not that I know. Why?
 
 Well, if the guest not only wants to send data but also receive
 frames coming from other machines, they need to get from the kernel
 into qemu, and the only way I can see for doing that is to read
 from this device if there is no vhost support around on the new
 machine.
 
 Maybe we're talking about different things here.
 
   Arnd

mpassthrough is currently useless without vhost.
If the new machine has no vhost, it can't use mpassthrough :)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-14 Thread Arnd Bergmann
On Wednesday 14 April 2010 22:40:03 Michael S. Tsirkin wrote:
 On Wed, Apr 14, 2010 at 10:39:49PM +0200, Arnd Bergmann wrote:
  Well, if the guest not only wants to send data but also receive
  frames coming from other machines, they need to get from the kernel
  into qemu, and the only way I can see for doing that is to read
  from this device if there is no vhost support around on the new
  machine.
  
  Maybe we're talking about different things here.

 mpassthrough is currently useless without vhost.
 If the new machine has no vhost, it can't use mpassthrough :)

Ok. Is that a planned feature though? vhost is currently limited
to guests with a virtio-net driver and even if you extend it to other
guest emulations, it will probably always be a subset of the qemu
supported drivers, but it may be useful to support zero-copy on other
drivers as well.

Arnd
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.

2010-04-09 Thread xiaohui . xin
From: Xin Xiaohui xiaohui@intel.com

Add a device to utilize the vhost-net backend driver for
copy-less data transfer between guest FE and host NIC.
It pins the guest user space to the host memory and
provides proto_ops as sendmsg/recvmsg to vhost-net.

Signed-off-by: Xin Xiaohui xiaohui@intel.com
Signed-off-by: Zhao Yu yzha...@gmail.com
Reviewed-by: Jeff Dike jd...@linux.intel.com
---

memory leak fixed,
kconfig made, 
do_unbind() made,
mp_chr_ioctl() cleaned up and
some other cleanups made
 
by Jeff Dike jd...@linux.intel.com

 drivers/vhost/Kconfig |5 +
 drivers/vhost/Makefile|2 +
 drivers/vhost/mpassthru.c | 1264 +
 include/linux/mpassthru.h |   29 +
 4 files changed, 1300 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vhost/mpassthru.c
 create mode 100644 include/linux/mpassthru.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 9f409f4..ee32a3b 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -9,3 +9,8 @@ config VHOST_NET
  To compile this driver as a module, choose M here: the module will
  be called vhost_net.
 
+config VHOST_PASSTHRU
+   tristate Zerocopy network driver (EXPERIMENTAL)
+   depends on VHOST_NET
+   ---help---
+ zerocopy network I/O support
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index 72dd020..3f79c79 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,2 +1,4 @@
 obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
+
+obj-$(CONFIG_VHOST_PASSTHRU) += mpassthru.o
diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
new file mode 100644
index 000..86d2525
--- /dev/null
+++ b/drivers/vhost/mpassthru.c
@@ -0,0 +1,1264 @@
+/*
+ *  MPASSTHRU - Mediate passthrough device.
+ *  Copyright (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ */
+
+#define DRV_NAMEmpassthru
+#define DRV_DESCRIPTION Mediate passthru device driver
+#define DRV_COPYRIGHT   (C) 2009 ZhaoYu, XinXiaohui, Dike, Jeffery G
+
+#include linux/module.h
+#include linux/errno.h
+#include linux/kernel.h
+#include linux/major.h
+#include linux/slab.h
+#include linux/smp_lock.h
+#include linux/poll.h
+#include linux/fcntl.h
+#include linux/init.h
+#include linux/aio.h
+
+#include linux/skbuff.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/miscdevice.h
+#include linux/ethtool.h
+#include linux/rtnetlink.h
+#include linux/if.h
+#include linux/if_arp.h
+#include linux/if_ether.h
+#include linux/crc32.h
+#include linux/nsproxy.h
+#include linux/uaccess.h
+#include linux/virtio_net.h
+#include linux/mpassthru.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/rtnetlink.h
+#include net/sock.h
+
+#include asm/system.h
+
+#include vhost.h
+
+/* Uncomment to enable debugging */
+/* #define MPASSTHRU_DEBUG 1 */
+
+#ifdef MPASSTHRU_DEBUG
+static int debug;
+
+#define DBG  if (mp-debug) printk
+#define DBG1 if (debug == 2) printk
+#else
+#define DBG(a...)
+#define DBG1(a...)
+#endif
+
+#define COPY_THRESHOLD (L1_CACHE_BYTES * 4)
+#define COPY_HDR_LEN   (L1_CACHE_BYTES  64 ? 64 : L1_CACHE_BYTES)
+
+struct frag {
+   u16 offset;
+   u16 size;
+};
+
+struct page_ctor {
+   struct list_headreadq;
+   int w_len;
+   int r_len;
+   spinlock_t  read_lock;
+   struct kmem_cache   *cache;
+   /* record the locked pages */
+   int lock_pages;
+   struct rlimit   o_rlim;
+   struct net_device   *dev;
+   struct mpassthru_port   port;
+};
+
+struct page_info {
+   void*ctrl;
+   struct list_headlist;
+   int header;
+   /* indicate the actual length of bytes
+* send/recv in the user space buffers
+*/
+   int total;
+   int offset;
+   struct page *pages[MAX_SKB_FRAGS+1];
+   struct skb_frag_struct  frag[MAX_SKB_FRAGS+1];
+   struct sk_buff  *skb;
+   struct page_ctor*ctor;
+
+   /* The pointer relayed to skb, to indicate
+* it's a user space allocated skb or kernel
+*/
+   struct skb_user_pageuser;
+   struct skb_shared_info  ushinfo;
+
+#define INFO_READ