Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4

2012-11-07 Thread Michael S. Tsirkin
On Mon, Oct 15, 2012 at 05:57:03PM +0800, Asias He wrote:
 vhost-blk is an in-kernel virito-blk device accelerator.
 
 Due to lack of proper in-kernel AIO interface, this version converts
 guest's I/O request to bio and use submit_bio() to submit I/O directly.
 So this version any supports raw block device as guest's disk image,
 e.g. /dev/sda, /dev/ram0. We can add file based image support to
 vhost-blk once we have in-kernel AIO interface. There are some work in
 progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:
 
http://marc.info/?l=linux-fsdevelm=133312234313122
 
 Performance evaluation:
 -
 1) LKVM
 Fio with libaio ioengine on Fusion IO device using kvm tool
 IOPS   Before   After   Improvement
 seq-read   107  121 +13.0%
 seq-write  130  179 +37.6%
 rnd-read   102  122 +19.6%
 rnd-write  125  159 +27.0%
 
 2) QEMU
 Fio with libaio ioengine on Fusion IO device using QEMU
 IOPS   Before   After   Improvement
 seq-read   76   123 +61.8%
 seq-write  139  173 +24.4%
 rnd-read   73   120 +64.3%
 rnd-write  75   156 +108.0%
 
 Userspace bits:
 -
 1) LKVM
 The latest vhost-blk userspace bits for kvm tool can be found here:
 g...@github.com:asias/linux-kvm.git blk.vhost-blk
 
 2) QEMU
 The latest vhost-blk userspace prototype for QEMU can be found here:
 g...@github.com:asias/qemu.git blk.vhost-blk
 
 Changes in v4:
 - Mark req-status as userspace pointer
 - Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
 - Add if (need_resched()) schedule() in blk thread
 - Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
 - Use vq_err() instead of pr_warn()
 - Fail un Unsupported request
 - Add flush in vhost_blk_set_features()
 
 Changes in v3:
 - Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
 - Check file passed by user is a raw block device file
 
 Signed-off-by: Asias He as...@redhat.com

Cc linux-ker...@vger.kernel.org, Jens, hch, davem.
I wonder what do others think. Christoph - any comments?

Also, what's the best way to merge this?

To avoid conflicts when vhost APIs change
(as that will be for 3.9) I can take this on the vhost tree
which is normally merged by Dave Miller.
Is this OK with everyone? Any acks?

 ---
  drivers/vhost/Kconfig |   1 +
  drivers/vhost/Kconfig.blk |  10 +
  drivers/vhost/Makefile|   2 +
  drivers/vhost/blk.c   | 677 
 ++
  drivers/vhost/blk.h   |   8 +
  5 files changed, 698 insertions(+)
  create mode 100644 drivers/vhost/Kconfig.blk
  create mode 100644 drivers/vhost/blk.c
  create mode 100644 drivers/vhost/blk.h
 
 diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
 index 202bba6..acd8038 100644
 --- a/drivers/vhost/Kconfig
 +++ b/drivers/vhost/Kconfig
 @@ -11,4 +11,5 @@ config VHOST_NET
  
  if STAGING
  source drivers/vhost/Kconfig.tcm
 +source drivers/vhost/Kconfig.blk
  endif
 diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
 new file mode 100644
 index 000..ff8ab76
 --- /dev/null
 +++ b/drivers/vhost/Kconfig.blk
 @@ -0,0 +1,10 @@
 +config VHOST_BLK
 + tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
 + depends on BLOCK   EXPERIMENTAL  m
 + ---help---
 +   This kernel module can be loaded in host kernel to accelerate
 +   guest block with virtio_blk. Not to be confused with virtio_blk
 +   module itself which needs to be loaded in guest kernel.
 +
 +   To compile this driver as a module, choose M here: the module will
 +   be called vhost_blk.
 diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
 index a27b053..1a8a4a5 100644
 --- a/drivers/vhost/Makefile
 +++ b/drivers/vhost/Makefile
 @@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
  vhost_net-y := vhost.o net.o
  
  obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
 +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
 +vhost_blk-y := blk.o
 diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
 new file mode 100644
 index 000..5c2b790
 --- /dev/null
 +++ b/drivers/vhost/blk.c
 @@ -0,0 +1,677 @@
 +/*
 + * Copyright (C) 2011 Taobao, Inc.
 + * Author: Liu Yuan tailai...@taobao.com
 + *
 + * Copyright (C) 2012 Red Hat, Inc.
 + * Author: Asias He as...@redhat.com
 + *
 + * This work is licensed under the terms of the GNU GPL, version 2.
 + *
 + * virtio-blk server in host kernel.
 + */
 +
 +#include linux/miscdevice.h
 +#include linux/module.h
 +#include linux/vhost.h
 +#include linux/virtio_blk.h
 +#include linux/mutex.h
 +#include linux/file.h
 +#include linux/kthread.h
 +#include linux/blkdev.h
 +
 +#include vhost.c
 +#include vhost.h
 +#include blk.h
 +
 +/* The block header is in the first and separate buffer. */
 +#define BLK_HDR  0
 +
 +static DEFINE_IDA(vhost_blk_index_ida);
 +
 +enum {
 + VHOST_BLK_VQ_REQ = 0,
 + VHOST_BLK_VQ_MAX = 1,
 +};
 

Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4

2012-11-07 Thread Rusty Russell
Asias He as...@redhat.com writes:
 vhost-blk is an in-kernel virito-blk device accelerator.

 Due to lack of proper in-kernel AIO interface, this version converts
 guest's I/O request to bio and use submit_bio() to submit I/O directly.
 So this version any supports raw block device as guest's disk image,
 e.g. /dev/sda, /dev/ram0. We can add file based image support to
 vhost-blk once we have in-kernel AIO interface. There are some work in
 progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

http://marc.info/?l=linux-fsdevelm=133312234313122

OK, this generally looks quite neat.  There is one significant bug,
however:

 +/* The block header is in the first and separate buffer. */
 +#define BLK_HDR  0

You need to do a proper pull off the iovec; you can't simply assume
this.  I'm working on fixing qemu, too.

linux/drivers/vhost/net.c simply skips the header, you want something
which actually copies it from userspace:

/* Returns 0, -EFAULT or -EINVAL (too short) */
int copy_from_iovec_user(void *dst, size_t len, struct iovec *iov, int iov_nr);
int copy_to_iovec_user(struct iovec *iov, int iov_nr, const void *src, size_t 
len);

These consume the iov in place.  You could pass struct iovec **iov and
int * if you wanted to be really efficient (otherwise you have
zero-length iov entries at the front after you've pulled things off).

This goes away:
 + if (hdr-type == VIRTIO_BLK_T_IN || hdr-type == VIRTIO_BLK_T_GET_ID)
 + iov_nr = in - 1;
 + else
 + iov_nr = out - 1;

This becomes a simple assignment:

 + /* The block data buffer follows block header buffer */
 + req-iov= vq-iov[BLK_HDR + 1];

This one actually requires iteration, since you should handle the case
where the last iov is zero length:

 + /* The block status buffer follows block data buffer */
 + req-status = vq-iov[iov_nr + 1].iov_base;

This becomes copy_to_iovec_user:

 + case VIRTIO_BLK_T_GET_ID: {
 + char id[VIRTIO_BLK_ID_BYTES];
 + int len;
 +
 + ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
 +vhost-blk%d, blk-index);
 + if (ret  0)
 + break;
 + len = ret;
 + ret = __copy_to_user(req-iov[0].iov_base, id, len);

This becomes copy_from_iovec_user:

 + if (unlikely(copy_from_user(hdr, vq-iov[BLK_HDR].iov_base,
 + sizeof(hdr {
 + vq_err(vq, Failed to get block header!\n);
 + vhost_discard_vq_desc(vq, 1);
 + break;
 + }

The rest looks OK, at a glance.

Thanks,
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/1] vhost-blk: Add vhost-blk support v4

2012-11-07 Thread Asias He
Hello Rusty,

On Thu, Nov 8, 2012 at 7:47 AM, Rusty Russell ru...@rustcorp.com.au wrote:
 Asias He as...@redhat.com writes:
 vhost-blk is an in-kernel virito-blk device accelerator.

 Due to lack of proper in-kernel AIO interface, this version converts
 guest's I/O request to bio and use submit_bio() to submit I/O directly.
 So this version any supports raw block device as guest's disk image,
 e.g. /dev/sda, /dev/ram0. We can add file based image support to
 vhost-blk once we have in-kernel AIO interface. There are some work in
 progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

http://marc.info/?l=linux-fsdevelm=133312234313122

 OK, this generally looks quite neat.  There is one significant bug,
 however:

 +/* The block header is in the first and separate buffer. */
 +#define BLK_HDR  0

 You need to do a proper pull off the iovec; you can't simply assume
 this.  I'm working on fixing qemu, too.

Yes.  I have changed the code to handle the buffer without assumption
about the layout already.
Just haven't sent out the new version. I will send it out after the kvm forum.

Cheers.

 linux/drivers/vhost/net.c simply skips the header, you want something
 which actually copies it from userspace:

 /* Returns 0, -EFAULT or -EINVAL (too short) */
 int copy_from_iovec_user(void *dst, size_t len, struct iovec *iov, int 
 iov_nr);
 int copy_to_iovec_user(struct iovec *iov, int iov_nr, const void *src, size_t 
 len);

 These consume the iov in place.  You could pass struct iovec **iov and
 int * if you wanted to be really efficient (otherwise you have
 zero-length iov entries at the front after you've pulled things off).

 This goes away:
 + if (hdr-type == VIRTIO_BLK_T_IN || hdr-type == VIRTIO_BLK_T_GET_ID)
 + iov_nr = in - 1;
 + else
 + iov_nr = out - 1;

 This becomes a simple assignment:

 + /* The block data buffer follows block header buffer */
 + req-iov= vq-iov[BLK_HDR + 1];

 This one actually requires iteration, since you should handle the case
 where the last iov is zero length:

 + /* The block status buffer follows block data buffer */
 + req-status = vq-iov[iov_nr + 1].iov_base;

 This becomes copy_to_iovec_user:

 + case VIRTIO_BLK_T_GET_ID: {
 + char id[VIRTIO_BLK_ID_BYTES];
 + int len;
 +
 + ret = snprintf(id, VIRTIO_BLK_ID_BYTES,
 +vhost-blk%d, blk-index);
 + if (ret  0)
 + break;
 + len = ret;
 + ret = __copy_to_user(req-iov[0].iov_base, id, len);

 This becomes copy_from_iovec_user:

 + if (unlikely(copy_from_user(hdr, vq-iov[BLK_HDR].iov_base,
 + sizeof(hdr {
 + vq_err(vq, Failed to get block header!\n);
 + vhost_discard_vq_desc(vq, 1);
 + break;
 + }

 The rest looks OK, at a glance.

 Thanks,
 Rusty.
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
Asias He
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH 1/1] vhost-blk: Add vhost-blk support v4

2012-10-15 Thread Asias He
vhost-blk is an in-kernel virito-blk device accelerator.

Due to lack of proper in-kernel AIO interface, this version converts
guest's I/O request to bio and use submit_bio() to submit I/O directly.
So this version any supports raw block device as guest's disk image,
e.g. /dev/sda, /dev/ram0. We can add file based image support to
vhost-blk once we have in-kernel AIO interface. There are some work in
progress for in-kernel AIO interface from Dave Kleikamp and Zach Brown:

   http://marc.info/?l=linux-fsdevelm=133312234313122

Performance evaluation:
-
1) LKVM
Fio with libaio ioengine on Fusion IO device using kvm tool
IOPS   Before   After   Improvement
seq-read   107  121 +13.0%
seq-write  130  179 +37.6%
rnd-read   102  122 +19.6%
rnd-write  125  159 +27.0%

2) QEMU
Fio with libaio ioengine on Fusion IO device using QEMU
IOPS   Before   After   Improvement
seq-read   76   123 +61.8%
seq-write  139  173 +24.4%
rnd-read   73   120 +64.3%
rnd-write  75   156 +108.0%

Userspace bits:
-
1) LKVM
The latest vhost-blk userspace bits for kvm tool can be found here:
g...@github.com:asias/linux-kvm.git blk.vhost-blk

2) QEMU
The latest vhost-blk userspace prototype for QEMU can be found here:
g...@github.com:asias/qemu.git blk.vhost-blk

Changes in v4:
- Mark req-status as userspace pointer
- Use __copy_to_user() instead of copy_to_user() in vhost_blk_set_status()
- Add if (need_resched()) schedule() in blk thread
- Kill vhost_blk_stop_vq() and move it into vhost_blk_stop()
- Use vq_err() instead of pr_warn()
- Fail un Unsupported request
- Add flush in vhost_blk_set_features()

Changes in v3:
- Sending REQ_FLUSH bio instead of vfs_fsync, thanks Christoph!
- Check file passed by user is a raw block device file

Signed-off-by: Asias He as...@redhat.com
---
 drivers/vhost/Kconfig |   1 +
 drivers/vhost/Kconfig.blk |  10 +
 drivers/vhost/Makefile|   2 +
 drivers/vhost/blk.c   | 677 ++
 drivers/vhost/blk.h   |   8 +
 5 files changed, 698 insertions(+)
 create mode 100644 drivers/vhost/Kconfig.blk
 create mode 100644 drivers/vhost/blk.c
 create mode 100644 drivers/vhost/blk.h

diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index 202bba6..acd8038 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -11,4 +11,5 @@ config VHOST_NET
 
 if STAGING
 source drivers/vhost/Kconfig.tcm
+source drivers/vhost/Kconfig.blk
 endif
diff --git a/drivers/vhost/Kconfig.blk b/drivers/vhost/Kconfig.blk
new file mode 100644
index 000..ff8ab76
--- /dev/null
+++ b/drivers/vhost/Kconfig.blk
@@ -0,0 +1,10 @@
+config VHOST_BLK
+   tristate Host kernel accelerator for virtio blk (EXPERIMENTAL)
+   depends on BLOCK   EXPERIMENTAL  m
+   ---help---
+ This kernel module can be loaded in host kernel to accelerate
+ guest block with virtio_blk. Not to be confused with virtio_blk
+ module itself which needs to be loaded in guest kernel.
+
+ To compile this driver as a module, choose M here: the module will
+ be called vhost_blk.
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053..1a8a4a5 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
+vhost_blk-y := blk.o
diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
new file mode 100644
index 000..5c2b790
--- /dev/null
+++ b/drivers/vhost/blk.c
@@ -0,0 +1,677 @@
+/*
+ * Copyright (C) 2011 Taobao, Inc.
+ * Author: Liu Yuan tailai...@taobao.com
+ *
+ * Copyright (C) 2012 Red Hat, Inc.
+ * Author: Asias He as...@redhat.com
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * virtio-blk server in host kernel.
+ */
+
+#include linux/miscdevice.h
+#include linux/module.h
+#include linux/vhost.h
+#include linux/virtio_blk.h
+#include linux/mutex.h
+#include linux/file.h
+#include linux/kthread.h
+#include linux/blkdev.h
+
+#include vhost.c
+#include vhost.h
+#include blk.h
+
+/* The block header is in the first and separate buffer. */
+#define BLK_HDR0
+
+static DEFINE_IDA(vhost_blk_index_ida);
+
+enum {
+   VHOST_BLK_VQ_REQ = 0,
+   VHOST_BLK_VQ_MAX = 1,
+};
+
+struct req_page_list {
+   struct page **pages;
+   int pages_nr;
+};
+
+struct vhost_blk_req {
+   struct llist_node llnode;
+   struct req_page_list *pl;
+   struct vhost_blk *blk;
+
+   struct iovec *iov;
+   int iov_nr;
+
+   struct bio **bio;
+   atomic_t bio_nr;
+
+   sector_t sector;
+   int write;
+   u16 head;
+   long len;
+
+   u8 __user *status;
+};
+
+struct vhost_blk {
+   struct task_struct *host_kick;
+   struct