[Qemu-devel] [RFC] vhost-blk implementation (v2)

2010-04-06 Thread Badari Pulavarty
Hi All,

Here is the latest version of vhost-blk implementation.
Major difference from my previous implementation is that, I
now merge all contiguous requests (both read and write), before
submitting them. This significantly improved IO performance.
I am still collecting performance numbers, I will be posting
in next few days.

Comments ?

Todo:
- Address hch's comments on annontations
- Implement per device read/write queues
- Finish up error handling

Thanks,
Badari

---
 drivers/vhost/blk.c |  445 
 1 file changed, 445 insertions(+)

Index: net-next/drivers/vhost/blk.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ net-next/drivers/vhost/blk.c2010-04-06 16:38:03.563847905 -0400
@@ -0,0 +1,445 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+#define SECTOR_SHIFT 9
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+struct vhost_blk_io {
+   struct list_head list;
+   struct work_struct work;
+   struct vhost_blk *blk;
+   struct file *file;
+   int head;
+   uint32_t type;
+   uint32_t nvecs;
+   uint64_t sector;
+   uint64_t len;
+   struct iovec iov[0];
+};
+
+static struct workqueue_struct *vblk_workqueue;
+static LIST_HEAD(write_queue);
+static LIST_HEAD(read_queue);
+
+static void handle_io_work(struct work_struct *work)
+{
+   struct vhost_blk_io *vbio, *entry;
+   struct vhost_virtqueue *vq;
+   struct vhost_blk *blk;
+   struct list_head single, *head, *node, *tmp;
+
+   int i, need_free, ret = 0;
+   loff_t pos;
+   uint8_t status = 0;
+
+   vbio = container_of(work, struct vhost_blk_io, work);
+   blk = vbio->blk;
+   vq = &blk->dev.vqs[0];
+   pos = vbio->sector << 8;
+
+   use_mm(blk->dev.mm);
+   if (vbio->type & VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1);
+   } else if (vbio->type & VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos);
+   } else {
+   ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos);
+   }
+   status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+   if (vbio->head != -1) {
+   INIT_LIST_HEAD(&single);
+   list_add(&vbio->list, &single);
+   head = &single;
+   need_free = 0;
+   } else {
+   head = &vbio->list;
+   need_free = 1;
+   }
+   list_for_each_entry(entry, head, list) {
+   copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof 
status);
+   }
+   mutex_lock(&vq->mutex);
+   list_for_each_safe(node, tmp, head) {
+   entry = list_entry(node, struct vhost_blk_io, list);
+   vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret);
+   list_del(node);
+   kfree(entry);
+   }
+   mutex_unlock(&vq->mutex);
+   unuse_mm(blk->dev.mm);
+   if (need_free)
+   kfree(vbio);
+}
+
+static struct vhost_blk_io *allocate_vbio(int nvecs)
+{
+   struct vhost_blk_io *vbio;
+   int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec);
+   vbio = kmalloc(size, GFP_KERNEL);
+   if (vbio) {
+   INIT_WORK(&vbio->work, handle_io_work);
+   INIT_LIST_HEAD(&vbio->list);
+   }
+   return vbio;
+}
+
+static void merge_and_handoff_work(struct list_head *queue)
+{
+   struct vhost_blk_io *vbio, *entry;
+   int nvecs = 0;
+   int entries = 0;
+
+   list_for_each_entry(entry, queue, list) {
+   nvecs += entry->nvecs;
+   entries++;
+   }
+
+   if (entries == 1) {
+   vbio = list_first_entry(queue, struct vhost_blk_io, list);
+   list_del(&vbio->list);
+   queue_work(vblk_workqueue, &vbio->work);
+   return;
+   }
+
+   vbio = allocate_vbio(nvecs);
+   if (!vbio) {
+   /* Unable to allocate memory - submit IOs individually */
+   list_for_each_entry(vbio, queue, list) {
+   queue_work(vblk_workqueue, &vbio->work);
+   }
+   INIT_LIST_HEAD(queue);
+   return;
+   }
+
+   entry = list_first_entry(queue, struct vhost_blk_io, list);
+   vbio->nvecs = nvecs;
+   vbio->blk = entry->blk;
+   vbio->file = entry->file;
+   vbio->type = entry->type;
+   vbio->sector = entry->sector;
+   vbio->head = -1;
+   v

[Qemu-devel] [RFC] vhost-blk implementation

2010-03-23 Thread Badari Pulavarty
Hi,

Inspired by vhost-net implementation, I did initial prototype 
of vhost-blk to see if it provides any benefits over QEMU virtio-blk.
I haven't handled all the error cases, fixed naming conventions etc.,
but the implementation is stable to play with. I tried not to deviate
from vhost-net implementation where possible.

NOTE:  Only change I had to make to vhost core code is to 
increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h 

Performance:
=

I have done simple tests to see how it performs. I got very
encouraging results on sequential read tests. But on sequential
write tests, I see degrade over virtio-blk. I can't figure out and
explain why. Can some one shed light on whats happening here ?

Read Results:
=
Test does read of 84GB file from the host (through virtio). I unmount
and mount the filesystem on the host to make sure there is nothing
in the page cache..


with vhost-blk:


# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s

real2m6.137s
user0m0.281s
sys 0m14.725s

without vhost-blk: (virtio)
---

# time dd if=/dev/vda of=/dev/null bs=128k iflag=direct
64+0 records in
64+0 records out
8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s

real4m35.468s
user0m0.373s
sys 0m48.074s



Write Results:
==

I see degraded IO performance when doing sequential IO write
tests with vhost-blk compared to virtio-blk.

# time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct

I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with
vhost-blk. Wondering why ?

Comments/flames ? 

Thanks,
Badari


vhost-blk is in-kernel accelerator for virtio-blk. 
At this time, this is a prototype based on virtio-net.
Lots of error handling and clean up needs to be done.
Read performance is pretty good over QEMU virtio-blk, but
write performance is not anywhere close to QEMU virtio-blk.
Why ?

Signed-off-by: Badari Pulavarty 
---
 drivers/vhost/blk.c |  242 
 1 file changed, 242 insertions(+)

Index: net-next/drivers/vhost/blk.c
===
--- /dev/null   1970-01-01 00:00:00.0 +
+++ net-next/drivers/vhost/blk.c2010-03-22 18:07:18.156584400 -0400
@@ -0,0 +1,242 @@
+ /*
+  * virtio-block server in host kernel.
+  * Inspired by vhost-net and shamlessly ripped code from it :)
+  */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vhost.h"
+
+#define VHOST_BLK_VQ_MAX 1
+
+struct vhost_blk {
+   struct vhost_dev dev;
+   struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX];
+   struct vhost_poll poll[VHOST_BLK_VQ_MAX];
+};
+
+static int do_handle_io(struct file *file, uint32_t type, uint64_t sector,
+   struct iovec *iov, int in)
+{
+   loff_t pos = sector << 8;
+   int ret = 0;
+
+   if (type & VIRTIO_BLK_T_FLUSH)  {
+   ret = vfs_fsync(file, file->f_path.dentry, 1);
+   } else if (type & VIRTIO_BLK_T_OUT) {
+   ret = vfs_writev(file, iov, in, &pos);
+   } else {
+   ret = vfs_readv(file, iov, in, &pos);
+   }
+   return ret;
+}
+
+static void handle_blk(struct vhost_blk *blk)
+{
+   struct vhost_virtqueue *vq = &blk->dev.vqs[0];
+   unsigned head, out, in;
+   struct virtio_blk_outhdr hdr;
+   int r, nvecs;
+   uint8_t status = 0;
+
+   use_mm(blk->dev.mm);
+   mutex_lock(&vq->mutex);
+
+   vhost_disable_notify(vq);
+
+   for (;;) {
+   head = vhost_get_vq_desc(&blk->dev, vq, vq->iov,
+ARRAY_SIZE(vq->iov),
+&out, &in, NULL, NULL);
+   if (head == vq->num) {
+   if (unlikely(vhost_enable_notify(vq))) {
+   vhost_disable_notify(vq);
+   continue;
+   }
+   break;
+   }
+
+   BUG_ON(vq->iov[0].iov_len != 16);
+
+   r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr);
+   if (r < 0) {
+   printk("copy from user failed\n");
+   vhost_discard_vq_desc(vq);
+   break;
+   }
+
+   nvecs = out - 1;
+   if (hdr.type == VIRTIO_BLK_T_IN)
+   nvecs = in - 1;
+
+   r = do_handle_io(vq->private_data, hdr.type, hdr.sector, 
&vq->iov[1], nvecs);
+   status = (r < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
+
+   nvecs++;
+   BUG_ON(vq->iov[nvecs].iov_len != 1);
+
+   if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof 
status) < 0) {