[Qemu-devel] [RFC] vhost-blk implementation (v2)
Hi All, Here is the latest version of vhost-blk implementation. Major difference from my previous implementation is that, I now merge all contiguous requests (both read and write), before submitting them. This significantly improved IO performance. I am still collecting performance numbers, I will be posting in next few days. Comments ? Todo: - Address hch's comments on annontations - Implement per device read/write queues - Finish up error handling Thanks, Badari --- drivers/vhost/blk.c | 445 1 file changed, 445 insertions(+) Index: net-next/drivers/vhost/blk.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ net-next/drivers/vhost/blk.c2010-04-06 16:38:03.563847905 -0400 @@ -0,0 +1,445 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define VHOST_BLK_VQ_MAX 1 +#define SECTOR_SHIFT 9 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +struct vhost_blk_io { + struct list_head list; + struct work_struct work; + struct vhost_blk *blk; + struct file *file; + int head; + uint32_t type; + uint32_t nvecs; + uint64_t sector; + uint64_t len; + struct iovec iov[0]; +}; + +static struct workqueue_struct *vblk_workqueue; +static LIST_HEAD(write_queue); +static LIST_HEAD(read_queue); + +static void handle_io_work(struct work_struct *work) +{ + struct vhost_blk_io *vbio, *entry; + struct vhost_virtqueue *vq; + struct vhost_blk *blk; + struct list_head single, *head, *node, *tmp; + + int i, need_free, ret = 0; + loff_t pos; + uint8_t status = 0; + + vbio = container_of(work, struct vhost_blk_io, work); + blk = vbio->blk; + vq = &blk->dev.vqs[0]; + pos = vbio->sector << 8; + + use_mm(blk->dev.mm); + if (vbio->type & VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(vbio->file, vbio->file->f_path.dentry, 1); + } else if (vbio->type & VIRTIO_BLK_T_OUT) { + ret = vfs_writev(vbio->file, vbio->iov, vbio->nvecs, &pos); + } else { + ret = vfs_readv(vbio->file, vbio->iov, vbio->nvecs, &pos); + } + status = (ret < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + if (vbio->head != -1) { + INIT_LIST_HEAD(&single); + list_add(&vbio->list, &single); + head = &single; + need_free = 0; + } else { + head = &vbio->list; + need_free = 1; + } + list_for_each_entry(entry, head, list) { + copy_to_user(entry->iov[entry->nvecs].iov_base, &status, sizeof status); + } + mutex_lock(&vq->mutex); + list_for_each_safe(node, tmp, head) { + entry = list_entry(node, struct vhost_blk_io, list); + vhost_add_used_and_signal(&blk->dev, vq, entry->head, ret); + list_del(node); + kfree(entry); + } + mutex_unlock(&vq->mutex); + unuse_mm(blk->dev.mm); + if (need_free) + kfree(vbio); +} + +static struct vhost_blk_io *allocate_vbio(int nvecs) +{ + struct vhost_blk_io *vbio; + int size = sizeof(struct vhost_blk_io) + nvecs * sizeof(struct iovec); + vbio = kmalloc(size, GFP_KERNEL); + if (vbio) { + INIT_WORK(&vbio->work, handle_io_work); + INIT_LIST_HEAD(&vbio->list); + } + return vbio; +} + +static void merge_and_handoff_work(struct list_head *queue) +{ + struct vhost_blk_io *vbio, *entry; + int nvecs = 0; + int entries = 0; + + list_for_each_entry(entry, queue, list) { + nvecs += entry->nvecs; + entries++; + } + + if (entries == 1) { + vbio = list_first_entry(queue, struct vhost_blk_io, list); + list_del(&vbio->list); + queue_work(vblk_workqueue, &vbio->work); + return; + } + + vbio = allocate_vbio(nvecs); + if (!vbio) { + /* Unable to allocate memory - submit IOs individually */ + list_for_each_entry(vbio, queue, list) { + queue_work(vblk_workqueue, &vbio->work); + } + INIT_LIST_HEAD(queue); + return; + } + + entry = list_first_entry(queue, struct vhost_blk_io, list); + vbio->nvecs = nvecs; + vbio->blk = entry->blk; + vbio->file = entry->file; + vbio->type = entry->type; + vbio->sector = entry->sector; + vbio->head = -1; + v
[Qemu-devel] [RFC] vhost-blk implementation
Hi, Inspired by vhost-net implementation, I did initial prototype of vhost-blk to see if it provides any benefits over QEMU virtio-blk. I haven't handled all the error cases, fixed naming conventions etc., but the implementation is stable to play with. I tried not to deviate from vhost-net implementation where possible. NOTE: Only change I had to make to vhost core code is to increase VHOST_NET_MAX_SG to 130 (128+2) in vhost.h Performance: = I have done simple tests to see how it performs. I got very encouraging results on sequential read tests. But on sequential write tests, I see degrade over virtio-blk. I can't figure out and explain why. Can some one shed light on whats happening here ? Read Results: = Test does read of 84GB file from the host (through virtio). I unmount and mount the filesystem on the host to make sure there is nothing in the page cache.. with vhost-blk: # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 126.135 seconds, 665 MB/s real2m6.137s user0m0.281s sys 0m14.725s without vhost-blk: (virtio) --- # time dd if=/dev/vda of=/dev/null bs=128k iflag=direct 64+0 records in 64+0 records out 8388608 bytes (84 GB) copied, 275.466 seconds, 305 MB/s real4m35.468s user0m0.373s sys 0m48.074s Write Results: == I see degraded IO performance when doing sequential IO write tests with vhost-blk compared to virtio-blk. # time dd of=/dev/vda if=/dev/zero bs=2M oflag=direct I get ~110MB/sec with virtio-blk, but I get only ~60MB/sec with vhost-blk. Wondering why ? Comments/flames ? Thanks, Badari vhost-blk is in-kernel accelerator for virtio-blk. At this time, this is a prototype based on virtio-net. Lots of error handling and clean up needs to be done. Read performance is pretty good over QEMU virtio-blk, but write performance is not anywhere close to QEMU virtio-blk. Why ? Signed-off-by: Badari Pulavarty --- drivers/vhost/blk.c | 242 1 file changed, 242 insertions(+) Index: net-next/drivers/vhost/blk.c === --- /dev/null 1970-01-01 00:00:00.0 + +++ net-next/drivers/vhost/blk.c2010-03-22 18:07:18.156584400 -0400 @@ -0,0 +1,242 @@ + /* + * virtio-block server in host kernel. + * Inspired by vhost-net and shamlessly ripped code from it :) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vhost.h" + +#define VHOST_BLK_VQ_MAX 1 + +struct vhost_blk { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_BLK_VQ_MAX]; + struct vhost_poll poll[VHOST_BLK_VQ_MAX]; +}; + +static int do_handle_io(struct file *file, uint32_t type, uint64_t sector, + struct iovec *iov, int in) +{ + loff_t pos = sector << 8; + int ret = 0; + + if (type & VIRTIO_BLK_T_FLUSH) { + ret = vfs_fsync(file, file->f_path.dentry, 1); + } else if (type & VIRTIO_BLK_T_OUT) { + ret = vfs_writev(file, iov, in, &pos); + } else { + ret = vfs_readv(file, iov, in, &pos); + } + return ret; +} + +static void handle_blk(struct vhost_blk *blk) +{ + struct vhost_virtqueue *vq = &blk->dev.vqs[0]; + unsigned head, out, in; + struct virtio_blk_outhdr hdr; + int r, nvecs; + uint8_t status = 0; + + use_mm(blk->dev.mm); + mutex_lock(&vq->mutex); + + vhost_disable_notify(vq); + + for (;;) { + head = vhost_get_vq_desc(&blk->dev, vq, vq->iov, +ARRAY_SIZE(vq->iov), +&out, &in, NULL, NULL); + if (head == vq->num) { + if (unlikely(vhost_enable_notify(vq))) { + vhost_disable_notify(vq); + continue; + } + break; + } + + BUG_ON(vq->iov[0].iov_len != 16); + + r = copy_from_user(&hdr, vq->iov[0].iov_base, sizeof hdr); + if (r < 0) { + printk("copy from user failed\n"); + vhost_discard_vq_desc(vq); + break; + } + + nvecs = out - 1; + if (hdr.type == VIRTIO_BLK_T_IN) + nvecs = in - 1; + + r = do_handle_io(vq->private_data, hdr.type, hdr.sector, &vq->iov[1], nvecs); + status = (r < 0) ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK; + + nvecs++; + BUG_ON(vq->iov[nvecs].iov_len != 1); + + if (copy_to_user(vq->iov[nvecs].iov_base, &status, sizeof status) < 0) {