On Thu, Jul 19, 2012 at 08:05:42AM -0500, Anthony Liguori wrote:
> Asias He <as...@redhat.com> writes:
> 
> > vhost-blk is a in kernel virito-blk device accelerator.
> >
> > This patch is based on Liu Yuan's implementation with various
> > improvements and bug fixes. Notably, this patch makes guest notify and
> > host completion processing in parallel which gives about 60% performance
> > improvement compared to Liu Yuan's implementation.
> >
> > Performance evaluation:
> > -----------------------------
> > The comparison is between kvm tool with usersapce implementation and kvm
> > tool with vhost-blk.
> >
> > 1) Fio with libaio ioengine on Fusion IO device
> > With bio-based IO path, sequential read/write, random read/write
> > IOPS boost         : 8.4%, 15.3%, 10.4%, 14.6%
> > Latency improvement: 8.5%, 15.4%, 10.4%, 15.1%
> >
> > 2) Fio with vsync ioengine on Fusion IO device
> > With bio-based IO path, sequential read/write, random read/write
> > IOPS boost         : 10.5%, 4.8%, 5.2%, 5.6%
> > Latency improvement: 11.4%, 5.0%, 5.2%, 5.8%
> >
> > Cc: Michael S. Tsirkin <m...@redhat.com>
> > Cc: linux-ker...@vger.kernel.org
> > Cc: kvm@vger.kernel.org
> > Cc: virtualizat...@lists.linux-foundation.org
> > Signed-off-by: Asias He <as...@redhat.com>
> > ---
> >  drivers/vhost/Kconfig  |   10 +
> >  drivers/vhost/Makefile |    2 +
> >  drivers/vhost/blk.c    |  600 
> > ++++++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/vhost/vhost.h  |    5 +
> >  include/linux/vhost.h  |    3 +
> >  5 files changed, 620 insertions(+)
> >  create mode 100644 drivers/vhost/blk.c
> >
> > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
> > index c387067..fa071a8 100644
> > --- a/drivers/vhost/Kconfig
> > +++ b/drivers/vhost/Kconfig
> > @@ -16,4 +16,14 @@ config VHOST_NET
> >  
> >       To compile this driver as a module, choose M here: the module will
> >       be called vhost_net.
> > +config VHOST_BLK
> > +   tristate "Host kernel accelerator for virtio blk (EXPERIMENTAL)"
> > +   depends on VHOST && BLOCK && AIO && EVENTFD && EXPERIMENTAL
> > +   ---help---
> > +     This kernel module can be loaded in host kernel to accelerate
> > +     guest block with virtio_blk. Not to be confused with virtio_blk
> > +     module itself which needs to be loaded in guest kernel.
> > +
> > +     To compile this driver as a module, choose M here: the module will
> > +     be called vhost_blk.
> >  
> > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
> > index cd36885..aa461d5 100644
> > --- a/drivers/vhost/Makefile
> > +++ b/drivers/vhost/Makefile
> > @@ -1,4 +1,6 @@
> >  obj-$(CONFIG_VHOST)        += vhost.o
> >  obj-$(CONFIG_VHOST_NET) += vhost_net.o
> > +obj-$(CONFIG_VHOST_BLK) += vhost_blk.o
> >  
> >  vhost_net-y                := net.o
> > +vhost_blk-y                := blk.o
> > diff --git a/drivers/vhost/blk.c b/drivers/vhost/blk.c
> > new file mode 100644
> > index 0000000..6a94894
> > --- /dev/null
> > +++ b/drivers/vhost/blk.c
> > @@ -0,0 +1,600 @@
> > +/*
> > + * Copyright (C) 2011 Taobao, Inc.
> > + * Author: Liu Yuan <tailai...@taobao.com>
> > + *
> > + * Copyright (C) 2012 Red Hat, Inc.
> > + * Author: Asias He <as...@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.
> > + *
> > + * virtio-blk server in host kernel.
> > + */
> > +
> > +#include <linux/miscdevice.h>
> > +#include <linux/module.h>
> > +#include <linux/vhost.h>
> > +#include <linux/virtio_blk.h>
> > +#include <linux/eventfd.h>
> > +#include <linux/mutex.h>
> > +#include <linux/file.h>
> > +#include <linux/mmu_context.h>
> > +#include <linux/anon_inodes.h>
> > +#include <linux/kthread.h>
> > +#include <linux/blkdev.h>
> > +
> > +#include "vhost.h"
> > +
> > +#define BLK_HDR    0
> > +
> > +enum {
> > +   VHOST_BLK_VQ_REQ = 0,
> > +   VHOST_BLK_VQ_MAX = 1,
> > +};
> > +
> > +struct vhost_blk_req {
> > +   u16 head;
> > +   u8 *status;
> > +};
> > +
> > +struct vhost_blk {
> > +   struct task_struct *worker_host_kick;
> > +   struct task_struct *worker;
> > +   struct vhost_blk_req *reqs;
> > +   struct vhost_virtqueue vq;
> > +   struct eventfd_ctx *ectx;
> > +   struct io_event *ioevent;
> > +   struct kioctx *ioctx;
> > +   struct vhost_dev dev;
> > +   struct file *efile;
> > +   u64 ioevent_nr;
> > +   bool stop;
> > +};
> > +
> > +static inline int vhost_blk_read_events(struct vhost_blk *blk, long nr)
> > +{
> > +   mm_segment_t old_fs = get_fs();
> > +   int ret;
> > +
> > +   set_fs(KERNEL_DS);
> > +   ret = read_events(blk->ioctx, nr, nr, blk->ioevent, NULL);
> > +   set_fs(old_fs);
> > +
> > +   return ret;
> > +}
> > +
> > +static int vhost_blk_setup(struct vhost_blk *blk)
> > +{
> > +   struct kioctx *ctx;
> > +
> > +   if (blk->ioctx)
> > +           return 0;
> > +
> > +   blk->ioevent_nr = blk->vq.num;
> > +   ctx = ioctx_alloc(blk->ioevent_nr);
> > +   if (IS_ERR(ctx)) {
> > +           pr_err("Failed to ioctx_alloc");
> > +           return PTR_ERR(ctx);
> > +   }
> 
> Not that it's very likely that ioctx_alloc will fail in practice.
> There's a fixed number of events that can be allocated that's currently
> 0x10000.  If you have a ring queue size of 1024 (which is normal) then
> that limits you to 64 vhost-blk devices.
> 
> Realistically, I don't think you can only do aio with vhost-blk because
> of this (and many other) limitations.  It's necessary to be able to fall
> back to a thread pool because AIO cannot be relied upon.
> 
> > +   put_ioctx(ctx);
> > +   blk->ioctx = ctx;
> > +
> > +   blk->ioevent = kmalloc(sizeof(struct io_event) * blk->ioevent_nr,
> > +                          GFP_KERNEL);
> > +   if (!blk->ioevent) {
> > +           pr_err("Failed to allocate memory for io_events");
> > +           return -ENOMEM;
> > +   }
> > +
> > +   blk->reqs = kmalloc(sizeof(struct vhost_blk_req) * blk->ioevent_nr,
> > +                       GFP_KERNEL);
> > +   if (!blk->reqs) {
> > +           pr_err("Failed to allocate memory for vhost_blk_req");
> > +           return -ENOMEM;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static inline int vhost_blk_set_status(struct vhost_blk *blk, u8 *statusp,
> > +                                  u8 status)
> > +{
> > +   if (copy_to_user(statusp, &status, sizeof(status))) {
> > +           vq_err(&blk->vq, "Failed to write status\n");
> > +           vhost_discard_vq_desc(&blk->vq, 1);
> > +           return -EFAULT;
> > +   }
> > +
> > +   return 0;
> > +}
> > +
> > +static void vhost_blk_enable_vq(struct vhost_blk *blk,
> > +                           struct vhost_virtqueue *vq)
> > +{
> > +   wake_up_process(blk->worker_host_kick);
> > +}
> > +
> > +static int vhost_blk_io_submit(struct vhost_blk *blk, struct file *file,
> > +                          struct vhost_blk_req *req,
> > +                          struct iovec *iov, u64 nr_vecs, loff_t offset,
> > +                          int opcode)
> > +{
> > +   struct kioctx *ioctx = blk->ioctx;
> > +   mm_segment_t oldfs = get_fs();
> > +   struct kiocb_batch batch;
> > +   struct blk_plug plug;
> > +   struct kiocb *iocb;
> > +   int ret;
> > +
> > +   if (!try_get_ioctx(ioctx)) {
> > +           pr_info("Failed to get ioctx");
> > +           return -EAGAIN;
> > +   }
> > +
> > +   atomic_long_inc_not_zero(&file->f_count);
> > +   eventfd_ctx_get(blk->ectx);
> > +
> > +   /* TODO: batch to 1 is not good! */
> > +   kiocb_batch_init(&batch, 1);
> > +   blk_start_plug(&plug);
> > +
> > +   iocb = aio_get_req(ioctx, &batch);
> > +   if (unlikely(!iocb)) {
> > +           ret = -EAGAIN;
> > +           goto out;
> > +   }
> > +
> > +   iocb->ki_filp   = file;
> > +   iocb->ki_pos    = offset;
> > +   iocb->ki_buf    = (void *)iov;
> > +   iocb->ki_left   = nr_vecs;
> > +   iocb->ki_nbytes = nr_vecs;
> > +   iocb->ki_opcode = opcode;
> > +   iocb->ki_obj.user = req;
> > +   iocb->ki_eventfd  = blk->ectx;
> > +
> > +   set_fs(KERNEL_DS);
> > +   ret = aio_setup_iocb(iocb, false);
> > +   set_fs(oldfs);
> > +   if (unlikely(ret))
> > +           goto out_put_iocb;
> > +
> > +   spin_lock_irq(&ioctx->ctx_lock);
> > +   if (unlikely(ioctx->dead)) {
> > +           spin_unlock_irq(&ioctx->ctx_lock);
> > +           ret = -EINVAL;
> > +           goto out_put_iocb;
> > +   }
> > +   aio_run_iocb(iocb);
> > +   spin_unlock_irq(&ioctx->ctx_lock);
> > +
> > +   aio_put_req(iocb);
> > +
> > +   blk_finish_plug(&plug);
> > +   kiocb_batch_free(ioctx, &batch);
> > +   put_ioctx(ioctx);
> > +
> > +   return ret;
> > +out_put_iocb:
> > +   aio_put_req(iocb); /* Drop extra ref to req */
> > +   aio_put_req(iocb); /* Drop I/O ref to req */
> > +out:
> > +   put_ioctx(ioctx);
> > +   return ret;
> > +}
> > +
> > +static void vhost_blk_flush(struct vhost_blk *blk)
> > +{
> > +   vhost_poll_flush(&blk->vq.poll);
> > +}
> > +
> > +static struct file *vhost_blk_stop_vq(struct vhost_blk *blk,
> > +                                 struct vhost_virtqueue *vq)
> > +{
> > +   struct file *file;
> > +
> > +   mutex_lock(&vq->mutex);
> > +   file = rcu_dereference_protected(vq->private_data,
> > +                   lockdep_is_held(&vq->mutex));
> > +   rcu_assign_pointer(vq->private_data, NULL);
> > +   mutex_unlock(&vq->mutex);
> > +
> > +   return file;
> > +
> > +}
> > +
> > +static inline void vhost_blk_stop(struct vhost_blk *blk, struct file 
> > **file)
> > +{
> > +
> > +   *file = vhost_blk_stop_vq(blk, &blk->vq);
> > +}
> > +
> > +/* Handle guest request */
> > +static int vhost_blk_do_req(struct vhost_virtqueue *vq,
> > +                       struct virtio_blk_outhdr *hdr,
> > +                       u16 head, u16 out, u16 in,
> > +                       struct file *file)
> > +{
> > +   struct vhost_blk *blk = container_of(vq->dev, struct vhost_blk, dev);
> > +   struct iovec *iov = &vq->iov[BLK_HDR + 1];
> > +   loff_t offset = hdr->sector << 9;
> > +   struct vhost_blk_req *req;
> > +   u64 nr_vecs;
> > +   int ret = 0;
> > +   u8 status;
> > +
> > +   if (hdr->type == VIRTIO_BLK_T_IN || hdr->type == VIRTIO_BLK_T_GET_ID)
> > +           nr_vecs = in - 1;
> > +   else
> > +           nr_vecs = out - 1;
> > +
> > +   req             = &blk->reqs[head];
> > +   req->head       = head;
> > +   req->status     = blk->vq.iov[nr_vecs + 1].iov_base;
> > +
> > +   switch (hdr->type) {
> > +   case VIRTIO_BLK_T_OUT:
> > +           ret = vhost_blk_io_submit(blk, file, req, iov, nr_vecs, offset,
> > +                                     IOCB_CMD_PWRITEV);
> > +           break;
> > +   case VIRTIO_BLK_T_IN:
> > +           ret = vhost_blk_io_submit(blk, file, req, iov, nr_vecs, offset,
> > +                                     IOCB_CMD_PREADV);
> > +           break;
> > +   case VIRTIO_BLK_T_FLUSH:
> > +           ret = vfs_fsync(file, 1);
> > +           status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> > +           ret = vhost_blk_set_status(blk, req->status, status);
> > +           if (!ret)
> > +                   vhost_add_used_and_signal(&blk->dev, vq, head, ret);
> > +           break;
> > +   case VIRTIO_BLK_T_GET_ID:
> > +           /* TODO: need a real ID string */
> > +           ret = snprintf(vq->iov[BLK_HDR + 1].iov_base,
> > +                          VIRTIO_BLK_ID_BYTES, "VHOST-BLK-DISK");
> > +           status = ret < 0 ? VIRTIO_BLK_S_IOERR : VIRTIO_BLK_S_OK;
> > +           ret = vhost_blk_set_status(blk, req->status, status);
> > +           if (!ret)
> > +                   vhost_add_used_and_signal(&blk->dev, vq, head,
> > +                                             VIRTIO_BLK_ID_BYTES);
> > +           break;
> > +   default:
> > +           pr_warn("Unsupported request type %d\n", hdr->type);
> > +           vhost_discard_vq_desc(vq, 1);
> > +           ret = -EFAULT;
> > +           break;
> > +   }
> 
> There doesn't appear to be any error handling in the event that
> vhost_blk_io_submit fails.  It would appear that you leak the ring queue
> entry since you never push anything onto the used queue.
> 
> I think you need to handle io_submit() failing too with EAGAIN.  Relying
> on min nr_events == queue_size seems like a dangerous assumption to me
> particularly since queue_size tends to be very large and max_nr_events
> is a fixed size global pool.
> 
> To properly handle EAGAIN, you effectively need to implement flow
> control and back off reading the virtqueue until you can submit requests 
> again.
> 
> Of course, the million dollar question is why would using AIO in the
> kernel be faster than using AIO in userspace?
> 
> When using eventfd, there is no heavy weight exit on the notify path.
> It should just be the difference between scheduling a kernel thread vs
> scheduling a userspace thread.

Actually AIO from userspace involves a kernel thread too, doesn't it?

>  There's simply no way that that's a 60%
> difference in performance.
> 
> Regards,
> 
> Anthony Liguori
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to