Re: [RFC v9 11/27] virtio-blk: Indirect vring and flush support

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:38PM +0100, Stefan Hajnoczi wrote:
> RHEL6 and other new guest kernels use indirect vring descriptors to
> increase the number of requests that can be batched.  This fundamentally
> changes vring from a scheme that requires fixed resources to something
> more dynamic (although there is still an absolute maximum number of
> descriptors).  Cope with indirect vrings by taking on as many requests
> as we can in one go and then postponing the remaining requests until the
> first batch completes.
> 
> It would be possible to switch to dynamic resource management so iovec
> and iocb structs are malloced.  This would allow the entire ring to be
> processed even with indirect descriptors, but would probably hit a
> bottleneck when io_submit refuses to queue more requests.  Therefore,
> stick with the simpler scheme for now.
> 
> Unfortunately Linux AIO does not support asynchronous fsync/fdatasync on
> all files.  In particular, an O_DIRECT opened file on ext4 does not
> support Linux AIO fdsync.  Work around this by performing fdatasync()
> synchronously for now.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  hw/dataplane/ioq.h   |   18 -
>  hw/dataplane/vring.h |  103 
> +++---
>  hw/virtio-blk.c  |   75 ++--
>  3 files changed, 144 insertions(+), 52 deletions(-)
> 
> diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
> index 7200e87..d1545d6 100644
> --- a/hw/dataplane/ioq.h
> +++ b/hw/dataplane/ioq.h
> @@ -3,7 +3,7 @@
>  
>  typedef struct {
>  int fd; /* file descriptor */
> -unsigned int max_reqs;   /* max length of freelist and queue */
> +unsigned int max_reqs;  /* max length of freelist and queue */
>  
>  io_context_t io_ctx;/* Linux AIO context */
>  EventNotifier io_notifier;  /* Linux AIO eventfd */
> @@ -91,18 +91,16 @@ static struct iocb *ioq_rdwr(IOQueue *ioq, bool read, 
> struct iovec *iov, unsigne
>  return iocb;
>  }
>  
> -static struct iocb *ioq_fdsync(IOQueue *ioq)
> -{
> -struct iocb *iocb = ioq_get_iocb(ioq);
> -
> -io_prep_fdsync(iocb, ioq->fd);
> -io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier));
> -return iocb;
> -}
> -
>  static int ioq_submit(IOQueue *ioq)
>  {
>  int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue);
> +if (unlikely(rc < 0)) {
> +unsigned int i;
> +fprintf(stderr, "io_submit io_ctx=%#lx nr=%d iovecs=%p\n", 
> (uint64_t)ioq->io_ctx, ioq->queue_idx, ioq->queue);
> +for (i = 0; i < ioq->queue_idx; i++) {
> +fprintf(stderr, "[%u] type=%#x fd=%d\n", i, 
> ioq->queue[i]->aio_lio_opcode, ioq->queue[i]->aio_fildes);
> +}
> +}
>  ioq->queue_idx = 0; /* reset */
>  return rc;
>  }
> diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
> index 70675e5..3eab4b4 100644
> --- a/hw/dataplane/vring.h
> +++ b/hw/dataplane/vring.h
> @@ -64,6 +64,86 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
> int n)
>  vring->vr.desc, vring->vr.avail, vring->vr.used);
>  }
>  
> +static bool vring_more_avail(Vring *vring)
> +{
> + return vring->vr.avail->idx != vring->last_avail_idx;
> +}
> +
> +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */

So add a Red Hat copyright pls.

> +static bool get_indirect(Vring *vring,
> + struct iovec iov[], struct iovec *iov_end,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vring_desc *indirect)
> +{
> + struct vring_desc desc;
> + unsigned int i = 0, count, found = 0;
> +
> + /* Sanity check */
> + if (unlikely(indirect->len % sizeof desc)) {
> + fprintf(stderr, "Invalid length in indirect descriptor: "
> +"len 0x%llx not multiple of 0x%zx\n",
> +(unsigned long long)indirect->len,
> +sizeof desc);
> + exit(1);
> + }
> +
> + count = indirect->len / sizeof desc;
> + /* Buffers are chained via a 16 bit next field, so
> +  * we can have at most 2^16 of these. */
> + if (unlikely(count > USHRT_MAX + 1)) {
> + fprintf(stderr, "Indirect buffer length too big: %d\n",
> +indirect->len);
> +exit(1);
> + }
> +
> +/* Point to translate indirect desc chain */
> +indirect = phys_to_host(vring, indirect->addr);
> +
> + /* We will use the result as an address to read from, so most
> +  * architectures only need a compiler barrier here. */
> + __sync_synchronize(); /* read_barrier_depends(); */
> +
> + do {
> + if (unlikely(++found > count)) {
> + fprintf(stderr, "Loop detected: last one at %u "
> +"indirect size %u\n",
> +i, count);
> + exit(1);
> + }
> +
> +   

Re: [RFC v9 11/27] virtio-blk: Indirect vring and flush support

2012-07-18 Thread Michael S. Tsirkin
On Wed, Jul 18, 2012 at 04:07:38PM +0100, Stefan Hajnoczi wrote:
> RHEL6 and other new guest kernels use indirect vring descriptors to
> increase the number of requests that can be batched.  This fundamentally
> changes vring from a scheme that requires fixed resources to something
> more dynamic (although there is still an absolute maximum number of
> descriptors).  Cope with indirect vrings by taking on as many requests
> as we can in one go and then postponing the remaining requests until the
> first batch completes.
> 
> It would be possible to switch to dynamic resource management so iovec
> and iocb structs are malloced.  This would allow the entire ring to be
> processed even with indirect descriptors, but would probably hit a
> bottleneck when io_submit refuses to queue more requests.  Therefore,
> stick with the simpler scheme for now.
> 
> Unfortunately Linux AIO does not support asynchronous fsync/fdatasync on
> all files.  In particular, an O_DIRECT opened file on ext4 does not
> support Linux AIO fdsync.  Work around this by performing fdatasync()
> synchronously for now.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  hw/dataplane/ioq.h   |   18 -
>  hw/dataplane/vring.h |  103 
> +++---
>  hw/virtio-blk.c  |   75 ++--
>  3 files changed, 144 insertions(+), 52 deletions(-)
> 
> diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
> index 7200e87..d1545d6 100644
> --- a/hw/dataplane/ioq.h
> +++ b/hw/dataplane/ioq.h
> @@ -3,7 +3,7 @@
>  
>  typedef struct {
>  int fd; /* file descriptor */
> -unsigned int max_reqs;   /* max length of freelist and queue */
> +unsigned int max_reqs;  /* max length of freelist and queue */
>  
>  io_context_t io_ctx;/* Linux AIO context */
>  EventNotifier io_notifier;  /* Linux AIO eventfd */
> @@ -91,18 +91,16 @@ static struct iocb *ioq_rdwr(IOQueue *ioq, bool read, 
> struct iovec *iov, unsigne
>  return iocb;
>  }
>  
> -static struct iocb *ioq_fdsync(IOQueue *ioq)
> -{
> -struct iocb *iocb = ioq_get_iocb(ioq);
> -
> -io_prep_fdsync(iocb, ioq->fd);
> -io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier));
> -return iocb;
> -}
> -
>  static int ioq_submit(IOQueue *ioq)
>  {
>  int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue);
> +if (unlikely(rc < 0)) {
> +unsigned int i;
> +fprintf(stderr, "io_submit io_ctx=%#lx nr=%d iovecs=%p\n", 
> (uint64_t)ioq->io_ctx, ioq->queue_idx, ioq->queue);
> +for (i = 0; i < ioq->queue_idx; i++) {
> +fprintf(stderr, "[%u] type=%#x fd=%d\n", i, 
> ioq->queue[i]->aio_lio_opcode, ioq->queue[i]->aio_fildes);
> +}
> +}
>  ioq->queue_idx = 0; /* reset */
>  return rc;
>  }
> diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
> index 70675e5..3eab4b4 100644
> --- a/hw/dataplane/vring.h
> +++ b/hw/dataplane/vring.h
> @@ -64,6 +64,86 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
> int n)
>  vring->vr.desc, vring->vr.avail, vring->vr.used);
>  }
>  
> +static bool vring_more_avail(Vring *vring)
> +{
> + return vring->vr.avail->idx != vring->last_avail_idx;
> +}
> +
> +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
> +static bool get_indirect(Vring *vring,
> + struct iovec iov[], struct iovec *iov_end,
> + unsigned int *out_num, unsigned int *in_num,
> + struct vring_desc *indirect)
> +{
> + struct vring_desc desc;
> + unsigned int i = 0, count, found = 0;
> +
> + /* Sanity check */
> + if (unlikely(indirect->len % sizeof desc)) {
> + fprintf(stderr, "Invalid length in indirect descriptor: "
> +"len 0x%llx not multiple of 0x%zx\n",
> +(unsigned long long)indirect->len,
> +sizeof desc);
> + exit(1);
> + }
> +
> + count = indirect->len / sizeof desc;
> + /* Buffers are chained via a 16 bit next field, so
> +  * we can have at most 2^16 of these. */
> + if (unlikely(count > USHRT_MAX + 1)) {
> + fprintf(stderr, "Indirect buffer length too big: %d\n",
> +indirect->len);
> +exit(1);
> + }
> +
> +/* Point to translate indirect desc chain */
> +indirect = phys_to_host(vring, indirect->addr);
> +
> + /* We will use the result as an address to read from, so most
> +  * architectures only need a compiler barrier here. */
> + __sync_synchronize(); /* read_barrier_depends(); */


qemu has its own barriers now, pls use them.

> +
> + do {
> + if (unlikely(++found > count)) {
> + fprintf(stderr, "Loop detected: last one at %u "
> +"indirect size %u\n",
> +i, count);
> + exit(1);
> + }

[RFC v9 11/27] virtio-blk: Indirect vring and flush support

2012-07-18 Thread Stefan Hajnoczi
RHEL6 and other new guest kernels use indirect vring descriptors to
increase the number of requests that can be batched.  This fundamentally
changes vring from a scheme that requires fixed resources to something
more dynamic (although there is still an absolute maximum number of
descriptors).  Cope with indirect vrings by taking on as many requests
as we can in one go and then postponing the remaining requests until the
first batch completes.

It would be possible to switch to dynamic resource management so iovec
and iocb structs are malloced.  This would allow the entire ring to be
processed even with indirect descriptors, but would probably hit a
bottleneck when io_submit refuses to queue more requests.  Therefore,
stick with the simpler scheme for now.

Unfortunately Linux AIO does not support asynchronous fsync/fdatasync on
all files.  In particular, an O_DIRECT opened file on ext4 does not
support Linux AIO fdsync.  Work around this by performing fdatasync()
synchronously for now.

Signed-off-by: Stefan Hajnoczi 
---
 hw/dataplane/ioq.h   |   18 -
 hw/dataplane/vring.h |  103 +++---
 hw/virtio-blk.c  |   75 ++--
 3 files changed, 144 insertions(+), 52 deletions(-)

diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
index 7200e87..d1545d6 100644
--- a/hw/dataplane/ioq.h
+++ b/hw/dataplane/ioq.h
@@ -3,7 +3,7 @@
 
 typedef struct {
 int fd; /* file descriptor */
-unsigned int max_reqs;   /* max length of freelist and queue */
+unsigned int max_reqs;  /* max length of freelist and queue */
 
 io_context_t io_ctx;/* Linux AIO context */
 EventNotifier io_notifier;  /* Linux AIO eventfd */
@@ -91,18 +91,16 @@ static struct iocb *ioq_rdwr(IOQueue *ioq, bool read, 
struct iovec *iov, unsigne
 return iocb;
 }
 
-static struct iocb *ioq_fdsync(IOQueue *ioq)
-{
-struct iocb *iocb = ioq_get_iocb(ioq);
-
-io_prep_fdsync(iocb, ioq->fd);
-io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier));
-return iocb;
-}
-
 static int ioq_submit(IOQueue *ioq)
 {
 int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue);
+if (unlikely(rc < 0)) {
+unsigned int i;
+fprintf(stderr, "io_submit io_ctx=%#lx nr=%d iovecs=%p\n", 
(uint64_t)ioq->io_ctx, ioq->queue_idx, ioq->queue);
+for (i = 0; i < ioq->queue_idx; i++) {
+fprintf(stderr, "[%u] type=%#x fd=%d\n", i, 
ioq->queue[i]->aio_lio_opcode, ioq->queue[i]->aio_fildes);
+}
+}
 ioq->queue_idx = 0; /* reset */
 return rc;
 }
diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
index 70675e5..3eab4b4 100644
--- a/hw/dataplane/vring.h
+++ b/hw/dataplane/vring.h
@@ -64,6 +64,86 @@ static void vring_setup(Vring *vring, VirtIODevice *vdev, 
int n)
 vring->vr.desc, vring->vr.avail, vring->vr.used);
 }
 
+static bool vring_more_avail(Vring *vring)
+{
+   return vring->vr.avail->idx != vring->last_avail_idx;
+}
+
+/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
+static bool get_indirect(Vring *vring,
+   struct iovec iov[], struct iovec *iov_end,
+   unsigned int *out_num, unsigned int *in_num,
+   struct vring_desc *indirect)
+{
+   struct vring_desc desc;
+   unsigned int i = 0, count, found = 0;
+
+   /* Sanity check */
+   if (unlikely(indirect->len % sizeof desc)) {
+   fprintf(stderr, "Invalid length in indirect descriptor: "
+  "len 0x%llx not multiple of 0x%zx\n",
+  (unsigned long long)indirect->len,
+  sizeof desc);
+   exit(1);
+   }
+
+   count = indirect->len / sizeof desc;
+   /* Buffers are chained via a 16 bit next field, so
+* we can have at most 2^16 of these. */
+   if (unlikely(count > USHRT_MAX + 1)) {
+   fprintf(stderr, "Indirect buffer length too big: %d\n",
+  indirect->len);
+exit(1);
+   }
+
+/* Point to translate indirect desc chain */
+indirect = phys_to_host(vring, indirect->addr);
+
+   /* We will use the result as an address to read from, so most
+* architectures only need a compiler barrier here. */
+   __sync_synchronize(); /* read_barrier_depends(); */
+
+   do {
+   if (unlikely(++found > count)) {
+   fprintf(stderr, "Loop detected: last one at %u "
+  "indirect size %u\n",
+  i, count);
+   exit(1);
+   }
+
+desc = *indirect++;
+   if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+   fprintf(stderr, "Nested indirect descriptor\n");
+exit(1);
+   }
+
+/* Stop for now if there are not enough iovecs available. */
+if (iov