On Thu, Nov 15, 2012 at 3:19 PM, Stefan Hajnoczi <stefa...@redhat.com> wrote:
> The virtio-blk-data-plane cannot access memory using the usual QEMU
> functions since it executes outside the global mutex and the memory APIs
> are this time are not thread-safe.
>
> This patch introduces a virtqueue module based on the kernel's vhost
> vring code.  The trick is that we map guest memory ahead of time and
> access it cheaply outside the global mutex.
>
> Once the hardware emulation code can execute outside the global mutex it
> will be possible to drop this code.
>
> Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com>
> ---
>  hw/Makefile.objs           |   2 +-
>  hw/dataplane/Makefile.objs |   3 +
>  hw/dataplane/vring.c       | 321 
> +++++++++++++++++++++++++++++++++++++++++++++
>  hw/dataplane/vring.h       |  54 ++++++++
>  trace-events               |   3 +
>  5 files changed, 382 insertions(+), 1 deletion(-)
>  create mode 100644 hw/dataplane/Makefile.objs
>  create mode 100644 hw/dataplane/vring.c
>  create mode 100644 hw/dataplane/vring.h
>
> diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> index af4ab0c..da8ef0c 100644
> --- a/hw/Makefile.objs
> +++ b/hw/Makefile.objs
> @@ -1,4 +1,4 @@
> -common-obj-y = usb/ ide/
> +common-obj-y = usb/ ide/ dataplane/
>  common-obj-y += loader.o
>  common-obj-$(CONFIG_VIRTIO) += virtio-console.o
>  common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
> diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
> new file mode 100644
> index 0000000..b58544f
> --- /dev/null
> +++ b/hw/dataplane/Makefile.objs
> @@ -0,0 +1,3 @@
> +ifeq ($(CONFIG_VIRTIO), y)
> +common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += vring.o
> +endif
> diff --git a/hw/dataplane/vring.c b/hw/dataplane/vring.c
> new file mode 100644
> index 0000000..6aacce8
> --- /dev/null
> +++ b/hw/dataplane/vring.c
> @@ -0,0 +1,321 @@
> +/* Copyright 2012 Red Hat, Inc.
> + * Copyright IBM, Corp. 2012
> + *
> + * Based on Linux vhost code:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Copyright (C) 2006 Rusty Russell IBM Corporation
> + *
> + * Author: Michael S. Tsirkin <m...@redhat.com>
> + *         Stefan Hajnoczi <stefa...@redhat.com>
> + *
> + * Inspiration, some code, and most witty comments come from
> + * Documentation/virtual/lguest/lguest.c, by Rusty Russell
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#include "trace.h"
> +#include "hw/dataplane/vring.h"
> +
> +/* Map target physical address to host address
> + */
> +static inline void *phys_to_host(Vring *vring, hwaddr phys)
> +{
> +    /* Adjust for 3.6-4 GB PCI memory range */
> +    if (phys >= 0x100000000) {
> +        phys -= 0x100000000 - 0xe0000000;
> +    } else if (phys >= 0xe0000000) {
> +        fprintf(stderr, "phys_to_host bad physical address in "
> +                "PCI range %#lx\n", phys);
> +        exit(1);

Exiting is rather drastic. Is this guest's error or QEMU's?

> +    }
> +    return vring->phys_mem_zero_host_ptr + phys;
> +}
> +
> +/* Setup for cheap target physical to host address conversion
> + *
> + * This is a hack for direct access to guest memory, we're not really allowed
> + * to do this.
> + */
> +static void setup_phys_to_host(Vring *vring)
> +{
> +    hwaddr len = 4096; /* RAM is really much larger but we cheat */
> +    vring->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0);
> +    if (!vring->phys_mem_zero_host_ptr) {
> +        fprintf(stderr, "setup_phys_to_host failed\n");
> +        exit(1);
> +    }
> +}
> +
> +/* Map the guest's vring to host memory
> + *
> + * This is not allowed but we know the ring won't move.
> + */
> +void vring_setup(Vring *vring, VirtIODevice *vdev, int n)
> +{
> +    setup_phys_to_host(vring);
> +
> +    vring_init(&vring->vr, virtio_queue_get_num(vdev, n),
> +               phys_to_host(vring, virtio_queue_get_ring_addr(vdev, n)), 
> 4096);
> +
> +    vring->last_avail_idx = 0;
> +    vring->last_used_idx = 0;
> +    vring->signalled_used = 0;
> +    vring->signalled_used_valid = false;
> +
> +    trace_vring_setup(virtio_queue_get_ring_addr(vdev, n),
> +                      vring->vr.desc, vring->vr.avail, vring->vr.used);
> +}
> +
> +/* Toggle guest->host notifies */
> +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable)
> +{
> +    if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
> +        if (enable) {
> +            vring_avail_event(&vring->vr) = vring->vr.avail->idx;
> +        }
> +    } else if (enable) {
> +        vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY;
> +    } else {
> +        vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY;
> +    }
> +}
> +
> +/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
> +bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
> +{
> +    uint16_t old, new;
> +    bool v;
> +    /* Flush out used index updates. This is paired
> +     * with the barrier that the Guest executes when enabling
> +     * interrupts. */
> +    smp_mb();
> +
> +    if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) &&
> +        unlikely(vring->vr.avail->idx == vring->last_avail_idx)) {
> +        return true;
> +    }
> +
> +    if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) {
> +        return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
> +    }
> +    old = vring->signalled_used;
> +    v = vring->signalled_used_valid;
> +    new = vring->signalled_used = vring->last_used_idx;
> +    vring->signalled_used_valid = true;
> +
> +    if (unlikely(!v)) {
> +        return true;
> +    }
> +
> +    return vring_need_event(vring_used_event(&vring->vr), new, old);
> +}
> +
> +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */
> +static bool get_indirect(Vring *vring,
> +                         struct iovec iov[], struct iovec *iov_end,
> +                         unsigned int *out_num, unsigned int *in_num,
> +                         struct vring_desc *indirect)
> +{
> +    struct vring_desc desc;
> +    unsigned int i = 0, count, found = 0;
> +
> +    /* Sanity check */
> +    if (unlikely(indirect->len % sizeof desc)) {
> +        fprintf(stderr, "Invalid length in indirect descriptor: "
> +               "len 0x%llx not multiple of 0x%zx\n",
> +               (unsigned long long)indirect->len,
> +               sizeof desc);
> +        exit(1);
> +    }
> +
> +    count = indirect->len / sizeof desc;
> +    /* Buffers are chained via a 16 bit next field, so
> +     * we can have at most 2^16 of these. */
> +    if (unlikely(count > USHRT_MAX + 1)) {
> +        fprintf(stderr, "Indirect buffer length too big: %d\n",
> +               indirect->len);
> +        exit(1);
> +    }
> +
> +    /* Point to translate indirect desc chain */
> +    indirect = phys_to_host(vring, indirect->addr);
> +
> +    /* We will use the result as an address to read from, so most
> +     * architectures only need a compiler barrier here. */
> +    barrier(); /* read_barrier_depends(); */
> +
> +    do {
> +        if (unlikely(++found > count)) {
> +            fprintf(stderr, "Loop detected: last one at %u "
> +                   "indirect size %u\n",
> +                   i, count);
> +            exit(1);
> +        }
> +
> +        desc = *indirect++;
> +        if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
> +            fprintf(stderr, "Nested indirect descriptor\n");
> +            exit(1);
> +        }
> +
> +        /* Stop for now if there are not enough iovecs available. */
> +        if (iov >= iov_end) {
> +            return false;
> +        }
> +
> +        iov->iov_base = phys_to_host(vring, desc.addr);
> +        iov->iov_len  = desc.len;
> +        iov++;
> +
> +        /* If this is an input descriptor, increment that count. */
> +        if (desc.flags & VRING_DESC_F_WRITE) {
> +            *in_num += 1;
> +        } else {
> +            /* If it's an output descriptor, they're all supposed
> +             * to come before any input descriptors. */
> +            if (unlikely(*in_num)) {
> +                fprintf(stderr, "Indirect descriptor "
> +                       "has out after in: idx %d\n", i);
> +                exit(1);
> +            }
> +            *out_num += 1;
> +        }
> +        i = desc.next;
> +    } while (desc.flags & VRING_DESC_F_NEXT);
> +    return true;
> +}
> +
> +/* This looks in the virtqueue and for the first available buffer, and 
> converts
> + * it to an iovec for convenient access.  Since descriptors consist of some
> + * number of output then some number of input descriptors, it's actually two
> + * iovecs, but we pack them into one and note how many of each there were.
> + *
> + * This function returns the descriptor number found, or vq->num (which is
> + * never a valid descriptor number) if none was found.  A negative code is
> + * returned on error.
> + *
> + * Stolen from linux-2.6/drivers/vhost/vhost.c.
> + */
> +int vring_pop(VirtIODevice *vdev, Vring *vring,
> +              struct iovec iov[], struct iovec *iov_end,
> +              unsigned int *out_num, unsigned int *in_num)
> +{
> +    struct vring_desc desc;
> +    unsigned int i, head, found = 0, num = vring->vr.num;
> +    __u16 avail_idx, last_avail_idx;

Please use uint16_t in QEMU code.

> +
> +    /* Check it isn't doing very strange things with descriptor numbers. */
> +    last_avail_idx = vring->last_avail_idx;
> +    avail_idx = vring->vr.avail->idx;
> +
> +    if (unlikely((__u16)(avail_idx - last_avail_idx) > num)) {
> +        fprintf(stderr, "Guest moved used index from %u to %u\n",
> +                last_avail_idx, avail_idx);
> +        exit(1);
> +    }
> +
> +    /* If there's nothing new since last we looked. */
> +    if (avail_idx == last_avail_idx) {
> +        return -EAGAIN;
> +    }
> +
> +    /* Only get avail ring entries after they have been exposed by guest. */
> +    smp_rmb();
> +
> +    /* Grab the next descriptor number they're advertising, and increment
> +     * the index we've seen. */
> +    head = vring->vr.avail->ring[last_avail_idx % num];
> +
> +    /* If their number is silly, that's an error. */
> +    if (unlikely(head >= num)) {
> +        fprintf(stderr, "Guest says index %u > %u is available\n",
> +                head, num);
> +        exit(1);
> +    }
> +
> +    if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
> +        vring_avail_event(&vring->vr) = vring->vr.avail->idx;
> +    }
> +
> +    /* When we start there are none of either input nor output. */
> +    *out_num = *in_num = 0;
> +
> +    i = head;
> +    do {
> +        if (unlikely(i >= num)) {
> +            fprintf(stderr, "Desc index is %u > %u, head = %u\n",
> +                    i, num, head);
> +            exit(1);
> +        }
> +        if (unlikely(++found > num)) {
> +            fprintf(stderr, "Loop detected: last one at %u "
> +                    "vq size %u head %u\n",
> +                    i, num, head);
> +            exit(1);
> +        }
> +        desc = vring->vr.desc[i];
> +        if (desc.flags & VRING_DESC_F_INDIRECT) {
> +            if (!get_indirect(vring, iov, iov_end, out_num, in_num, &desc)) {
> +                return -ENOBUFS; /* not enough iovecs, stop for now */
> +            }
> +            continue;
> +        }
> +
> +        /* If there are not enough iovecs left, stop for now.  The caller
> +         * should check if there are more descs available once they have 
> dealt
> +         * with the current set.
> +         */
> +        if (iov >= iov_end) {
> +            return -ENOBUFS;
> +        }
> +
> +        iov->iov_base = phys_to_host(vring, desc.addr);
> +        iov->iov_len  = desc.len;
> +        iov++;
> +
> +        if (desc.flags & VRING_DESC_F_WRITE) {
> +            /* If this is an input descriptor,
> +             * increment that count. */
> +            *in_num += 1;
> +        } else {
> +            /* If it's an output descriptor, they're all supposed
> +             * to come before any input descriptors. */
> +            if (unlikely(*in_num)) {
> +                fprintf(stderr, "Descriptor has out after in: "
> +                        "idx %d\n", i);
> +                exit(1);
> +            }
> +            *out_num += 1;
> +        }
> +        i = desc.next;
> +    } while (desc.flags & VRING_DESC_F_NEXT);
> +
> +    /* On success, increment avail index. */
> +    vring->last_avail_idx++;
> +    return head;
> +}
> +
> +/* After we've used one of their buffers, we tell them about it.
> + *
> + * Stolen from linux-2.6/drivers/vhost/vhost.c.
> + */
> +void vring_push(Vring *vring, unsigned int head, int len)
> +{
> +    struct vring_used_elem *used;
> +    uint16_t new;
> +
> +    /* The virtqueue contains a ring of used buffers.  Get a pointer to the
> +     * next entry in that used ring. */
> +    used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num];
> +    used->id = head;
> +    used->len = len;
> +
> +    /* Make sure buffer is written before we update index. */
> +    smp_wmb();
> +
> +    new = vring->vr.used->idx = ++vring->last_used_idx;
> +    if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) {
> +        vring->signalled_used_valid = false;
> +    }
> +}
> diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
> new file mode 100644
> index 0000000..42c2f0a
> --- /dev/null
> +++ b/hw/dataplane/vring.h
> @@ -0,0 +1,54 @@
> +/* Copyright 2012 Red Hat, Inc. and/or its affiliates
> + * Copyright IBM, Corp. 2012
> + *
> + * Based on Linux vhost code:
> + * Copyright (C) 2009 Red Hat, Inc.
> + * Copyright (C) 2006 Rusty Russell IBM Corporation
> + *
> + * Author: Michael S. Tsirkin <m...@redhat.com>
> + *         Stefan Hajnoczi <stefa...@redhat.com>
> + *
> + * Inspiration, some code, and most witty comments come from
> + * Documentation/virtual/lguest/lguest.c, by Rusty Russell
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + */
> +
> +#ifndef VRING_H
> +#define VRING_H
> +
> +#include <linux/virtio_ring.h>
> +#include "qemu-common.h"
> +#include "qemu-barrier.h"
> +#include "memory.h"
> +#include "hw/virtio.h"
> +
> +typedef struct {
> +    void *phys_mem_zero_host_ptr;   /* host pointer to guest RAM */
> +    struct vring vr;                /* virtqueue vring mapped to host memory 
> */
> +    __u16 last_avail_idx;           /* last processed avail ring index */
> +    __u16 last_used_idx;            /* last processed used ring index */
> +    uint16_t signalled_used;        /* EVENT_IDX state */
> +    bool signalled_used_valid;
> +} Vring;
> +
> +static inline unsigned int vring_get_num(Vring *vring)
> +{
> +    return vring->vr.num;
> +}
> +
> +/* Are there more descriptors available? */
> +static inline bool vring_more_avail(Vring *vring)
> +{
> +    return vring->vr.avail->idx != vring->last_avail_idx;
> +}
> +
> +void vring_setup(Vring *vring, VirtIODevice *vdev, int n);
> +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable);
> +bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
> +int vring_pop(VirtIODevice *vdev, Vring *vring,
> +              struct iovec iov[], struct iovec *iov_end,
> +              unsigned int *out_num, unsigned int *in_num);
> +void vring_push(Vring *vring, unsigned int head, int len);
> +
> +#endif /* VRING_H */
> diff --git a/trace-events b/trace-events
> index e1a37cc..8eeab34 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -98,6 +98,9 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
>  virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p 
> sector %"PRIu64" nsectors %zu"
>  virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p 
> sector %"PRIu64" nsectors %zu"
>
> +# hw/dataplane/vring.c
> +vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring 
> physical %#"PRIx64" desc %p avail %p used %p"
> +
>  # thread-pool.c
>  thread_pool_submit(void *req, void *opaque) "req %p opaque %p"
>  thread_pool_complete(void *req, void *opaque, int ret) "req %p opaque %p ret 
> %d"
> --
> 1.8.0
>
>

Reply via email to