On Thu, Nov 15, 2012 at 3:19 PM, Stefan Hajnoczi <stefa...@redhat.com> wrote: > The virtio-blk-data-plane cannot access memory using the usual QEMU > functions since it executes outside the global mutex and the memory APIs > are this time are not thread-safe. > > This patch introduces a virtqueue module based on the kernel's vhost > vring code. The trick is that we map guest memory ahead of time and > access it cheaply outside the global mutex. > > Once the hardware emulation code can execute outside the global mutex it > will be possible to drop this code. > > Signed-off-by: Stefan Hajnoczi <stefa...@redhat.com> > --- > hw/Makefile.objs | 2 +- > hw/dataplane/Makefile.objs | 3 + > hw/dataplane/vring.c | 321 > +++++++++++++++++++++++++++++++++++++++++++++ > hw/dataplane/vring.h | 54 ++++++++ > trace-events | 3 + > 5 files changed, 382 insertions(+), 1 deletion(-) > create mode 100644 hw/dataplane/Makefile.objs > create mode 100644 hw/dataplane/vring.c > create mode 100644 hw/dataplane/vring.h > > diff --git a/hw/Makefile.objs b/hw/Makefile.objs > index af4ab0c..da8ef0c 100644 > --- a/hw/Makefile.objs > +++ b/hw/Makefile.objs > @@ -1,4 +1,4 @@ > -common-obj-y = usb/ ide/ > +common-obj-y = usb/ ide/ dataplane/ > common-obj-y += loader.o > common-obj-$(CONFIG_VIRTIO) += virtio-console.o > common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o > diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs > new file mode 100644 > index 0000000..b58544f > --- /dev/null > +++ b/hw/dataplane/Makefile.objs > @@ -0,0 +1,3 @@ > +ifeq ($(CONFIG_VIRTIO), y) > +common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += vring.o > +endif > diff --git a/hw/dataplane/vring.c b/hw/dataplane/vring.c > new file mode 100644 > index 0000000..6aacce8 > --- /dev/null > +++ b/hw/dataplane/vring.c > @@ -0,0 +1,321 @@ > +/* Copyright 2012 Red Hat, Inc. > + * Copyright IBM, Corp. 2012 > + * > + * Based on Linux vhost code: > + * Copyright (C) 2009 Red Hat, Inc. > + * Copyright (C) 2006 Rusty Russell IBM Corporation > + * > + * Author: Michael S. Tsirkin <m...@redhat.com> > + * Stefan Hajnoczi <stefa...@redhat.com> > + * > + * Inspiration, some code, and most witty comments come from > + * Documentation/virtual/lguest/lguest.c, by Rusty Russell > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + */ > + > +#include "trace.h" > +#include "hw/dataplane/vring.h" > + > +/* Map target physical address to host address > + */ > +static inline void *phys_to_host(Vring *vring, hwaddr phys) > +{ > + /* Adjust for 3.6-4 GB PCI memory range */ > + if (phys >= 0x100000000) { > + phys -= 0x100000000 - 0xe0000000; > + } else if (phys >= 0xe0000000) { > + fprintf(stderr, "phys_to_host bad physical address in " > + "PCI range %#lx\n", phys); > + exit(1);
Exiting is rather drastic. Is this guest's error or QEMU's? > + } > + return vring->phys_mem_zero_host_ptr + phys; > +} > + > +/* Setup for cheap target physical to host address conversion > + * > + * This is a hack for direct access to guest memory, we're not really allowed > + * to do this. > + */ > +static void setup_phys_to_host(Vring *vring) > +{ > + hwaddr len = 4096; /* RAM is really much larger but we cheat */ > + vring->phys_mem_zero_host_ptr = cpu_physical_memory_map(0, &len, 0); > + if (!vring->phys_mem_zero_host_ptr) { > + fprintf(stderr, "setup_phys_to_host failed\n"); > + exit(1); > + } > +} > + > +/* Map the guest's vring to host memory > + * > + * This is not allowed but we know the ring won't move. > + */ > +void vring_setup(Vring *vring, VirtIODevice *vdev, int n) > +{ > + setup_phys_to_host(vring); > + > + vring_init(&vring->vr, virtio_queue_get_num(vdev, n), > + phys_to_host(vring, virtio_queue_get_ring_addr(vdev, n)), > 4096); > + > + vring->last_avail_idx = 0; > + vring->last_used_idx = 0; > + vring->signalled_used = 0; > + vring->signalled_used_valid = false; > + > + trace_vring_setup(virtio_queue_get_ring_addr(vdev, n), > + vring->vr.desc, vring->vr.avail, vring->vr.used); > +} > + > +/* Toggle guest->host notifies */ > +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable) > +{ > + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { > + if (enable) { > + vring_avail_event(&vring->vr) = vring->vr.avail->idx; > + } > + } else if (enable) { > + vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY; > + } else { > + vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY; > + } > +} > + > +/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */ > +bool vring_should_notify(VirtIODevice *vdev, Vring *vring) > +{ > + uint16_t old, new; > + bool v; > + /* Flush out used index updates. This is paired > + * with the barrier that the Guest executes when enabling > + * interrupts. */ > + smp_mb(); > + > + if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) && > + unlikely(vring->vr.avail->idx == vring->last_avail_idx)) { > + return true; > + } > + > + if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) { > + return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT); > + } > + old = vring->signalled_used; > + v = vring->signalled_used_valid; > + new = vring->signalled_used = vring->last_used_idx; > + vring->signalled_used_valid = true; > + > + if (unlikely(!v)) { > + return true; > + } > + > + return vring_need_event(vring_used_event(&vring->vr), new, old); > +} > + > +/* This is stolen from linux-2.6/drivers/vhost/vhost.c. */ > +static bool get_indirect(Vring *vring, > + struct iovec iov[], struct iovec *iov_end, > + unsigned int *out_num, unsigned int *in_num, > + struct vring_desc *indirect) > +{ > + struct vring_desc desc; > + unsigned int i = 0, count, found = 0; > + > + /* Sanity check */ > + if (unlikely(indirect->len % sizeof desc)) { > + fprintf(stderr, "Invalid length in indirect descriptor: " > + "len 0x%llx not multiple of 0x%zx\n", > + (unsigned long long)indirect->len, > + sizeof desc); > + exit(1); > + } > + > + count = indirect->len / sizeof desc; > + /* Buffers are chained via a 16 bit next field, so > + * we can have at most 2^16 of these. */ > + if (unlikely(count > USHRT_MAX + 1)) { > + fprintf(stderr, "Indirect buffer length too big: %d\n", > + indirect->len); > + exit(1); > + } > + > + /* Point to translate indirect desc chain */ > + indirect = phys_to_host(vring, indirect->addr); > + > + /* We will use the result as an address to read from, so most > + * architectures only need a compiler barrier here. */ > + barrier(); /* read_barrier_depends(); */ > + > + do { > + if (unlikely(++found > count)) { > + fprintf(stderr, "Loop detected: last one at %u " > + "indirect size %u\n", > + i, count); > + exit(1); > + } > + > + desc = *indirect++; > + if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) { > + fprintf(stderr, "Nested indirect descriptor\n"); > + exit(1); > + } > + > + /* Stop for now if there are not enough iovecs available. */ > + if (iov >= iov_end) { > + return false; > + } > + > + iov->iov_base = phys_to_host(vring, desc.addr); > + iov->iov_len = desc.len; > + iov++; > + > + /* If this is an input descriptor, increment that count. */ > + if (desc.flags & VRING_DESC_F_WRITE) { > + *in_num += 1; > + } else { > + /* If it's an output descriptor, they're all supposed > + * to come before any input descriptors. */ > + if (unlikely(*in_num)) { > + fprintf(stderr, "Indirect descriptor " > + "has out after in: idx %d\n", i); > + exit(1); > + } > + *out_num += 1; > + } > + i = desc.next; > + } while (desc.flags & VRING_DESC_F_NEXT); > + return true; > +} > + > +/* This looks in the virtqueue and for the first available buffer, and > converts > + * it to an iovec for convenient access. Since descriptors consist of some > + * number of output then some number of input descriptors, it's actually two > + * iovecs, but we pack them into one and note how many of each there were. > + * > + * This function returns the descriptor number found, or vq->num (which is > + * never a valid descriptor number) if none was found. A negative code is > + * returned on error. > + * > + * Stolen from linux-2.6/drivers/vhost/vhost.c. > + */ > +int vring_pop(VirtIODevice *vdev, Vring *vring, > + struct iovec iov[], struct iovec *iov_end, > + unsigned int *out_num, unsigned int *in_num) > +{ > + struct vring_desc desc; > + unsigned int i, head, found = 0, num = vring->vr.num; > + __u16 avail_idx, last_avail_idx; Please use uint16_t in QEMU code. > + > + /* Check it isn't doing very strange things with descriptor numbers. */ > + last_avail_idx = vring->last_avail_idx; > + avail_idx = vring->vr.avail->idx; > + > + if (unlikely((__u16)(avail_idx - last_avail_idx) > num)) { > + fprintf(stderr, "Guest moved used index from %u to %u\n", > + last_avail_idx, avail_idx); > + exit(1); > + } > + > + /* If there's nothing new since last we looked. */ > + if (avail_idx == last_avail_idx) { > + return -EAGAIN; > + } > + > + /* Only get avail ring entries after they have been exposed by guest. */ > + smp_rmb(); > + > + /* Grab the next descriptor number they're advertising, and increment > + * the index we've seen. */ > + head = vring->vr.avail->ring[last_avail_idx % num]; > + > + /* If their number is silly, that's an error. */ > + if (unlikely(head >= num)) { > + fprintf(stderr, "Guest says index %u > %u is available\n", > + head, num); > + exit(1); > + } > + > + if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { > + vring_avail_event(&vring->vr) = vring->vr.avail->idx; > + } > + > + /* When we start there are none of either input nor output. */ > + *out_num = *in_num = 0; > + > + i = head; > + do { > + if (unlikely(i >= num)) { > + fprintf(stderr, "Desc index is %u > %u, head = %u\n", > + i, num, head); > + exit(1); > + } > + if (unlikely(++found > num)) { > + fprintf(stderr, "Loop detected: last one at %u " > + "vq size %u head %u\n", > + i, num, head); > + exit(1); > + } > + desc = vring->vr.desc[i]; > + if (desc.flags & VRING_DESC_F_INDIRECT) { > + if (!get_indirect(vring, iov, iov_end, out_num, in_num, &desc)) { > + return -ENOBUFS; /* not enough iovecs, stop for now */ > + } > + continue; > + } > + > + /* If there are not enough iovecs left, stop for now. The caller > + * should check if there are more descs available once they have > dealt > + * with the current set. > + */ > + if (iov >= iov_end) { > + return -ENOBUFS; > + } > + > + iov->iov_base = phys_to_host(vring, desc.addr); > + iov->iov_len = desc.len; > + iov++; > + > + if (desc.flags & VRING_DESC_F_WRITE) { > + /* If this is an input descriptor, > + * increment that count. */ > + *in_num += 1; > + } else { > + /* If it's an output descriptor, they're all supposed > + * to come before any input descriptors. */ > + if (unlikely(*in_num)) { > + fprintf(stderr, "Descriptor has out after in: " > + "idx %d\n", i); > + exit(1); > + } > + *out_num += 1; > + } > + i = desc.next; > + } while (desc.flags & VRING_DESC_F_NEXT); > + > + /* On success, increment avail index. */ > + vring->last_avail_idx++; > + return head; > +} > + > +/* After we've used one of their buffers, we tell them about it. > + * > + * Stolen from linux-2.6/drivers/vhost/vhost.c. > + */ > +void vring_push(Vring *vring, unsigned int head, int len) > +{ > + struct vring_used_elem *used; > + uint16_t new; > + > + /* The virtqueue contains a ring of used buffers. Get a pointer to the > + * next entry in that used ring. */ > + used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num]; > + used->id = head; > + used->len = len; > + > + /* Make sure buffer is written before we update index. */ > + smp_wmb(); > + > + new = vring->vr.used->idx = ++vring->last_used_idx; > + if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) { > + vring->signalled_used_valid = false; > + } > +} > diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h > new file mode 100644 > index 0000000..42c2f0a > --- /dev/null > +++ b/hw/dataplane/vring.h > @@ -0,0 +1,54 @@ > +/* Copyright 2012 Red Hat, Inc. and/or its affiliates > + * Copyright IBM, Corp. 2012 > + * > + * Based on Linux vhost code: > + * Copyright (C) 2009 Red Hat, Inc. > + * Copyright (C) 2006 Rusty Russell IBM Corporation > + * > + * Author: Michael S. Tsirkin <m...@redhat.com> > + * Stefan Hajnoczi <stefa...@redhat.com> > + * > + * Inspiration, some code, and most witty comments come from > + * Documentation/virtual/lguest/lguest.c, by Rusty Russell > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + */ > + > +#ifndef VRING_H > +#define VRING_H > + > +#include <linux/virtio_ring.h> > +#include "qemu-common.h" > +#include "qemu-barrier.h" > +#include "memory.h" > +#include "hw/virtio.h" > + > +typedef struct { > + void *phys_mem_zero_host_ptr; /* host pointer to guest RAM */ > + struct vring vr; /* virtqueue vring mapped to host memory > */ > + __u16 last_avail_idx; /* last processed avail ring index */ > + __u16 last_used_idx; /* last processed used ring index */ > + uint16_t signalled_used; /* EVENT_IDX state */ > + bool signalled_used_valid; > +} Vring; > + > +static inline unsigned int vring_get_num(Vring *vring) > +{ > + return vring->vr.num; > +} > + > +/* Are there more descriptors available? */ > +static inline bool vring_more_avail(Vring *vring) > +{ > + return vring->vr.avail->idx != vring->last_avail_idx; > +} > + > +void vring_setup(Vring *vring, VirtIODevice *vdev, int n); > +void vring_set_notification(VirtIODevice *vdev, Vring *vring, bool enable); > +bool vring_should_notify(VirtIODevice *vdev, Vring *vring); > +int vring_pop(VirtIODevice *vdev, Vring *vring, > + struct iovec iov[], struct iovec *iov_end, > + unsigned int *out_num, unsigned int *in_num); > +void vring_push(Vring *vring, unsigned int head, int len); > + > +#endif /* VRING_H */ > diff --git a/trace-events b/trace-events > index e1a37cc..8eeab34 100644 > --- a/trace-events > +++ b/trace-events > @@ -98,6 +98,9 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d" > virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p > sector %"PRIu64" nsectors %zu" > virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p > sector %"PRIu64" nsectors %zu" > > +# hw/dataplane/vring.c > +vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring > physical %#"PRIx64" desc %p avail %p used %p" > + > # thread-pool.c > thread_pool_submit(void *req, void *opaque) "req %p opaque %p" > thread_pool_complete(void *req, void *opaque, int ret) "req %p opaque %p ret > %d" > -- > 1.8.0 > >