Virtqueue notify is currently handled synchronously in userspace virtio. This prevents the vcpu from executing guest code while hardware emulation code handles the notify.
On systems that support KVM, the ioeventfd mechanism can be used to make virtqueue notify a lightweight exit by deferring hardware emulation to the iothread and allowing the VM to continue execution. This model is similar to how vhost receives virtqueue notifies. The result of this change is improved performance for userspace virtio devices. Virtio-blk throughput increases especially for multithreaded scenarios and virtio-net transmit throughput increases substantially. Full numbers are below. This patch employs ioeventfd virtqueue notify for all virtio devices. Linux kernels pre-2.6.34 only allow for 6 ioeventfds per VM and care must be taken so that vhost-net, the other ioeventfd user in QEMU, is able to function. On such kernels ioeventfd virtqueue notify will not be used. Khoa Huynh <k...@us.ibm.com> collected the following data for virtio-blk with cache=none,aio=native: FFSB Test Threads Unmodified Patched (MB/s) (MB/s) Large file create 1 21.7 21.8 8 101.0 118.0 16 119.0 157.0 Sequential reads 1 21.9 23.2 8 114.0 139.0 16 143.0 178.0 Random reads 1 3.3 3.6 8 23.0 25.4 16 43.3 47.8 Random writes 1 22.2 23.0 8 93.1 111.6 16 110.5 132.0 Sridhar Samudrala <s...@us.ibm.com> collected the following data for virtio-net with 2.6.36-rc1 on the host and 2.6.34 on the guest. Guest to Host TCP_STREAM throughput(Mb/sec) ------------------------------------------- Msg Size vhost-net virtio-net virtio-net/ioeventfd 65536 12755 6430 7590 16384 8499 3084 5764 4096 4723 1578 3659 1024 1827 981 2060 Host to Guest TCP_STREAM throughput(Mb/sec) ------------------------------------------- Msg Size vhost-net virtio-net virtio-net/ioeventfd 65536 11156 5790 5853 16384 10787 5575 5691 4096 10452 5556 4277 1024 4437 3671 5277 Guest to Host TCP_RR latency(transactions/sec) ---------------------------------------------- Msg Size vhost-net virtio-net virtio-net/ioeventfd 1 9903 3459 3425 4096 7185 1931 1899 16384 6108 2102 1923 65536 3161 1610 1744 Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com> --- Small changes are required for qemu-kvm.git. I will send them once qemu.git has virtio-ioeventfd support. hw/vhost.c | 6 ++-- hw/virtio.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ hw/virtio.h | 9 +---- kvm-all.c | 39 +++++++++++++++++++++ kvm-stub.c | 5 +++ kvm.h | 1 + 6 files changed, 156 insertions(+), 10 deletions(-) diff --git a/hw/vhost.c b/hw/vhost.c index 1b8624d..f127a07 100644 --- a/hw/vhost.c +++ b/hw/vhost.c @@ -517,7 +517,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, goto fail_guest_notifier; } - r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, true); + r = virtio_set_host_notifier(vdev, idx, true); if (r < 0) { fprintf(stderr, "Error binding host notifier: %d\n", -r); goto fail_host_notifier; @@ -539,7 +539,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev, fail_call: fail_kick: - vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false); + virtio_set_host_notifier(vdev, idx, false); fail_host_notifier: vdev->binding->set_guest_notifier(vdev->binding_opaque, idx, false); fail_guest_notifier: @@ -575,7 +575,7 @@ static void vhost_virtqueue_cleanup(struct vhost_dev *dev, } assert (r >= 0); - r = vdev->binding->set_host_notifier(vdev->binding_opaque, idx, false); + r = virtio_set_host_notifier(vdev, idx, false); if (r < 0) { fprintf(stderr, "vhost VQ %d host cleanup failed: %d\n", idx, r); fflush(stderr); diff --git a/hw/virtio.c b/hw/virtio.c index fbef788..f075b3a 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -16,6 +16,7 @@ #include "trace.h" #include "virtio.h" #include "sysemu.h" +#include "kvm.h" /* The alignment to use between consumer and producer parts of vring. * x86 pagesize again. */ @@ -77,6 +78,11 @@ struct VirtQueue VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; + enum { + HOST_NOTIFIER_DEASSIGNED, /* inactive */ + HOST_NOTIFIER_ASSIGNED, /* active */ + HOST_NOTIFIER_OFFLIMITS, /* active but outside our control */ + } host_notifier_state; }; /* virt queue functions */ @@ -453,6 +459,93 @@ void virtio_update_irq(VirtIODevice *vdev) virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); } +/* Service virtqueue notify from a host notifier */ +static void virtio_read_host_notifier(void *opaque) +{ + VirtQueue *vq = opaque; + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); + if (event_notifier_test_and_clear(notifier)) { + if (vq->vring.desc) { + vq->handle_output(vq->vdev, vq); + } + } +} + +/* Transition between host notifier states */ +static int virtio_set_host_notifier_state(VirtIODevice *vdev, int n, int state) +{ + VirtQueue *vq = &vdev->vq[n]; + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); + int rc; + + if (!kvm_enabled()) { + return -ENOSYS; + } + + /* If the number of ioeventfds is limited, use them for vhost only */ + if (state == HOST_NOTIFIER_ASSIGNED && !kvm_has_many_iobus_devs()) { + state = HOST_NOTIFIER_DEASSIGNED; + } + + /* Ignore if no state change */ + if (vq->host_notifier_state == state) { + return 0; + } + + /* Disable read handler if transitioning away from assigned */ + if (vq->host_notifier_state == HOST_NOTIFIER_ASSIGNED) { + qemu_set_fd_handler(event_notifier_get_fd(notifier), NULL, NULL, NULL); + } + + /* Toggle host notifier if transitioning to or from deassigned */ + if (state == HOST_NOTIFIER_DEASSIGNED || + vq->host_notifier_state == HOST_NOTIFIER_DEASSIGNED) { + rc = vdev->binding->set_host_notifier(vdev->binding_opaque, n, + state != HOST_NOTIFIER_DEASSIGNED); + if (rc < 0) { + return rc; + } + } + + /* Enable read handler if transitioning to assigned */ + if (state == HOST_NOTIFIER_ASSIGNED) { + qemu_set_fd_handler(event_notifier_get_fd(notifier), + virtio_read_host_notifier, NULL, vq); + } + + vq->host_notifier_state = state; + return 0; +} + +/* Try to assign/deassign host notifiers for all virtqueues */ +static void virtio_set_host_notifiers(VirtIODevice *vdev, bool assigned) +{ + int state = assigned ? HOST_NOTIFIER_ASSIGNED : HOST_NOTIFIER_DEASSIGNED; + int i; + + if (!vdev->binding->set_host_notifier) { + return; + } + + for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { + if (vdev->vq[i].host_notifier_state == HOST_NOTIFIER_OFFLIMITS) { + continue; + } + + if (!vdev->vq[i].vring.desc) { + continue; + } + + virtio_set_host_notifier_state(vdev, i, state); + } +} + +int virtio_set_host_notifier(VirtIODevice *vdev, int n, bool assigned) +{ + int state = assigned ? HOST_NOTIFIER_OFFLIMITS : HOST_NOTIFIER_ASSIGNED; + return virtio_set_host_notifier_state(vdev, n, state); +} + void virtio_reset(void *opaque) { VirtIODevice *vdev = opaque; @@ -467,6 +560,7 @@ void virtio_reset(void *opaque) vdev->isr = 0; vdev->config_vector = VIRTIO_NO_VECTOR; virtio_notify_vector(vdev, vdev->config_vector); + virtio_set_host_notifiers(vdev, false); for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { vdev->vq[i].vring.desc = 0; @@ -592,6 +686,16 @@ void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector) vdev->vq[n].vector = vector; } +void virtio_set_status(VirtIODevice *vdev, uint8_t val) +{ + virtio_set_host_notifiers(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK); + + if (vdev->set_status) { + vdev->set_status(vdev, val); + } + vdev->status = val; +} + VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void (*handle_output)(VirtIODevice *, VirtQueue *)) { @@ -719,6 +823,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f) } virtio_notify_vector(vdev, VIRTIO_NO_VECTOR); + virtio_set_host_notifiers(vdev, vdev->status & VIRTIO_CONFIG_S_DRIVER_OK); return 0; } @@ -746,6 +851,7 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id, for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { vdev->vq[i].vector = VIRTIO_NO_VECTOR; vdev->vq[i].vdev = vdev; + vdev->vq[i].host_notifier_state = HOST_NOTIFIER_DEASSIGNED; } vdev->name = name; diff --git a/hw/virtio.h b/hw/virtio.h index 96514e6..d76157e 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -125,13 +125,7 @@ struct VirtIODevice uint16_t device_id; }; -static inline void virtio_set_status(VirtIODevice *vdev, uint8_t val) -{ - if (vdev->set_status) { - vdev->set_status(vdev, val); - } - vdev->status = val; -} +void virtio_set_status(VirtIODevice *vdev, uint8_t val); VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, void (*handle_output)(VirtIODevice *, @@ -217,6 +211,7 @@ target_phys_addr_t virtio_queue_get_ring_size(VirtIODevice *vdev, int n); uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n); void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx); VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n); +int virtio_set_host_notifier(VirtIODevice *vdev, int n, bool assigned); EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq); EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq); void virtio_irq(VirtQueue *vq); diff --git a/kvm-all.c b/kvm-all.c index 1cc696f..2f09e34 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -24,6 +24,7 @@ #include "qemu-barrier.h" #include "sysemu.h" #include "hw/hw.h" +#include "hw/event_notifier.h" #include "gdbstub.h" #include "kvm.h" #include "bswap.h" @@ -72,6 +73,7 @@ struct KVMState int irqchip_in_kernel; int pit_in_kernel; int xsave, xcrs; + int many_iobus_devs; }; static KVMState *kvm_state; @@ -423,6 +425,36 @@ int kvm_check_extension(KVMState *s, unsigned int extension) return ret; } +static int kvm_check_many_iobus_devs(void) +{ + /* Older kernels have a 6 device limit on the KVM io bus. In that case + * creating many ioeventfds must be avoided. This tests checks for the + * limitation. + */ + EventNotifier notifiers[7]; + int i, ret = 0; + for (i = 0; i < ARRAY_SIZE(notifiers); i++) { + ret = event_notifier_init(¬ifiers[i], 0); + if (ret < 0) { + break; + } + ret = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(¬ifiers[i]), 0, i, true); + if (ret < 0) { + event_notifier_cleanup(¬ifiers[i]); + break; + } + } + + /* Decide whether many devices are supported or not */ + ret = i == ARRAY_SIZE(notifiers); + + while (i-- > 0) { + kvm_set_ioeventfd_pio_word(event_notifier_get_fd(¬ifiers[i]), 0, i, false); + event_notifier_cleanup(¬ifiers[i]); + } + return ret; +} + static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size, ram_addr_t phys_offset) @@ -699,6 +731,8 @@ int kvm_init(int smp_cpus) kvm_state = s; cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client); + s->many_iobus_devs = kvm_check_many_iobus_devs(); + return 0; err: @@ -1028,6 +1062,11 @@ int kvm_has_xcrs(void) return kvm_state->xcrs; } +int kvm_has_many_iobus_devs(void) +{ + return kvm_state->many_iobus_devs; +} + void kvm_setup_guest_memory(void *start, size_t size) { if (!kvm_has_sync_mmu()) { diff --git a/kvm-stub.c b/kvm-stub.c index d45f9fa..b0887fb 100644 --- a/kvm-stub.c +++ b/kvm-stub.c @@ -99,6 +99,11 @@ int kvm_has_robust_singlestep(void) return 0; } +int kvm_has_many_iobus_devs(void) +{ + return 0; +} + void kvm_setup_guest_memory(void *start, size_t size) { } diff --git a/kvm.h b/kvm.h index 50b6c01..f405906 100644 --- a/kvm.h +++ b/kvm.h @@ -42,6 +42,7 @@ int kvm_has_robust_singlestep(void); int kvm_has_debugregs(void); int kvm_has_xsave(void); int kvm_has_xcrs(void); +int kvm_has_many_iobus_devs(void); #ifdef NEED_CPU_H int kvm_init_vcpu(CPUState *env); -- 1.7.1