Not worth respinning for, but if you do generate v11

On Tue, Jul 07, 2009 at 05:08:49PM -0400, Gregory Haskins wrote:
> ioeventfd is a mechanism to register PIO/MMIO regions to trigger an eventfd
> signal when written to by a guest.  Host userspace can register any
> arbitrary IO address with a corresponding eventfd and then pass the eventfd
> to a specific end-point of interest for handling.
> 
> Normal IO requires a blocking round-trip since the operation may cause
> side-effects in the emulated model or may return data to the caller.
> Therefore, an IO in KVM traps from the guest to the host, causes a VMX/SVM
> "heavy-weight" exit back to userspace, and is ultimately serviced by qemu's
> device model synchronously before returning control back to the vcpu.
> 
> However, there is a subclass of IO which acts purely as a trigger for
> other IO (such as to kick off an out-of-band DMA request, etc).  For these
> patterns, the synchronous call is particularly expensive since we really
> only want to simply get our notification transmitted asychronously and
> return as quickly as possible.  All the sychronous infrastructure to ensure
> proper data-dependencies are met in the normal IO case are just unecessary
> overhead for signalling.  This adds additional computational load on the
> system, as well as latency to the signalling path.
> 
> Therefore, we provide a mechanism for registration of an in-kernel trigger
> point that allows the VCPU to only require a very brief, lightweight
> exit just long enough to signal an eventfd.  This also means that any
> clients compatible with the eventfd interface (which includes userspace
> and kernelspace equally well) can now register to be notified. The end
> result should be a more flexible and higher performance notification API
> for the backend KVM hypervisor and perhipheral components.
> 
> To test this theory, we built a test-harness called "doorbell".  This
> module has a function called "doorbell_ring()" which simply increments a
> counter for each time the doorbell is signaled.  It supports signalling
> from either an eventfd, or an ioctl().
> 
> We then wired up two paths to the doorbell: One via QEMU via a registered
> io region and through the doorbell ioctl().  The other is direct via
> ioeventfd.
> 
> You can download this test harness here:
> 
> ftp://ftp.novell.com/dev/ghaskins/doorbell.tar.bz2
> 
> The measured results are as follows:
> 
> qemu-mmio:       110000 iops, 9.09us rtt
> ioeventfd-mmio: 200100 iops, 5.00us rtt
> ioeventfd-pio:  367300 iops, 2.72us rtt
> 
> I didn't measure qemu-pio, because I have to figure out how to register a
> PIO region with qemu's device model, and I got lazy.  However, for now we
> can extrapolate based on the data from the NULLIO runs of +2.56us for MMIO,
> and -350ns for HC, we get:
> 
> qemu-pio:      153139 iops, 6.53us rtt
> ioeventfd-hc: 412585 iops, 2.37us rtt
> 
> these are just for fun, for now, until I can gather more data.
> 
> Here is a graph for your convenience:
> 
> http://developer.novell.com/wiki/images/7/76/Iofd-chart.png
> 
> The conclusion to draw is that we save about 4us by skipping the userspace
> hop.
> 
> --------------------
> 
> Signed-off-by: Gregory Haskins <ghask...@novell.com>
> ---
> 
>  arch/x86/kvm/x86.c       |    1 
>  include/linux/kvm.h      |   24 ++++
>  include/linux/kvm_host.h |   10 +-
>  virt/kvm/eventfd.c       |  252 
> ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/kvm_main.c      |   11 ++
>  5 files changed, 294 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 95fa45c..59c2d93 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1212,6 +1212,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>       case KVM_CAP_IRQ_INJECT_STATUS:
>       case KVM_CAP_ASSIGN_DEV_IRQ:
>       case KVM_CAP_IRQFD:
> +     case KVM_CAP_IOEVENTFD:
>       case KVM_CAP_PIT2:
>               r = 1;
>               break;
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 76c6408..22d0eb7 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -307,6 +307,28 @@ struct kvm_guest_debug {
>       struct kvm_guest_debug_arch arch;
>  };
>  
> +enum {
> +     kvm_ioeventfd_flag_nr_datamatch,
> +     kvm_ioeventfd_flag_nr_pio,
> +     kvm_ioeventfd_flag_nr_deassign,
> +     kvm_ioeventfd_flag_nr_max,
> +};
> +
> +#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> +#define KVM_IOEVENTFD_FLAG_PIO       (1 << kvm_ioeventfd_flag_nr_pio)
> +#define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
> +
> +#define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
> +
> +struct kvm_ioeventfd {
> +     __u64 datamatch;
> +     __u64 addr;        /* legal pio/mmio address */
> +     __u32 len;         /* 1, 2, 4, or 8 bytes    */
> +     __s32 fd;
> +     __u32 flags;
> +     __u8  pad[36];
> +};
> +
>  #define KVM_TRC_SHIFT           16
>  /*
>   * kvm trace categories
> @@ -409,6 +431,7 @@ struct kvm_guest_debug {
>  #define KVM_CAP_PIT2 33
>  #endif
>  #define KVM_CAP_SET_BOOT_CPU_ID 34
> +#define KVM_CAP_IOEVENTFD 35
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -517,6 +540,7 @@ struct kvm_irqfd {
>  #define KVM_IRQFD                  _IOW(KVMIO, 0x76, struct kvm_irqfd)
>  #define KVM_CREATE_PIT2                 _IOW(KVMIO, 0x77, struct 
> kvm_pit_config)
>  #define KVM_SET_BOOT_CPU_ID        _IO(KVMIO, 0x78)
> +#define KVM_IOEVENTFD             _IOW(KVMIO, 0x79, struct kvm_ioeventfd)
>  
>  /*
>   * ioctls for vcpu fds
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 306bc67..0347d59 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -154,6 +154,7 @@ struct kvm {
>               spinlock_t        lock;
>               struct list_head  items;
>       } irqfds;
> +     struct list_head ioeventfds;
>  #endif
>       struct kvm_vm_stat stat;
>       struct kvm_arch arch;
> @@ -532,19 +533,24 @@ static inline void kvm_free_irq_routing(struct kvm 
> *kvm) {}
>  
>  #ifdef CONFIG_HAVE_KVM_EVENTFD
>  
> -void kvm_irqfd_init(struct kvm *kvm);
> +void kvm_eventfd_init(struct kvm *kvm);
>  int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags);
>  void kvm_irqfd_release(struct kvm *kvm);
> +int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);
>  
>  #else
>  
> -static inline void kvm_irqfd_init(struct kvm *kvm) {}
> +static inline void kvm_eventfd_init(struct kvm *kvm) {}
>  static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
>  {
>       return -EINVAL;
>  }
>  
>  static inline void kvm_irqfd_release(struct kvm *kvm) {}
> +static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> +{
> +     return -ENOSYS;
> +}
>  
>  #endif /* CONFIG_HAVE_KVM_EVENTFD */
>  
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 4092b8d..eee8edb 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -21,6 +21,7 @@
>   */
>  
>  #include <linux/kvm_host.h>
> +#include <linux/kvm.h>
>  #include <linux/workqueue.h>
>  #include <linux/syscalls.h>
>  #include <linux/wait.h>
> @@ -28,6 +29,9 @@
>  #include <linux/file.h>
>  #include <linux/list.h>
>  #include <linux/eventfd.h>
> +#include <linux/kernel.h>
> +
> +#include "iodev.h"
>  
>  /*
>   * --------------------------------------------------------------------
> @@ -234,10 +238,11 @@ fail:
>  }
>  
>  void
> -kvm_irqfd_init(struct kvm *kvm)
> +kvm_eventfd_init(struct kvm *kvm)
>  {
>       spin_lock_init(&kvm->irqfds.lock);
>       INIT_LIST_HEAD(&kvm->irqfds.items);
> +     INIT_LIST_HEAD(&kvm->ioeventfds);
>  }
>  
>  /*
> @@ -327,3 +332,248 @@ static void __exit irqfd_module_exit(void)
>  
>  module_init(irqfd_module_init);
>  module_exit(irqfd_module_exit);
> +
> +/*
> + * --------------------------------------------------------------------
> + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
> + *
> + * userspace can register a PIO/MMIO address with an eventfd for receiving
> + * notification when the memory has been touched.
> + * --------------------------------------------------------------------
> + */
> +
> +struct _ioeventfd {
> +     struct list_head     list;
> +     u64                  addr;
> +     int                  length;
> +     struct eventfd_ctx  *eventfd;
> +     u64                  datamatch;
> +     struct kvm_io_device dev;
> +     bool                 wildcard;
> +};
> +
> +static inline struct _ioeventfd *
> +to_ioeventfd(struct kvm_io_device *dev)
> +{
> +     return container_of(dev, struct _ioeventfd, dev);
> +}
> +
> +static void
> +ioeventfd_release(struct _ioeventfd *p)
> +{
> +     eventfd_ctx_put(p->eventfd);
> +     list_del(&p->list);
> +     kfree(p);
> +}
> +
> +static bool
> +ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void 
> *val)
> +{
> +     u64 _val;
> +
> +     if (!(addr == p->addr && len == p->length))
> +             /* address-range must be precise for a hit */
> +             return false;
> +
> +     if (p->wildcard)
> +             /* all else equal, wildcard is always a hit */
> +             return true;
> +
> +     /* otherwise, we have to actually compare the data */
> +
> +     BUG_ON(!IS_ALIGNED((unsigned long)val, len));
> +
> +     switch (len) {
> +     case 1:
> +             _val = *(u8 *)val;
> +             break;
> +     case 2:
> +             _val = *(u16 *)val;
> +             break;
> +     case 4:
> +             _val = *(u32 *)val;
> +             break;
> +     case 8:
> +             _val = *(u64 *)val;
> +             break;
> +     default:
> +             return false;
> +     }
> +
> +     return _val == p->datamatch ? true : false;

Just return _val == p->datamatch is clearer.

> +}
> +
> +/* MMIO/PIO writes trigger an event if the addr/val match */
> +static int
> +ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> +             const void *val)
> +{
> +     struct _ioeventfd *p = to_ioeventfd(this);
> +
> +     if (!ioeventfd_in_range(p, addr, len, val))
> +             return -EOPNOTSUPP;
> +
> +     eventfd_signal(p->eventfd, 1);
> +     return 0;
> +}
> +
> +/*
> + * This function is called as KVM is completely shutting down.  We do not
> + * need to worry about locking just nuke anything we have as quickly as 
> possible
> + */
> +static void
> +ioeventfd_destructor(struct kvm_io_device *this)
> +{
> +     struct _ioeventfd *p = to_ioeventfd(this);
> +
> +     ioeventfd_release(p);
> +}
> +
> +static const struct kvm_io_device_ops ioeventfd_ops = {
> +     .write      = ioeventfd_write,
> +     .destructor = ioeventfd_destructor,
> +};
> +
> +/* assumes kvm->slots_lock held */
> +static bool
> +ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
> +{
> +     struct _ioeventfd *_p;
> +
> +     list_for_each_entry(_p, &kvm->ioeventfds, list)
> +             if (_p->addr == p->addr && _p->length == p->length &&
> +                 (_p->wildcard || p->wildcard ||
> +                  _p->datamatch == p->datamatch))
> +                     return true;
> +
> +     return false;
> +}
> +
> +static int
> +kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> +{
> +     int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
> +     struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
> +     struct _ioeventfd        *p;
> +     struct eventfd_ctx       *eventfd;
> +     int                       ret;
> +
> +     /* must be natural-word sized */
> +     switch (args->len) {
> +     case 1:
> +     case 2:
> +     case 4:
> +     case 8:
> +             break;
> +     default:
> +             return -EINVAL;
> +     }
> +
> +     /* check for range overflow */
> +     if (args->addr + args->len < args->addr)
> +             return -EINVAL;
> +
> +     /* check for extra flags that we don't understand */
> +     if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
> +             return -EINVAL;
> +
> +     eventfd = eventfd_ctx_fdget(args->fd);
> +     if (IS_ERR(eventfd))
> +             return PTR_ERR(eventfd);
> +
> +     p = kzalloc(sizeof(*p), GFP_KERNEL);
> +     if (!p) {
> +             ret = -ENOMEM;
> +             goto fail;
> +     }
> +
> +     INIT_LIST_HEAD(&p->list);
> +     p->addr    = args->addr;
> +     p->length  = args->len;
> +     p->eventfd = eventfd;
> +
> +     /* The datamatch feature is optional, otherwise this is a wildcard */
> +     if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
> +             p->datamatch = args->datamatch;
> +     else
> +             p->wildcard = true;
> +
> +     down_write(&kvm->slots_lock);
> +
> +     /* Verify that there isnt a match already */
> +     if (ioeventfd_check_collision(kvm, p)) {
> +             ret = -EEXIST;
> +             goto unlock_fail;
> +     }
> +
> +     kvm_iodevice_init(&p->dev, &ioeventfd_ops);
> +
> +     ret = __kvm_io_bus_register_dev(bus, &p->dev);
> +     if (ret < 0)
> +             goto unlock_fail;
> +
> +     list_add_tail(&p->list, &kvm->ioeventfds);
> +
> +     up_write(&kvm->slots_lock);
> +
> +     return 0;
> +
> +unlock_fail:
> +     up_write(&kvm->slots_lock);
> +
> +fail:
> +     kfree(p);
> +     eventfd_ctx_put(eventfd);
> +
> +     return ret;
> +}
> +
> +static int
> +kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> +{
> +     int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
> +     struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
> +     struct _ioeventfd        *p, *tmp;
> +     struct eventfd_ctx       *eventfd;
> +     int                       ret = -ENOENT;
> +
> +     eventfd = eventfd_ctx_fdget(args->fd);
> +     if (IS_ERR(eventfd))
> +             return PTR_ERR(eventfd);
> +
> +     down_write(&kvm->slots_lock);
> +
> +     list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
> +             bool wildcard = args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH ?
> +                     true : false;

Just !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) is clearer.

> +
> +             if (p->eventfd != eventfd  ||
> +                 p->addr != args->addr  ||
> +                 p->length != args->len ||
> +                 p->wildcard != wildcard)
> +                     continue;
> +
> +             if (!p->wildcard && p->datamatch != args->datamatch)
> +                     continue;
> +
> +             __kvm_io_bus_unregister_dev(bus, &p->dev);
> +             ioeventfd_release(p);
> +             ret = 0;
> +             break;
> +     }
> +
> +     up_write(&kvm->slots_lock);
> +
> +     eventfd_ctx_put(eventfd);
> +
> +     return ret;
> +}
> +
> +int
> +kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> +{
> +     if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
> +             return kvm_deassign_ioeventfd(kvm, args);
> +
> +     return kvm_assign_ioeventfd(kvm, args);
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index dd92b44..14e1f32 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -979,7 +979,7 @@ static struct kvm *kvm_create_vm(void)
>       spin_lock_init(&kvm->mmu_lock);
>       spin_lock_init(&kvm->requests_lock);
>       kvm_io_bus_init(&kvm->pio_bus);
> -     kvm_irqfd_init(kvm);
> +     kvm_eventfd_init(kvm);
>       mutex_init(&kvm->lock);
>       mutex_init(&kvm->irq_lock);
>       kvm_io_bus_init(&kvm->mmio_bus);
> @@ -2271,6 +2271,15 @@ static long kvm_vm_ioctl(struct file *filp,
>               r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
>               break;
>       }
> +     case KVM_IOEVENTFD: {
> +             struct kvm_ioeventfd data;
> +
> +             r = -EFAULT;
> +             if (copy_from_user(&data, argp, sizeof data))
> +                     goto out;
> +             r = kvm_ioeventfd(kvm, &data);
> +             break;
> +     }
>  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
>       case KVM_SET_BOOT_CPU_ID:
>               r = 0;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to