Re: [PATCH v3.1 4/5] KVM: Kick resamplefd for split kernel irqchip

2020-03-17 Thread Alex Williamson
On Tue, 17 Mar 2020 18:49:23 -0400
Peter Xu  wrote:

> This is majorly only for X86 because that's the only one that supports
> split irqchip for now.
> 
> When the irqchip is split, we face a dilemma that KVM irqfd will be
> enabled, however the slow irqchip is still running in the userspace.
> It means that the resamplefd in the kernel irqfds won't take any
> effect and it will miss to ack INTx interrupts on EOIs.
> 
> One example is split irqchip with VFIO INTx, which will break if we
> use the VFIO INTx fast path.
> 
> This patch can potentially supports the VFIO fast path again for INTx,
> that the IRQ delivery will still use the fast path, while we don't
> need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
> callers of vfio_eoi() hook).  However the EOI of the INTx will still
> need to be done from the userspace by caching all the resamplefds in
> QEMU and kick properly for IOAPIC EOI broadcast.
> 
> This is tricky because in this case the userspace ioapic irr &
> remote-irr will be bypassed.  However such a change will greatly boost
> performance for assigned devices using INTx irqs (TCP_RR boosts 46%
> after this patch applied).
> 
> When the userspace is responsible for the resamplefd kickup, don't
> register it on the kvm_irqfd anymore, because on newer kernels (after
> commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
> irqchip and resamplefd.  This will make sure that the fast path will
> work for all supported kernels.
> 
> https://patchwork.kernel.org/patch/10738541/#22609933
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Peter Xu 
> ---
> v3.1 changelog
> - only kick resamplefd for level triggered irqs [Alex]
>  accel/kvm/kvm-all.c| 79 --
>  accel/kvm/trace-events |  1 +
>  hw/intc/ioapic.c   | 17 +
>  include/sysemu/kvm.h   |  4 +++
>  4 files changed, 99 insertions(+), 2 deletions(-)

Reviewed-by: Alex Williamson 

> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index d49b74512a..9a85fd1b8f 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -159,9 +159,59 @@ static const KVMCapabilityInfo 
> kvm_required_capabilites[] = {
>  static NotifierList kvm_irqchip_change_notifiers =
>  NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
>  
> +struct KVMResampleFd {
> +int gsi;
> +EventNotifier *resample_event;
> +QLIST_ENTRY(KVMResampleFd) node;
> +};
> +typedef struct KVMResampleFd KVMResampleFd;
> +
> +/*
> + * Only used with split irqchip where we need to do the resample fd
> + * kick for the kernel from userspace.
> + */
> +static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
> +QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
> +
>  #define kvm_slots_lock(kml)  qemu_mutex_lock(&(kml)->slots_lock)
>  #define kvm_slots_unlock(kml)qemu_mutex_unlock(&(kml)->slots_lock)
>  
> +static inline void kvm_resample_fd_remove(int gsi)
> +{
> +KVMResampleFd *rfd;
> +
> +QLIST_FOREACH(rfd, _resample_fd_list, node) {
> +if (rfd->gsi == gsi) {
> +QLIST_REMOVE(rfd, node);
> +g_free(rfd);
> +break;
> +}
> +}
> +}
> +
> +static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
> +{
> +KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
> +
> +rfd->gsi = gsi;
> +rfd->resample_event = event;
> +
> +QLIST_INSERT_HEAD(_resample_fd_list, rfd, node);
> +}
> +
> +void kvm_resample_fd_notify(int gsi)
> +{
> +KVMResampleFd *rfd;
> +
> +QLIST_FOREACH(rfd, _resample_fd_list, node) {
> +if (rfd->gsi == gsi) {
> +event_notifier_set(rfd->resample_event);
> +trace_kvm_resample_fd_notify(gsi);
> +return;
> +}
> +}
> +}
> +
>  int kvm_get_max_memslots(void)
>  {
>  KVMState *s = KVM_STATE(current_accel());
> @@ -1642,8 +1692,33 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, 
> EventNotifier *event,
>  };
>  
>  if (rfd != -1) {
> -irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
> -irqfd.resamplefd = rfd;
> +assert(assign);
> +if (kvm_irqchip_is_split()) {
> +/*
> + * When the slow irqchip (e.g. IOAPIC) is in the
> + * userspace, KVM kernel resamplefd will not work because
> + * the EOI of the interrupt will be delivered to userspace
> + * instead, so the KVM kernel resamplefd kick will be
> + * skipped.  The userspace here mimics what the kernel
> + * provides with resamplefd, remember the resamplefd and
> + * kick it when we receive EOI of this IRQ.
> + *
> + * This is hackery because IOAPIC is mostly bypassed
> + * (except EOI broadcasts) when irqfd is used.  However
> + * this can bring much performance back for split irqchip
> + * with INTx IRQs (for VFIO, this gives 93% perf of the
> + * full fast path, 

[PATCH v3.1 4/5] KVM: Kick resamplefd for split kernel irqchip

2020-03-17 Thread Peter Xu
This is majorly only for X86 because that's the only one that supports
split irqchip for now.

When the irqchip is split, we face a dilemma that KVM irqfd will be
enabled, however the slow irqchip is still running in the userspace.
It means that the resamplefd in the kernel irqfds won't take any
effect and it will miss to ack INTx interrupts on EOIs.

One example is split irqchip with VFIO INTx, which will break if we
use the VFIO INTx fast path.

This patch can potentially supports the VFIO fast path again for INTx,
that the IRQ delivery will still use the fast path, while we don't
need to trap MMIOs in QEMU for the device to emulate the EIOs (see the
callers of vfio_eoi() hook).  However the EOI of the INTx will still
need to be done from the userspace by caching all the resamplefds in
QEMU and kick properly for IOAPIC EOI broadcast.

This is tricky because in this case the userspace ioapic irr &
remote-irr will be bypassed.  However such a change will greatly boost
performance for assigned devices using INTx irqs (TCP_RR boosts 46%
after this patch applied).

When the userspace is responsible for the resamplefd kickup, don't
register it on the kvm_irqfd anymore, because on newer kernels (after
commit 654f1f13ea56, 5.2+) the KVM_IRQFD will fail if with both split
irqchip and resamplefd.  This will make sure that the fast path will
work for all supported kernels.

https://patchwork.kernel.org/patch/10738541/#22609933

Suggested-by: Paolo Bonzini 
Signed-off-by: Peter Xu 
---
v3.1 changelog
- only kick resamplefd for level triggered irqs [Alex]
 accel/kvm/kvm-all.c| 79 --
 accel/kvm/trace-events |  1 +
 hw/intc/ioapic.c   | 17 +
 include/sysemu/kvm.h   |  4 +++
 4 files changed, 99 insertions(+), 2 deletions(-)

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index d49b74512a..9a85fd1b8f 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -159,9 +159,59 @@ static const KVMCapabilityInfo kvm_required_capabilites[] 
= {
 static NotifierList kvm_irqchip_change_notifiers =
 NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
 
+struct KVMResampleFd {
+int gsi;
+EventNotifier *resample_event;
+QLIST_ENTRY(KVMResampleFd) node;
+};
+typedef struct KVMResampleFd KVMResampleFd;
+
+/*
+ * Only used with split irqchip where we need to do the resample fd
+ * kick for the kernel from userspace.
+ */
+static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
+QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
+
 #define kvm_slots_lock(kml)  qemu_mutex_lock(&(kml)->slots_lock)
 #define kvm_slots_unlock(kml)qemu_mutex_unlock(&(kml)->slots_lock)
 
+static inline void kvm_resample_fd_remove(int gsi)
+{
+KVMResampleFd *rfd;
+
+QLIST_FOREACH(rfd, _resample_fd_list, node) {
+if (rfd->gsi == gsi) {
+QLIST_REMOVE(rfd, node);
+g_free(rfd);
+break;
+}
+}
+}
+
+static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
+{
+KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
+
+rfd->gsi = gsi;
+rfd->resample_event = event;
+
+QLIST_INSERT_HEAD(_resample_fd_list, rfd, node);
+}
+
+void kvm_resample_fd_notify(int gsi)
+{
+KVMResampleFd *rfd;
+
+QLIST_FOREACH(rfd, _resample_fd_list, node) {
+if (rfd->gsi == gsi) {
+event_notifier_set(rfd->resample_event);
+trace_kvm_resample_fd_notify(gsi);
+return;
+}
+}
+}
+
 int kvm_get_max_memslots(void)
 {
 KVMState *s = KVM_STATE(current_accel());
@@ -1642,8 +1692,33 @@ static int kvm_irqchip_assign_irqfd(KVMState *s, 
EventNotifier *event,
 };
 
 if (rfd != -1) {
-irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
-irqfd.resamplefd = rfd;
+assert(assign);
+if (kvm_irqchip_is_split()) {
+/*
+ * When the slow irqchip (e.g. IOAPIC) is in the
+ * userspace, KVM kernel resamplefd will not work because
+ * the EOI of the interrupt will be delivered to userspace
+ * instead, so the KVM kernel resamplefd kick will be
+ * skipped.  The userspace here mimics what the kernel
+ * provides with resamplefd, remember the resamplefd and
+ * kick it when we receive EOI of this IRQ.
+ *
+ * This is hackery because IOAPIC is mostly bypassed
+ * (except EOI broadcasts) when irqfd is used.  However
+ * this can bring much performance back for split irqchip
+ * with INTx IRQs (for VFIO, this gives 93% perf of the
+ * full fast path, which is 46% perf boost comparing to
+ * the INTx slow path).
+ */
+kvm_resample_fd_insert(virq, resample);
+} else {
+irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
+irqfd.resamplefd = rfd;
+}
+} else if (!assign) {
+if (kvm_irqchip_is_split()) {
+