This allows drivers to register a callback on a qemu_irq, which is
invoked when a level-triggered IRQ is acked on the irqchip.

This allows us to simulate level-triggered interrupts more efficiently,
by resampling the state of the interrupt only when it actually matters.

This can be used in two ways. 

The example in the patch below shows the event source literally being
resampled from the callback — in this case the line level is tied to
the Xen evtchn_upcall_pending flag in vCPU0's vcpu_info, and the
callback from the irqchip allows us to avoid having to constantly poll
for that being clearer).

As we hook it up to INTx interrupts on VFIO PCI devices, it would
unconditionally return 'true' to clear the level in the irqchip, and
also send an event on the 'resample' eventfd to the kernel, so that the
kernel will reraise the interrupt if it's still actually physically set
on the device.

There's theoretically a race condition there, if the kernel reraises
the interrupt before the callback even returns and the irqchip clears
its internal s->irr. But I think we get away with it by being single-
threaded for the I/O processing so we won't actually consume the event
until later?

It was the Xen part firt that offended me, having to poll on vmexit:
https://git.infradead.org/users/dwmw2/qemu.git/commitdiff/7bada5e4f#patch5

But then I looked at how VFIO handles this, and it offends me even
more; sending the resample eventfd down to the kernel on *ever* MMIO
read/write.... having unmapped the device's BARs from the guest in
order to *trap* those MMIO accesses... with a timer to map it back
again...

It'll take a little more work to hook up the reverse path for the ack
back through PCI INTx handling, a bit like I've had to do it with
gsi_ack_handler to convey the ack events back from the {i8259,ioapic}
qemu_irq to the GSI qemu_irqs. And I'll need to do it for more than
just i8259 and ioapic. But I suspect it'll be worth it.

Opinions?

Tested by booting a (KVM) Xen guest with xen_no_vector_callback on its
command line, and sometimes also 'apic=off'. In PIC mode we still seem
to get two interrupts per event, but I think that's actually genuine
because printfs in the evtchn code confirm that ->evtchn_upcall_pending
for vCPU0 really *is* still set the first time the interrupt is acked
in the i8259 and genuinely doesn't get cleared.

diff --git a/hw/core/irq.c b/hw/core/irq.c
index 3623f711fe..552e732835 100644
--- a/hw/core/irq.c
+++ b/hw/core/irq.c
@@ -32,6 +32,9 @@ DECLARE_INSTANCE_CHECKER(struct IRQState, IRQ,
 struct IRQState {
     Object parent_obj;
 
+    qemu_irq_ack_fn ack_cb;
+    void *ack_opaque;
+
     qemu_irq_handler handler;
     void *opaque;
     int n;
@@ -45,6 +48,22 @@ void qemu_set_irq(qemu_irq irq, int level)
     irq->handler(irq->opaque, irq->n, level);
 }
 
+void qemu_set_irq_ack_callback(qemu_irq irq, qemu_irq_ack_fn cb, void *opaque)
+{
+    if (irq) {
+        irq->ack_cb = cb;
+        irq->ack_opaque = opaque;
+    }
+}
+
+bool qemu_notify_irq_ack(qemu_irq irq)
+{
+    if (irq && irq->ack_cb) {
+        return irq->ack_cb(irq, irq->ack_opaque);
+    }
+    return false;
+}
+
 qemu_irq *qemu_extend_irqs(qemu_irq *old, int n_old, qemu_irq_handler handler,
                            void *opaque, int n)
 {
diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 084249c56d..5867868549 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -288,6 +288,16 @@ void xen_evtchn_set_callback_level(int level)
     }
 }
 
+static bool resample_evtchn_irq(qemu_irq irq, void *opaques)
+{
+    struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
+
+    if (vi && !vi->evtchn_upcall_pending) {
+        return true;
+    }
+    return false;
+}
+
 int xen_evtchn_set_callback_param(uint64_t param)
 {
     XenEvtchnState *s = xen_evtchn_singleton;
@@ -339,8 +349,14 @@ int xen_evtchn_set_callback_param(uint64_t param)
         if (gsi != s->callback_gsi) {
             struct vcpu_info *vi = kvm_xen_get_vcpu_info_hva(0);
 
-            xen_evtchn_set_callback_level(0);
+            if (s->callback_gsi) {
+                xen_evtchn_set_callback_level(0);
+                qemu_set_irq_ack_callback(s->gsis[s->callback_gsi], NULL, 
NULL);
+            }
             s->callback_gsi = gsi;
+            if (s->callback_gsi) {
+                qemu_set_irq_ack_callback(s->gsis[s->callback_gsi], 
resample_evtchn_irq, s);
+            }
 
             if (gsi && vi && vi->evtchn_upcall_pending) {
                 /*
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index c5cf6581da..8cfd6ff641 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -410,7 +410,7 @@ GSIState *pc_gsi_create(qemu_irq **irqs, bool pci_enabled)
     if (kvm_ioapic_in_kernel()) {
         kvm_pc_setup_irq_routing(pci_enabled);
     }
-    *irqs = qemu_allocate_irqs(gsi_handler, s, GSI_NUM_PINS);
+    s->gsi_irq = *irqs = qemu_allocate_irqs(gsi_handler, s, GSI_NUM_PINS);
 
     return s;
 }
@@ -1355,7 +1355,7 @@ void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, 
PCIBus *pci_bus)
     rom_reset_order_override();
 }
 
-void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs)
+void pc_i8259_create(ISABus *isa_bus, GSIState *gsi_state)
 {
     qemu_irq *i8259;
 
@@ -1368,7 +1368,10 @@ void pc_i8259_create(ISABus *isa_bus, qemu_irq 
*i8259_irqs)
     }
 
     for (size_t i = 0; i < ISA_NUM_IRQS; i++) {
-        i8259_irqs[i] = i8259[i];
+        gsi_state->i8259_irq[i] = i8259[i];
+        qemu_set_irq_ack_callback(gsi_state->i8259_irq[i], gsi_ack_handler,
+                                  gsi_state->gsi_irq[i]);
+
     }
 
     g_free(i8259);
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 5678112dc2..964a8b458d 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -250,7 +250,7 @@ static void pc_init1(MachineState *machine,
     isa_bus_irqs(isa_bus, x86ms->gsi);
 
     if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
-        pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+        pc_i8259_create(isa_bus, gsi_state);
     }
 
     if (pcmc->pci_enabled) {
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 67ceb04bcc..c2c9933170 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -274,7 +274,7 @@ static void pc_q35_init(MachineState *machine)
     isa_bus = ich9_lpc->isa_bus;
 
     if (x86ms->pic == ON_OFF_AUTO_ON || x86ms->pic == ON_OFF_AUTO_AUTO) {
-        pc_i8259_create(isa_bus, gsi_state->i8259_irq);
+        pc_i8259_create(isa_bus, gsi_state);
     }
 
     if (pcmc->pci_enabled) {
diff --git a/hw/i386/x86.c b/hw/i386/x86.c
index 78cc131926..677e4ec3eb 100644
--- a/hw/i386/x86.c
+++ b/hw/i386/x86.c
@@ -617,6 +617,16 @@ void gsi_handler(void *opaque, int n, int level)
     }
 }
 
+bool gsi_ack_handler(qemu_irq irq, void *opaque)
+{
+    /*
+     * This is a callback on the underlying PIC/IOAPIC irq but the
+     * opaque pointer that was registered is the GSI irq. Propagate
+     * the notifiation.
+     */
+    return qemu_notify_irq_ack(opaque);
+}
+
 void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name)
 {
     DeviceState *dev;
@@ -637,6 +647,8 @@ void ioapic_init_gsi(GSIState *gsi_state, const char 
*parent_name)
 
     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
         gsi_state->ioapic_irq[i] = qdev_get_gpio_in(dev, i);
+        qemu_set_irq_ack_callback(gsi_state->ioapic_irq[i], gsi_ack_handler,
+                                  gsi_state->gsi_irq[i]);
     }
 }
 
@@ -653,6 +665,8 @@ DeviceState *ioapic_init_secondary(GSIState *gsi_state)
 
     for (i = 0; i < IOAPIC_NUM_PINS; i++) {
         gsi_state->ioapic2_irq[i] = qdev_get_gpio_in(dev, i);
+        qemu_set_irq_ack_callback(gsi_state->ioapic2_irq[i], gsi_ack_handler,
+                                  gsi_state->gsi_irq[IO_APIC_SECONDARY_IRQBASE 
+ i]);
     }
     return dev;
 }
diff --git a/hw/intc/i8259.c b/hw/intc/i8259.c
index cc4e21ffec..0bc43f0fc3 100644
--- a/hw/intc/i8259.c
+++ b/hw/intc/i8259.c
@@ -166,9 +166,15 @@ static void pic_intack(PICCommonState *s, int irq)
     } else {
         s->isr |= (1 << irq);
     }
-    /* We don't clear a level sensitive interrupt here */
+    /*
+     * We don't clear a level sensitive interrupt here, unless the
+     * ack notifier asks us to.
+     */
     if (!(s->elcr & (1 << irq))) {
         s->irr &= ~(1 << irq);
+    } else if (qemu_notify_irq_ack(qdev_get_gpio_in(DEVICE(s), irq))) {
+        s->irr &= ~(1 << irq);
+        s->last_irr &= ~(1 << irq);
     }
     pic_update_irq(s);
 }
diff --git a/hw/intc/ioapic.c b/hw/intc/ioapic.c
index 264262959d..4d56bbedac 100644
--- a/hw/intc/ioapic.c
+++ b/hw/intc/ioapic.c
@@ -34,6 +34,7 @@
 #include "sysemu/sysemu.h"
 #include "hw/i386/apic-msidef.h"
 #include "hw/i386/x86-iommu.h"
+#include "hw/irq.h"
 #include "trace.h"
 
 #define APIC_DELIVERY_MODE_SHIFT 8
@@ -259,7 +260,9 @@ void ioapic_eoi_broadcast(int vector)
              */
             kvm_resample_fd_notify(n);
 #endif
-
+            if (qemu_notify_irq_ack(qdev_get_gpio_in(DEVICE(s), n))) {
+                s->irr &= ~(1 << n);
+            }
             if (!(entry & IOAPIC_LVT_REMOTE_IRR)) {
                 continue;
             }
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index b866567b7b..77a8d0adfa 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -177,7 +177,7 @@ void pc_cmos_init(PCMachineState *pcms,
                   ISADevice *s);
 void pc_nic_init(PCMachineClass *pcmc, ISABus *isa_bus, PCIBus *pci_bus);
 
-void pc_i8259_create(ISABus *isa_bus, qemu_irq *i8259_irqs);
+void pc_i8259_create(ISABus *isa_bus, GSIState *gsi_state);
 
 /* port92.c */
 #define PORT92_A20_LINE "a20"
diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h
index 62fa5774f8..a315d7719f 100644
--- a/include/hw/i386/x86.h
+++ b/include/hw/i386/x86.h
@@ -138,6 +138,7 @@ bool x86_machine_is_acpi_enabled(const X86MachineState 
*x86ms);
 #define ACPI_BUILD_PCI_IRQS ((1<<5) | (1<<9) | (1<<10) | (1<<11))
 
 typedef struct GSIState {
+    qemu_irq *gsi_irq;
     qemu_irq i8259_irq[ISA_NUM_IRQS];
     qemu_irq ioapic_irq[IOAPIC_NUM_PINS];
     qemu_irq ioapic2_irq[IOAPIC_NUM_PINS];
@@ -145,6 +146,7 @@ typedef struct GSIState {
 
 qemu_irq x86_allocate_cpu_irq(void);
 void gsi_handler(void *opaque, int n, int level);
+bool gsi_ack_handler(qemu_irq, void *opaque);
 void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name);
 DeviceState *ioapic_init_secondary(GSIState *gsi_state);
 
diff --git a/include/hw/irq.h b/include/hw/irq.h
index 645b73d251..f21110d5f0 100644
--- a/include/hw/irq.h
+++ b/include/hw/irq.h
@@ -23,6 +23,16 @@ static inline void qemu_irq_pulse(qemu_irq irq)
     qemu_set_irq(irq, 0);
 }
 
+/*
+ * Allows a callback to be invoked when an IRQ is acked at the irqchip,
+ * allowing it to be resampled and reasserted as appropriate. If the
+ * callback function returns true, the interrupt is deasserted at the
+ * irqchip.
+ */
+typedef bool (*qemu_irq_ack_fn)(qemu_irq irq, void *opaque);
+void qemu_set_irq_ack_callback(qemu_irq irq, qemu_irq_ack_fn cb, void *opaque);
+bool qemu_notify_irq_ack(qemu_irq irq);
+
 /* Returns an array of N IRQs. Each IRQ is assigned the argument handler and
  * opaque data.
  */
diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index 273200bc70..a7d3fc33b3 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -391,7 +391,7 @@ void kvm_xen_maybe_deassert_callback(CPUState *cs)
     /* If the evtchn_upcall_pending flag is cleared, turn the GSI off. */
     if (!vi->evtchn_upcall_pending) {
         env->xen_callback_asserted = false;
-        xen_evtchn_set_callback_level(0);
+        //xen_evtchn_set_callback_level(0);
     }
 }
 
@@ -432,7 +432,7 @@ void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, 
int type)
     case HVM_PARAM_CALLBACK_TYPE_PCI_INTX:
         if (vcpu_id == 0) {
             xen_evtchn_set_callback_level(1);
-            X86_CPU(cs)->env.xen_callback_asserted = true;
+            //X86_CPU(cs)->env.xen_callback_asserted = true;
         }
         break;
     }

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Reply via email to