[Qemu-devel] interrupt mitigation for e1000

Luigi Rizzo Tue, 24 Jul 2012 09:39:08 -0700

I noticed that the various NIC modules in qemu/kvm do not implement
interrupt mitigation, which is very beneficial as it dramatically
reduces exits from the hypervisor.


As a proof of concept i tried to implement it for the e1000 driver
(patch below), and it brings tx performance from 9 to 56Kpps on
qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm.

I am going to measure the rx interrupt mitigation in the next couple
of days.

Is there any interest in having this code in ?

        cheers
        luigi

diff -ubwrp --exclude '*.[do]' /tmp/qemu-61dc008/hw/e1000.c ./hw/e1000.c
--- /tmp/qemu-61dc008/hw/e1000.c        2012-07-20 01:25:52.000000000 +0200
+++ ./hw/e1000.c        2012-07-24 18:21:39.000000000 +0200
@@ -33,6 +33,8 @@
 #include "sysemu.h"
 #include "dma.h"
 
+#define MITIGATION
+
 #include "e1000_hw.h"
 
 #define E1000_DEBUG
@@ -127,6 +129,13 @@ typedef struct E1000State_st {
     } eecd_state;
 
     QEMUTimer *autoneg_timer;
+
+#ifdef MITIGATION
+    QEMUBH *int_bh;    // interrupt mitigation handler
+    int tx_ics_count;  // pending tx int requests
+    int rx_ics_count;  // pending rx int requests
+    int int_cause;     // int cause
+#endif // MITIGATION
 } E1000State;
 
 #define        defreg(x)       x = (E1000_##x>>2)
@@ -638,6 +648,26 @@ start_xmit(E1000State *s)
         return;
     }
 
+#ifdef MITIGATION
+    /* we transmit the first few packets, or we do if we are
+     * approaching a full ring. in the latter case, also
+     * send an ics.
+     * 
+     */
+{
+    int len, pending;
+    len = s->mac_reg[TDLEN] / sizeof(desc) ;
+    pending = s->mac_reg[TDT] - s->mac_reg[TDH];
+    if (pending < 0)
+       pending += len;
+    /* ignore requests after the first few ones, as long as
+     * we are not approaching a full ring.
+     * Otherwise, deliver packets to the backend.
+     */
+    if (s->tx_ics_count > 4 && s->tx_ics_count + pending < len - 5)
+       return;
+#endif // MITIGATION
+
     while (s->mac_reg[TDH] != s->mac_reg[TDT]) {
         base = tx_desc_base(s) +
                sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];
@@ -663,7 +693,21 @@ start_xmit(E1000State *s)
             break;
         }
     }
+#ifdef MITIGATION
+    s->int_cause |= cause; // remember the interrupt cause.
+    s->tx_ics_count += pending;
+    if (s->tx_ics_count >= len - 5) {
+        // if the ring is about to become full, generate an interrupt
+       set_ics(s, 0, s->int_cause);
+       s->tx_ics_count = 0;
+       s->int_cause = 0;
+    } else {   // otherwise just schedule it for later.
+        qemu_bh_schedule_idle(s->int_bh);
+    }
+}
+#else /* !MITIGATION */
     set_ics(s, 0, cause);
+#endif
 }
 
 static int
@@ -875,7 +919,27 @@ e1000_receive(VLANClientState *nc, const
         s->rxbuf_min_shift)
         n |= E1000_ICS_RXDMT0;
 
+#ifdef MITIGATION
+#define MIT_RXDMT0_SENT        100000  // large
+    s->int_cause |= n;
+    if (s->rx_ics_count == 0) {
+       /* deliver the first interrupt */
+       set_ics(s, 0, s->int_cause);
+       s->int_cause = 0;
+       s->rx_ics_count++;
+    } else if ( (n & E1000_ICS_RXDMT0) && s->rx_ics_count < MIT_RXDMT0_SENT) {
+       /* also deliver if we are approaching ring full */
+       set_ics(s, 0, s->int_cause);
+       s->int_cause = 0;
+       s->rx_ics_count = MIT_RXDMT0_SENT;
+    } else {
+       /* otherwise schedule for later */
+       s->rx_ics_count++;
+       qemu_bh_schedule_idle(s->int_bh);
+    }
+#else /* !MITIGATION */
     set_ics(s, 0, n);
+#endif /* !MITIGATION */
 
     return size;
 }
@@ -1214,6 +1281,20 @@ static NetClientInfo net_e1000_info = {
     .link_status_changed = e1000_set_link_status,
 };
 
+#ifdef MITIGATION
+static void e1000_int_bh(void *opaque)
+{
+    E1000State *s = opaque;
+    if (s->tx_ics_count < 1 && s->rx_ics_count < 1)
+       return;
+    s->tx_ics_count = 0;
+    s->rx_ics_count = 0;
+    start_xmit(s);
+    set_ics(s, 0, s->int_cause);
+    s->int_cause = 0;
+}
+#endif /* MITIGATION */
+
 static int pci_e1000_init(PCIDevice *pci_dev)
 {
     E1000State *d = DO_UPCAST(E1000State, dev, pci_dev);
@@ -1231,6 +1312,9 @@ static int pci_e1000_init(PCIDevice *pci
 
     e1000_mmio_setup(d);
 
+#ifdef MITIGATION
+    d->int_bh = qemu_bh_new(e1000_int_bh, d);
+#endif /* MITIGATION */
     pci_register_bar(&d->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio);
 
     pci_register_bar(&d->dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);

[Qemu-devel] interrupt mitigation for e1000

Reply via email to