I noticed that the various NIC modules in qemu/kvm do not implement interrupt mitigation, which is very beneficial as it dramatically reduces exits from the hypervisor.
As a proof of concept i tried to implement it for the e1000 driver (patch below), and it brings tx performance from 9 to 56Kpps on qemu-softmmu, and from ~20 to 140Kpps on qemu-kvm. I am going to measure the rx interrupt mitigation in the next couple of days. Is there any interest in having this code in ? cheers luigi diff -ubwrp --exclude '*.[do]' /tmp/qemu-61dc008/hw/e1000.c ./hw/e1000.c --- /tmp/qemu-61dc008/hw/e1000.c 2012-07-20 01:25:52.000000000 +0200 +++ ./hw/e1000.c 2012-07-24 18:21:39.000000000 +0200 @@ -33,6 +33,8 @@ #include "sysemu.h" #include "dma.h" +#define MITIGATION + #include "e1000_hw.h" #define E1000_DEBUG @@ -127,6 +129,13 @@ typedef struct E1000State_st { } eecd_state; QEMUTimer *autoneg_timer; + +#ifdef MITIGATION + QEMUBH *int_bh; // interrupt mitigation handler + int tx_ics_count; // pending tx int requests + int rx_ics_count; // pending rx int requests + int int_cause; // int cause +#endif // MITIGATION } E1000State; #define defreg(x) x = (E1000_##x>>2) @@ -638,6 +648,26 @@ start_xmit(E1000State *s) return; } +#ifdef MITIGATION + /* we transmit the first few packets, or we do if we are + * approaching a full ring. in the latter case, also + * send an ics. + * + */ +{ + int len, pending; + len = s->mac_reg[TDLEN] / sizeof(desc) ; + pending = s->mac_reg[TDT] - s->mac_reg[TDH]; + if (pending < 0) + pending += len; + /* ignore requests after the first few ones, as long as + * we are not approaching a full ring. + * Otherwise, deliver packets to the backend. + */ + if (s->tx_ics_count > 4 && s->tx_ics_count + pending < len - 5) + return; +#endif // MITIGATION + while (s->mac_reg[TDH] != s->mac_reg[TDT]) { base = tx_desc_base(s) + sizeof(struct e1000_tx_desc) * s->mac_reg[TDH]; @@ -663,7 +693,21 @@ start_xmit(E1000State *s) break; } } +#ifdef MITIGATION + s->int_cause |= cause; // remember the interrupt cause. + s->tx_ics_count += pending; + if (s->tx_ics_count >= len - 5) { + // if the ring is about to become full, generate an interrupt + set_ics(s, 0, s->int_cause); + s->tx_ics_count = 0; + s->int_cause = 0; + } else { // otherwise just schedule it for later. + qemu_bh_schedule_idle(s->int_bh); + } +} +#else /* !MITIGATION */ set_ics(s, 0, cause); +#endif } static int @@ -875,7 +919,27 @@ e1000_receive(VLANClientState *nc, const s->rxbuf_min_shift) n |= E1000_ICS_RXDMT0; +#ifdef MITIGATION +#define MIT_RXDMT0_SENT 100000 // large + s->int_cause |= n; + if (s->rx_ics_count == 0) { + /* deliver the first interrupt */ + set_ics(s, 0, s->int_cause); + s->int_cause = 0; + s->rx_ics_count++; + } else if ( (n & E1000_ICS_RXDMT0) && s->rx_ics_count < MIT_RXDMT0_SENT) { + /* also deliver if we are approaching ring full */ + set_ics(s, 0, s->int_cause); + s->int_cause = 0; + s->rx_ics_count = MIT_RXDMT0_SENT; + } else { + /* otherwise schedule for later */ + s->rx_ics_count++; + qemu_bh_schedule_idle(s->int_bh); + } +#else /* !MITIGATION */ set_ics(s, 0, n); +#endif /* !MITIGATION */ return size; } @@ -1214,6 +1281,20 @@ static NetClientInfo net_e1000_info = { .link_status_changed = e1000_set_link_status, }; +#ifdef MITIGATION +static void e1000_int_bh(void *opaque) +{ + E1000State *s = opaque; + if (s->tx_ics_count < 1 && s->rx_ics_count < 1) + return; + s->tx_ics_count = 0; + s->rx_ics_count = 0; + start_xmit(s); + set_ics(s, 0, s->int_cause); + s->int_cause = 0; +} +#endif /* MITIGATION */ + static int pci_e1000_init(PCIDevice *pci_dev) { E1000State *d = DO_UPCAST(E1000State, dev, pci_dev); @@ -1231,6 +1312,9 @@ static int pci_e1000_init(PCIDevice *pci e1000_mmio_setup(d); +#ifdef MITIGATION + d->int_bh = qemu_bh_new(e1000_int_bh, d); +#endif /* MITIGATION */ pci_register_bar(&d->dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY, &d->mmio); pci_register_bar(&d->dev, 1, PCI_BASE_ADDRESS_SPACE_IO, &d->io);