This brings bge(4) up to about where em(4) is. It involves a few different changes, notably: - per-ring refill timeouts, to ensure we don't try to refill a ring from the timeout and an interrupt at the same time - removing the list of tx dma maps and just assigning a map to use based on the current ring slot number (this is why it's more - than +) - using atomics to adjust bge_txcnt, saving those adjustments until the end of the tx/txeof loops, and only acting on the adjusted value - not adding any mutexes
I've tested it on amd64 with these: bge0 at pci1 dev 0 function 0 "Broadcom BCM5721" rev 0x21, BCM5750 C1 (0x4201): msi, address 00:18:f3:d1:80:64 bge0 at pci2 dev 2 function 0 "Broadcom BCM5703X" rev 0x02, BCM5702/5703 A2 (0x1002): apic 3 int 1, address 00:09:3d:00:84:d1 and on sparc64: bge0 at pci7 dev 4 function 0 "Broadcom BCM5714" rev 0xa3, BCM5715 A3 (0x9003): ivec 0x795, address 00:14:4f:00:5a:5a On the sparc64 box (a v245), with if_input_process unlocked, this gets me 10-20% more pps or about 100mbps more in tcpbench (550, up from 450). ok? Index: if_bgereg.h =================================================================== RCS file: /cvs/src/sys/dev/pci/if_bgereg.h,v retrieving revision 1.127 diff -u -p -u -p -r1.127 if_bgereg.h --- if_bgereg.h 11 Sep 2015 13:02:28 -0000 1.127 +++ if_bgereg.h 30 Sep 2015 11:14:09 -0000 @@ -2830,11 +2830,6 @@ struct bge_type { #define BGE_TIMEOUT 100000 #define BGE_TXCONS_UNSET 0xFFFF /* impossible value */ -struct txdmamap_pool_entry { - bus_dmamap_t dmamap; - SLIST_ENTRY(txdmamap_pool_entry) link; -}; - #define ASF_ENABLE 1 #define ASF_NEW_HANDSHAKE 2 #define ASF_STACKUP 4 @@ -2934,11 +2929,11 @@ struct bge_softc { int bge_txcnt; struct timeout bge_timeout; struct timeout bge_rxtimeout; + struct timeout bge_rxtimeout_jumbo; u_int32_t bge_rx_discards; u_int32_t bge_tx_discards; u_int32_t bge_rx_inerrors; u_int32_t bge_rx_overruns; u_int32_t bge_tx_collisions; - SLIST_HEAD(, txdmamap_pool_entry) txdma_list; - struct txdmamap_pool_entry *txdma[BGE_TX_RING_CNT]; + bus_dmamap_t bge_txdma[BGE_TX_RING_CNT]; }; Index: if_bge.c =================================================================== RCS file: /cvs/src/sys/dev/pci/if_bge.c,v retrieving revision 1.369 diff -u -p -u -p -r1.369 if_bge.c --- if_bge.c 19 Jul 2015 06:28:12 -0000 1.369 +++ if_bge.c 5 Oct 2015 01:06:21 -0000 @@ -141,7 +141,7 @@ void bge_tick(void *); void bge_stats_update(struct bge_softc *); void bge_stats_update_regs(struct bge_softc *); int bge_cksum_pad(struct mbuf *); -int bge_encap(struct bge_softc *, struct mbuf *, u_int32_t *); +int bge_encap(struct bge_softc *, struct mbuf *, int *); int bge_compact_dma_runt(struct mbuf *); int bge_intr(void *); @@ -1262,20 +1262,31 @@ uncreate: return (1); } +/* + * When the refill timeout for a ring is active, that ring is so empty + * that no more packets can be received on it, so the interrupt handler + * will not attempt to refill it, meaning we don't need to protect against + * interrupts here. + */ + void bge_rxtick(void *arg) { struct bge_softc *sc = arg; - int s; - s = splnet(); if (ISSET(sc->bge_flags, BGE_RXRING_VALID) && if_rxr_inuse(&sc->bge_std_ring) <= 8) bge_fill_rx_ring_std(sc); +} + +void +bge_rxtick_jumbo(void *arg) +{ + struct bge_softc *sc = arg; + if (ISSET(sc->bge_flags, BGE_JUMBO_RXRING_VALID) && if_rxr_inuse(&sc->bge_jumbo_ring) <= 8) bge_fill_rx_ring_jumbo(sc); - splx(s); } void @@ -1410,7 +1421,7 @@ bge_fill_rx_ring_jumbo(struct bge_softc * that now, then try again later. */ if (if_rxr_inuse(&sc->bge_jumbo_ring) <= 8) - timeout_add(&sc->bge_rxtimeout, 1); + timeout_add(&sc->bge_rxtimeout_jumbo, 1); } void @@ -1446,7 +1457,6 @@ void bge_free_tx_ring(struct bge_softc *sc) { int i; - struct txdmamap_pool_entry *dma; if (!(sc->bge_flags & BGE_TXRING_VALID)) return; @@ -1455,18 +1465,12 @@ bge_free_tx_ring(struct bge_softc *sc) if (sc->bge_cdata.bge_tx_chain[i] != NULL) { m_freem(sc->bge_cdata.bge_tx_chain[i]); sc->bge_cdata.bge_tx_chain[i] = NULL; - SLIST_INSERT_HEAD(&sc->txdma_list, sc->txdma[i], - link); - sc->txdma[i] = 0; + sc->bge_cdata.bge_tx_map[i] = NULL; } bzero(&sc->bge_rdata->bge_tx_ring[i], sizeof(struct bge_tx_bd)); - } - while ((dma = SLIST_FIRST(&sc->txdma_list))) { - SLIST_REMOVE_HEAD(&sc->txdma_list, link); - bus_dmamap_destroy(sc->bge_dmatag, dma->dmamap); - free(dma, M_DEVBUF, 0); + bus_dmamap_destroy(sc->bge_dmatag, sc->bge_txdma[i]); } sc->bge_flags &= ~BGE_TXRING_VALID; @@ -1476,9 +1480,7 @@ int bge_init_tx_ring(struct bge_softc *sc) { int i; - bus_dmamap_t dmamap; bus_size_t txsegsz, txmaxsegsz; - struct txdmamap_pool_entry *dma; if (sc->bge_flags & BGE_TXRING_VALID) return (0); @@ -1505,22 +1507,10 @@ bge_init_tx_ring(struct bge_softc *sc) txmaxsegsz = MCLBYTES; } - SLIST_INIT(&sc->txdma_list); for (i = 0; i < BGE_TX_RING_CNT; i++) { if (bus_dmamap_create(sc->bge_dmatag, txmaxsegsz, - BGE_NTXSEG, txsegsz, 0, BUS_DMA_NOWAIT, &dmamap)) + BGE_NTXSEG, txsegsz, 0, BUS_DMA_NOWAIT, &sc->bge_txdma[i])) return (ENOBUFS); - if (dmamap == NULL) - panic("dmamap NULL in bge_init_tx_ring"); - dma = malloc(sizeof(*dma), M_DEVBUF, M_NOWAIT); - if (dma == NULL) { - printf("%s: can't alloc txdmamap_pool_entry\n", - sc->bge_dev.dv_xname); - bus_dmamap_destroy(sc->bge_dmatag, dmamap); - return (ENOMEM); - } - dma->dmamap = dmamap; - SLIST_INSERT_HEAD(&sc->txdma_list, dma, link); } sc->bge_flags |= BGE_TXRING_VALID; @@ -3081,8 +3071,8 @@ bge_attach(struct device *parent, struct /* Hookup IRQ last. */ DPRINTFN(5, ("pci_intr_establish\n")); - sc->bge_intrhand = pci_intr_establish(pc, ih, IPL_NET, bge_intr, sc, - sc->bge_dev.dv_xname); + sc->bge_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE, + bge_intr, sc, sc->bge_dev.dv_xname); if (sc->bge_intrhand == NULL) { printf(": couldn't establish interrupt"); if (intrstr != NULL) @@ -3139,6 +3129,7 @@ bge_attach(struct device *parent, struct timeout_set(&sc->bge_timeout, bge_tick, sc); timeout_set(&sc->bge_rxtimeout, bge_rxtick, sc); + timeout_set(&sc->bge_rxtimeout_jumbo, bge_rxtick_jumbo, sc); return; fail_6: @@ -3578,15 +3569,17 @@ bge_txeof(struct bge_softc *sc) { struct bge_tx_bd *cur_tx = NULL; struct ifnet *ifp; - struct txdmamap_pool_entry *dma; + bus_dmamap_t dmamap; bus_addr_t offset, toff; bus_size_t tlen; - int tosync; + int tosync, freed, txcnt; + u_int32_t cons, newcons; struct mbuf *m; /* Nothing to do */ - if (sc->bge_tx_saved_considx == - sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx) + cons = sc->bge_tx_saved_considx; + newcons = sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx; + if (cons == newcons) return; ifp = &sc->arpcom.ac_if; @@ -3597,14 +3590,12 @@ bge_txeof(struct bge_softc *sc) BUS_DMASYNC_POSTREAD); offset = offsetof(struct bge_ring_data, bge_tx_ring); - tosync = sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx - - sc->bge_tx_saved_considx; + tosync = newcons - cons; - toff = offset + (sc->bge_tx_saved_considx * sizeof (struct bge_tx_bd)); + toff = offset + (cons * sizeof (struct bge_tx_bd)); if (tosync < 0) { - tlen = (BGE_TX_RING_CNT - sc->bge_tx_saved_considx) * - sizeof (struct bge_tx_bd); + tlen = (BGE_TX_RING_CNT - cons) * sizeof (struct bge_tx_bd); bus_dmamap_sync(sc->bge_dmatag, sc->bge_ring_map, toff, tlen, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); tosync = -tosync; @@ -3618,34 +3609,35 @@ bge_txeof(struct bge_softc *sc) * Go through our tx ring and free mbufs for those * frames that have been sent. */ - while (sc->bge_tx_saved_considx != - sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx) { - u_int32_t idx = 0; - - idx = sc->bge_tx_saved_considx; - cur_tx = &sc->bge_rdata->bge_tx_ring[idx]; + freed = 0; + while (cons != newcons) { + cur_tx = &sc->bge_rdata->bge_tx_ring[cons]; if (cur_tx->bge_flags & BGE_TXBDFLAG_END) ifp->if_opackets++; - m = sc->bge_cdata.bge_tx_chain[idx]; + m = sc->bge_cdata.bge_tx_chain[cons]; if (m != NULL) { - sc->bge_cdata.bge_tx_chain[idx] = NULL; - dma = sc->txdma[idx]; - bus_dmamap_sync(sc->bge_dmatag, dma->dmamap, 0, - dma->dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(sc->bge_dmatag, dma->dmamap); - SLIST_INSERT_HEAD(&sc->txdma_list, dma, link); - sc->txdma[idx] = NULL; + dmamap = sc->bge_cdata.bge_tx_map[cons]; + + sc->bge_cdata.bge_tx_chain[cons] = NULL; + sc->bge_cdata.bge_tx_map[cons] = NULL; + bus_dmamap_sync(sc->bge_dmatag, dmamap, 0, + dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sc->bge_dmatag, dmamap); m_freem(m); } - sc->bge_txcnt--; - BGE_INC(sc->bge_tx_saved_considx, BGE_TX_RING_CNT); + freed++; + BGE_INC(cons, BGE_TX_RING_CNT); } - if (sc->bge_txcnt < BGE_TX_RING_CNT - 16) + txcnt = atomic_sub_int_nv(&sc->bge_txcnt, freed); + + if (txcnt < BGE_TX_RING_CNT - 16) ifp->if_flags &= ~IFF_OACTIVE; - if (sc->bge_txcnt == 0) + if (txcnt == 0) ifp->if_timer = 0; + + sc->bge_tx_saved_considx = cons; } int @@ -3693,8 +3685,11 @@ bge_intr(void *xsc) if (BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5700 || statusword & BGE_STATFLAG_LINKSTATE_CHANGED || - BGE_STS_BIT(sc, BGE_STS_LINK_EVT)) + BGE_STS_BIT(sc, BGE_STS_LINK_EVT)) { + KERNEL_LOCK(); bge_link_upd(sc); + KERNEL_UNLOCK(); + } /* Re-enable interrupts. */ bge_writembx(sc, BGE_MBX_IRQ0_LO, statustag); @@ -3706,8 +3701,11 @@ bge_intr(void *xsc) /* Check TX ring producer/consumer */ bge_txeof(sc); - if (!IFQ_IS_EMPTY(&ifp->if_snd)) + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + KERNEL_LOCK(); bge_start(ifp); + KERNEL_UNLOCK(); + } } return (1); @@ -3987,16 +3985,15 @@ bge_cksum_pad(struct mbuf *m) * pointers to descriptors. */ int -bge_encap(struct bge_softc *sc, struct mbuf *m_head, u_int32_t *txidx) +bge_encap(struct bge_softc *sc, struct mbuf *m_head, int *txinc) { struct bge_tx_bd *f = NULL; u_int32_t frag, cur; u_int16_t csum_flags = 0; - struct txdmamap_pool_entry *dma; - bus_dmamap_t dmamap; + bus_dmamap_t dmamap; int i = 0; - cur = frag = *txidx; + cur = frag = (sc->bge_tx_prodidx + *txinc) % BGE_TX_RING_CNT; if (m_head->m_pkthdr.csum_flags) { if (m_head->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT) @@ -4026,10 +4023,7 @@ bge_encap(struct bge_softc *sc, struct m return (ENOBUFS); doit: - dma = SLIST_FIRST(&sc->txdma_list); - if (dma == NULL) - return (ENOBUFS); - dmamap = dma->dmamap; + dmamap = sc->bge_txdma[cur]; /* * Start packing the mbufs in this chain into @@ -4052,7 +4046,7 @@ doit: } /* Check if we have enough free send BDs. */ - if (sc->bge_txcnt + dmamap->dm_nsegs >= BGE_TX_RING_CNT) + if (sc->bge_txcnt + *txinc + dmamap->dm_nsegs >= BGE_TX_RING_CNT) goto fail_unload; for (i = 0; i < dmamap->dm_nsegs; i++) { @@ -4084,11 +4078,9 @@ doit: sc->bge_rdata->bge_tx_ring[cur].bge_flags |= BGE_TXBDFLAG_END; sc->bge_cdata.bge_tx_chain[cur] = m_head; - SLIST_REMOVE_HEAD(&sc->txdma_list, link); - sc->txdma[cur] = dma; - sc->bge_txcnt += dmamap->dm_nsegs; - - *txidx = frag; + sc->bge_cdata.bge_tx_map[cur] = dmamap; + + *txinc += dmamap->dm_nsegs; return (0); @@ -4107,8 +4099,7 @@ bge_start(struct ifnet *ifp) { struct bge_softc *sc; struct mbuf *m_head; - u_int32_t prodidx; - int pkts; + int txinc; sc = ifp->if_softc; @@ -4117,55 +4108,44 @@ bge_start(struct ifnet *ifp) if (!BGE_STS_BIT(sc, BGE_STS_LINK)) return; - prodidx = sc->bge_tx_prodidx; - - for (pkts = 0; !IFQ_IS_EMPTY(&ifp->if_snd);) { - if (sc->bge_txcnt > BGE_TX_RING_CNT - 16) { - ifp->if_flags |= IFF_OACTIVE; - break; - } - + txinc = 0; + while (1) { IFQ_POLL(&ifp->if_snd, m_head); if (m_head == NULL) break; - /* - * Pack the data into the transmit ring. If we - * don't have room, set the OACTIVE flag and wait - * for the NIC to drain the ring. - */ - if (bge_encap(sc, m_head, &prodidx)) { - ifp->if_flags |= IFF_OACTIVE; + if (bge_encap(sc, m_head, &txinc)) break; - } /* now we are committed to transmit the packet */ IFQ_DEQUEUE(&ifp->if_snd, m_head); - pkts++; #if NBPFILTER > 0 - /* - * If there's a BPF listener, bounce a copy of this frame - * to him. - */ if (ifp->if_bpf) bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT); #endif } - if (pkts == 0) - return; - /* Transmit */ - bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, prodidx); - if (BGE_CHIPREV(sc->bge_chipid) == BGE_CHIPREV_5700_BX) - bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, prodidx); + if (txinc != 0) { + int txcnt; - sc->bge_tx_prodidx = prodidx; + /* Transmit */ + sc->bge_tx_prodidx = (sc->bge_tx_prodidx + txinc) % + BGE_TX_RING_CNT; + bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, sc->bge_tx_prodidx); + if (BGE_CHIPREV(sc->bge_chipid) == BGE_CHIPREV_5700_BX) + bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, + sc->bge_tx_prodidx); - /* - * Set a timeout in case the chip goes out to lunch. - */ - ifp->if_timer = 5; + txcnt = atomic_add_int_nv(&sc->bge_txcnt, txinc); + if (txcnt > BGE_TX_RING_CNT - 16) + ifp->if_flags |= IFF_OACTIVE; + + /* + * Set a timeout in case the chip goes out to lunch. + */ + ifp->if_timer = 5; + } } void @@ -4580,6 +4560,7 @@ bge_stop(struct bge_softc *sc) timeout_del(&sc->bge_timeout); timeout_del(&sc->bge_rxtimeout); + timeout_del(&sc->bge_rxtimeout_jumbo); ifp->if_flags &= ~(IFF_RUNNING | IFF_OACTIVE); @@ -4639,6 +4620,8 @@ bge_stop(struct bge_softc *sc) * Tell firmware we're shutting down. */ BGE_CLRBIT(sc, BGE_MODE_CTL, BGE_MODECTL_STACKUP); + + intr_barrier(sc->bge_intrhand); /* Free the RX lists. */ bge_free_rx_ring_std(sc);