This brings bge(4) up to about where em(4) is.  It involves a few different
changes, notably:
- per-ring refill timeouts, to ensure we don't try to refill a ring from the
timeout and an interrupt at the same time
- removing the list of tx dma maps and just assigning a map to use based
on the current ring slot number (this is why it's more - than +)
- using atomics to adjust bge_txcnt, saving those adjustments until the
end of the tx/txeof loops, and only acting on the adjusted value
- not adding any mutexes

I've tested it on amd64 with these:

bge0 at pci1 dev 0 function 0 "Broadcom BCM5721" rev 0x21, BCM5750 C1 (0x4201): 
msi, address 00:18:f3:d1:80:64

bge0 at pci2 dev 2 function 0 "Broadcom BCM5703X" rev 0x02, BCM5702/5703 A2 
(0x1002): apic 3 int 1, address 00:09:3d:00:84:d1

and on sparc64:

bge0 at pci7 dev 4 function 0 "Broadcom BCM5714" rev 0xa3, BCM5715 A3 (0x9003): 
ivec 0x795, address 00:14:4f:00:5a:5a

On the sparc64 box (a v245), with if_input_process unlocked, this gets me 10-20%
more pps or about 100mbps more in tcpbench (550, up from 450).

ok?

Index: if_bgereg.h
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bgereg.h,v
retrieving revision 1.127
diff -u -p -u -p -r1.127 if_bgereg.h
--- if_bgereg.h 11 Sep 2015 13:02:28 -0000      1.127
+++ if_bgereg.h 30 Sep 2015 11:14:09 -0000
@@ -2830,11 +2830,6 @@ struct bge_type {
 #define        BGE_TIMEOUT             100000
 #define        BGE_TXCONS_UNSET                0xFFFF  /* impossible value */
 
-struct txdmamap_pool_entry {
-       bus_dmamap_t dmamap;
-       SLIST_ENTRY(txdmamap_pool_entry) link;
-};
-
 #define        ASF_ENABLE              1
 #define        ASF_NEW_HANDSHAKE       2
 #define        ASF_STACKUP             4
@@ -2934,11 +2929,11 @@ struct bge_softc {
        int                     bge_txcnt;
        struct timeout          bge_timeout;
        struct timeout          bge_rxtimeout;
+       struct timeout          bge_rxtimeout_jumbo;
        u_int32_t               bge_rx_discards;
        u_int32_t               bge_tx_discards;
        u_int32_t               bge_rx_inerrors;
        u_int32_t               bge_rx_overruns;
        u_int32_t               bge_tx_collisions;
-       SLIST_HEAD(, txdmamap_pool_entry) txdma_list;
-       struct txdmamap_pool_entry *txdma[BGE_TX_RING_CNT];
+       bus_dmamap_t            bge_txdma[BGE_TX_RING_CNT];
 };
Index: if_bge.c
===================================================================
RCS file: /cvs/src/sys/dev/pci/if_bge.c,v
retrieving revision 1.369
diff -u -p -u -p -r1.369 if_bge.c
--- if_bge.c    19 Jul 2015 06:28:12 -0000      1.369
+++ if_bge.c    5 Oct 2015 01:06:21 -0000
@@ -141,7 +141,7 @@ void bge_tick(void *);
 void bge_stats_update(struct bge_softc *);
 void bge_stats_update_regs(struct bge_softc *);
 int bge_cksum_pad(struct mbuf *);
-int bge_encap(struct bge_softc *, struct mbuf *, u_int32_t *);
+int bge_encap(struct bge_softc *, struct mbuf *, int *);
 int bge_compact_dma_runt(struct mbuf *);
 
 int bge_intr(void *);
@@ -1262,20 +1262,31 @@ uncreate:
        return (1);
 }
 
+/*
+ * When the refill timeout for a ring is active, that ring is so empty
+ * that no more packets can be received on it, so the interrupt handler
+ * will not attempt to refill it, meaning we don't need to protect against
+ * interrupts here.
+ */
+
 void
 bge_rxtick(void *arg)
 {
        struct bge_softc *sc = arg;
-       int s;
 
-       s = splnet();
        if (ISSET(sc->bge_flags, BGE_RXRING_VALID) &&
            if_rxr_inuse(&sc->bge_std_ring) <= 8)
                bge_fill_rx_ring_std(sc);
+}
+
+void
+bge_rxtick_jumbo(void *arg)
+{
+       struct bge_softc *sc = arg;
+
        if (ISSET(sc->bge_flags, BGE_JUMBO_RXRING_VALID) &&
            if_rxr_inuse(&sc->bge_jumbo_ring) <= 8)
                bge_fill_rx_ring_jumbo(sc);
-       splx(s);
 }
 
 void
@@ -1410,7 +1421,7 @@ bge_fill_rx_ring_jumbo(struct bge_softc 
         * that now, then try again later.
         */
        if (if_rxr_inuse(&sc->bge_jumbo_ring) <= 8)
-               timeout_add(&sc->bge_rxtimeout, 1);
+               timeout_add(&sc->bge_rxtimeout_jumbo, 1);
 }
 
 void
@@ -1446,7 +1457,6 @@ void
 bge_free_tx_ring(struct bge_softc *sc)
 {
        int i;
-       struct txdmamap_pool_entry *dma;
 
        if (!(sc->bge_flags & BGE_TXRING_VALID))
                return;
@@ -1455,18 +1465,12 @@ bge_free_tx_ring(struct bge_softc *sc)
                if (sc->bge_cdata.bge_tx_chain[i] != NULL) {
                        m_freem(sc->bge_cdata.bge_tx_chain[i]);
                        sc->bge_cdata.bge_tx_chain[i] = NULL;
-                       SLIST_INSERT_HEAD(&sc->txdma_list, sc->txdma[i],
-                                           link);
-                       sc->txdma[i] = 0;
+                       sc->bge_cdata.bge_tx_map[i] = NULL;
                }
                bzero(&sc->bge_rdata->bge_tx_ring[i],
                    sizeof(struct bge_tx_bd));
-       }
 
-       while ((dma = SLIST_FIRST(&sc->txdma_list))) {
-               SLIST_REMOVE_HEAD(&sc->txdma_list, link);
-               bus_dmamap_destroy(sc->bge_dmatag, dma->dmamap);
-               free(dma, M_DEVBUF, 0);
+               bus_dmamap_destroy(sc->bge_dmatag, sc->bge_txdma[i]);
        }
 
        sc->bge_flags &= ~BGE_TXRING_VALID;
@@ -1476,9 +1480,7 @@ int
 bge_init_tx_ring(struct bge_softc *sc)
 {
        int i;
-       bus_dmamap_t dmamap;
        bus_size_t txsegsz, txmaxsegsz;
-       struct txdmamap_pool_entry *dma;
 
        if (sc->bge_flags & BGE_TXRING_VALID)
                return (0);
@@ -1505,22 +1507,10 @@ bge_init_tx_ring(struct bge_softc *sc)
                txmaxsegsz = MCLBYTES;
        }
 
-       SLIST_INIT(&sc->txdma_list);
        for (i = 0; i < BGE_TX_RING_CNT; i++) {
                if (bus_dmamap_create(sc->bge_dmatag, txmaxsegsz,
-                   BGE_NTXSEG, txsegsz, 0, BUS_DMA_NOWAIT, &dmamap))
+                   BGE_NTXSEG, txsegsz, 0, BUS_DMA_NOWAIT, &sc->bge_txdma[i]))
                        return (ENOBUFS);
-               if (dmamap == NULL)
-                       panic("dmamap NULL in bge_init_tx_ring");
-               dma = malloc(sizeof(*dma), M_DEVBUF, M_NOWAIT);
-               if (dma == NULL) {
-                       printf("%s: can't alloc txdmamap_pool_entry\n",
-                           sc->bge_dev.dv_xname);
-                       bus_dmamap_destroy(sc->bge_dmatag, dmamap);
-                       return (ENOMEM);
-               }
-               dma->dmamap = dmamap;
-               SLIST_INSERT_HEAD(&sc->txdma_list, dma, link);
        }
 
        sc->bge_flags |= BGE_TXRING_VALID;
@@ -3081,8 +3071,8 @@ bge_attach(struct device *parent, struct
 
        /* Hookup IRQ last. */
        DPRINTFN(5, ("pci_intr_establish\n"));
-       sc->bge_intrhand = pci_intr_establish(pc, ih, IPL_NET, bge_intr, sc,
-           sc->bge_dev.dv_xname);
+       sc->bge_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
+           bge_intr, sc, sc->bge_dev.dv_xname);
        if (sc->bge_intrhand == NULL) {
                printf(": couldn't establish interrupt");
                if (intrstr != NULL)
@@ -3139,6 +3129,7 @@ bge_attach(struct device *parent, struct
 
        timeout_set(&sc->bge_timeout, bge_tick, sc);
        timeout_set(&sc->bge_rxtimeout, bge_rxtick, sc);
+       timeout_set(&sc->bge_rxtimeout_jumbo, bge_rxtick_jumbo, sc);
        return;
 
 fail_6:
@@ -3578,15 +3569,17 @@ bge_txeof(struct bge_softc *sc)
 {
        struct bge_tx_bd *cur_tx = NULL;
        struct ifnet *ifp;
-       struct txdmamap_pool_entry *dma;
+       bus_dmamap_t dmamap;
        bus_addr_t offset, toff;
        bus_size_t tlen;
-       int tosync;
+       int tosync, freed, txcnt;
+       u_int32_t cons, newcons;
        struct mbuf *m;
 
        /* Nothing to do */
-       if (sc->bge_tx_saved_considx ==
-           sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx)
+       cons = sc->bge_tx_saved_considx; 
+       newcons = sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx;
+       if (cons == newcons)
                return;
 
        ifp = &sc->arpcom.ac_if;
@@ -3597,14 +3590,12 @@ bge_txeof(struct bge_softc *sc)
            BUS_DMASYNC_POSTREAD);
 
        offset = offsetof(struct bge_ring_data, bge_tx_ring);
-       tosync = sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx -
-           sc->bge_tx_saved_considx;
+       tosync = newcons - cons;
 
-       toff = offset + (sc->bge_tx_saved_considx * sizeof (struct bge_tx_bd));
+       toff = offset + (cons * sizeof (struct bge_tx_bd));
 
        if (tosync < 0) {
-               tlen = (BGE_TX_RING_CNT - sc->bge_tx_saved_considx) *
-                   sizeof (struct bge_tx_bd);
+               tlen = (BGE_TX_RING_CNT - cons) * sizeof (struct bge_tx_bd);
                bus_dmamap_sync(sc->bge_dmatag, sc->bge_ring_map,
                    toff, tlen, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
                tosync = -tosync;
@@ -3618,34 +3609,35 @@ bge_txeof(struct bge_softc *sc)
         * Go through our tx ring and free mbufs for those
         * frames that have been sent.
         */
-       while (sc->bge_tx_saved_considx !=
-           sc->bge_rdata->bge_status_block.bge_idx[0].bge_tx_cons_idx) {
-               u_int32_t               idx = 0;
-
-               idx = sc->bge_tx_saved_considx;
-               cur_tx = &sc->bge_rdata->bge_tx_ring[idx];
+       freed = 0;
+       while (cons != newcons) {
+               cur_tx = &sc->bge_rdata->bge_tx_ring[cons];
                if (cur_tx->bge_flags & BGE_TXBDFLAG_END)
                        ifp->if_opackets++;
-               m = sc->bge_cdata.bge_tx_chain[idx];
+               m = sc->bge_cdata.bge_tx_chain[cons];
                if (m != NULL) {
-                       sc->bge_cdata.bge_tx_chain[idx] = NULL;
-                       dma = sc->txdma[idx];
-                       bus_dmamap_sync(sc->bge_dmatag, dma->dmamap, 0,
-                           dma->dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
-                       bus_dmamap_unload(sc->bge_dmatag, dma->dmamap);
-                       SLIST_INSERT_HEAD(&sc->txdma_list, dma, link);
-                       sc->txdma[idx] = NULL;
+                       dmamap = sc->bge_cdata.bge_tx_map[cons];
+
+                       sc->bge_cdata.bge_tx_chain[cons] = NULL;
+                       sc->bge_cdata.bge_tx_map[cons] = NULL;
+                       bus_dmamap_sync(sc->bge_dmatag, dmamap, 0,
+                           dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
+                       bus_dmamap_unload(sc->bge_dmatag, dmamap);
 
                        m_freem(m);
                }
-               sc->bge_txcnt--;
-               BGE_INC(sc->bge_tx_saved_considx, BGE_TX_RING_CNT);
+               freed++;
+               BGE_INC(cons, BGE_TX_RING_CNT);
        }
 
-       if (sc->bge_txcnt < BGE_TX_RING_CNT - 16)
+       txcnt = atomic_sub_int_nv(&sc->bge_txcnt, freed);
+
+       if (txcnt < BGE_TX_RING_CNT - 16)
                ifp->if_flags &= ~IFF_OACTIVE;
-       if (sc->bge_txcnt == 0)
+       if (txcnt == 0)
                ifp->if_timer = 0;
+
+       sc->bge_tx_saved_considx = cons;
 }
 
 int
@@ -3693,8 +3685,11 @@ bge_intr(void *xsc)
 
        if (BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5700 ||
            statusword & BGE_STATFLAG_LINKSTATE_CHANGED ||
-           BGE_STS_BIT(sc, BGE_STS_LINK_EVT))
+           BGE_STS_BIT(sc, BGE_STS_LINK_EVT)) {
+               KERNEL_LOCK();
                bge_link_upd(sc);
+               KERNEL_UNLOCK();
+       }
 
        /* Re-enable interrupts. */
        bge_writembx(sc, BGE_MBX_IRQ0_LO, statustag);
@@ -3706,8 +3701,11 @@ bge_intr(void *xsc)
                /* Check TX ring producer/consumer */
                bge_txeof(sc);
 
-               if (!IFQ_IS_EMPTY(&ifp->if_snd))
+               if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
+                       KERNEL_LOCK();
                        bge_start(ifp);
+                       KERNEL_UNLOCK();
+               }
        }
 
        return (1);
@@ -3987,16 +3985,15 @@ bge_cksum_pad(struct mbuf *m)
  * pointers to descriptors.
  */
 int
-bge_encap(struct bge_softc *sc, struct mbuf *m_head, u_int32_t *txidx)
+bge_encap(struct bge_softc *sc, struct mbuf *m_head, int *txinc)
 {
        struct bge_tx_bd        *f = NULL;
        u_int32_t               frag, cur;
        u_int16_t               csum_flags = 0;
-       struct txdmamap_pool_entry *dma;
-       bus_dmamap_t dmamap;
+       bus_dmamap_t            dmamap;
        int                     i = 0;
 
-       cur = frag = *txidx;
+       cur = frag = (sc->bge_tx_prodidx + *txinc) % BGE_TX_RING_CNT;
 
        if (m_head->m_pkthdr.csum_flags) {
                if (m_head->m_pkthdr.csum_flags & M_IPV4_CSUM_OUT)
@@ -4026,10 +4023,7 @@ bge_encap(struct bge_softc *sc, struct m
                return (ENOBUFS);
 
 doit:
-       dma = SLIST_FIRST(&sc->txdma_list);
-       if (dma == NULL)
-               return (ENOBUFS);
-       dmamap = dma->dmamap;
+       dmamap = sc->bge_txdma[cur];
 
        /*
         * Start packing the mbufs in this chain into
@@ -4052,7 +4046,7 @@ doit:
        }
 
        /* Check if we have enough free send BDs. */
-       if (sc->bge_txcnt + dmamap->dm_nsegs >= BGE_TX_RING_CNT)
+       if (sc->bge_txcnt + *txinc + dmamap->dm_nsegs >= BGE_TX_RING_CNT)
                goto fail_unload;
 
        for (i = 0; i < dmamap->dm_nsegs; i++) {
@@ -4084,11 +4078,9 @@ doit:
 
        sc->bge_rdata->bge_tx_ring[cur].bge_flags |= BGE_TXBDFLAG_END;
        sc->bge_cdata.bge_tx_chain[cur] = m_head;
-       SLIST_REMOVE_HEAD(&sc->txdma_list, link);
-       sc->txdma[cur] = dma;
-       sc->bge_txcnt += dmamap->dm_nsegs;
-
-       *txidx = frag;
+       sc->bge_cdata.bge_tx_map[cur] = dmamap;
+       
+       *txinc += dmamap->dm_nsegs;
 
        return (0);
 
@@ -4107,8 +4099,7 @@ bge_start(struct ifnet *ifp)
 {
        struct bge_softc *sc;
        struct mbuf *m_head;
-       u_int32_t prodidx;
-       int pkts;
+       int txinc;
 
        sc = ifp->if_softc;
 
@@ -4117,55 +4108,44 @@ bge_start(struct ifnet *ifp)
        if (!BGE_STS_BIT(sc, BGE_STS_LINK))
                return;
 
-       prodidx = sc->bge_tx_prodidx;
-
-       for (pkts = 0; !IFQ_IS_EMPTY(&ifp->if_snd);) {
-               if (sc->bge_txcnt > BGE_TX_RING_CNT - 16) {
-                       ifp->if_flags |= IFF_OACTIVE;
-                       break;
-               }
-
+       txinc = 0;
+       while (1) {
                IFQ_POLL(&ifp->if_snd, m_head);
                if (m_head == NULL)
                        break;
 
-               /*
-                * Pack the data into the transmit ring. If we
-                * don't have room, set the OACTIVE flag and wait
-                * for the NIC to drain the ring.
-                */
-               if (bge_encap(sc, m_head, &prodidx)) {
-                       ifp->if_flags |= IFF_OACTIVE;
+               if (bge_encap(sc, m_head, &txinc))
                        break;
-               }
 
                /* now we are committed to transmit the packet */
                IFQ_DEQUEUE(&ifp->if_snd, m_head);
-               pkts++;
 
 #if NBPFILTER > 0
-               /*
-                * If there's a BPF listener, bounce a copy of this frame
-                * to him.
-                */
                if (ifp->if_bpf)
                        bpf_mtap_ether(ifp->if_bpf, m_head, BPF_DIRECTION_OUT);
 #endif
        }
-       if (pkts == 0)
-               return;
 
-       /* Transmit */
-       bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, prodidx);
-       if (BGE_CHIPREV(sc->bge_chipid) == BGE_CHIPREV_5700_BX)
-               bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, prodidx);
+       if (txinc != 0) {
+               int txcnt;
 
-       sc->bge_tx_prodidx = prodidx;
+               /* Transmit */
+               sc->bge_tx_prodidx = (sc->bge_tx_prodidx + txinc) %
+                   BGE_TX_RING_CNT;
+               bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO, sc->bge_tx_prodidx);
+               if (BGE_CHIPREV(sc->bge_chipid) == BGE_CHIPREV_5700_BX)
+                       bge_writembx(sc, BGE_MBX_TX_HOST_PROD0_LO,
+                           sc->bge_tx_prodidx);
 
-       /*
-        * Set a timeout in case the chip goes out to lunch.
-        */
-       ifp->if_timer = 5;
+               txcnt = atomic_add_int_nv(&sc->bge_txcnt, txinc);
+               if (txcnt > BGE_TX_RING_CNT - 16)
+                       ifp->if_flags |= IFF_OACTIVE;
+
+               /*
+                * Set a timeout in case the chip goes out to lunch.
+                */
+               ifp->if_timer = 5;
+       }
 }
 
 void
@@ -4580,6 +4560,7 @@ bge_stop(struct bge_softc *sc)
 
        timeout_del(&sc->bge_timeout);
        timeout_del(&sc->bge_rxtimeout);
+       timeout_del(&sc->bge_rxtimeout_jumbo);
 
        ifp->if_flags &= ~(IFF_RUNNING | IFF_OACTIVE);
 
@@ -4639,6 +4620,8 @@ bge_stop(struct bge_softc *sc)
         * Tell firmware we're shutting down.
         */
        BGE_CLRBIT(sc, BGE_MODE_CTL, BGE_MODECTL_STACKUP);
+
+       intr_barrier(sc->bge_intrhand);
 
        /* Free the RX lists. */
        bge_free_rx_ring_std(sc);

Reply via email to