Hi,

This is a mail I've sent to dlg@ and kettenis@ and didn't
get an objection, but I would like a more widespread testing
to happen with this diff.

Please try it on all bge's you can. Your help is much
appreciated!

--------8<--------

I find the transmit path on 5719 here to be of questionable
reliability and it's certainly not something we can use in
production here.  If I run tcpbench in different rdomains
on the same machine with BCM5719 against an Intel 82580, I
can see the following picture for nearly every TCP stream
that originates from the bge:  http://pastie.org/8004262

I have verified that doing so between two 82580's or from
82580 to a bnx on the other machine doesn't produce this
result.  In fact even sending from the em towards bge
works fine, but if bge wants to transmit a lot I loose.

The culprit seems to be that the BGE_STATFLAG_UPDATED bit
in the statusword in the bge status block doesn't get
properly updated every time and we sometimes end up with
a stray interrupt.

Other systems don't use this since they're doing MSI for
newer controllers which implies tagged status and this bit
is no longer relevant there.  Making us do the same does
fix the issue for me.

The bge_intr chunk of the diff looks complicated, but in
fact it's due to the indentation change.  The condition
to return without acknowledging the interrupt is changed
to check the last status AND read the pci interrupt status
(like Linux does) for the MSI case and nothing is changed
for the !MSI case.

--------8<--------

diff --git sys/dev/pci/if_bge.c sys/dev/pci/if_bge.c
index 05bfaa4..f87b1fe 100644
--- sys/dev/pci/if_bge.c
+++ sys/dev/pci/if_bge.c
@@ -1623,16 +1623,19 @@ bge_phy_addr(struct bge_softc *sc)
  */
 void
 bge_chipinit(struct bge_softc *sc)
 {
        struct pci_attach_args  *pa = &(sc->bge_pa);
-       u_int32_t dma_rw_ctl, mode_ctl;
+       u_int32_t dma_rw_ctl, misc_ctl, mode_ctl;
        int i;
 
        /* Set endianness before we access any non-PCI registers. */
+       misc_ctl = BGE_INIT;
+       if (sc->bge_flags & BGE_TAGGED_STATUS)
+               misc_ctl |= BGE_PCIMISCCTL_TAGGED_STATUS;
        pci_conf_write(pa->pa_pc, pa->pa_tag, BGE_PCI_MISC_CTL,
-           BGE_INIT);
+           misc_ctl);
 
        /*
         * Clear the MAC statistics block in the NIC's
         * internal memory.
         */
@@ -2485,19 +2488,10 @@ bge_attach(struct device *parent, struct device *self, 
void *aux)
            &sc->bge_bhandle, NULL, &size, 0)) {
                printf(": can't find mem space\n");
                return;
        }
 
-       DPRINTFN(5, ("pci_intr_map\n"));
-       if (pci_intr_map(pa, &ih)) {
-               printf(": couldn't map interrupt\n");
-               goto fail_1;
-       }
-
-       DPRINTFN(5, ("pci_intr_string\n"));
-       intrstr = pci_intr_string(pc, ih);
-
        /*
         * Kludge for 5700 Bx bug: a hardware bug (PCIX byte enable?)
         * can clobber the chip's PCI config-space power control registers,
         * leaving the card in D3 powersave state.
         * We do not have memory-mapped registers in this state,
@@ -2749,10 +2743,21 @@ bge_attach(struct device *parent, struct device *self, 
void *aux)
            BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5761 ||
            BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5785 ||
            BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM57780)
                sc->bge_flags |= BGE_CPMU_PRESENT;
 
+       DPRINTFN(5, ("pci_intr_map\n"));
+       if (BGE_IS_5717_PLUS(sc) && pci_intr_map_msi(pa, &ih) == 0)
+               sc->bge_flags |= BGE_TAGGED_STATUS;
+       else if (pci_intr_map(pa, &ih)) {
+               printf(": couldn't map interrupt\n");
+               goto fail_1;
+       }
+
+       DPRINTFN(5, ("pci_intr_string\n"));
+       intrstr = pci_intr_string(pc, ih);
+
        /* Try to reset the chip. */
        DPRINTFN(5, ("bge_reset\n"));
        bge_sig_pre_reset(sc, BGE_RESET_START);
        bge_reset(sc);
 
@@ -2936,10 +2941,18 @@ bge_attach(struct device *parent, struct device *self, 
void *aux)
                    sc->bge_flags |= BGE_PHY_FIBER_TBI;
                else
                    sc->bge_flags |= BGE_PHY_FIBER_MII;
        }
 
+       /* Take advantage of single-shot MSI. */
+       if ((BGE_IS_5717_PLUS(sc) && sc->bge_flags & BGE_TAGGED_STATUS)) {
+               reg = CSR_READ_4(sc, BGE_MSI_MODE);
+               reg &= ~BGE_MSIMODE_ONE_SHOT_DISABLE;
+               reg |= BGE_MSIMODE_ENABLE;
+               CSR_WRITE_4(sc, BGE_MSI_MODE, reg);
+       }
+
        /* Hookup IRQ last. */
        DPRINTFN(5, ("pci_intr_establish\n"));
        sc->bge_intrhand = pci_intr_establish(pc, ih, IPL_NET, bge_intr, sc,
            sc->bge_dev.dv_xname);
        if (sc->bge_intrhand == NULL) {
@@ -3505,68 +3518,68 @@ bge_txeof(struct bge_softc *sc)
 int
 bge_intr(void *xsc)
 {
        struct bge_softc *sc;
        struct ifnet *ifp;
-       u_int32_t statusword;
-       u_int32_t intrmask = BGE_PCISTATE_INTR_NOT_ACTIVE;
+       u_int32_t statusword, statustag;
 
        sc = xsc;
        ifp = &sc->arpcom.ac_if;
 
-       if (BGE_IS_5717_PLUS(sc))
-               intrmask = 0;
-
-       /* It is possible for the interrupt to arrive before
-        * the status block is updated prior to the interrupt.
-        * Reading the PCI State register will confirm whether the
-        * interrupt is ours and will flush the status block.
-        */
-
        /* read status word from status block */
        bus_dmamap_sync(sc->bge_dmatag, sc->bge_ring_map,
            offsetof(struct bge_ring_data, bge_status_block),
            sizeof (struct bge_status_block),
            BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 
        statusword = sc->bge_rdata->bge_status_block.bge_status;
+       statustag = sc->bge_rdata->bge_status_block.bge_status_tag << 24;
 
-       if ((statusword & BGE_STATFLAG_UPDATED) ||
-           (~CSR_READ_4(sc, BGE_PCI_PCISTATE) & intrmask)) {
-
+       if (sc->bge_flags & BGE_TAGGED_STATUS) {
+               if (sc->bge_lasttag == statustag &&
+                   (CSR_READ_4(sc, BGE_PCI_PCISTATE) &
+                    BGE_PCISTATE_INTR_NOT_ACTIVE))
+                       return (0);
+               sc->bge_lasttag = statustag;
+       } else {
+               if (!(statusword & BGE_STATFLAG_UPDATED) &&
+                   (CSR_READ_4(sc, BGE_PCI_PCISTATE) &
+                    BGE_PCISTATE_INTR_NOT_ACTIVE))
+                       return (0);
                /* Ack interrupt and stop others from occurring. */
                bge_writembx(sc, BGE_MBX_IRQ0_LO, 1);
-                       
-               /* clear status word */
-               sc->bge_rdata->bge_status_block.bge_status = 0;
+               statustag = 0;
+       }
 
-               bus_dmamap_sync(sc->bge_dmatag, sc->bge_ring_map,
-                   offsetof(struct bge_ring_data, bge_status_block),
-                   sizeof (struct bge_status_block),
-                   BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
+       /* clear status word */
+       sc->bge_rdata->bge_status_block.bge_status = 0;
 
-               if (BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5700 ||
-                   statusword & BGE_STATFLAG_LINKSTATE_CHANGED ||
-                   BGE_STS_BIT(sc, BGE_STS_LINK_EVT))
-                       bge_link_upd(sc);
+       bus_dmamap_sync(sc->bge_dmatag, sc->bge_ring_map,
+           offsetof(struct bge_ring_data, bge_status_block),
+           sizeof (struct bge_status_block),
+           BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 
-               if (ifp->if_flags & IFF_RUNNING) {
-                       /* Check RX return ring producer/consumer */
-                       bge_rxeof(sc);
+       if (BGE_ASICREV(sc->bge_chipid) == BGE_ASICREV_BCM5700 ||
+           statusword & BGE_STATFLAG_LINKSTATE_CHANGED ||
+           BGE_STS_BIT(sc, BGE_STS_LINK_EVT))
+               bge_link_upd(sc);
 
-                       /* Check TX ring producer/consumer */
-                       bge_txeof(sc);
-               }
+       /* Re-enable interrupts. */
+       bge_writembx(sc, BGE_MBX_IRQ0_LO, statustag);
 
-               /* Re-enable interrupts. */
-               bge_writembx(sc, BGE_MBX_IRQ0_LO, 0);
+       if (ifp->if_flags & IFF_RUNNING) {
+               /* Check RX return ring producer/consumer */
+               bge_rxeof(sc);
 
-               bge_start(ifp);
+               /* Check TX ring producer/consumer */
+               bge_txeof(sc);
 
-               return (1);
-       } else
-               return (0);
+               if (!IFQ_IS_EMPTY(&ifp->if_snd))
+                       bge_start(ifp);
+       }
+
+       return (1);
 }
 
 void
 bge_tick(void *xsc)
 {
diff --git sys/dev/pci/if_bgereg.h sys/dev/pci/if_bgereg.h
index 3685f14..72eba41 100644
--- sys/dev/pci/if_bgereg.h
+++ sys/dev/pci/if_bgereg.h
@@ -223,10 +223,11 @@
 #define        BGE_PCIMISCCTL_ENDIAN_WORDSWAP  0x00000008
 #define        BGE_PCIMISCCTL_PCISTATE_RW      0x00000010
 #define        BGE_PCIMISCCTL_CLOCKCTL_RW      0x00000020
 #define        BGE_PCIMISCCTL_REG_WORDSWAP     0x00000040
 #define        BGE_PCIMISCCTL_INDIRECT_ACCESS  0x00000080
+#define        BGE_PCIMISCCTL_TAGGED_STATUS    0x00000200
 #define        BGE_PCIMISCCTL_ASICREV          0xFFFF0000
 #define        BGE_PCIMISCCTL_ASICREV_SHIFT    16
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define        BGE_DMA_SWAP_OPTIONS \
@@ -1917,12 +1918,11 @@
 #define        BGE_MSIMODE_RESET               0x00000001
 #define        BGE_MSIMODE_ENABLE              0x00000002
 #define        BGE_MSIMODE_PCI_TGT_ABRT_ATTN   0x00000004
 #define        BGE_MSIMODE_PCI_MSTR_ABRT_ATTN  0x00000008
 #define        BGE_MSIMODE_PCI_PERR_ATTN       0x00000010
-#define        BGE_MSIMODE_MSI_FIFOUFLOW_ATTN  0x00000020
-#define        BGE_MSIMODE_MSI_FIFOOFLOW_ATTN  0x00000040
+#define        BGE_MSIMODE_ONE_SHOT_DISABLE    0x00000020
 
 /* MSI status register */
 #define        BGE_MSISTAT_PCI_TGT_ABRT_ATTN   0x00000004
 #define        BGE_MSISTAT_PCI_MSTR_ABRT_ATTN  0x00000008
 #define        BGE_MSISTAT_PCI_PERR_ATTN       0x00000010
@@ -2399,11 +2399,11 @@ struct bge_sts_idx {
 #endif
 };
 
 struct bge_status_block {
        u_int32_t               bge_status;
-       u_int32_t               bge_rsvd0;
+       u_int32_t               bge_status_tag;
 #if BYTE_ORDER == LITTLE_ENDIAN
        u_int16_t               bge_rx_jumbo_cons_idx;
        u_int16_t               bge_rx_std_cons_idx;
        u_int16_t               bge_rx_mini_cons_idx;
        u_int16_t               bge_rsvd1;
@@ -2808,10 +2808,11 @@ struct bge_softc {
        struct mii_data         bge_mii;
        struct ifmedia          bge_ifmedia;    /* media info */
        u_int32_t               bge_expcap;
        u_int32_t               bge_mps;
        u_int32_t               bge_expmrq;
+       u_int32_t               bge_lasttag;
        u_int32_t               bge_flags;
 #define        BGE_TXRING_VALID        0x00000001
 #define        BGE_RXRING_VALID        0x00000002
 #define        BGE_JUMBO_RXRING_VALID  0x00000004
 #define        BGE_RX_ALIGNBUG         0x00000008
@@ -2839,10 +2840,11 @@ struct bge_softc {
 #define        BGE_5700_FAMILY         0x02000000
 #define        BGE_5717_PLUS           0x04000000
 #define        BGE_57765_PLUS          0x08000000
 #define        BGE_APE                 0x10000000
 #define        BGE_CPMU_PRESENT        0x20000000
+#define        BGE_TAGGED_STATUS       0x40000000
 
        bus_dma_tag_t           bge_dmatag;
        u_int32_t               bge_mfw_flags;  /* Management F/W flags */
 #define        BGE_MFW_ON_RXCPU        0x00000001
 #define        BGE_MFW_ON_APE          0x00000002

Reply via email to