>Synopsis:      ral(4) leaks mbufs instead of setting oactive 
>Category:      kernel
>Environment:
        System      : OpenBSD 5.9
        Details     : OpenBSD 5.9-beta (GENERIC) #1538: Sun Jan 31 20:01:47 MST 
2016
                         
dera...@i386.openbsd.org:/usr/src/sys/arch/i386/compile/GENERIC

>Description:
An error in ral(4)'s mbuf defrag logic drops those mbufs that would 
otherwise fully fill the tx-ring: the oactive flag is never raised and 
many unnecessary oerrors occur. See patch comments below for details.

Demonstrated with an RT2860, but dev/ic/rt2860.c also drives the newer 
chipsets and this problem should affect them, too. I haven't checked the 
drivers for the older chipsets.

>How-To-Repeat: 
netcat /dev/zero over a ral(4) interface, or otherwise saturate the link. 
Observe 'netstat -i' oerror counters and/or SACK episodes.

>Fix:
Patch below tested for two days on two routers that provide an AP, one an 
i386 and the other socppc. Patch also stress-tested with netcat, etc: this 
saw oerrors drop from 1 per 224 frames to 0 over the test run, SACK 
episodes much reduced, etc. No bandwidth regressions observed.

Note that the fix exposed three pre-existing problems with ring-full 
handling; two of these are fixed by the first two patches below. The third 
problem is probably only theoretical, was unobserved in testing and looks 
tricky to remove, see XXX comment.

Lastly, the patch adds some KASSERTs so it may pay to test it over the 
next release cycle. I could replace these with workaround code if desired.

best, 
Richard. 

PS. A style question: is there a preference between 'if (foo() != 0)' and  
'if (foo())' when checking error returns?

ral0 at pci0 dev 12 function 0 "Ralink RT2860" rev 0x00: irq 9, address 
00:0e:8e:xx:xx:xx 
ral0: MAC/BBP RT2860 (rev 0x0103), RF RT2850 (MIMO 2T3R) 

ral0 at pci0 dev 15 function 0 "Ralink RT2860" rev 0x00: ivec 19, address 
00:0e:8e:xx:xx:xx
ral0: MAC/BBP RT2860 (rev 0x0103), RF RT2850 (MIMO 2T3R)

------------------------------
 * fix dropped frames, watchdog timeouts

- ring->queued is shared with the interrupt handler; update it atomically. 

Once exposed, this problem manifested in testing. 

Index: sys/dev/ic/rt2860.c
===================================================================
--- sys.orig/dev/ic/rt2860.c
+++ sys/dev/ic/rt2860.c
@@ -35,6 +35,7 @@
 #include <sys/conf.h>
 #include <sys/device.h>
 #include <sys/endian.h>
+#include <sys/atomic.h>
 
 #include <machine/bus.h>
 #include <machine/intr.h>
@@ -1166,7 +1167,7 @@ rt2860_tx_intr(struct rt2860_softc *sc,
 
                        ifp->if_opackets++;
                }
-               ring->queued--;
+               atomic_dec_int(&ring->queued);
                ring->next = (ring->next + 1) % RT2860_TX_RING_COUNT;
        }
 
@@ -1713,7 +1714,7 @@ rt2860_tx(struct rt2860_softc *sc, struc
            qid, txwi->wcid, data->map->dm_nsegs, ridx));
 
        ring->cur = (ring->cur + 1) % RT2860_TX_RING_COUNT;
-       ring->queued += ntxds;
+       atomic_add_int(&ring->queued, ntxds);
        if (ring->queued >= RT2860_TX_RING_COUNT)
                sc->qfullmsk |= 1 << qid;
 
Index: sys/dev/ic/rt2860var.h
===================================================================
--- sys.orig/dev/ic/rt2860var.h
+++ sys/dev/ic/rt2860var.h
@@ -78,7 +78,7 @@ struct rt2860_tx_ring {
        struct rt2860_tx_data   *data[RT2860_TX_RING_COUNT];
        int                     cur;
        int                     next;
-       int                     queued;
+       unsigned int            queued;
 };
 
 struct rt2860_rx_data {
------------------------------

* fix dropped frames, watchdog timeouts. 

- on full tx ring, ring->cur wraps to an active tx descriptor. Passing 
that wrapped value to the card was observed to cause general flakiness.

Fix prevents the wrap at the cost of reducing usable tx descriptors by 
one.

Note: this issue is independent of the ring->queued race; once exposed, 
neither this nor that patch alone suffices to prevent dropped frames.

Index: sys/dev/ic/rt2860.c
===================================================================
--- sys.orig/dev/ic/rt2860.c
+++ sys/dev/ic/rt2860.c
@@ -1172,7 +1172,7 @@ rt2860_tx_intr(struct rt2860_softc *sc,
        }
 
        sc->sc_tx_timer = 0;
-       if (ring->queued < RT2860_TX_RING_COUNT)
+       if (ring->queued < RT2860_TX_RING_MAX)
                sc->qfullmsk &= ~(1 << qid);
        ifq_clr_oactive(&ifp->if_snd);
        rt2860_start(ifp);
@@ -1619,7 +1619,7 @@ rt2860_tx(struct rt2860_softc *sc, struc
                /* determine how many TXDs are required */
                ntxds = 1 + (data->map->dm_nsegs / 2);
 
-               if (ring->queued + ntxds >= RT2860_TX_RING_COUNT) {
+               if (ring->queued + ntxds >= RT2860_TX_RING_MAX) {
                        /* not enough free TXDs, force mbuf defrag */
                        bus_dmamap_unload(sc->sc_dmat, data->map);
                        error = EFBIG;
@@ -1657,7 +1657,7 @@ rt2860_tx(struct rt2860_softc *sc, struc
                /* determine how many TXDs are now required */
                ntxds = 1 + (data->map->dm_nsegs / 2);
 
-               if (ring->queued + ntxds >= RT2860_TX_RING_COUNT) {
+               if (ring->queued + ntxds >= RT2860_TX_RING_MAX) {
                        /* this is a hopeless case, drop the mbuf! */
                        bus_dmamap_unload(sc->sc_dmat, data->map);
                        m_freem(m);
@@ -1715,7 +1715,7 @@ rt2860_tx(struct rt2860_softc *sc, struc
 
        ring->cur = (ring->cur + 1) % RT2860_TX_RING_COUNT;
        atomic_add_int(&ring->queued, ntxds);
-       if (ring->queued >= RT2860_TX_RING_COUNT)
+       if (ring->queued >= RT2860_TX_RING_MAX)
                sc->qfullmsk |= 1 << qid;
 
        /* kick Tx */
Index: sys/dev/ic/rt2860var.h
===================================================================
--- sys.orig/dev/ic/rt2860var.h
+++ sys/dev/ic/rt2860var.h
@@ -17,8 +17,9 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-#define RT2860_TX_RING_COUNT   64
 #define RT2860_RX_RING_COUNT   128
+#define RT2860_TX_RING_COUNT   64
+#define RT2860_TX_RING_MAX     (RT2860_TX_RING_COUNT - 1)
 #define RT2860_TX_POOL_COUNT   (RT2860_TX_RING_COUNT * 2)
 
 #define RT2860_MAX_SCATTER     ((RT2860_TX_RING_COUNT * 2) - 1)
------------------------------

* replace custom defrag with m_defrag() 

- This fixes an error in the existing code: the "hopeless case" guard
equivales 'ring now full', so oactive is never set: the code drops any mbuf
that would fill the ring. This occurs often in practice.

- Allowing the ring to fill exposes the problems fixed by the preceding 
patches. See XXX comment for a theoretical issue also exposed.

- The new code avoids some hoop-jumping. Currently, one tx dma-map can 
fill the tx ring. Therefore an mbuf that fits a dma-map may yet not fit 
into the tx ring's remaining space.  To ensure it can, we must in general 
count the mbuf's fragments and, if necessary, defrag it and reload the 
dmamap.

The new code limits the dmamap to cover at most 8 tx descriptors (= 15 
mbuf fragments): if an mbuf fits a dma-map it will fit any ring with at 
least 8 free descriptors. So we're no longer obliged to count fragments 
and jump hoops. The cost is under-utilised tx ring descriptors, at most 7 
of 63, when a one-fragment mbuf occupies one descriptor in the last block 
of 8.

- For simplicity on error return, shift responsibilty for calling
m_freem() to rt2860_tx()'s sole caller (which already calls
ieee80211_release_node()).

Index: sys/dev/ic/rt2860.c
===================================================================
--- sys.orig/dev/ic/rt2860.c
+++ sys/dev/ic/rt2860.c
@@ -1172,7 +1172,7 @@ rt2860_tx_intr(struct rt2860_softc *sc,
        }
 
        sc->sc_tx_timer = 0;
-       if (ring->queued < RT2860_TX_RING_MAX)
+       if (ring->queued < RT2860_TX_RING_FULL)
                sc->qfullmsk &= ~(1 << qid);
        ifq_clr_oactive(&ifp->if_snd);
        rt2860_start(ifp);
@@ -1482,12 +1482,11 @@ rt2860_tx(struct rt2860_softc *sc, struc
        struct rt2860_txd *txd;
        struct rt2860_txwi *txwi;
        struct ieee80211_frame *wh;
-       struct mbuf *m1;
        bus_dma_segment_t *seg;
        u_int hdrlen;
        uint16_t qos, dur;
        uint8_t type, qsel, mcs, pid, tid, qid;
-       int nsegs, ntxds, hasqos, ridx, ctl_ridx, error;
+       int nsegs, hasqos, ridx, ctl_ridx;
 
        /* the data pool contains at least one element, pick the first */
        data = SLIST_FIRST(&sc->data_pool);
@@ -1607,62 +1606,14 @@ rt2860_tx(struct rt2860_softc *sc, struc
        memcpy(txwi + 1, wh, hdrlen);
        m_adj(m, hdrlen);
 
-       error = bus_dmamap_load_mbuf(sc->sc_dmat, data->map, m,
-           BUS_DMA_NOWAIT);
-       if (__predict_false(error != 0 && error != EFBIG)) {
-               printf("%s: can't map mbuf (error %d)\n",
-                   sc->sc_dev.dv_xname, error);
-               m_freem(m);
-               return error;
-       }
-       if (__predict_true(error == 0)) {
-               /* determine how many TXDs are required */
-               ntxds = 1 + (data->map->dm_nsegs / 2);
-
-               if (ring->queued + ntxds >= RT2860_TX_RING_MAX) {
-                       /* not enough free TXDs, force mbuf defrag */
-                       bus_dmamap_unload(sc->sc_dmat, data->map);
-                       error = EFBIG;
-               }
-       }
-       if (__predict_false(error != 0)) {
-               /* too many fragments, linearize */
-               MGETHDR(m1, M_DONTWAIT, MT_DATA);
-               if (m1 == NULL) {
-                       m_freem(m);
-                       return ENOBUFS;
-               }
-               if (m->m_pkthdr.len > MHLEN) {
-                       MCLGET(m1, M_DONTWAIT);
-                       if (!(m1->m_flags & M_EXT)) {
-                               m_freem(m);
-                               m_freem(m1);
-                               return ENOBUFS;
-                       }
-               }
-               m_copydata(m, 0, m->m_pkthdr.len, mtod(m1, caddr_t));
-               m1->m_pkthdr.len = m1->m_len = m->m_pkthdr.len;
-               m_freem(m);
-               m = m1;
-
-               error = bus_dmamap_load_mbuf(sc->sc_dmat, data->map, m,
-                   BUS_DMA_NOWAIT);
-               if (__predict_false(error != 0)) {
-                       printf("%s: can't map mbuf (error %d)\n",
-                           sc->sc_dev.dv_xname, error);
-                       m_freem(m);
-                       return error;
-               }
+       KASSERT (ring->queued < RT2860_TX_RING_FULL);
 
-               /* determine how many TXDs are now required */
-               ntxds = 1 + (data->map->dm_nsegs / 2);
-
-               if (ring->queued + ntxds >= RT2860_TX_RING_MAX) {
-                       /* this is a hopeless case, drop the mbuf! */
-                       bus_dmamap_unload(sc->sc_dmat, data->map);
-                       m_freem(m);
-                       return ENOBUFS;
-               }
+       if (bus_dmamap_load_mbuf(sc->sc_dmat, data->map, m, BUS_DMA_NOWAIT)) {
+               if (m_defrag(m, M_DONTWAIT))
+                       return (ENOBUFS);
+               if (bus_dmamap_load_mbuf(sc->sc_dmat,
+                   data->map, m, BUS_DMA_NOWAIT))
+                       return (EFBIG);
        }
 
        qsel = (qid < EDCA_NUM_AC) ? RT2860_TX_QSEL_EDCA : RT2860_TX_QSEL_MGMT;
@@ -1714,9 +1665,18 @@ rt2860_tx(struct rt2860_softc *sc, struc
            qid, txwi->wcid, data->map->dm_nsegs, ridx));
 
        ring->cur = (ring->cur + 1) % RT2860_TX_RING_COUNT;
-       atomic_add_int(&ring->queued, ntxds);
-       if (ring->queued >= RT2860_TX_RING_MAX)
+       atomic_add_int(&ring->queued, 1 + (data->map->dm_nsegs / 2));
+       if (ring->queued >= RT2860_TX_RING_FULL) {
+               /* XXX card may interrupt here and invalidate this guard; the
+                * next line would then cause a spurious oactive and tx would
+                * go out to lunch. Interrupt handler mitigates this by
+                * clearing oactive on every frame; the card would have to
+                * transmit a full tx ring between these two lines.
+                */
                sc->qfullmsk |= 1 << qid;
+       }
+
+       KASSERT (ring->queued <= RT2860_TX_RING_MAX);
 
        /* kick Tx */
        RAL_WRITE(sc, RT2860_TX_CTX_IDX(qid), ring->cur);
@@ -1772,6 +1732,7 @@ sendit:
                        bpf_mtap(ic->ic_rawbpf, m, BPF_DIRECTION_OUT);
 #endif
                if (rt2860_tx(sc, m, ni) != 0) {
+                       m_freem(m);
                        ieee80211_release_node(ic, ni);
                        ifp->if_oerrors++;
                        continue;
Index: sys/dev/ic/rt2860var.h
===================================================================
--- sys.orig/dev/ic/rt2860var.h
+++ sys/dev/ic/rt2860var.h
@@ -17,13 +17,15 @@
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
+#define RT2860_MAX_SCATTER     15
+#define RT2860_MAX_SCATTER_TXD (1 + (RT2860_MAX_SCATTER / 2))
+
 #define RT2860_RX_RING_COUNT   128
 #define RT2860_TX_RING_COUNT   64
 #define RT2860_TX_RING_MAX     (RT2860_TX_RING_COUNT - 1)
+#define RT2860_TX_RING_FULL    (RT2860_TX_RING_MAX - RT2860_MAX_SCATTER_TXD)
 #define RT2860_TX_POOL_COUNT   (RT2860_TX_RING_COUNT * 2)
 
-#define RT2860_MAX_SCATTER     ((RT2860_TX_RING_COUNT * 2) - 1)
-
 /* HW supports up to 255 STAs */
 #define RT2860_WCID_MAX                254
 #define RT2860_AID2WCID(aid)   ((aid) & 0xff)

Reply via email to