Re: mpsafe re(4)
> On 26 Dec 2015, at 5:49 PM, Jonathan Matthew wrote: > > On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote: >> The main interesting bit here is the txeof and start loops, which previously >> operated based on the prod/cons indices and the contents of the tx queue, >> but now just uses the indices as that's the only way to get a consistent view >> of the tx queue state. >> >> At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE >> instead of ifq_deq_begin/commit, but maybe I'm wrong about that. > > dlg@ pointed out that the way the driver modifies rl_flags would be unsafe > with > this diff applied, since it sets and clears bits during interrupts and ioctl > handling with no locking. The only change it makes during interrupts is to > flip RL_FLAG_TIMERINTR, which otherwise is only used during re_init and > re_stop, so this updated diff splits it out into a separate field. > > ok? im ok with this diff. tested on RTL8168E/8111E. dlg > > > Index: dev/ic/re.c > === > RCS file: /cvs/src/sys/dev/ic/re.c,v > retrieving revision 1.187 > diff -u -p -r1.187 re.c > --- dev/ic/re.c 25 Nov 2015 03:09:58 - 1.187 > +++ dev/ic/re.c 26 Dec 2015 03:43:35 - > @@ -120,6 +120,7 @@ > #include > #include > #include > +#include > > #include > > @@ -151,7 +152,7 @@ int redebug = 0; > > static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t); > > -int re_encap(struct rl_softc *, struct mbuf *, int *); > +int re_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *); > > int re_newbuf(struct rl_softc *); > int re_rx_list_init(struct rl_softc *); > @@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc) > struct ifnet*ifp; > struct rl_txq *txq; > uint32_ttxstat; > - int idx, descidx, tx = 0; > + int idx, descidx, tx_free, freed = 0; > > ifp = &sc->sc_arpcom.ac_if; > > - for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) { > + for (idx = sc->rl_ldata.rl_txq_considx; > + idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) { > txq = &sc->rl_ldata.rl_txq[idx]; > > - if (txq->txq_mbuf == NULL) { > - KASSERT(idx == sc->rl_ldata.rl_txq_prodidx); > - break; > - } > - > descidx = txq->txq_descidx; > RL_TXDESCSYNC(sc, descidx, > BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); > @@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc) > if (txstat & RL_TDESC_CMD_OWN) > break; > > - tx = 1; > - sc->rl_ldata.rl_tx_free += txq->txq_nsegs; > - KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt); > + freed += txq->txq_nsegs; > bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap, > 0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE); > bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap); > @@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc) > ifp->if_opackets++; > } > > - sc->rl_ldata.rl_txq_considx = idx; > + if (freed == 0) > + return (0); > > - ifq_clr_oactive(&ifp->if_snd); > + tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed); > + KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt); > + > + sc->rl_ldata.rl_txq_considx = idx; > > /* >* Some chips will ignore a second TX request issued while an > @@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc) >* to restart the channel here to flush them out. This only >* seems to be required with the PCIe devices. >*/ > - if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt) > + if (tx_free < sc->rl_ldata.rl_tx_desc_cnt) > CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); > - else > + else { > + ifq_clr_oactive(&ifp->if_snd); > ifp->if_timer = 0; > + } > > - return (tx); > + return (1); > } > > void > @@ -1566,13 +1567,15 @@ re_intr(void *arg) > } > > if (status & RL_ISR_SYSTEM_ERR) { > + KERNEL_LOCK(); > re_init(ifp); > + KERNEL_UNLOCK(); > claimed = 1; > } > } > > if (sc->rl_imtype == RL_IMTYPE_SIM) { > - if ((sc->rl_flags & RL_FLAG_TIMERINTR)) { > + if (sc->rl_timerintr) { > if ((tx | rx) == 0) { > /* >* Nothing needs to be processed, fallback > @@ -1599,7 +1602,11 @@ re_intr(void *arg) > } > } > > - re_start(ifp); > + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { > + KERNEL_LOCK(); > + re_start(ifp); > +
Re: mpsafe re(4)
On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote: > The main interesting bit here is the txeof and start loops, which previously > operated based on the prod/cons indices and the contents of the tx queue, > but now just uses the indices as that's the only way to get a consistent view > of the tx queue state. > > At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE > instead of ifq_deq_begin/commit, but maybe I'm wrong about that. dlg@ pointed out that the way the driver modifies rl_flags would be unsafe with this diff applied, since it sets and clears bits during interrupts and ioctl handling with no locking. The only change it makes during interrupts is to flip RL_FLAG_TIMERINTR, which otherwise is only used during re_init and re_stop, so this updated diff splits it out into a separate field. ok? Index: dev/ic/re.c === RCS file: /cvs/src/sys/dev/ic/re.c,v retrieving revision 1.187 diff -u -p -r1.187 re.c --- dev/ic/re.c 25 Nov 2015 03:09:58 - 1.187 +++ dev/ic/re.c 26 Dec 2015 03:43:35 - @@ -120,6 +120,7 @@ #include #include #include +#include #include @@ -151,7 +152,7 @@ int redebug = 0; static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t); -intre_encap(struct rl_softc *, struct mbuf *, int *); +intre_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *); intre_newbuf(struct rl_softc *); intre_rx_list_init(struct rl_softc *); @@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc) struct ifnet*ifp; struct rl_txq *txq; uint32_ttxstat; - int idx, descidx, tx = 0; + int idx, descidx, tx_free, freed = 0; ifp = &sc->sc_arpcom.ac_if; - for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) { + for (idx = sc->rl_ldata.rl_txq_considx; + idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) { txq = &sc->rl_ldata.rl_txq[idx]; - if (txq->txq_mbuf == NULL) { - KASSERT(idx == sc->rl_ldata.rl_txq_prodidx); - break; - } - descidx = txq->txq_descidx; RL_TXDESCSYNC(sc, descidx, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); @@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc) if (txstat & RL_TDESC_CMD_OWN) break; - tx = 1; - sc->rl_ldata.rl_tx_free += txq->txq_nsegs; - KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt); + freed += txq->txq_nsegs; bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap, 0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap); @@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc) ifp->if_opackets++; } - sc->rl_ldata.rl_txq_considx = idx; + if (freed == 0) + return (0); - ifq_clr_oactive(&ifp->if_snd); + tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed); + KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt); + + sc->rl_ldata.rl_txq_considx = idx; /* * Some chips will ignore a second TX request issued while an @@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc) * to restart the channel here to flush them out. This only * seems to be required with the PCIe devices. */ - if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt) + if (tx_free < sc->rl_ldata.rl_tx_desc_cnt) CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); - else + else { + ifq_clr_oactive(&ifp->if_snd); ifp->if_timer = 0; + } - return (tx); + return (1); } void @@ -1566,13 +1567,15 @@ re_intr(void *arg) } if (status & RL_ISR_SYSTEM_ERR) { + KERNEL_LOCK(); re_init(ifp); + KERNEL_UNLOCK(); claimed = 1; } } if (sc->rl_imtype == RL_IMTYPE_SIM) { - if ((sc->rl_flags & RL_FLAG_TIMERINTR)) { + if (sc->rl_timerintr) { if ((tx | rx) == 0) { /* * Nothing needs to be processed, fallback @@ -1599,7 +1602,11 @@ re_intr(void *arg) } } - re_start(ifp); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + KERNEL_LOCK(); + re_start(ifp); + KERNEL_UNLOCK(); + } CSR_WRITE_2(sc, RL_IMR, sc->rl_intrs); @@ -1607,7 +1614,7 @@ re_intr(void *arg) } int -re_encap(struct rl_softc *sc, struct mbuf *m, int *idx) +re_encap(struct rl_s
Re: mpsafe re(4)
Dimitris Papastamos [s...@2f30.org] wrote: > On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote: > > The main interesting bit here is the txeof and start loops, which previously > > operated based on the prod/cons indices and the contents of the tx queue, > > but now just uses the indices as that's the only way to get a consistent > > view > > of the tx queue state. > > > > At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE > > instead of ifq_deq_begin/commit, but maybe I'm wrong about that. > > > > can someone try this on an APU1? > > I've tested this on my router and it seems to work okay. I've also used > tcpbench with various combinations. > > re0 at pci2 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G > (0x4c00), msi, address 80:ee:73:9f:1d:3e > re1 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G > (0x4c00), msi, address 80:ee:73:9f:1d:3d Testing fine for me on APU1 so far.
Re: mpsafe re(4)
On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote: > The main interesting bit here is the txeof and start loops, which previously > operated based on the prod/cons indices and the contents of the tx queue, > but now just uses the indices as that's the only way to get a consistent view > of the tx queue state. > > At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE > instead of ifq_deq_begin/commit, but maybe I'm wrong about that. > > can someone try this on an APU1? I've tested this on my router and it seems to work okay. I've also used tcpbench with various combinations. re0 at pci2 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G (0x4c00), msi, address 80:ee:73:9f:1d:3e re1 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G (0x4c00), msi, address 80:ee:73:9f:1d:3d
mpsafe re(4)
The main interesting bit here is the txeof and start loops, which previously operated based on the prod/cons indices and the contents of the tx queue, but now just uses the indices as that's the only way to get a consistent view of the tx queue state. At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE instead of ifq_deq_begin/commit, but maybe I'm wrong about that. can someone try this on an APU1? Index: ic/re.c === RCS file: /cvs/src/sys/dev/ic/re.c,v retrieving revision 1.187 diff -u -p -r1.187 re.c --- ic/re.c 25 Nov 2015 03:09:58 - 1.187 +++ ic/re.c 4 Dec 2015 15:07:02 - @@ -120,6 +120,7 @@ #include #include #include +#include #include @@ -151,7 +152,7 @@ int redebug = 0; static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t); -intre_encap(struct rl_softc *, struct mbuf *, int *); +intre_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *); intre_newbuf(struct rl_softc *); intre_rx_list_init(struct rl_softc *); @@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc) struct ifnet*ifp; struct rl_txq *txq; uint32_ttxstat; - int idx, descidx, tx = 0; + int idx, descidx, tx_free, freed = 0; ifp = &sc->sc_arpcom.ac_if; - for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) { + for (idx = sc->rl_ldata.rl_txq_considx; + idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) { txq = &sc->rl_ldata.rl_txq[idx]; - if (txq->txq_mbuf == NULL) { - KASSERT(idx == sc->rl_ldata.rl_txq_prodidx); - break; - } - descidx = txq->txq_descidx; RL_TXDESCSYNC(sc, descidx, BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE); @@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc) if (txstat & RL_TDESC_CMD_OWN) break; - tx = 1; - sc->rl_ldata.rl_tx_free += txq->txq_nsegs; - KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt); + freed += txq->txq_nsegs; bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap, 0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap); @@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc) ifp->if_opackets++; } - sc->rl_ldata.rl_txq_considx = idx; + if (freed == 0) + return (0); - ifq_clr_oactive(&ifp->if_snd); + tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed); + KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt); + + sc->rl_ldata.rl_txq_considx = idx; /* * Some chips will ignore a second TX request issued while an @@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc) * to restart the channel here to flush them out. This only * seems to be required with the PCIe devices. */ - if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt) + if (tx_free < sc->rl_ldata.rl_tx_desc_cnt) CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START); - else + else { + ifq_clr_oactive(&ifp->if_snd); ifp->if_timer = 0; + } - return (tx); + return (1); } void @@ -1566,7 +1567,9 @@ re_intr(void *arg) } if (status & RL_ISR_SYSTEM_ERR) { + KERNEL_LOCK(); re_init(ifp); + KERNEL_UNLOCK(); claimed = 1; } } @@ -1599,7 +1602,11 @@ re_intr(void *arg) } } - re_start(ifp); + if (!IFQ_IS_EMPTY(&ifp->if_snd)) { + KERNEL_LOCK(); + re_start(ifp); + KERNEL_UNLOCK(); + } CSR_WRITE_2(sc, RL_IMR, sc->rl_intrs); @@ -1607,7 +1614,7 @@ re_intr(void *arg) } int -re_encap(struct rl_softc *sc, struct mbuf *m, int *idx) +re_encap(struct rl_softc *sc, struct mbuf *m, struct rl_txq *txq, int *used) { bus_dmamap_tmap; struct mbuf *mp, mh; @@ -1616,7 +1623,6 @@ re_encap(struct rl_softc *sc, struct mbu struct ip *ip; struct rl_desc *d; u_int32_t cmdstat, vlanctl = 0, csum_flags = 0; - struct rl_txq *txq; /* * Set up checksum offload. Note: checksum offload bits must @@ -1669,7 +1675,6 @@ re_encap(struct rl_softc *sc, struct mbu } } - txq = &sc->rl_ldata.rl_txq[*idx]; map = txq->txq_dmamap; error = bus_dmamap_load_mbuf(sc->sc_dmat, map, m, @@ -1710,7 +1715,7 @@ re_encap(struct rl_softc *sc, struct mbu nsegs++;