Re: mpsafe re(4)

2015-12-27 Thread David Gwynne

> On 26 Dec 2015, at 5:49 PM, Jonathan Matthew  wrote:
> 
> On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote:
>> The main interesting bit here is the txeof and start loops, which previously
>> operated based on the prod/cons indices and the contents of the tx queue,
>> but now just uses the indices as that's the only way to get a consistent view
>> of the tx queue state.
>> 
>> At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE
>> instead of ifq_deq_begin/commit, but maybe I'm wrong about that.
> 
> dlg@ pointed out that the way the driver modifies rl_flags would be unsafe 
> with
> this diff applied, since it sets and clears bits during interrupts and ioctl
> handling with no locking.  The only change it makes during interrupts is to
> flip RL_FLAG_TIMERINTR, which otherwise is only used during re_init and
> re_stop, so this updated diff splits it out into a separate field.
> 
> ok?

im ok with this diff. tested on RTL8168E/8111E.

dlg

> 
> 
> Index: dev/ic/re.c
> ===
> RCS file: /cvs/src/sys/dev/ic/re.c,v
> retrieving revision 1.187
> diff -u -p -r1.187 re.c
> --- dev/ic/re.c   25 Nov 2015 03:09:58 -  1.187
> +++ dev/ic/re.c   26 Dec 2015 03:43:35 -
> @@ -120,6 +120,7 @@
> #include 
> #include 
> #include 
> +#include 
> 
> #include 
> 
> @@ -151,7 +152,7 @@ int redebug = 0;
> 
> static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t);
> 
> -int  re_encap(struct rl_softc *, struct mbuf *, int *);
> +int  re_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *);
> 
> int   re_newbuf(struct rl_softc *);
> int   re_rx_list_init(struct rl_softc *);
> @@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc)
>   struct ifnet*ifp;
>   struct rl_txq   *txq;
>   uint32_ttxstat;
> - int idx, descidx, tx = 0;
> + int idx, descidx, tx_free, freed = 0;
> 
>   ifp = &sc->sc_arpcom.ac_if;
> 
> - for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) {
> + for (idx = sc->rl_ldata.rl_txq_considx;
> + idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) {
>   txq = &sc->rl_ldata.rl_txq[idx];
> 
> - if (txq->txq_mbuf == NULL) {
> - KASSERT(idx == sc->rl_ldata.rl_txq_prodidx);
> - break;
> - }
> -
>   descidx = txq->txq_descidx;
>   RL_TXDESCSYNC(sc, descidx,
>   BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
> @@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc)
>   if (txstat & RL_TDESC_CMD_OWN)
>   break;
> 
> - tx = 1;
> - sc->rl_ldata.rl_tx_free += txq->txq_nsegs;
> - KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
> + freed += txq->txq_nsegs;
>   bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap,
>   0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
>   bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap);
> @@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc)
>   ifp->if_opackets++;
>   }
> 
> - sc->rl_ldata.rl_txq_considx = idx;
> + if (freed == 0)
> + return (0);
> 
> - ifq_clr_oactive(&ifp->if_snd);
> + tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed);
> + KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
> +
> + sc->rl_ldata.rl_txq_considx = idx;
> 
>   /*
>* Some chips will ignore a second TX request issued while an
> @@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc)
>* to restart the channel here to flush them out. This only
>* seems to be required with the PCIe devices.
>*/
> - if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt)
> + if (tx_free < sc->rl_ldata.rl_tx_desc_cnt)
>   CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
> - else
> + else {
> + ifq_clr_oactive(&ifp->if_snd);
>   ifp->if_timer = 0;
> + }
> 
> - return (tx);
> + return (1);
> }
> 
> void
> @@ -1566,13 +1567,15 @@ re_intr(void *arg)
>   }
> 
>   if (status & RL_ISR_SYSTEM_ERR) {
> + KERNEL_LOCK();
>   re_init(ifp);
> + KERNEL_UNLOCK();
>   claimed = 1;
>   }
>   }
> 
>   if (sc->rl_imtype == RL_IMTYPE_SIM) {
> - if ((sc->rl_flags & RL_FLAG_TIMERINTR)) {
> + if (sc->rl_timerintr) {
>   if ((tx | rx) == 0) {
>   /*
>* Nothing needs to be processed, fallback
> @@ -1599,7 +1602,11 @@ re_intr(void *arg)
>   }
>   }
> 
> - re_start(ifp);
> + if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
> + KERNEL_LOCK();
> + re_start(ifp);
> +  

Re: mpsafe re(4)

2015-12-25 Thread Jonathan Matthew
On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote:
> The main interesting bit here is the txeof and start loops, which previously
> operated based on the prod/cons indices and the contents of the tx queue,
> but now just uses the indices as that's the only way to get a consistent view
> of the tx queue state.
> 
> At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE
> instead of ifq_deq_begin/commit, but maybe I'm wrong about that.

dlg@ pointed out that the way the driver modifies rl_flags would be unsafe with
this diff applied, since it sets and clears bits during interrupts and ioctl
handling with no locking.  The only change it makes during interrupts is to
flip RL_FLAG_TIMERINTR, which otherwise is only used during re_init and
re_stop, so this updated diff splits it out into a separate field.

ok?


Index: dev/ic/re.c
===
RCS file: /cvs/src/sys/dev/ic/re.c,v
retrieving revision 1.187
diff -u -p -r1.187 re.c
--- dev/ic/re.c 25 Nov 2015 03:09:58 -  1.187
+++ dev/ic/re.c 26 Dec 2015 03:43:35 -
@@ -120,6 +120,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -151,7 +152,7 @@ int redebug = 0;
 
 static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t);
 
-intre_encap(struct rl_softc *, struct mbuf *, int *);
+intre_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *);
 
 intre_newbuf(struct rl_softc *);
 intre_rx_list_init(struct rl_softc *);
@@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc)
struct ifnet*ifp;
struct rl_txq   *txq;
uint32_ttxstat;
-   int idx, descidx, tx = 0;
+   int idx, descidx, tx_free, freed = 0;
 
ifp = &sc->sc_arpcom.ac_if;
 
-   for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) {
+   for (idx = sc->rl_ldata.rl_txq_considx;
+   idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) {
txq = &sc->rl_ldata.rl_txq[idx];
 
-   if (txq->txq_mbuf == NULL) {
-   KASSERT(idx == sc->rl_ldata.rl_txq_prodidx);
-   break;
-   }
-
descidx = txq->txq_descidx;
RL_TXDESCSYNC(sc, descidx,
BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
@@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc)
if (txstat & RL_TDESC_CMD_OWN)
break;
 
-   tx = 1;
-   sc->rl_ldata.rl_tx_free += txq->txq_nsegs;
-   KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
+   freed += txq->txq_nsegs;
bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap,
0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap);
@@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc)
ifp->if_opackets++;
}
 
-   sc->rl_ldata.rl_txq_considx = idx;
+   if (freed == 0)
+   return (0);
 
-   ifq_clr_oactive(&ifp->if_snd);
+   tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed);
+   KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
+
+   sc->rl_ldata.rl_txq_considx = idx;
 
/*
 * Some chips will ignore a second TX request issued while an
@@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc)
 * to restart the channel here to flush them out. This only
 * seems to be required with the PCIe devices.
 */
-   if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt)
+   if (tx_free < sc->rl_ldata.rl_tx_desc_cnt)
CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
-   else
+   else {
+   ifq_clr_oactive(&ifp->if_snd);
ifp->if_timer = 0;
+   }
 
-   return (tx);
+   return (1);
 }
 
 void
@@ -1566,13 +1567,15 @@ re_intr(void *arg)
}
 
if (status & RL_ISR_SYSTEM_ERR) {
+   KERNEL_LOCK();
re_init(ifp);
+   KERNEL_UNLOCK();
claimed = 1;
}
}
 
if (sc->rl_imtype == RL_IMTYPE_SIM) {
-   if ((sc->rl_flags & RL_FLAG_TIMERINTR)) {
+   if (sc->rl_timerintr) {
if ((tx | rx) == 0) {
/*
 * Nothing needs to be processed, fallback
@@ -1599,7 +1602,11 @@ re_intr(void *arg)
}
}
 
-   re_start(ifp);
+   if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
+   KERNEL_LOCK();
+   re_start(ifp);
+   KERNEL_UNLOCK();
+   }
 
CSR_WRITE_2(sc, RL_IMR, sc->rl_intrs);
 
@@ -1607,7 +1614,7 @@ re_intr(void *arg)
 }
 
 int
-re_encap(struct rl_softc *sc, struct mbuf *m, int *idx)
+re_encap(struct rl_s

Re: mpsafe re(4)

2015-12-10 Thread Chris Cappuccio
Dimitris Papastamos [s...@2f30.org] wrote:
> On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote:
> > The main interesting bit here is the txeof and start loops, which previously
> > operated based on the prod/cons indices and the contents of the tx queue,
> > but now just uses the indices as that's the only way to get a consistent 
> > view
> > of the tx queue state.
> > 
> > At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE
> > instead of ifq_deq_begin/commit, but maybe I'm wrong about that.
> > 
> > can someone try this on an APU1?
> 
> I've tested this on my router and it seems to work okay.  I've also used
> tcpbench with various combinations.
> 
> re0 at pci2 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
> (0x4c00), msi, address 80:ee:73:9f:1d:3e
> re1 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
> (0x4c00), msi, address 80:ee:73:9f:1d:3d

Testing fine for me on APU1 so far.



Re: mpsafe re(4)

2015-12-06 Thread Dimitris Papastamos
On Sat, Dec 05, 2015 at 06:11:51PM +0100, Jonathan Matthew wrote:
> The main interesting bit here is the txeof and start loops, which previously
> operated based on the prod/cons indices and the contents of the tx queue,
> but now just uses the indices as that's the only way to get a consistent view
> of the tx queue state.
> 
> At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE
> instead of ifq_deq_begin/commit, but maybe I'm wrong about that.
> 
> can someone try this on an APU1?

I've tested this on my router and it seems to work okay.  I've also used
tcpbench with various combinations.

re0 at pci2 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
(0x4c00), msi, address 80:ee:73:9f:1d:3e
re1 at pci3 dev 0 function 0 "Realtek 8168" rev 0x0c: RTL8168G/8111G
(0x4c00), msi, address 80:ee:73:9f:1d:3d



mpsafe re(4)

2015-12-05 Thread Jonathan Matthew
The main interesting bit here is the txeof and start loops, which previously
operated based on the prod/cons indices and the contents of the tx queue,
but now just uses the indices as that's the only way to get a consistent view
of the tx queue state.

At the moment I don't think the tx ring is big enough to use IFQ_DEQUEUE
instead of ifq_deq_begin/commit, but maybe I'm wrong about that.

can someone try this on an APU1?

Index: ic/re.c
===
RCS file: /cvs/src/sys/dev/ic/re.c,v
retrieving revision 1.187
diff -u -p -r1.187 re.c
--- ic/re.c 25 Nov 2015 03:09:58 -  1.187
+++ ic/re.c 4 Dec 2015 15:07:02 -
@@ -120,6 +120,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -151,7 +152,7 @@ int redebug = 0;
 
 static inline void re_set_bufaddr(struct rl_desc *, bus_addr_t);
 
-intre_encap(struct rl_softc *, struct mbuf *, int *);
+intre_encap(struct rl_softc *, struct mbuf *, struct rl_txq *, int *);
 
 intre_newbuf(struct rl_softc *);
 intre_rx_list_init(struct rl_softc *);
@@ -1448,18 +1449,14 @@ re_txeof(struct rl_softc *sc)
struct ifnet*ifp;
struct rl_txq   *txq;
uint32_ttxstat;
-   int idx, descidx, tx = 0;
+   int idx, descidx, tx_free, freed = 0;
 
ifp = &sc->sc_arpcom.ac_if;
 
-   for (idx = sc->rl_ldata.rl_txq_considx;; idx = RL_NEXT_TXQ(sc, idx)) {
+   for (idx = sc->rl_ldata.rl_txq_considx;
+   idx != sc->rl_ldata.rl_txq_prodidx; idx = RL_NEXT_TXQ(sc, idx)) {
txq = &sc->rl_ldata.rl_txq[idx];
 
-   if (txq->txq_mbuf == NULL) {
-   KASSERT(idx == sc->rl_ldata.rl_txq_prodidx);
-   break;
-   }
-
descidx = txq->txq_descidx;
RL_TXDESCSYNC(sc, descidx,
BUS_DMASYNC_POSTREAD|BUS_DMASYNC_POSTWRITE);
@@ -1470,9 +1467,7 @@ re_txeof(struct rl_softc *sc)
if (txstat & RL_TDESC_CMD_OWN)
break;
 
-   tx = 1;
-   sc->rl_ldata.rl_tx_free += txq->txq_nsegs;
-   KASSERT(sc->rl_ldata.rl_tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
+   freed += txq->txq_nsegs;
bus_dmamap_sync(sc->sc_dmat, txq->txq_dmamap,
0, txq->txq_dmamap->dm_mapsize, BUS_DMASYNC_POSTWRITE);
bus_dmamap_unload(sc->sc_dmat, txq->txq_dmamap);
@@ -1487,9 +1482,13 @@ re_txeof(struct rl_softc *sc)
ifp->if_opackets++;
}
 
-   sc->rl_ldata.rl_txq_considx = idx;
+   if (freed == 0)
+   return (0);
 
-   ifq_clr_oactive(&ifp->if_snd);
+   tx_free = atomic_add_int_nv(&sc->rl_ldata.rl_tx_free, freed);
+   KASSERT(tx_free <= sc->rl_ldata.rl_tx_desc_cnt);
+
+   sc->rl_ldata.rl_txq_considx = idx;
 
/*
 * Some chips will ignore a second TX request issued while an
@@ -1498,12 +1497,14 @@ re_txeof(struct rl_softc *sc)
 * to restart the channel here to flush them out. This only
 * seems to be required with the PCIe devices.
 */
-   if (sc->rl_ldata.rl_tx_free < sc->rl_ldata.rl_tx_desc_cnt)
+   if (tx_free < sc->rl_ldata.rl_tx_desc_cnt)
CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
-   else
+   else {
+   ifq_clr_oactive(&ifp->if_snd);
ifp->if_timer = 0;
+   }
 
-   return (tx);
+   return (1);
 }
 
 void
@@ -1566,7 +1567,9 @@ re_intr(void *arg)
}
 
if (status & RL_ISR_SYSTEM_ERR) {
+   KERNEL_LOCK();
re_init(ifp);
+   KERNEL_UNLOCK();
claimed = 1;
}
}
@@ -1599,7 +1602,11 @@ re_intr(void *arg)
}
}
 
-   re_start(ifp);
+   if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
+   KERNEL_LOCK();
+   re_start(ifp);
+   KERNEL_UNLOCK();
+   }
 
CSR_WRITE_2(sc, RL_IMR, sc->rl_intrs);
 
@@ -1607,7 +1614,7 @@ re_intr(void *arg)
 }
 
 int
-re_encap(struct rl_softc *sc, struct mbuf *m, int *idx)
+re_encap(struct rl_softc *sc, struct mbuf *m, struct rl_txq *txq, int *used)
 {
bus_dmamap_tmap;
struct mbuf *mp, mh;
@@ -1616,7 +1623,6 @@ re_encap(struct rl_softc *sc, struct mbu
struct ip   *ip;
struct rl_desc  *d;
u_int32_t   cmdstat, vlanctl = 0, csum_flags = 0;
-   struct rl_txq   *txq;
 
/*
 * Set up checksum offload. Note: checksum offload bits must
@@ -1669,7 +1675,6 @@ re_encap(struct rl_softc *sc, struct mbu
}
}
 
-   txq = &sc->rl_ldata.rl_txq[*idx];
map = txq->txq_dmamap;
 
error = bus_dmamap_load_mbuf(sc->sc_dmat, map, m,
@@ -1710,7 +1715,7 @@ re_encap(struct rl_softc *sc, struct mbu
nsegs++;