On 17/08/15(Mon) 19:55, Mark Kettenis wrote:
> The diff below is a first step towards running the em(4) interrupt
> handler without grabbing the kernel lock. It runs the rx completion
> path without the lock, which is the important bit to be able to test
> the work that has been going on to make the network stack run on
> multiple CPUs. But the handler still grabs the kernel lock for tx
> completions, rx ring refills and everything else. Which means that it
> will still grab the lock every time it runs, which may affect how it
> interacts with the network stack.
>
> The locking strategy is centered around the if_rxring interface. The
> tricky bit is to prevent the code that brings down the interface from
> frees the data structures accessed by the interrupt handler while it
> is running. The interrupt handler's rx completion path checks if
> there are any filled ring slots before checking them and holds the
> lock while doing so. The code that frees the rx ring now explicitly
> calls
>
> if_rxr_init(&sc->rx_ring, 0, 0);
>
> which clears the number of filled ring slots, so after the
> initialization call completes, we can be sure the rx completion code
> doesn't run again until we refill the ring.
>
> The nice side effect if resetting the if_rxring structure this way is
> that if you bring down the interface, systat mbuf correctly shows the
> number of allocated mbufs on the ring as 0. But it also resets the
> water marks, which may or may not be a good idea.
>
> ok?
ok mpi@
> Index: if_em.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_em.c,v
> retrieving revision 1.299
> diff -u -p -r1.299 if_em.c
> --- if_em.c 24 Jun 2015 09:40:54 -0000 1.299
> +++ if_em.c 17 Aug 2015 09:24:58 -0000
> @@ -353,6 +353,8 @@ em_attach(struct device *parent, struct
> sc = (struct em_softc *)self;
> sc->osdep.em_pa = *pa;
>
> + mtx_init(&sc->rx_mtx, IPL_NET);
> +
> timeout_set(&sc->timer_handle, em_local_timer, sc);
> timeout_set(&sc->tx_fifo_timer_handle, em_82547_move_tail, sc);
>
> @@ -922,6 +924,8 @@ em_intr(void *arg)
> refill = 1;
> }
>
> + KERNEL_LOCK();
> +
> /* Link status change */
> if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {
> sc->hw.get_link_status = 1;
> @@ -942,6 +946,8 @@ em_intr(void *arg)
> E1000_WRITE_REG(&sc->hw, RDT, sc->last_rx_desc_filled);
> }
>
> + KERNEL_UNLOCK();
> +
> return (1);
> }
>
> @@ -1698,8 +1704,8 @@ em_allocate_pci_resources(struct em_soft
> sc->hw.back = &sc->osdep;
>
> intrstr = pci_intr_string(pc, ih);
> - sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET, em_intr, sc,
> - sc->sc_dv.dv_xname);
> + sc->sc_intrhand = pci_intr_establish(pc, ih, IPL_NET | IPL_MPSAFE,
> + em_intr, sc, sc->sc_dv.dv_xname);
> if (sc->sc_intrhand == NULL) {
> printf(": couldn't establish interrupt");
> if (intrstr != NULL)
> @@ -2413,6 +2419,8 @@ em_txeof(struct em_softc *sc)
> if (sc->num_tx_desc_avail == sc->num_tx_desc)
> return;
>
> + KERNEL_LOCK();
> +
> num_avail = sc->num_tx_desc_avail;
> first = sc->next_tx_to_clean;
> tx_desc = &sc->tx_desc_base[first];
> @@ -2494,6 +2502,8 @@ em_txeof(struct em_softc *sc)
> ifp->if_timer = EM_TX_TIMEOUT;
>
> sc->num_tx_desc_avail = num_avail;
> +
> + KERNEL_UNLOCK();
> }
>
> /*********************************************************************
> @@ -2743,6 +2753,10 @@ em_free_receive_structures(struct em_sof
>
> INIT_DEBUGOUT("free_receive_structures: begin");
>
> + mtx_enter(&sc->rx_mtx);
> + if_rxr_init(&sc->rx_ring, 0, 0);
> + mtx_leave(&sc->rx_mtx);
> +
> if (sc->rx_buffer_area != NULL) {
> rx_buffer = sc->rx_buffer_area;
> for (i = 0; i < sc->num_rx_desc; i++, rx_buffer++) {
> @@ -2825,6 +2839,8 @@ em_rxfill(struct em_softc *sc)
> int post = 0;
> int i;
>
> + mtx_enter(&sc->rx_mtx);
> +
> i = sc->last_rx_desc_filled;
>
> for (slots = if_rxr_get(&sc->rx_ring, sc->num_rx_desc);
> @@ -2841,6 +2857,8 @@ em_rxfill(struct em_softc *sc)
>
> if_rxr_put(&sc->rx_ring, slots);
>
> + mtx_leave(&sc->rx_mtx);
> +
> return (post);
> }
>
> @@ -2856,6 +2874,7 @@ em_rxeof(struct em_softc *sc)
> {
> struct ifnet *ifp = &sc->interface_data.ac_if;
> struct mbuf_list ml = MBUF_LIST_INITIALIZER();
> + struct mbuf_list free_ml = MBUF_LIST_INITIALIZER();
> struct mbuf *m;
> u_int8_t accept_frame = 0;
> u_int8_t eop = 0;
> @@ -2867,8 +2886,11 @@ em_rxeof(struct em_softc *sc)
> struct em_buffer *pkt;
> u_int8_t status;
>
> - if (if_rxr_inuse(&sc->rx_ring) == 0)
> + mtx_enter(&sc->rx_mtx);
> + if (if_rxr_inuse(&sc->rx_ring) == 0) {
> + mtx_leave(&sc->rx_mtx);
> return;
> + }
>
> i = sc->next_rx_desc_to_check;
>
> @@ -2988,12 +3010,12 @@ em_rxeof(struct em_softc *sc)
> sc->dropped_pkts++;
>
> if (sc->fmp != NULL) {
> - m_freem(sc->fmp);
> + ml_enqueue(&free_ml, sc->fmp);
> sc->fmp = NULL;
> sc->lmp = NULL;
> }
>
> - m_freem(m);
> + ml_enqueue(&free_ml, m);
> }
>
> /* Advance our pointers to the next descriptor. */
> @@ -3005,9 +3027,14 @@ em_rxeof(struct em_softc *sc)
> 0, sizeof(*desc) * sc->num_rx_desc,
> BUS_DMASYNC_PREREAD);
>
> - if_input(ifp, &ml);
> -
> sc->next_rx_desc_to_check = i;
> +
> + mtx_leave(&sc->rx_mtx);
> +
> + MBUF_LIST_FOREACH(&free_ml, m)
> + m_freem(m);
> +
> + if_input(ifp, &ml);
> }
>
> /*********************************************************************
> Index: if_em.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_em.h,v
> retrieving revision 1.54
> diff -u -p -r1.54 if_em.h
> --- if_em.h 26 Dec 2014 05:46:32 -0000 1.54
> +++ if_em.h 17 Aug 2015 09:24:58 -0000
> @@ -373,6 +373,7 @@ struct em_softc {
> struct em_dma_alloc rxdma; /* bus_dma glue for rx desc */
> struct em_rx_desc *rx_desc_base;
> struct if_rxring rx_ring;
> + struct mutex rx_mtx;
> u_int32_t next_rx_desc_to_check;
> u_int32_t last_rx_desc_filled;
> u_int32_t rx_buffer_len;
>