Author: ae
Date: Mon May 13 13:45:28 2019
New Revision: 347526
URL: https://svnweb.freebsd.org/changeset/base/347526

Log:
  Rework locking in BPF code to remove rwlock from fast path.
  
  On high packets rate the contention on rwlock in bpf_*tap*() functions
  can lead to packets dropping. To avoid this, migrate this code to use
  epoch(9) KPI and ConcurrencyKit's lists.
  
  * all lists changed to use CK_LIST;
  * reference counting added to bpf_if and bpf_d;
  * now bpf_if references ifnet and releases this reference on destroy;
  * each bpf_d descriptor references bpf_if when it is attached;
  * new struct bpf_program_buffer introduced to keep BPF filter programs;
  * bpf_program_buffer, bpf_d and bpf_if structures are freed by
    epoch_call();
  * bpf_freelist and ifnet_departure event are no longer needed, thus
    both are removed;
  
  Reviewed by:  melifaro
  Sponsored by: Yandex LLC
  Differential Revision:        https://reviews.freebsd.org/D20224

Modified:
  head/sys/net/bpf.c
  head/sys/net/bpf.h
  head/sys/net/bpfdesc.h

Modified: head/sys/net/bpf.c
==============================================================================
--- head/sys/net/bpf.c  Mon May 13 13:30:34 2019        (r347525)
+++ head/sys/net/bpf.c  Mon May 13 13:45:28 2019        (r347526)
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 1990, 1991, 1993
  *     The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2019 Andrey V. Elsukov <a...@freebsd.org>
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
@@ -46,7 +47,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/lock.h>
-#include <sys/rwlock.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
@@ -99,7 +99,7 @@ __FBSDID("$FreeBSD$");
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 static struct bpf_if_ext dead_bpf_if = {
-       .bif_dlist = LIST_HEAD_INITIALIZER()
+       .bif_dlist = CK_LIST_HEAD_INITIALIZER()
 };
 
 struct bpf_if {
@@ -108,19 +108,22 @@ struct bpf_if {
        struct bpf_if_ext bif_ext;      /* public members */
        u_int           bif_dlt;        /* link layer type */
        u_int           bif_hdrlen;     /* length of link header */
+       struct bpfd_list bif_wlist;     /* writer-only list */
        struct ifnet    *bif_ifp;       /* corresponding interface */
-       struct rwlock   bif_lock;       /* interface lock */
-       LIST_HEAD(, bpf_d) bif_wlist;   /* writer-only list */
-       int             bif_flags;      /* Interface flags */
        struct bpf_if   **bif_bpf;      /* Pointer to pointer to us */
+       volatile u_int  bif_refcnt;
+       struct epoch_context epoch_ctx;
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
-#define BPFIF_RLOCK(bif)       rw_rlock(&(bif)->bif_lock)
-#define BPFIF_RUNLOCK(bif)     rw_runlock(&(bif)->bif_lock)
-#define BPFIF_WLOCK(bif)       rw_wlock(&(bif)->bif_lock)
-#define BPFIF_WUNLOCK(bif)     rw_wunlock(&(bif)->bif_lock)
+struct bpf_program_buffer {
+       struct epoch_context    epoch_ctx;
+#ifdef BPF_JITTER
+       bpf_jit_filter          *func;
+#endif
+       void                    *buffer[0];
+};
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
@@ -173,18 +176,24 @@ struct bpf_dltlist32 {
 #define BPF_LOCK_ASSERT()      sx_assert(&bpf_sx, SA_XLOCKED)
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
- * specific DLT.  The same network interface might have several BPF interface
+ * specific DLT. The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
-static LIST_HEAD(, bpf_if)     bpf_iflist, bpf_freelist;
+CK_LIST_HEAD(bpf_iflist, bpf_if);
+static struct bpf_iflist bpf_iflist;
 static struct sx       bpf_sx;         /* bpf global lock */
 static int             bpf_bpfd_cnt;
 
+static void    bpfif_ref(struct bpf_if *);
+static void    bpfif_rele(struct bpf_if *);
+
+static void    bpfd_ref(struct bpf_d *);
+static void    bpfd_rele(struct bpf_d *);
 static void    bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void    bpf_detachd(struct bpf_d *);
-static void    bpf_detachd_locked(struct bpf_d *);
-static void    bpf_freed(struct bpf_d *);
+static void    bpf_detachd_locked(struct bpf_d *, bool);
+static void    bpfd_free(epoch_context_t);
 static int     bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
                    struct sockaddr *, int *, struct bpf_d *);
 static int     bpf_setif(struct bpf_d *, struct ifreq *);
@@ -243,37 +252,106 @@ static struct filterops bpfread_filtops = {
        .f_event = filt_bpfread,
 };
 
-eventhandler_tag       bpf_ifdetach_cookie = NULL;
-
 /*
- * LOCKING MODEL USED BY BPF:
+ * LOCKING MODEL USED BY BPF
+ *
  * Locks:
- * 1) global lock (BPF_LOCK). Mutex, used to protect interface 
addition/removal,
- * some global counters and every bpf_if reference.
- * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and 
their filters.
- * 3) Descriptor lock. Mutex, used to protect BPF buffers and various 
structure fields
- *   used by bpf_mtap code.
+ * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
+ * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
+ * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
+ * structure fields used by bpf_*tap* code.
  *
- * Lock order:
+ * Lock order: global lock, then descriptor lock.
  *
- * Global lock, interface lock, descriptor lock
+ * There are several possible consumers:
  *
- * We have to acquire interface lock before descriptor main lock due to 
BPF_MTAP[2]
- * working model. In many places (like bpf_detachd) we start with BPF 
descriptor
- * (and we need to at least rlock it to get reliable interface pointer). This
- * gives us potential LOR. As a result, we use global lock to protect from 
bpf_if
- * change in every such place.
+ * 1. The kernel registers interface pointer with bpfattach().
+ * Each call allocates new bpf_if structure, references ifnet pointer
+ * and links bpf_if into bpf_iflist chain. This is protected with global
+ * lock.
  *
- * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
- * 3) descriptor main wlock.
- * Reading bd_bif can be protected by any of these locks, typically global 
lock.
+ * 2. An userland application uses ioctl() call to bpf_d descriptor.
+ * All such call are serialized with global lock. BPF filters can be
+ * changed, but pointer to old filter will be freed using epoch_call().
+ * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
+ * filter pointers, even if change will happen during bpf_tap execution.
+ * Destroying of bpf_d descriptor also is doing using epoch_call().
  *
- * Changing read/write BPF filter is protected by the same three locks,
- * the same applies for reading.
+ * 3. An userland application can write packets into bpf_d descriptor.
+ * There we need to be sure, that ifnet won't disappear during bpfwrite().
  *
- * Sleeping in global lock is not allowed due to bpfdetach() using it.
+ * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
+ * bif_dlist is protected with net_epoch_preempt section. So, it should
+ * be safe to make access to bpf_d descriptor inside the section.
+ *
+ * 5. The kernel invokes bpfdetach() on interface destroying. All lists
+ * are modified with global lock held and actual free() is done using
+ * epoch_call().
  */
 
+static void
+bpfif_free(epoch_context_t ctx)
+{
+       struct bpf_if *bp;
+
+       bp = __containerof(ctx, struct bpf_if, epoch_ctx);
+       if_rele(bp->bif_ifp);
+       free(bp, M_BPF);
+}
+
+static void
+bpfif_ref(struct bpf_if *bp)
+{
+
+       refcount_acquire(&bp->bif_refcnt);
+}
+
+static void
+bpfif_rele(struct bpf_if *bp)
+{
+
+       if (!refcount_release(&bp->bif_refcnt))
+               return;
+       epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free);
+}
+
+static void
+bpfd_ref(struct bpf_d *d)
+{
+
+       refcount_acquire(&d->bd_refcnt);
+}
+
+static void
+bpfd_rele(struct bpf_d *d)
+{
+
+       if (!refcount_release(&d->bd_refcnt))
+               return;
+       epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free);
+}
+
+static struct bpf_program_buffer*
+bpf_program_buffer_alloc(size_t size, int flags)
+{
+
+       return (malloc(sizeof(struct bpf_program_buffer) + size,
+           M_BPF, flags));
+}
+
+static void
+bpf_program_buffer_free(epoch_context_t ctx)
+{
+       struct bpf_program_buffer *ptr;
+
+       ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
+#ifdef BPF_JITTER
+       if (ptr->func != NULL)
+               bpf_destroy_jit_filter(ptr->func);
+#endif
+       free(ptr, M_BPF);
+}
+
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
@@ -627,7 +705,8 @@ bad:
 }
 
 /*
- * Attach file to the bpf interface, i.e. make d listen on bp.
+ * Attach descriptor to the bpf interface, i.e. make d listen on bp,
+ * then reset its buffers and counters with reset_d().
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
@@ -643,7 +722,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
        op_w = V_bpf_optimize_writers || d->bd_writer;
 
        if (d->bd_bif != NULL)
-               bpf_detachd_locked(d);
+               bpf_detachd_locked(d, false);
        /*
         * Point d at bp, and add d to the interface's list.
         * Since there are many applications using BPF for
@@ -652,26 +731,27 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
         * some filter is configured.
         */
 
-       BPFIF_WLOCK(bp);
        BPFD_LOCK(d);
-
+       /*
+        * Hold reference to bpif while descriptor uses this interface.
+        */
+       bpfif_ref(bp);
        d->bd_bif = bp;
-
        if (op_w != 0) {
                /* Add to writers-only list */
-               LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
+               CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
                /*
                 * We decrement bd_writer on every filter set operation.
                 * First BIOCSETF is done by pcap_open_live() to set up
-                * snap length. After that appliation usually sets its own 
filter
+                * snap length. After that appliation usually sets its own
+                * filter.
                 */
                d->bd_writer = 2;
        } else
-               LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
+               CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
+       reset_d(d);
        BPFD_UNLOCK(d);
-       BPFIF_WUNLOCK(bp);
-
        bpf_bpfd_cnt++;
 
        CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
@@ -685,7 +765,8 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
-bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int 
flen)
+bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
+    int flen)
 {
        int is_snap, need_upgrade;
 
@@ -705,7 +786,8 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct 
         * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
         * do not consider upgrading immediately
         */
-       if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
+       if (cmd == BIOCSETF && flen == 1 &&
+           fcode[0].code == (BPF_RET | BPF_K))
                is_snap = 1;
        else
                is_snap = 0;
@@ -743,88 +825,45 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct 
 }
 
 /*
- * Add d to the list of active bp filters.
- * Requires bpf_attachd() to be called before.
- */
-static void
-bpf_upgraded(struct bpf_d *d)
-{
-       struct bpf_if *bp;
-
-       BPF_LOCK_ASSERT();
-
-       bp = d->bd_bif;
-
-       /*
-        * Filter can be set several times without specifying interface.
-        * Mark d as reader and exit.
-        */
-       if (bp == NULL) {
-               BPFD_LOCK(d);
-               d->bd_writer = 0;
-               BPFD_UNLOCK(d);
-               return;
-       }
-
-       BPFIF_WLOCK(bp);
-       BPFD_LOCK(d);
-
-       /* Remove from writers-only list */
-       LIST_REMOVE(d, bd_next);
-       LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
-       /* Mark d as reader */
-       d->bd_writer = 0;
-
-       BPFD_UNLOCK(d);
-       BPFIF_WUNLOCK(bp);
-
-       CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
-
-       EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
-}
-
-/*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
        BPF_LOCK();
-       bpf_detachd_locked(d);
+       bpf_detachd_locked(d, false);
        BPF_UNLOCK();
 }
 
 static void
-bpf_detachd_locked(struct bpf_d *d)
+bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 {
-       int error;
        struct bpf_if *bp;
        struct ifnet *ifp;
+       int error;
 
+       BPF_LOCK_ASSERT();
        CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
-       BPF_LOCK_ASSERT();
-
        /* Check if descriptor is attached */
        if ((bp = d->bd_bif) == NULL)
                return;
 
-       BPFIF_WLOCK(bp);
        BPFD_LOCK(d);
-
+       /* Remove d from the interface's descriptor list. */
+       CK_LIST_REMOVE(d, bd_next);
        /* Save bd_writer value */
        error = d->bd_writer;
-
-       /*
-        * Remove d from the interface's descriptor list.
-        */
-       LIST_REMOVE(d, bd_next);
-
        ifp = bp->bif_ifp;
        d->bd_bif = NULL;
+       if (detached_ifp) {
+               /*
+                * Notify descriptor as it's detached, so that any
+                * sleepers wake up and get ENXIO.
+                */
+               bpf_wakeup(d);
+       }
        BPFD_UNLOCK(d);
-       BPFIF_WUNLOCK(bp);
-
        bpf_bpfd_cnt--;
 
        /* Call event handler iff d is attached */
@@ -833,9 +872,9 @@ bpf_detachd_locked(struct bpf_d *d)
 
        /*
         * Check if this descriptor had requested promiscuous mode.
-        * If so, turn it off.
+        * If so and ifnet is not detached, turn it off.
         */
-       if (d->bd_promisc) {
+       if (d->bd_promisc && !detached_ifp) {
                d->bd_promisc = 0;
                CURVNET_SET(ifp->if_vnet);
                error = ifpromisc(ifp, 0);
@@ -851,6 +890,7 @@ bpf_detachd_locked(struct bpf_d *d)
                                "bpf_detach: ifpromisc failed (%d)\n", error);
                }
        }
+       bpfif_rele(bp);
 }
 
 /*
@@ -875,8 +915,7 @@ bpf_dtor(void *data)
        seldrain(&d->bd_sel);
        knlist_destroy(&d->bd_sel.si_note);
        callout_drain(&d->bd_callout);
-       bpf_freed(d);
-       free(d, M_BPF);
+       bpfd_rele(d);
 }
 
 /*
@@ -918,6 +957,7 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct t
        d->bd_bufmode = BPF_BUFMODE_BUFFER;
        d->bd_sig = SIGIO;
        d->bd_direction = BPF_D_INOUT;
+       d->bd_refcnt = 1;
        BPF_PID_REFRESH(d, td);
 #ifdef MAC
        mac_bpfdesc_init(d);
@@ -1093,7 +1133,8 @@ bpf_timed_out(void *arg)
 
        BPFD_LOCK_ASSERT(d);
 
-       if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
+       if (callout_pending(&d->bd_callout) ||
+           !callout_active(&d->bd_callout))
                return;
        if (d->bd_state == BPF_WAITING) {
                d->bd_state = BPF_TIMED_OUT;
@@ -1119,47 +1160,71 @@ bpf_ready(struct bpf_d *d)
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
+       struct route ro;
+       struct sockaddr dst;
+       struct epoch_tracker et;
+       struct bpf_if *bp;
        struct bpf_d *d;
        struct ifnet *ifp;
        struct mbuf *m, *mc;
-       struct sockaddr dst;
-       struct route ro;
        int error, hlen;
 
        error = devfs_get_cdevpriv((void **)&d);
        if (error != 0)
                return (error);
 
+       NET_EPOCH_ENTER(et);
+       BPFD_LOCK(d);
        BPF_PID_REFRESH_CUR(d);
        counter_u64_add(d->bd_wcount, 1);
-       /* XXX: locking required */
-       if (d->bd_bif == NULL) {
-               counter_u64_add(d->bd_wdcount, 1);
-               return (ENXIO);
+       if ((bp = d->bd_bif) == NULL) {
+               error = ENXIO;
+               goto out_locked;
        }
 
-       ifp = d->bd_bif->bif_ifp;
-
+       ifp = bp->bif_ifp;
        if ((ifp->if_flags & IFF_UP) == 0) {
-               counter_u64_add(d->bd_wdcount, 1);
-               return (ENETDOWN);
+               error = ENETDOWN;
+               goto out_locked;
        }
 
-       if (uio->uio_resid == 0) {
-               counter_u64_add(d->bd_wdcount, 1);
-               return (0);
-       }
+       if (uio->uio_resid == 0)
+               goto out_locked;
 
        bzero(&dst, sizeof(dst));
        m = NULL;
        hlen = 0;
-       /* XXX: bpf_movein() can sleep */
-       error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
+
+       /*
+        * Take extra reference, unlock d and exit from epoch section,
+        * since bpf_movein() can sleep.
+        */
+       bpfd_ref(d);
+       NET_EPOCH_EXIT(et);
+       BPFD_UNLOCK(d);
+
+       error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
            &m, &dst, &hlen, d);
-       if (error) {
+
+       if (error != 0) {
                counter_u64_add(d->bd_wdcount, 1);
+               bpfd_rele(d);
                return (error);
        }
+
+       BPFD_LOCK(d);
+       /*
+        * Check that descriptor is still attached to the interface.
+        * This can happen on bpfdetach(). To avoid access to detached
+        * ifnet, free mbuf and return ENXIO.
+        */
+       if (d->bd_bif == NULL) {
+               counter_u64_add(d->bd_wdcount, 1);
+               BPFD_UNLOCK(d);
+               bpfd_rele(d);
+               m_freem(m);
+               return (ENXIO);
+       }
        counter_u64_add(d->bd_wfcount, 1);
        if (d->bd_hdrcmplt)
                dst.sa_family = pseudo_AF_HDRCMPLT;
@@ -1180,11 +1245,9 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag
 
        CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
-       BPFD_LOCK(d);
        mac_bpfdesc_create_mbuf(d, m);
        if (mc != NULL)
                mac_bpfdesc_create_mbuf(d, mc);
-       BPFD_UNLOCK(d);
 #endif
 
        bzero(&ro, sizeof(ro));
@@ -1205,7 +1268,14 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag
                        m_freem(mc);
        }
        CURVNET_RESTORE();
+       BPFD_UNLOCK(d);
+       bpfd_rele(d);
+       return (error);
 
+out_locked:
+       counter_u64_add(d->bd_wdcount, 1);
+       NET_EPOCH_EXIT(et);
+       BPFD_UNLOCK(d);
        return (error);
 }
 
@@ -1830,16 +1900,11 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, i
 }
 
 /*
- * Set d's packet filter program to fp.  If this file already has a filter,
- * free it and replace it.  Returns EINVAL for bogus requests.
+ * Set d's packet filter program to fp. If this file already has a filter,
+ * free it and replace it. Returns EINVAL for bogus requests.
  *
- * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
- * since reading d->bd_bif can't be protected by d or interface lock due to
- * lock order.
- *
- * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
- * interface read lock to read all filers.
- *
+ * Note we use global lock here to serialize bpf_setf() and bpf_setif()
+ * calls.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
@@ -1848,13 +1913,14 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
        struct bpf_program fp_swab;
        struct bpf_program32 *fp32;
 #endif
-       struct bpf_insn *fcode, *old;
+       struct bpf_program_buffer *fcode;
+       struct bpf_insn *filter;
 #ifdef BPF_JITTER
-       bpf_jit_filter *jfunc, *ofunc;
+       bpf_jit_filter *jfunc;
 #endif
        size_t size;
        u_int flen;
-       int need_upgrade;
+       bool track_event;
 
 #ifdef COMPAT_FREEBSD32
        switch (cmd) {
@@ -1863,7 +1929,8 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
        case BIOCSETFNR32:
                fp32 = (struct bpf_program32 *)fp;
                fp_swab.bf_len = fp32->bf_len;
-               fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
+               fp_swab.bf_insns =
+                   (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
                fp = &fp_swab;
                switch (cmd) {
                case BIOCSETF32:
@@ -1877,12 +1944,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
        }
 #endif
 
-       fcode = NULL;
+       filter = NULL;
 #ifdef BPF_JITTER
-       jfunc = ofunc = NULL;
+       jfunc = NULL;
 #endif
-       need_upgrade = 0;
-
        /*
         * Check new filter validness before acquiring any locks.
         * Allocate memory for new filter, if needed.
@@ -1892,10 +1957,11 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
                return (EINVAL);
        size = flen * sizeof(*fp->bf_insns);
        if (size > 0) {
-               /* We're setting up new filter.  Copy and check actual data. */
-               fcode = malloc(size, M_BPF, M_WAITOK);
-               if (copyin(fp->bf_insns, fcode, size) != 0 ||
-                   !bpf_validate(fcode, flen)) {
+               /* We're setting up new filter. Copy and check actual data. */
+               fcode = bpf_program_buffer_alloc(size, M_WAITOK);
+               filter = (struct bpf_insn *)fcode->buffer;
+               if (copyin(fp->bf_insns, filter, size) != 0 ||
+                   !bpf_validate(filter, flen)) {
                        free(fcode, M_BPF);
                        return (EINVAL);
                }
@@ -1905,50 +1971,73 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
                         * Filter is copied inside fcode and is
                         * perfectly valid.
                         */
-                       jfunc = bpf_jitter(fcode, flen);
+                       jfunc = bpf_jitter(filter, flen);
                }
 #endif
        }
 
-       BPF_LOCK();
+       track_event = false;
+       fcode = NULL;
 
-       /*
-        * Set up new filter.
-        * Protect filter change by interface lock.
-        * Additionally, we are protected by global lock here.
-        */
-       if (d->bd_bif != NULL)
-               BPFIF_WLOCK(d->bd_bif);
+       BPF_LOCK();
        BPFD_LOCK(d);
+       /* Set up new filter. */
        if (cmd == BIOCSETWF) {
-               old = d->bd_wfilter;
-               d->bd_wfilter = fcode;
+               if (d->bd_wfilter != NULL) {
+                       fcode = __containerof((void *)d->bd_wfilter,
+                           struct bpf_program_buffer, buffer);
+#ifdef BPF_JITTER
+                       fcode->func = NULL;
+#endif
+               }
+               d->bd_wfilter = filter;
        } else {
-               old = d->bd_rfilter;
-               d->bd_rfilter = fcode;
+               if (d->bd_rfilter != NULL) {
+                       fcode = __containerof((void *)d->bd_rfilter,
+                           struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
-               ofunc = d->bd_bfilter;
+                       fcode->func = d->bd_bfilter;
+#endif
+               }
+               d->bd_rfilter = filter;
+#ifdef BPF_JITTER
                d->bd_bfilter = jfunc;
 #endif
                if (cmd == BIOCSETF)
                        reset_d(d);
 
-               need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
+               if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
+                       /*
+                        * Filter can be set several times without
+                        * specifying interface. In this case just mark d
+                        * as reader.
+                        */
+                       d->bd_writer = 0;
+                       if (d->bd_bif != NULL) {
+                               /*
+                                * Remove descriptor from writers-only list
+                                * and add it to active readers list.
+                                */
+                               CK_LIST_REMOVE(d, bd_next);
+                               CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
+                                   d, bd_next);
+                               CTR2(KTR_NET,
+                                   "%s: upgrade required by pid %d",
+                                   __func__, d->bd_pid);
+                               track_event = true;
+                       }
+               }
        }
        BPFD_UNLOCK(d);
-       if (d->bd_bif != NULL)
-               BPFIF_WUNLOCK(d->bd_bif);
-       if (old != NULL)
-               free(old, M_BPF);
-#ifdef BPF_JITTER
-       if (ofunc != NULL)
-               bpf_destroy_jit_filter(ofunc);
-#endif
 
-       /* Move d to active readers list. */
-       if (need_upgrade != 0)
-               bpf_upgraded(d);
+       if (fcode != NULL)
+               epoch_call(net_epoch_preempt, &fcode->epoch_ctx,
+                   bpf_program_buffer_free);
 
+       if (track_event)
+               EVENTHANDLER_INVOKE(bpf_track,
+                   d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
+
        BPF_UNLOCK();
        return (0);
 }
@@ -1971,15 +2060,6 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
                return (ENXIO);
 
        bp = theywant->if_bpf;
-
-       /* Check if interface is not being detached from BPF */
-       BPFIF_RLOCK(bp);
-       if (bp->bif_flags & BPFIF_FLAG_DYING) {
-               BPFIF_RUNLOCK(bp);
-               return (ENXIO);
-       }
-       BPFIF_RUNLOCK(bp);
-
        /*
         * At this point, we expect the buffer is already allocated.  If not,
         * return an error.
@@ -1996,9 +2076,11 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
        }
        if (bp != d->bd_bif)
                bpf_attachd(d, bp);
-       BPFD_LOCK(d);
-       reset_d(d);
-       BPFD_UNLOCK(d);
+       else {
+               BPFD_LOCK(d);
+               reset_d(d);
+               BPFD_UNLOCK(d);
+       }
        return (0);
 }
 
@@ -2150,6 +2232,7 @@ bpf_gettime(struct bintime *bt, int tstype, struct mbu
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
+       struct epoch_tracker et;
        struct bintime bt;
        struct bpf_d *d;
 #ifdef BPF_JITTER
@@ -2159,24 +2242,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
        int gottime;
 
        gottime = BPF_TSTAMP_NONE;
-
-       BPFIF_RLOCK(bp);
-
-       LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
-               /*
-                * We are not using any locks for d here because:
-                * 1) any filter change is protected by interface
-                * write lock
-                * 2) destroying/detaching d is protected by interface
-                * write lock, too
-                */
-
+       NET_EPOCH_ENTER(et);
+       CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
                counter_u64_add(d->bd_rcount, 1);
                /*
-                * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
-                * way for the caller to indiciate to us whether this packet
-                * is inbound or outbound.  In the bpf_mtap() routines, we use
-                * the interface pointers on the mbuf to figure it out.
+                * NB: We dont call BPF_CHECK_DIRECTION() here since there
+                * is no way for the caller to indiciate to us whether this
+                * packet is inbound or outbound. In the bpf_mtap() routines,
+                * we use the interface pointers on the mbuf to figure it out.
                 */
 #ifdef BPF_JITTER
                bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
@@ -2190,10 +2263,10 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
                         * Filter matches. Let's to acquire write lock.
                         */
                        BPFD_LOCK(d);
-
                        counter_u64_add(d->bd_fcount, 1);
                        if (gottime < bpf_ts_quality(d->bd_tstamp))
-                               gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
+                               gottime = bpf_gettime(&bt, d->bd_tstamp,
+                                   NULL);
 #ifdef MAC
                        if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
@@ -2202,7 +2275,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
                        BPFD_UNLOCK(d);
                }
        }
-       BPFIF_RUNLOCK(bp);
+       NET_EPOCH_EXIT(et);
 }
 
 #define        BPF_CHECK_DIRECTION(d, r, i)                            \
@@ -2216,6 +2289,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
+       struct epoch_tracker et;
        struct bintime bt;
        struct bpf_d *d;
 #ifdef BPF_JITTER
@@ -2233,9 +2307,8 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
        pktlen = m_length(m, NULL);
        gottime = BPF_TSTAMP_NONE;
 
-       BPFIF_RLOCK(bp);
-
-       LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
+       NET_EPOCH_ENTER(et);
+       CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
                if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
                        continue;
                counter_u64_add(d->bd_rcount, 1);
@@ -2243,7 +2316,8 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
                bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
                /* XXX We cannot handle multiple mbufs. */
                if (bf != NULL && m->m_next == NULL)
-                       slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
+                       slen = (*(bf->func))(mtod(m, u_char *), pktlen,
+                           pktlen);
                else
 #endif
                slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
@@ -2261,7 +2335,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
                        BPFD_UNLOCK(d);
                }
        }
-       BPFIF_RUNLOCK(bp);
+       NET_EPOCH_EXIT(et);
 }
 
 /*
@@ -2271,6 +2345,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
+       struct epoch_tracker et;
        struct bintime bt;
        struct mbuf mb;
        struct bpf_d *d;
@@ -2296,9 +2371,8 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, s
 
        gottime = BPF_TSTAMP_NONE;
 
-       BPFIF_RLOCK(bp);
-
-       LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
+       NET_EPOCH_ENTER(et);
+       CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
                if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
                        continue;
                counter_u64_add(d->bd_rcount, 1);
@@ -2317,11 +2391,10 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, s
                        BPFD_UNLOCK(d);
                }
        }
-       BPFIF_RUNLOCK(bp);
+       NET_EPOCH_EXIT(et);
 }
 
 #undef BPF_CHECK_DIRECTION
-
 #undef BPF_TSTAMP_NONE
 #undef BPF_TSTAMP_FAST
 #undef BPF_TSTAMP_NORMAL
@@ -2540,26 +2613,30 @@ copy:
  * Called on close.
  */
 static void
-bpf_freed(struct bpf_d *d)
+bpfd_free(epoch_context_t ctx)
 {
+       struct bpf_d *d;
+       struct bpf_program_buffer *p;
 
        /*
         * We don't need to lock out interrupts since this descriptor has
         * been detached from its interface and it yet hasn't been marked
         * free.
         */
+       d = __containerof(ctx, struct bpf_d, epoch_ctx);
        bpf_free(d);
        if (d->bd_rfilter != NULL) {
-               free((caddr_t)d->bd_rfilter, M_BPF);
-#ifdef BPF_JITTER
-               if (d->bd_bfilter != NULL)
-                       bpf_destroy_jit_filter(d->bd_bfilter);
-#endif
+               p = __containerof((void *)d->bd_rfilter,
+                   struct bpf_program_buffer, buffer);
+               bpf_program_buffer_free(&p->epoch_ctx);
        }
-       if (d->bd_wfilter != NULL)
-               free((caddr_t)d->bd_wfilter, M_BPF);
-       mtx_destroy(&d->bd_lock);
+       if (d->bd_wfilter != NULL) {
+               p = __containerof((void *)d->bd_wfilter,
+                   struct bpf_program_buffer, buffer);
+               bpf_program_buffer_free(&p->epoch_ctx);
+       }
 
+       mtx_destroy(&d->bd_lock);
        counter_u64_free(d->bd_rcount);
        counter_u64_free(d->bd_dcount);
        counter_u64_free(d->bd_fcount);
@@ -2567,7 +2644,7 @@ bpf_freed(struct bpf_d *d)
        counter_u64_free(d->bd_wfcount);
        counter_u64_free(d->bd_wdcount);
        counter_u64_free(d->bd_zcopy);
-
+       free(d, M_BPF);
 }
 
 /*
@@ -2588,25 +2665,31 @@ bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
  * headers are not yet supporrted).
  */
 void
-bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
+bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
+    struct bpf_if **driverp)
 {
        struct bpf_if *bp;
 
-       KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
+       KASSERT(*driverp == NULL,
+           ("bpfattach2: driverp already initialized"));
 
        bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
 
-       rw_init(&bp->bif_lock, "bpf interface lock");
-       LIST_INIT(&bp->bif_dlist);
-       LIST_INIT(&bp->bif_wlist);
+       CK_LIST_INIT(&bp->bif_dlist);
+       CK_LIST_INIT(&bp->bif_wlist);
        bp->bif_ifp = ifp;
        bp->bif_dlt = dlt;
        bp->bif_hdrlen = hdrlen;
        bp->bif_bpf = driverp;
+       bp->bif_refcnt = 1;
        *driverp = bp;
-
+       /*
+        * Reference ifnet pointer, so it won't freed until
+        * we release it.
+        */
+       if_ref(ifp);
        BPF_LOCK();
-       LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
+       CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
        BPF_UNLOCK();
 
        if (bootverbose && IS_DEFAULT_VNET(curvnet))
@@ -2647,103 +2730,37 @@ bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u
 void
 bpfdetach(struct ifnet *ifp)
 {
-       struct bpf_if   *bp, *bp_temp;
-       struct bpf_d    *d;
-       int ndetached;
+       struct bpf_if *bp, *bp_temp;
+       struct bpf_d *d;
 
-       ndetached = 0;
-
        BPF_LOCK();

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to