Last diff to trade the KERNEL_LOCK for a mutex in order to protect data
accessed inside bpf_catchpacket().

Note about the multiples data structures:

  - selwakeup() is called in a thread context (task) so we rely on the
    KERNEL_LOCK() to serialize access to kqueue(9) data.

  - the global list of descriptor ``bpf_d_list``, accessed via 
    bpfilter_lookup() is also protected by the KERNEL_LOCK().

  - bpf_get() increments the refcount of a descriptor.  It is needed
    to increment it around ifppromisc() which can sleep.

  - A mutex now protects the rotating buffers and their associated
    fields.
 
 - The dance around uiomove(9) is here to check that buffers aren't
  rotated while data is copied to userland.  Setting ``b->bd_fbuf''
  to NULL should be enough to let bpf_catchpacket() to drop the
  patcket.  But I added ``__in_uiomove'' to be able to have usable
  panic if something weird happen.

Comments, oks?

Index: net/bpf.c
===================================================================
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.155
diff -u -p -r1.155 bpf.c
--- net/bpf.c   28 Nov 2016 10:16:08 -0000      1.155
+++ net/bpf.c   28 Nov 2016 14:51:06 -0000
@@ -116,6 +116,9 @@ int bpf_sysctl_locked(int *, u_int, void
 
 struct bpf_d *bpfilter_lookup(int);
 
+/*
+ * Called holding ``bd_mtx''.
+ */
 void   bpf_attachd(struct bpf_d *, struct bpf_if *);
 void   bpf_detachd(struct bpf_d *);
 void   bpf_resetd(struct bpf_d *);
@@ -260,11 +263,12 @@ bpf_movein(struct uio *uio, u_int linkty
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
- * Must be called at splnet.
  */
 void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+
        /*
         * Point d at bp, and add d to the interface's list of listeners.
         * Finally, point the driver's bpf cookie at the interface so
@@ -287,6 +291,8 @@ bpf_detachd(struct bpf_d *d)
 {
        struct bpf_if *bp;
 
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+
        bp = d->bd_bif;
        /* Not attached. */
        if (bp == NULL)
@@ -313,7 +319,13 @@ bpf_detachd(struct bpf_d *d)
                int error;
 
                d->bd_promisc = 0;
+
+               bpf_get(d);
+               mtx_leave(&d->bd_mtx);
                error = ifpromisc(bp->bif_ifp, 0);
+               mtx_enter(&d->bd_mtx);
+               bpf_put(d);
+
                if (error && !(error == EINVAL || error == ENODEV))
                        /*
                         * Something is really wrong if we were able to put
@@ -353,6 +365,7 @@ bpfopen(dev_t dev, int flag, int mode, s
        bd->bd_unit = unit;
        bd->bd_bufsize = bpf_bufsize;
        bd->bd_sig = SIGIO;
+       mtx_init(&bd->bd_mtx, IPL_NET);
        task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
 
        if (flag & FNONBLOCK)
@@ -372,15 +385,14 @@ int
 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
 {
        struct bpf_d *d;
-       int s;
 
        d = bpfilter_lookup(minor(dev));
-       s = splnet();
+       mtx_enter(&d->bd_mtx);
        bpf_detachd(d);
        bpf_wakeup(d);
        LIST_REMOVE(d, bd_list);
+       mtx_leave(&d->bd_mtx);
        bpf_put(d);
-       splx(s);
 
        return (0);
 }
@@ -391,11 +403,13 @@ bpfclose(dev_t dev, int flag, int mode, 
  * Zero the length of the new store buffer.
  */
 #define ROTATE_BUFFERS(d) \
+       KASSERT(d->__in_uiomove == 0); \
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
        (d)->bd_hbuf = (d)->bd_sbuf; \
        (d)->bd_hlen = (d)->bd_slen; \
        (d)->bd_sbuf = (d)->bd_fbuf; \
        (d)->bd_slen = 0; \
-       (d)->bd_fbuf = 0;
+       (d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -403,15 +417,17 @@ int
 bpfread(dev_t dev, struct uio *uio, int ioflag)
 {
        struct bpf_d *d;
-       int error;
-       int s;
+       caddr_t hbuf;
+       int hlen, error;
+
+       KERNEL_ASSERT_LOCKED();
 
        d = bpfilter_lookup(minor(dev));
        if (d->bd_bif == NULL)
                return (ENXIO);
 
-       s = splnet();
        bpf_get(d);
+       mtx_enter(&d->bd_mtx);
 
        /*
         * Restrict application to use a buffer the same size as
@@ -460,8 +476,8 @@ bpfread(dev_t dev, struct uio *uio, int 
                        error = EWOULDBLOCK;
                } else {
                        if ((d->bd_rdStart + d->bd_rtout) < ticks) {
-                               error = tsleep((caddr_t)d, PRINET|PCATCH, "bpf",
-                                   d->bd_rtout);
+                               error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
+                                   "bpf", d->bd_rtout);
                        } else
                                error = EWOULDBLOCK;
                }
@@ -492,22 +508,30 @@ bpfread(dev_t dev, struct uio *uio, int 
        /*
         * At this point, we know we have something in the hold slot.
         */
-       splx(s);
+       hbuf = d->bd_hbuf;
+       hlen = d->bd_hlen;
+       d->bd_hbuf = NULL;
+       d->bd_hlen = 0;
+       d->bd_fbuf = NULL;
+       d->__in_uiomove = 1;
 
        /*
         * Move data from hold buffer into user space.
         * We know the entire buffer is transferred since
         * we checked above that the read buffer is bpf_bufsize bytes.
         */
-       error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
-
-       s = splnet();
-       d->bd_fbuf = d->bd_hbuf;
-       d->bd_hbuf = NULL;
-       d->bd_hlen = 0;
+       mtx_leave(&d->bd_mtx);
+       error = uiomove(hbuf, hlen, uio);
+       mtx_enter(&d->bd_mtx);
+
+       /* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
+       KASSERT(d->bd_fbuf == NULL);
+       KASSERT(d->bd_hbuf == NULL);
+       d->bd_fbuf = hbuf;
+       d->__in_uiomove = 0;
 out:
+       mtx_leave(&d->bd_mtx);
        bpf_put(d);
-       splx(s);
 
        return (error);
 }
@@ -519,6 +543,8 @@ out:
 void
 bpf_wakeup(struct bpf_d *d)
 {
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+
        /*
         * As long as csignal() and selwakeup() need to be protected
         * by the KERNEL_LOCK() we have to delay the wakeup to
@@ -542,6 +568,7 @@ bpf_wakeup_cb(void *xd)
 
        selwakeup(&d->bd_sel);
        bpf_put(d);
+
 }
 
 int
@@ -552,17 +579,21 @@ bpfwrite(dev_t dev, struct uio *uio, int
        struct mbuf *m;
        struct bpf_program *bf;
        struct bpf_insn *fcode = NULL;
-       int error, s;
        struct sockaddr_storage dst;
        u_int dlt;
+       int error;
 
-       d = bpfilter_lookup(minor(dev));
-       if (d->bd_bif == NULL)
-               return (ENXIO);
+       KERNEL_ASSERT_LOCKED();
 
+       d = bpfilter_lookup(minor(dev));
        bpf_get(d);
-       ifp = d->bd_bif->bif_ifp;
+       mtx_enter(&d->bd_mtx);
+       if (d->bd_bif == NULL) {
+               error = ENXIO;
+               goto out;
+       }
 
+       ifp = d->bd_bif->bif_ifp;
        if ((ifp->if_flags & IFF_UP) == 0) {
                error = ENETDOWN;
                goto out;
@@ -580,7 +611,9 @@ bpfwrite(dev_t dev, struct uio *uio, int
 
        dlt = d->bd_bif->bif_dlt;
 
+       mtx_leave(&d->bd_mtx);
        error = bpf_movein(uio, dlt, &m, (struct sockaddr *)&dst, fcode);
+       mtx_enter(&d->bd_mtx);
        if (error)
                goto out;
 
@@ -596,23 +629,25 @@ bpfwrite(dev_t dev, struct uio *uio, int
        if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
                dst.ss_family = pseudo_AF_HDRCMPLT;
 
-       s = splsoftnet();
        error = ifp->if_output(ifp, m, (struct sockaddr *)&dst, NULL);
-       splx(s);
-
 out:
+       mtx_leave(&d->bd_mtx);
        bpf_put(d);
+
        return (error);
 }
 
 /*
  * Reset a descriptor by flushing its packet buffer and clearing the
- * receive and drop counts.  Should be called at splnet.
+ * receive and drop counts.
  */
 void
 bpf_resetd(struct bpf_d *d)
 {
-       if (d->bd_hbuf) {
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+       KASSERT(d->__in_uiomove == 0);
+
+       if (d->bd_hbuf != NULL) {
                /* Free the hold buffer. */
                d->bd_fbuf = d->bd_hbuf;
                d->bd_hbuf = NULL;
@@ -646,7 +681,7 @@ int
 bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
 {
        struct bpf_d *d;
-       int s, error = 0;
+       int error = 0;
 
        d = bpfilter_lookup(minor(dev));
        if (d->bd_locked && suser(p, 0) != 0) {
@@ -674,8 +709,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                }
        }
 
-       switch (cmd) {
+       bpf_get(d);
 
+       switch (cmd) {
        default:
                error = EINVAL;
                break;
@@ -687,11 +723,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                {
                        int n;
 
-                       s = splnet();
+                       mtx_enter(&d->bd_mtx);
                        n = d->bd_slen;
-                       if (d->bd_hbuf)
+                       if (d->bd_hbuf != NULL)
                                n += d->bd_hlen;
-                       splx(s);
+                       mtx_leave(&d->bd_mtx);
 
                        *(int *)addr = n;
                        break;
@@ -717,7 +753,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                                *(u_int *)addr = size = bpf_maxbufsize;
                        else if (size < BPF_MINBUFSIZE)
                                *(u_int *)addr = size = BPF_MINBUFSIZE;
+                       mtx_enter(&d->bd_mtx);
                        d->bd_bufsize = size;
+                       mtx_leave(&d->bd_mtx);
                }
                break;
 
@@ -739,9 +777,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
         * Flush read packet buffer.
         */
        case BIOCFLUSH:
-               s = splnet();
+               mtx_enter(&d->bd_mtx);
                bpf_resetd(d);
-               splx(s);
+               mtx_leave(&d->bd_mtx);
                break;
 
        /*
@@ -753,15 +791,14 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                         * No interface attached yet.
                         */
                        error = EINVAL;
-                       break;
-               }
-               s = splnet();
-               if (d->bd_promisc == 0) {
-                       error = ifpromisc(d->bd_bif->bif_ifp, 1);
-                       if (error == 0)
-                               d->bd_promisc = 1;
+               } else {
+                       if (d->bd_promisc == 0) {
+                               MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
+                               error = ifpromisc(d->bd_bif->bif_ifp, 1);
+                               if (error == 0)
+                                       d->bd_promisc = 1;
+                       }
                }
-               splx(s);
                break;
 
        /*
@@ -778,30 +815,36 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
         * Get device parameters.
         */
        case BIOCGDLT:
+               mtx_enter(&d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        *(u_int *)addr = d->bd_bif->bif_dlt;
+               mtx_leave(&d->bd_mtx);
                break;
 
        /*
         * Set device parameters.
         */
        case BIOCSDLT:
+               mtx_enter(&d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        error = bpf_setdlt(d, *(u_int *)addr);
+               mtx_leave(&d->bd_mtx);
                break;
 
        /*
         * Set interface name.
         */
        case BIOCGETIF:
+               mtx_enter(&d->bd_mtx);
                if (d->bd_bif == NULL)
                        error = EINVAL;
                else
                        bpf_ifname(d->bd_bif->bif_ifp, (struct ifreq *)addr);
+               mtx_leave(&d->bd_mtx);
                break;
 
        /*
@@ -939,6 +982,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
                *(u_int *)addr = d->bd_sig;
                break;
        }
+
+       bpf_put(d);
        return (error);
 }
 
@@ -953,7 +998,6 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
        struct srp *filter;
        struct bpf_insn *fcode;
        u_int flen, size;
-       int s;
 
        KERNEL_ASSERT_LOCKED();
        filter = wf ? &d->bd_wfilter : &d->bd_rfilter;
@@ -962,9 +1006,9 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
                if (fp->bf_len != 0)
                        return (EINVAL);
                srp_update_locked(&bpf_insn_gc, filter, NULL);
-               s = splnet();
+               mtx_enter(&d->bd_mtx);
                bpf_resetd(d);
-               splx(s);
+               mtx_leave(&d->bd_mtx);
                return (0);
        }
        flen = fp->bf_len;
@@ -989,9 +1033,9 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
 
        srp_update_locked(&bpf_insn_gc, filter, bf);
 
-       s = splnet();
+       mtx_enter(&d->bd_mtx);
        bpf_resetd(d);
-       splx(s);
+       mtx_leave(&d->bd_mtx);
        return (0);
 }
 
@@ -1005,7 +1049,6 @@ bpf_setif(struct bpf_d *d, struct ifreq 
 {
        struct bpf_if *bp, *candidate = NULL;
        int error = 0;
-       int s;
 
        /*
         * Look through attached interfaces for the named one.
@@ -1030,7 +1073,7 @@ bpf_setif(struct bpf_d *d, struct ifreq 
         * If we're already attached to requested interface,
         * just flush the buffer.
         */
-       s = splnet();
+       mtx_enter(&d->bd_mtx);
        if (d->bd_sbuf == NULL) {
                if ((error = bpf_allocbufs(d)))
                        goto out;
@@ -1044,7 +1087,7 @@ bpf_setif(struct bpf_d *d, struct ifreq 
        }
        bpf_resetd(d);
 out:
-       splx(s);
+       mtx_leave(&d->bd_mtx);
        return (error);
 }
 
@@ -1064,7 +1107,9 @@ int
 bpfpoll(dev_t dev, int events, struct proc *p)
 {
        struct bpf_d *d;
-       int s, revents;
+       int revents;
+
+       KERNEL_ASSERT_LOCKED();
 
        /*
         * An imitation of the FIONREAD ioctl code.
@@ -1085,7 +1130,7 @@ bpfpoll(dev_t dev, int events, struct pr
        revents = events & (POLLOUT | POLLWRNORM);
 
        if (events & (POLLIN | POLLRDNORM)) {
-               s = splnet();
+               mtx_enter(&d->bd_mtx);
                if (d->bd_hlen != 0 || (d->bd_immediate && d->bd_slen != 0))
                        revents |= events & (POLLIN | POLLRDNORM);
                else {
@@ -1097,7 +1142,7 @@ bpfpoll(dev_t dev, int events, struct pr
                                d->bd_rdStart = ticks;
                        selrecord(p, &d->bd_sel);
                }
-               splx(s);
+               mtx_leave(&d->bd_mtx);
        }
        return (revents);
 }
@@ -1110,9 +1155,11 @@ bpfkqfilter(dev_t dev, struct knote *kn)
 {
        struct bpf_d *d;
        struct klist *klist;
-       int s;
+
+       KERNEL_ASSERT_LOCKED();
 
        d = bpfilter_lookup(minor(dev));
+
        switch (kn->kn_filter) {
        case EVFILT_READ:
                klist = &d->bd_sel.si_note;
@@ -1122,14 +1169,14 @@ bpfkqfilter(dev_t dev, struct knote *kn)
                return (EINVAL);
        }
 
-       kn->kn_hook = d;
-
-       s = splnet();
        bpf_get(d);
+       kn->kn_hook = d;
        SLIST_INSERT_HEAD(klist, kn, kn_selnext);
+
+       mtx_enter(&d->bd_mtx);
        if (d->bd_rtout != -1 && d->bd_rdStart == 0)
                d->bd_rdStart = ticks;
-       splx(s);
+       mtx_leave(&d->bd_mtx);
 
        return (0);
 }
@@ -1138,12 +1185,11 @@ void
 filt_bpfrdetach(struct knote *kn)
 {
        struct bpf_d *d = kn->kn_hook;
-       int s;
 
-       s = splnet();
+       KERNEL_ASSERT_LOCKED();
+
        SLIST_REMOVE(&d->bd_sel.si_note, kn, knote, kn_selnext);
        bpf_put(d);
-       splx(s);
 }
 
 int
@@ -1151,6 +1197,8 @@ filt_bpfread(struct knote *kn, long hint
 {
        struct bpf_d *d = kn->kn_hook;
 
+       KERNEL_ASSERT_LOCKED();
+
        kn->kn_data = d->bd_hlen;
        if (d->bd_immediate)
                kn->kn_data += d->bd_slen;
@@ -1196,7 +1244,6 @@ _bpf_mtap(caddr_t arg, const struct mbuf
        struct timeval tv;
        int gottime = 0;
        int drop = 0;
-       int s;
 
        if (m == NULL)
                return (0);
@@ -1232,12 +1279,10 @@ _bpf_mtap(caddr_t arg, const struct mbuf
                        if (!gottime++)
                                microtime(&tv);
 
-                       KERNEL_LOCK();
-                       s = splnet();
+                       mtx_enter(&d->bd_mtx);
                        bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
                            &tv);
-                       splx(s);
-                       KERNEL_UNLOCK();
+                       mtx_leave(&d->bd_mtx);
 
                        if (d->bd_fildrop)
                                drop = 1;
@@ -1367,6 +1412,7 @@ bpf_catchpacket(struct bpf_d *d, u_char 
        int totlen, curlen;
        int hdrlen, do_wakeup = 0;
 
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
        if (d->bd_bif == NULL)
                return;
 
@@ -1450,6 +1496,8 @@ bpf_catchpacket(struct bpf_d *d, u_char 
 int
 bpf_allocbufs(struct bpf_d *d)
 {
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
+
        d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
        if (d->bd_fbuf == NULL)
                return (ENOMEM);
@@ -1614,6 +1662,8 @@ bpfilter_lookup(int unit)
 {
        struct bpf_d *bd;
 
+       KERNEL_ASSERT_LOCKED();
+
        LIST_FOREACH(bd, &bpf_d_list, bd_list)
                if (bd->bd_unit == unit)
                        return (bd);
@@ -1657,10 +1707,10 @@ bpf_getdltlist(struct bpf_d *d, struct b
 int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
-       int s;
        struct ifnet *ifp;
        struct bpf_if *bp;
 
+       MUTEX_ASSERT_LOCKED(&d->bd_mtx);
        if (d->bd_bif->bif_dlt == dlt)
                return (0);
        ifp = d->bd_bif->bif_ifp;
@@ -1670,11 +1720,9 @@ bpf_setdlt(struct bpf_d *d, u_int dlt)
        }
        if (bp == NULL)
                return (EINVAL);
-       s = splnet();
        bpf_detachd(d);
        bpf_attachd(d, bp);
        bpf_resetd(d);
-       splx(s);
        return (0);
 }
 
Index: net/bpfdesc.h
===================================================================
RCS file: /cvs/src/sys/net/bpfdesc.h,v
retrieving revision 1.31
diff -u -p -r1.31 bpfdesc.h
--- net/bpfdesc.h       22 Aug 2016 10:40:36 -0000      1.31
+++ net/bpfdesc.h       28 Nov 2016 14:51:06 -0000
@@ -56,15 +56,17 @@ struct bpf_d {
         *   fbuf (free) - When read is done, put cluster here.
         * On receiving, if sbuf is full and fbuf is 0, packet is dropped.
         */
+       struct mutex    bd_mtx;         /* protect buffer slots below */
        caddr_t         bd_sbuf;        /* store slot */
        caddr_t         bd_hbuf;        /* hold slot */
        caddr_t         bd_fbuf;        /* free slot */
        int             bd_slen;        /* current length of store buffer */
        int             bd_hlen;        /* current length of hold buffer */
-
        int             bd_bufsize;     /* absolute length of buffers */
 
-       struct bpf_if * bd_bif;         /* interface descriptor */
+       int             __in_uiomove;
+
+       struct bpf_if  *bd_bif;         /* interface descriptor */
        u_long          bd_rtout;       /* Read timeout in 'ticks' */
        u_long          bd_rdStart;     /* when the read started */
        struct srp      bd_rfilter;     /* read filter code */

Reply via email to