Module Name:    src
Committed By:   ozaki-r
Date:           Thu Feb  9 09:30:26 UTC 2017

Modified Files:
        src/sys/net: bpf.c bpfdesc.h if.c

Log Message:
Make bpf MP-safe

By the change, bpf_mtap can run without any locks as long as its bpf filter
doesn't match a target packet. Pushing data to a bpf buffer still needs
a lock. Removing the lock requires big changes and it's a future work.

Another known issue is that we need to remain some obsolete variables to
avoid breaking kvm(3) users such as netstat and fstat. One problem for
MP-ification is that in order to keep statistic counters of bpf_d we need
to use atomic operations for them. Once we retire the kvm(3) users, we
should make the counters per-CPU and remove the atomic operations.


To generate a diff of this commit:
cvs rdiff -u -r1.212 -r1.213 src/sys/net/bpf.c
cvs rdiff -u -r1.43 -r1.44 src/sys/net/bpfdesc.h
cvs rdiff -u -r1.375 -r1.376 src/sys/net/if.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/net/bpf.c
diff -u src/sys/net/bpf.c:1.212 src/sys/net/bpf.c:1.213
--- src/sys/net/bpf.c:1.212	Wed Feb  1 08:18:33 2017
+++ src/sys/net/bpf.c	Thu Feb  9 09:30:26 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $	*/
+/*	$NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $	*/
 
 /*
  * Copyright (c) 1990, 1991, 1993
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_bpf.h"
@@ -77,6 +77,8 @@ __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.21
 #include <sys/kauth.h>
 #include <sys/syslog.h>
 #include <sys/percpu.h>
+#include <sys/pserialize.h>
+#include <sys/lwp.h>
 
 #include <net/if.h>
 #include <net/slip.h>
@@ -132,11 +134,48 @@ static struct percpu	*bpf_gstats_percpu;
 	}
 
 /*
+ * Locking notes:
+ * - bpf_mtx (adaptive mutex) protects:
+ *   - Gobal lists: bpf_iflist and bpf_dlist
+ *   - struct bpf_if
+ *   - bpf_close
+ *   - bpf_psz (pserialize)
+ * - struct bpf_d has two mutexes:
+ *   - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
+ *     on packet tapping
+ *   - bd_mtx (adaptive mutex) protects member variables other than the buffers
+ * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
+ * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
+ *   never freed because struct bpf_d is only freed in bpf_close and
+ *   bpf_close never be called while executing bpf_read and bpf_write
+ * - A filter that is assigned to bpf_d can be replaced with another filter
+ *   while tapping packets, so it needs to be done atomically
+ * - struct bpf_d is iterated on bpf_dlist with psz
+ * - struct bpf_if is iterated on bpf_iflist with psz or psref
+ */
+/*
  * Use a mutex to avoid a race condition between gathering the stats/peers
  * and opening/closing the device.
  */
 static kmutex_t bpf_mtx;
 
+static struct psref_class	*bpf_psref_class __read_mostly;
+static pserialize_t		bpf_psz;
+
+static inline void
+bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
+{
+
+	psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
+}
+
+static inline void
+bpf_if_release(struct bpf_if *bp, struct psref *psref)
+{
+
+	psref_release(psref, &bp->bif_psref, bpf_psref_class);
+}
+
 /*
  *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
  *  bpf_dtab holds the descriptors, indexed by minor device #
@@ -201,6 +240,7 @@ static void	bpf_deliver(struct bpf_if *,
 		            void *(*cpfn)(void *, const void *, size_t),
 		            void *, u_int, u_int, const bool);
 static void	bpf_freed(struct bpf_d *);
+static void	bpf_free_filter(struct bpf_filter *);
 static void	bpf_ifname(struct ifnet *, struct ifreq *);
 static void	*bpf_mcpy(void *, const void *, size_t);
 static int	bpf_movein(struct uio *, int, uint64_t,
@@ -404,7 +444,9 @@ bad:
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+
 	KASSERT(mutex_owned(&bpf_mtx));
+	KASSERT(mutex_owned(d->bd_mtx));
 	/*
 	 * Point d at bp, and add d to the interface's list of listeners.
 	 * Finally, point the driver's bpf cookie at the interface so
@@ -425,6 +467,7 @@ bpf_detachd(struct bpf_d *d)
 	struct bpf_if *bp;
 
 	KASSERT(mutex_owned(&bpf_mtx));
+	KASSERT(mutex_owned(d->bd_mtx));
 
 	bp = d->bd_bif;
 	/*
@@ -442,7 +485,13 @@ bpf_detachd(struct bpf_d *d)
 		 * the interface was configured down, so only panic
 		 * if we don't get an unexpected error.
 		 */
+#ifndef NET_MPSAFE
+		KERNEL_LOCK(1, NULL);
+#endif
   		error = ifpromisc(bp->bif_ifp, 0);
+#ifndef NET_MPSAFE
+		KERNEL_UNLOCK_ONE(NULL);
+#endif
 #ifdef DIAGNOSTIC
 		if (error)
 			printf("%s: ifpromisc failed: %d", __func__, error);
@@ -452,11 +501,8 @@ bpf_detachd(struct bpf_d *d)
 	/* Remove d from the interface's descriptor list. */
 	BPFIF_DLIST_WRITER_REMOVE(d);
 
-	/* TODO pserialize_perform(); */
-	/* TODO psref_target_destroy(); */
-	BPFIF_DLIST_ENTRY_DESTROY(d);
+	pserialize_perform(bpf_psz);
 
-	/* XXX NOMPSAFE? */
 	if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
 		/*
 		 * Let the driver know that there are no more listeners.
@@ -471,6 +517,8 @@ bpf_init(void)
 {
 
 	mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
+	bpf_psz = pserialize_create();
+	bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);
 
 	PSLIST_INIT(&bpf_iflist);
 	PSLIST_INIT(&bpf_dlist);
@@ -521,9 +569,11 @@ bpfopen(dev_t dev, int flag, int mode, s
 	selinit(&d->bd_sel);
 	d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
 	d->bd_jitcode = NULL;
+	d->bd_filter = NULL;
 	BPF_DLIST_ENTRY_INIT(d);
 	BPFIF_DLIST_ENTRY_INIT(d);
-	d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
+	d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
+	d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
 	cv_init(&d->bd_cv, "bpf");
 
 	mutex_enter(&bpf_mtx);
@@ -542,14 +592,11 @@ static int
 bpf_close(struct file *fp)
 {
 	struct bpf_d *d;
-	int s;
 
-	KERNEL_LOCK(1, NULL);
 	mutex_enter(&bpf_mtx);
 
 	if ((d = fp->f_bpf) == NULL) {
 		mutex_exit(&bpf_mtx);
-		KERNEL_UNLOCK_ONE(NULL);
 		return 0;
 	}
 
@@ -558,28 +605,28 @@ bpf_close(struct file *fp)
 	 */
 	d->bd_pid = curproc->p_pid;
 
-	s = splnet();
+	mutex_enter(d->bd_mtx);
 	if (d->bd_state == BPF_WAITING)
-		callout_stop(&d->bd_callout);
+		callout_halt(&d->bd_callout, d->bd_mtx);
 	d->bd_state = BPF_IDLE;
 	if (d->bd_bif)
 		bpf_detachd(d);
-	splx(s);
-	bpf_freed(d);
+	mutex_exit(d->bd_mtx);
+
 	BPF_DLIST_WRITER_REMOVE(d);
-	fp->f_bpf = NULL;
 
+	pserialize_perform(bpf_psz);
 	mutex_exit(&bpf_mtx);
-	KERNEL_UNLOCK_ONE(NULL);
 
-	/* TODO pserialize_perform(); */
-	/* TODO psref_target_destroy(); */
+	BPFIF_DLIST_ENTRY_DESTROY(d);
 	BPF_DLIST_ENTRY_DESTROY(d);
-
+	fp->f_bpf = NULL;
+	bpf_freed(d);
 	callout_destroy(&d->bd_callout);
 	seldestroy(&d->bd_sel);
 	softint_disestablish(d->bd_sih);
 	mutex_obj_free(d->bd_mtx);
+	mutex_obj_free(d->bd_buf_mtx);
 	cv_destroy(&d->bd_cv);
 
 	kmem_free(d, sizeof(*d));
@@ -608,7 +655,6 @@ bpf_read(struct file *fp, off_t *offp, s
 	struct bpf_d *d = fp->f_bpf;
 	int timed_out;
 	int error;
-	int s;
 
 	getnanotime(&d->bd_atime);
 	/*
@@ -618,17 +664,18 @@ bpf_read(struct file *fp, off_t *offp, s
 	if (uio->uio_resid != d->bd_bufsize)
 		return (EINVAL);
 
-	KERNEL_LOCK(1, NULL);
-	s = splnet();
+	mutex_enter(d->bd_mtx);
 	if (d->bd_state == BPF_WAITING)
-		callout_stop(&d->bd_callout);
+		callout_halt(&d->bd_callout, d->bd_buf_mtx);
 	timed_out = (d->bd_state == BPF_TIMED_OUT);
 	d->bd_state = BPF_IDLE;
+	mutex_exit(d->bd_mtx);
 	/*
 	 * If the hold buffer is empty, then do a timed sleep, which
 	 * ends when the timeout expires or when enough packets
 	 * have arrived to fill the store buffer.
 	 */
+	mutex_enter(d->bd_buf_mtx);
 	while (d->bd_hbuf == NULL) {
 		if (fp->f_flag & FNONBLOCK) {
 			if (d->bd_slen == 0) {
@@ -649,9 +696,7 @@ bpf_read(struct file *fp, off_t *offp, s
 			break;
 		}
 
-		mutex_enter(d->bd_mtx);
-		error = cv_timedwait_sig(&d->bd_cv, d->bd_mtx, d->bd_rtout);
-		mutex_exit(d->bd_mtx);
+		error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);
 
 		if (error == EINTR || error == ERESTART)
 			goto out;
@@ -683,7 +728,7 @@ bpf_read(struct file *fp, off_t *offp, s
 	/*
 	 * At this point, we know we have something in the hold slot.
 	 */
-	splx(s);
+	mutex_exit(d->bd_buf_mtx);
 
 	/*
 	 * Move data from hold buffer into user space.
@@ -692,13 +737,12 @@ bpf_read(struct file *fp, off_t *offp, s
 	 */
 	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
 
-	s = splnet();
+	mutex_enter(d->bd_buf_mtx);
 	d->bd_fbuf = d->bd_hbuf;
 	d->bd_hbuf = NULL;
 	d->bd_hlen = 0;
 out:
-	splx(s);
-	KERNEL_UNLOCK_ONE(NULL);
+	mutex_exit(d->bd_buf_mtx);
 	return (error);
 }
 
@@ -710,9 +754,9 @@ static inline void
 bpf_wakeup(struct bpf_d *d)
 {
 
-	mutex_enter(d->bd_mtx);
+	mutex_enter(d->bd_buf_mtx);
 	cv_broadcast(&d->bd_cv);
-	mutex_exit(d->bd_mtx);
+	mutex_exit(d->bd_buf_mtx);
 
 	if (d->bd_async)
 		softint_schedule(d->bd_sih);
@@ -733,15 +777,14 @@ static void
 bpf_timed_out(void *arg)
 {
 	struct bpf_d *d = arg;
-	int s;
 
-	s = splnet();
+	mutex_enter(d->bd_mtx);
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
 		if (d->bd_slen != 0)
 			bpf_wakeup(d);
 	}
-	splx(s);
+	mutex_exit(d->bd_mtx);
 }
 
 
@@ -750,39 +793,49 @@ bpf_write(struct file *fp, off_t *offp, 
     kauth_cred_t cred, int flags)
 {
 	struct bpf_d *d = fp->f_bpf;
+	struct bpf_if *bp;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
-	int error, s;
+	int error;
 	static struct sockaddr_storage dst;
+	struct psref psref;
+	int bound;
 
 	m = NULL;	/* XXX gcc */
 
-	KERNEL_LOCK(1, NULL);
-
-	if (d->bd_bif == NULL) {
-		KERNEL_UNLOCK_ONE(NULL);
-		return (ENXIO);
+	bound = curlwp_bind();
+	mutex_enter(d->bd_mtx);
+	bp = d->bd_bif;
+	if (bp == NULL) {
+		mutex_exit(d->bd_mtx);
+		error = ENXIO;
+		goto out_bindx;
 	}
+	bpf_if_acquire(bp, &psref);
+	mutex_exit(d->bd_mtx);
+
 	getnanotime(&d->bd_mtime);
 
-	ifp = d->bd_bif->bif_ifp;
+	ifp = bp->bif_ifp;
+	if (if_is_deactivated(ifp)) {
+		error = ENXIO;
+		goto out;
+	}
 
 	if (uio->uio_resid == 0) {
-		KERNEL_UNLOCK_ONE(NULL);
-		return (0);
+		error = 0;
+		goto out;
 	}
 
-	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m,
+	error = bpf_movein(uio, (int)bp->bif_dlt, ifp->if_mtu, &m,
 		(struct sockaddr *) &dst);
-	if (error) {
-		KERNEL_UNLOCK_ONE(NULL);
-		return (error);
-	}
+	if (error)
+		goto out;
 
 	if (m->m_pkthdr.len > ifp->if_mtu) {
-		KERNEL_UNLOCK_ONE(NULL);
 		m_freem(m);
-		return (EMSGSIZE);
+		error = EMSGSIZE;
+		goto out;
 	}
 
 	if (d->bd_hdrcmplt)
@@ -798,7 +851,6 @@ bpf_write(struct file *fp, off_t *offp, 
 	} else  
 		mc = NULL;
 
-	s = splsoftnet();
 	error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL);
 
 	if (mc != NULL) {
@@ -807,12 +859,14 @@ bpf_write(struct file *fp, off_t *offp, 
 		else
 			m_freem(mc);
 	}
-	splx(s);
-	KERNEL_UNLOCK_ONE(NULL);
 	/*
 	 * The driver frees the mbuf.
 	 */
-	return (error);
+out:
+	bpf_if_release(bp, &psref);
+out_bindx:
+	curlwp_bindx(bound);
+	return error;
 }
 
 /*
@@ -822,6 +876,10 @@ bpf_write(struct file *fp, off_t *offp, 
 static void
 reset_d(struct bpf_d *d)
 {
+
+	KASSERT(mutex_owned(d->bd_mtx));
+
+	mutex_enter(d->bd_buf_mtx);
 	if (d->bd_hbuf) {
 		/* Free the hold buffer. */
 		d->bd_fbuf = d->bd_hbuf;
@@ -832,6 +890,7 @@ reset_d(struct bpf_d *d)
 	d->bd_rcount = 0;
 	d->bd_dcount = 0;
 	d->bd_ccount = 0;
+	mutex_exit(d->bd_buf_mtx);
 }
 
 /*
@@ -860,12 +919,11 @@ static int
 bpf_ioctl(struct file *fp, u_long cmd, void *addr)
 {
 	struct bpf_d *d = fp->f_bpf;
-	int s, error = 0;
+	int error = 0;
 
 	/*
 	 * Refresh the PID associated with this bpf file.
 	 */
-	KERNEL_LOCK(1, NULL);
 	d->bd_pid = curproc->p_pid;
 #ifdef _LP64
 	if (curproc->p_flag & PK_32)
@@ -874,11 +932,11 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 		d->bd_compat32 = 0;
 #endif
 
-	s = splnet();
+	mutex_enter(d->bd_mtx);
 	if (d->bd_state == BPF_WAITING)
-		callout_stop(&d->bd_callout);
+		callout_halt(&d->bd_callout, d->bd_mtx);
 	d->bd_state = BPF_IDLE;
-	splx(s);
+	mutex_exit(d->bd_mtx);
 
 	switch (cmd) {
 
@@ -893,11 +951,11 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 		{
 			int n;
 
-			s = splnet();
+			mutex_enter(d->bd_buf_mtx);
 			n = d->bd_slen;
 			if (d->bd_hbuf)
 				n += d->bd_hlen;
-			splx(s);
+			mutex_exit(d->bd_buf_mtx);
 
 			*(int *)addr = n;
 			break;
@@ -918,6 +976,8 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 		 * Forbid to change the buffer length if buffers are already
 		 * allocated.
 		 */
+		mutex_enter(d->bd_mtx);
+		mutex_enter(d->bd_buf_mtx);
 		if (d->bd_bif != NULL || d->bd_sbuf != NULL)
 			error = EINVAL;
 		else {
@@ -929,6 +989,8 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 				*(u_int *)addr = size = BPF_MINBUFSIZE;
 			d->bd_bufsize = size;
 		}
+		mutex_exit(d->bd_buf_mtx);
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
@@ -942,49 +1004,60 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 	 * Flush read packet buffer.
 	 */
 	case BIOCFLUSH:
-		s = splnet();
+		mutex_enter(d->bd_mtx);
 		reset_d(d);
-		splx(s);
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
 	 * Put interface into promiscuous mode.
 	 */
 	case BIOCPROMISC:
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif == NULL) {
+			mutex_exit(d->bd_mtx);
 			/*
 			 * No interface attached yet.
 			 */
 			error = EINVAL;
 			break;
 		}
-		s = splnet();
 		if (d->bd_promisc == 0) {
+#ifndef NET_MPSAFE
+			KERNEL_LOCK(1, NULL);
+#endif
 			error = ifpromisc(d->bd_bif->bif_ifp, 1);
+#ifndef NET_MPSAFE
+			KERNEL_UNLOCK_ONE(NULL);
+#endif
 			if (error == 0)
 				d->bd_promisc = 1;
 		}
-		splx(s);
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
 	 * Get device parameters.
 	 */
 	case BIOCGDLT:
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			*(u_int *)addr = d->bd_bif->bif_dlt;
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
 	 * Get a list of supported device parameters.
 	 */
 	case BIOCGDLTLIST:
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_getdltlist(d, addr);
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
@@ -992,10 +1065,12 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 	 */
 	case BIOCSDLT:
 		mutex_enter(&bpf_mtx);
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			error = bpf_setdlt(d, *(u_int *)addr);
+		mutex_exit(d->bd_mtx);
 		mutex_exit(&bpf_mtx);
 		break;
 
@@ -1006,10 +1081,12 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 	case OBIOCGETIF:
 #endif
 	case BIOCGETIF:
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif == NULL)
 			error = EINVAL;
 		else
 			bpf_ifname(d->bd_bif->bif_ifp, addr);
+		mutex_exit(d->bd_mtx);
 		break;
 
 	/*
@@ -1175,7 +1252,6 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 		error = fgetown(d->bd_pgid, cmd, addr);
 		break;
 	}
-	KERNEL_UNLOCK_ONE(NULL);
 	return (error);
 }
 
@@ -1186,10 +1262,10 @@ bpf_ioctl(struct file *fp, u_long cmd, v
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp)
 {
-	struct bpf_insn *fcode, *old;
-	bpfjit_func_t jcode, oldj;
-	size_t flen, size = 0, old_size;
-	int s;
+	struct bpf_insn *fcode;
+	bpfjit_func_t jcode;
+	size_t flen, size = 0;
+	struct bpf_filter *oldf, *newf;
 
 	jcode = NULL;
 	flen = fp->bf_len;
@@ -1217,23 +1293,25 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
 		fcode = NULL;
 	}
 
-	old_size = d->bd_filter_size;
+	newf = kmem_alloc(sizeof(*newf), KM_SLEEP);
+	newf->bf_insn = fcode;
+	newf->bf_size = size;
+	newf->bf_jitcode = jcode;
+	d->bd_jitcode = jcode; /* XXX just for kvm(3) users */
 
-	s = splnet();
-	old = d->bd_filter;
-	d->bd_filter = fcode;
-	d->bd_filter_size = size;
-	oldj = d->bd_jitcode;
-	d->bd_jitcode = jcode;
+	/* Need to hold bpf_mtx for pserialize_perform */
+	mutex_enter(&bpf_mtx);
+	mutex_enter(d->bd_mtx);
+	oldf = d->bd_filter;
+	d->bd_filter = newf;
+	membar_producer();
 	reset_d(d);
-	splx(s);
+	pserialize_perform(bpf_psz);
+	mutex_exit(d->bd_mtx);
+	mutex_exit(&bpf_mtx);
 
-	if (old) {
-		kmem_free(old, old_size);
-	}
-	if (oldj) {
-		bpf_jit_freecode(oldj);
-	}
+	if (oldf != NULL)
+		bpf_free_filter(oldf);
 
 	return 0;
 }
@@ -1248,7 +1326,7 @@ bpf_setif(struct bpf_d *d, struct ifreq 
 {
 	struct bpf_if *bp;
 	char *cp;
-	int unit_seen, i, s, error;
+	int unit_seen, i, error;
 
 	KASSERT(mutex_owned(&bpf_mtx));
 	/*
@@ -1292,12 +1370,16 @@ bpf_setif(struct bpf_d *d, struct ifreq 
 		 * If we're already attached to requested interface,
 		 * just flush the buffer.
 		 */
+		/*
+		 * bpf_allocbufs is called only here. bpf_mtx ensures that
+		 * no race condition happen on d->bd_sbuf.
+		 */
 		if (d->bd_sbuf == NULL) {
 			error = bpf_allocbufs(d);
 			if (error != 0)
 				return (error);
 		}
-		s = splnet();
+		mutex_enter(d->bd_mtx);
 		if (bp != d->bd_bif) {
 			if (d->bd_bif)
 				/*
@@ -1308,7 +1390,7 @@ bpf_setif(struct bpf_d *d, struct ifreq 
 			bpf_attachd(d, bp);
 		}
 		reset_d(d);
-		splx(s);
+		mutex_exit(d->bd_mtx);
 		return (0);
 	}
 	/* Not found. */
@@ -1330,7 +1412,7 @@ bpf_stat(struct file *fp, struct stat *s
 	struct bpf_d *d = fp->f_bpf;
 
 	(void)memset(st, 0, sizeof(*st));
-	KERNEL_LOCK(1, NULL);
+	mutex_enter(d->bd_mtx);
 	st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid);
 	st->st_atimespec = d->bd_atime;
 	st->st_mtimespec = d->bd_mtime;
@@ -1338,7 +1420,7 @@ bpf_stat(struct file *fp, struct stat *s
 	st->st_uid = kauth_cred_geteuid(fp->f_cred);
 	st->st_gid = kauth_cred_getegid(fp->f_cred);
 	st->st_mode = S_IFCHR;
-	KERNEL_UNLOCK_ONE(NULL);
+	mutex_exit(d->bd_mtx);
 	return 0;
 }
 
@@ -1354,13 +1436,12 @@ static int
 bpf_poll(struct file *fp, int events)
 {
 	struct bpf_d *d = fp->f_bpf;
-	int s = splnet();
 	int revents;
 
 	/*
 	 * Refresh the PID associated with this bpf file.
 	 */
-	KERNEL_LOCK(1, NULL);
+	mutex_enter(&bpf_mtx);
 	d->bd_pid = curproc->p_pid;
 
 	revents = events & (POLLOUT | POLLWRNORM);
@@ -1368,6 +1449,7 @@ bpf_poll(struct file *fp, int events)
 		/*
 		 * An imitation of the FIONREAD ioctl code.
 		 */
+		mutex_enter(d->bd_mtx);
 		if (d->bd_hlen != 0 ||
 		    ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
 		     d->bd_slen != 0)) {
@@ -1381,10 +1463,10 @@ bpf_poll(struct file *fp, int events)
 				d->bd_state = BPF_WAITING;
 			}
 		}
+		mutex_exit(d->bd_mtx);
 	}
 
-	KERNEL_UNLOCK_ONE(NULL);
-	splx(s);
+	mutex_exit(&bpf_mtx);
 	return (revents);
 }
 
@@ -1392,13 +1474,10 @@ static void
 filt_bpfrdetach(struct knote *kn)
 {
 	struct bpf_d *d = kn->kn_hook;
-	int s;
 
-	KERNEL_LOCK(1, NULL);
-	s = splnet();
+	mutex_enter(d->bd_buf_mtx);
 	SLIST_REMOVE(&d->bd_sel.sel_klist, kn, knote, kn_selnext);
-	splx(s);
-	KERNEL_UNLOCK_ONE(NULL);
+	mutex_exit(d->bd_buf_mtx);
 }
 
 static int
@@ -1407,12 +1486,12 @@ filt_bpfread(struct knote *kn, long hint
 	struct bpf_d *d = kn->kn_hook;
 	int rv;
 
-	KERNEL_LOCK(1, NULL);
+	mutex_enter(d->bd_buf_mtx);
 	kn->kn_data = d->bd_hlen;
 	if (d->bd_immediate)
 		kn->kn_data += d->bd_slen;
 	rv = (kn->kn_data > 0);
-	KERNEL_UNLOCK_ONE(NULL);
+	mutex_exit(d->bd_buf_mtx);
 	return rv;
 }
 
@@ -1424,10 +1503,8 @@ bpf_kqfilter(struct file *fp, struct kno
 {
 	struct bpf_d *d = fp->f_bpf;
 	struct klist *klist;
-	int s;
-
-	KERNEL_LOCK(1, NULL);
 
+	mutex_enter(d->bd_buf_mtx);
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		klist = &d->bd_sel.sel_klist;
@@ -1435,16 +1512,14 @@ bpf_kqfilter(struct file *fp, struct kno
 		break;
 
 	default:
-		KERNEL_UNLOCK_ONE(NULL);
+		mutex_exit(d->bd_buf_mtx);
 		return (EINVAL);
 	}
 
 	kn->kn_hook = d;
 
-	s = splnet();
 	SLIST_INSERT_HEAD(klist, kn, kn_selnext);
-	splx(s);
-	KERNEL_UNLOCK_ONE(NULL);
+	mutex_exit(d->bd_buf_mtx);
 
 	return (0);
 }
@@ -1498,25 +1573,35 @@ bpf_deliver(struct bpf_if *bp, void *(*c
 	bool gottime = false;
 	struct timespec ts;
 	struct bpf_d *d;
+	int s;
+
+	KASSERT(!cpu_intr_p());
 
 	/*
 	 * Note that the IPL does not have to be raised at this point.
 	 * The only problem that could arise here is that if two different
 	 * interfaces shared any data.  This is not the case.
 	 */
+	s = pserialize_read_enter();
 	BPFIF_DLIST_READER_FOREACH(d, bp) {
-		u_int slen;
+		u_int slen = 0;
+		struct bpf_filter *filter;
 
 		if (!d->bd_seesent && !rcv) {
 			continue;
 		}
-		d->bd_rcount++;
+		atomic_inc_ulong(&d->bd_rcount);
 		BPF_STATINC(recv);
 
-		if (d->bd_jitcode)
-			slen = d->bd_jitcode(NULL, &args);
-		else
-			slen = bpf_filter_ext(NULL, d->bd_filter, &args);
+		filter = d->bd_filter;
+		membar_datadep_consumer();
+		if (filter != NULL) {
+			if (filter->bf_jitcode != NULL)
+				slen = filter->bf_jitcode(NULL, &args);
+			else
+				slen = bpf_filter_ext(NULL, filter->bf_insn,
+				    &args);
+		}
 
 		if (!slen) {
 			continue;
@@ -1525,8 +1610,10 @@ bpf_deliver(struct bpf_if *bp, void *(*c
 			gottime = true;
 			nanotime(&ts);
 		}
+		/* Assume catchpacket doesn't sleep */
 		catchpacket(d, pkt, pktlen, slen, cpfn, &ts);
 	}
+	pserialize_read_exit(s);
 }
 
 /*
@@ -1633,7 +1720,6 @@ _bpf_mtap_af(struct bpf_if *bp, uint32_t
 static void
 _bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m)
 {
-	int s;
 	u_char *hp;
 
 	M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT);
@@ -1644,9 +1730,7 @@ _bpf_mtap_sl_in(struct bpf_if *bp, u_cha
 	hp[SLX_DIR] = SLIPDIR_IN;
 	(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
-	s = splnet();
 	_bpf_mtap(bp, *m);
-	splx(s);
 
 	m_adj(*m, SLIP_HDRLEN);
 }
@@ -1661,7 +1745,6 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_ch
 {
 	struct mbuf m0;
 	u_char *hp;
-	int s;
 
 	m0.m_flags = 0;
 	m0.m_next = m;
@@ -1673,9 +1756,7 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_ch
 	hp[SLX_DIR] = SLIPDIR_OUT;
 	(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
 
-	s = splnet();
 	_bpf_mtap(bp, &m0);
-	splx(s);
 	m_freem(m);
 }
 
@@ -1708,6 +1789,7 @@ bpf_mbuf_dequeue(struct bpf_if *bp)
 	struct mbuf *m;
 	int s;
 
+	/* XXX NOMPSAFE: assumed running on one CPU */
 	s = splnet();
 	m = bp->bif_mbuf_head;
 	if (m != NULL) {
@@ -1737,13 +1819,7 @@ bpf_mtap_si(void *arg)
 		log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n",
 		    __func__, m, bp->bif_ifp->if_xname);
 #endif
-#ifndef NET_MPSAFE
-		KERNEL_LOCK(1, NULL);
-#endif
 		bpf_ops->bpf_mtap(bp, m);
-#ifndef NET_MPSAFE
-		KERNEL_UNLOCK_ONE(NULL);
-#endif
 		m_freem(m);
 	}
 }
@@ -1801,7 +1877,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
 	int hdrlen = bpf_hdrlen(d);
 	int do_wakeup = 0;
 
-	++d->bd_ccount;
+	atomic_inc_ulong(&d->bd_ccount);
 	BPF_STATINC(capt);
 	/*
 	 * Figure out how many bytes to move.  If the packet is
@@ -1820,6 +1896,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
 	if (caplen < 0)
 		caplen = 0;
 
+	mutex_enter(d->bd_buf_mtx);
 	/*
 	 * Round up the end of the previous packet to the next longword.
 	 */
@@ -1836,11 +1913,12 @@ catchpacket(struct bpf_d *d, u_char *pkt
 		 * pending reads.
 		 */
 		if (d->bd_fbuf == NULL) {
+			mutex_exit(d->bd_buf_mtx);
 			/*
 			 * We haven't completed the previous read yet,
 			 * so drop the packet.
 			 */
-			++d->bd_dcount;
+			atomic_inc_ulong(&d->bd_dcount);
 			BPF_STATINC(drop);
 			return;
 		}
@@ -1888,6 +1966,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
 	 */
 	(*cpfn)(h + hdrlen, pkt, caplen);
 	d->bd_slen = curlen + totlen;
+	mutex_exit(d->bd_buf_mtx);
 
 	/*
 	 * Call bpf_wakeup after bd_slen has been updated so that kevent(2)
@@ -1917,6 +1996,19 @@ bpf_allocbufs(struct bpf_d *d)
 	return (0);
 }
 
+static void
+bpf_free_filter(struct bpf_filter *filter)
+{
+
+	KASSERT(filter != NULL);
+	KASSERT(filter->bf_insn != NULL);
+
+	kmem_free(filter->bf_insn, filter->bf_size);
+	if (filter->bf_jitcode != NULL)
+		bpf_jit_freecode(filter->bf_jitcode);
+	kmem_free(filter, sizeof(*filter));
+}
+
 /*
  * Free buffers currently in use by a descriptor.
  * Called on close.
@@ -1936,12 +2028,11 @@ bpf_freed(struct bpf_d *d)
 		if (d->bd_fbuf != NULL)
 			kmem_free(d->bd_fbuf, d->bd_bufsize);
 	}
-	if (d->bd_filter)
-		kmem_free(d->bd_filter, d->bd_filter_size);
-
-	if (d->bd_jitcode != NULL) {
-		bpf_jit_freecode(d->bd_jitcode);
+	if (d->bd_filter != NULL) {
+		bpf_free_filter(d->bd_filter);
+		d->bd_filter = NULL;
 	}
+	d->bd_jitcode = NULL;
 }
 
 /*
@@ -1964,6 +2055,7 @@ _bpfattach(struct ifnet *ifp, u_int dlt,
 	bp->bif_si = NULL;
 	BPF_IFLIST_ENTRY_INIT(bp);
 	PSLIST_INIT(&bp->bif_dlist_head);
+	psref_target_init(&bp->bif_psref, bpf_psref_class);
 
 	BPF_IFLIST_WRITER_INSERT_HEAD(bp);
 
@@ -2013,27 +2105,31 @@ _bpfdetach(struct ifnet *ifp)
 	/* Nuke the vnodes for any open instances */
   again_d:
 	BPF_DLIST_WRITER_FOREACH(d) {
+		mutex_enter(d->bd_mtx);
 		if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) {
 			/*
 			 * Detach the descriptor from an interface now.
 			 * It will be free'ed later by close routine.
 			 */
-			s = splnet();
 			d->bd_promisc = 0;	/* we can't touch device. */
 			bpf_detachd(d);
-			splx(s);
+			mutex_exit(d->bd_mtx);
 			goto again_d;
 		}
+		mutex_exit(d->bd_mtx);
 	}
 
   again:
 	BPF_IFLIST_WRITER_FOREACH(bp) {
 		if (bp->bif_ifp == ifp) {
 			BPF_IFLIST_WRITER_REMOVE(bp);
-			/* TODO pserialize_perform(); */
-			/* TODO psref_target_destroy(); */
+
+			pserialize_perform(bpf_psz);
+			psref_target_destroy(&bp->bif_psref, bpf_psref_class);
+
 			BPF_IFLIST_ENTRY_DESTROY(bp);
 			if (bp->bif_si != NULL) {
+				/* XXX NOMPSAFE: assumed running on one CPU */
 				s = splnet();
 				while (bp->bif_mbuf_head != NULL) {
 					struct mbuf *m = bp->bif_mbuf_head;
@@ -2058,7 +2154,8 @@ _bpf_change_type(struct ifnet *ifp, u_in
 {
 	struct bpf_if *bp;
 
-	BPF_IFLIST_READER_FOREACH(bp) {
+	mutex_enter(&bpf_mtx);
+	BPF_IFLIST_WRITER_FOREACH(bp) {
 		if (bp->bif_driverp == &ifp->if_bpf)
 			break;
 	}
@@ -2068,6 +2165,7 @@ _bpf_change_type(struct ifnet *ifp, u_in
 	bp->bif_dlt = dlt;
 
 	bp->bif_hdrlen = hdrlen;
+	mutex_exit(&bpf_mtx);
 }
 
 /*
@@ -2079,21 +2177,41 @@ bpf_getdltlist(struct bpf_d *d, struct b
 	int n, error;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
+	int s, bound;
+
+	KASSERT(mutex_owned(d->bd_mtx));
 
 	ifp = d->bd_bif->bif_ifp;
 	n = 0;
 	error = 0;
+
+	bound = curlwp_bind();
+	s = pserialize_read_enter();
 	BPF_IFLIST_READER_FOREACH(bp) {
 		if (bp->bif_ifp != ifp)
 			continue;
 		if (bfl->bfl_list != NULL) {
-			if (n >= bfl->bfl_len)
+			struct psref psref;
+
+			if (n >= bfl->bfl_len) {
+				pserialize_read_exit(s);
 				return ENOMEM;
+			}
+
+			bpf_if_acquire(bp, &psref);
+			pserialize_read_exit(s);
+
 			error = copyout(&bp->bif_dlt,
 			    bfl->bfl_list + n, sizeof(u_int));
+
+			s = pserialize_read_enter();
+			bpf_if_release(bp, &psref);
 		}
 		n++;
 	}
+	pserialize_read_exit(s);
+	curlwp_bindx(bound);
+
 	bfl->bfl_len = n;
 	return error;
 }
@@ -2104,11 +2222,12 @@ bpf_getdltlist(struct bpf_d *d, struct b
 static int
 bpf_setdlt(struct bpf_d *d, u_int dlt)
 {
-	int s, error, opromisc;
+	int error, opromisc;
 	struct ifnet *ifp;
 	struct bpf_if *bp;
 
 	KASSERT(mutex_owned(&bpf_mtx));
+	KASSERT(mutex_owned(d->bd_mtx));
 
 	if (d->bd_bif->bif_dlt == dlt)
 		return 0;
@@ -2119,20 +2238,24 @@ bpf_setdlt(struct bpf_d *d, u_int dlt)
 	}
 	if (bp == NULL)
 		return EINVAL;
-	s = splnet();
 	opromisc = d->bd_promisc;
 	bpf_detachd(d);
 	bpf_attachd(d, bp);
 	reset_d(d);
 	if (opromisc) {
+#ifndef NET_MPSAFE
+		KERNEL_LOCK(1, NULL);
+#endif
 		error = ifpromisc(bp->bif_ifp, 1);
+#ifndef NET_MPSAFE
+		KERNEL_UNLOCK_ONE(NULL);
+#endif
 		if (error)
 			printf("%s: bpf_setdlt: ifpromisc failed (%d)\n",
 			    bp->bif_ifp->if_xname, error);
 		else
 			d->bd_promisc = 1;
 	}
-	splx(s);
 	return 0;
 }
 
@@ -2235,12 +2358,14 @@ sysctl_net_bpf_peers(SYSCTLFN_ARGS)
 			BPF_EXT(dcount);
 			BPF_EXT(ccount);
 #undef BPF_EXT
+			mutex_enter(dp->bd_mtx);
 			if (dp->bd_bif)
 				(void)strlcpy(dpe.bde_ifname,
 				    dp->bd_bif->bif_ifp->if_xname,
 				    IFNAMSIZ - 1);
 			else
 				dpe.bde_ifname[0] = '\0';
+			mutex_exit(dp->bd_mtx);
 
 			error = copyout(&dpe, sp, out_size);
 			if (error)

Index: src/sys/net/bpfdesc.h
diff -u src/sys/net/bpfdesc.h:1.43 src/sys/net/bpfdesc.h:1.44
--- src/sys/net/bpfdesc.h:1.43	Wed Feb  1 08:16:42 2017
+++ src/sys/net/bpfdesc.h	Thu Feb  9 09:30:26 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: bpfdesc.h,v 1.43 2017/02/01 08:16:42 ozaki-r Exp $	*/
+/*	$NetBSD: bpfdesc.h,v 1.44 2017/02/09 09:30:26 ozaki-r Exp $	*/
 
 /*
  * Copyright (c) 1990, 1991, 1993
@@ -49,8 +49,15 @@
 #include <sys/pslist.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
+#include <sys/psref.h>
 #endif
 
+struct bpf_filter {
+	struct bpf_insn *bf_insn; 	/* filter code */
+	size_t		bf_size;
+	bpfjit_func_t	bf_jitcode;	/* compiled filter program */
+};
+
 /*
  * Descriptor associated with each open bpf file.
  */
@@ -76,7 +83,12 @@ struct bpf_d {
 
 	struct bpf_if *	bd_bif;		/* interface descriptor */
 	u_long		bd_rtout;	/* Read timeout in 'ticks' */
-	struct bpf_insn *bd_filter; 	/* filter code */
+	/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
+	struct bpf_insn *_bd_filter; 	/* filter code */
+	/*
+	 * XXX we should make the counters per-CPU once we retire kvm(3) users
+	 * that directly access them.
+	 */
 	u_long		bd_rcount;	/* number of packets received */
 	u_long		bd_dcount;	/* number of packets dropped */
 	u_long		bd_ccount;	/* number of packets captured */
@@ -108,12 +120,14 @@ struct bpf_d {
 #ifdef _LP64
 	int		bd_compat32;	/* 32-bit stream on LP64 system */
 #endif
+	/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
 	bpfjit_func_t	bd_jitcode;	/* compiled filter program */
-	size_t		bd_filter_size;
+	struct bpf_filter *bd_filter;
 #ifdef _KERNEL
 	struct pslist_entry	bd_bif_dlist_entry; /* For bpf_if */
 	struct pslist_entry	bd_bpf_dlist_entry; /* For the global list */
 	kmutex_t	*bd_mtx;
+	kmutex_t	*bd_buf_mtx;
 	kcondvar_t	bd_cv;
 #endif
 };
@@ -160,6 +174,7 @@ struct bpf_if {
 #ifdef _KERNEL
 	struct pslist_entry bif_iflist_entry;
 	struct pslist_head bif_dlist_head;
+	struct psref_target bif_psref;
 #endif
 };
 

Index: src/sys/net/if.c
diff -u src/sys/net/if.c:1.375 src/sys/net/if.c:1.376
--- src/sys/net/if.c:1.375	Wed Jan 25 03:04:21 2017
+++ src/sys/net/if.c	Thu Feb  9 09:30:26 2017
@@ -1,4 +1,4 @@
-/*	$NetBSD: if.c,v 1.375 2017/01/25 03:04:21 christos Exp $	*/
+/*	$NetBSD: if.c,v 1.376 2017/02/09 09:30:26 ozaki-r Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc.
@@ -90,7 +90,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.375 2017/01/25 03:04:21 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.376 2017/02/09 09:30:26 ozaki-r Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_inet.h"
@@ -774,13 +774,7 @@ if_percpuq_softint(void *arg)
 
 	while ((m = if_percpuq_dequeue(ipq)) != NULL) {
 		ifp->if_ipackets++;
-#ifndef NET_MPSAFE
-		KERNEL_LOCK(1, NULL);
-#endif
 		bpf_mtap(ifp, m);
-#ifndef NET_MPSAFE
-		KERNEL_UNLOCK_ONE(NULL);
-#endif
 
 		ifp->_if_input(ifp, m);
 	}
@@ -1072,13 +1066,7 @@ if_input(struct ifnet *ifp, struct mbuf 
 	KASSERT(!cpu_intr_p());
 
 	ifp->if_ipackets++;
-#ifndef NET_MPSAFE
-	KERNEL_LOCK(1, NULL);
-#endif
 	bpf_mtap(ifp, m);
-#ifndef NET_MPSAFE
-	KERNEL_UNLOCK_ONE(NULL);
-#endif
 
 	ifp->_if_input(ifp, m);
 }

Reply via email to