Module Name: src
Committed By: ozaki-r
Date: Thu Feb 9 09:30:26 UTC 2017
Modified Files:
src/sys/net: bpf.c bpfdesc.h if.c
Log Message:
Make bpf MP-safe
By the change, bpf_mtap can run without any locks as long as its bpf filter
doesn't match a target packet. Pushing data to a bpf buffer still needs
a lock. Removing the lock requires big changes and it's a future work.
Another known issue is that we need to remain some obsolete variables to
avoid breaking kvm(3) users such as netstat and fstat. One problem for
MP-ification is that in order to keep statistic counters of bpf_d we need
to use atomic operations for them. Once we retire the kvm(3) users, we
should make the counters per-CPU and remove the atomic operations.
To generate a diff of this commit:
cvs rdiff -u -r1.212 -r1.213 src/sys/net/bpf.c
cvs rdiff -u -r1.43 -r1.44 src/sys/net/bpfdesc.h
cvs rdiff -u -r1.375 -r1.376 src/sys/net/if.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/net/bpf.c
diff -u src/sys/net/bpf.c:1.212 src/sys/net/bpf.c:1.213
--- src/sys/net/bpf.c:1.212 Wed Feb 1 08:18:33 2017
+++ src/sys/net/bpf.c Thu Feb 9 09:30:26 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $ */
+/* $NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $ */
/*
* Copyright (c) 1990, 1991, 1993
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.212 2017/02/01 08:18:33 ozaki-r Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.213 2017/02/09 09:30:26 ozaki-r Exp $");
#if defined(_KERNEL_OPT)
#include "opt_bpf.h"
@@ -77,6 +77,8 @@ __KERNEL_RCSID(0, "$NetBSD: bpf.c,v 1.21
#include <sys/kauth.h>
#include <sys/syslog.h>
#include <sys/percpu.h>
+#include <sys/pserialize.h>
+#include <sys/lwp.h>
#include <net/if.h>
#include <net/slip.h>
@@ -132,11 +134,48 @@ static struct percpu *bpf_gstats_percpu;
}
/*
+ * Locking notes:
+ * - bpf_mtx (adaptive mutex) protects:
+ * - Gobal lists: bpf_iflist and bpf_dlist
+ * - struct bpf_if
+ * - bpf_close
+ * - bpf_psz (pserialize)
+ * - struct bpf_d has two mutexes:
+ * - bd_buf_mtx (spin mutex) protects the buffers that can be accessed
+ * on packet tapping
+ * - bd_mtx (adaptive mutex) protects member variables other than the buffers
+ * - Locking order: bpf_mtx => bpf_d#bd_mtx => bpf_d#bd_buf_mtx
+ * - struct bpf_d obtained via fp->f_bpf in bpf_read and bpf_write is
+ * never freed because struct bpf_d is only freed in bpf_close and
+ * bpf_close never be called while executing bpf_read and bpf_write
+ * - A filter that is assigned to bpf_d can be replaced with another filter
+ * while tapping packets, so it needs to be done atomically
+ * - struct bpf_d is iterated on bpf_dlist with psz
+ * - struct bpf_if is iterated on bpf_iflist with psz or psref
+ */
+/*
* Use a mutex to avoid a race condition between gathering the stats/peers
* and opening/closing the device.
*/
static kmutex_t bpf_mtx;
+static struct psref_class *bpf_psref_class __read_mostly;
+static pserialize_t bpf_psz;
+
+static inline void
+bpf_if_acquire(struct bpf_if *bp, struct psref *psref)
+{
+
+ psref_acquire(psref, &bp->bif_psref, bpf_psref_class);
+}
+
+static inline void
+bpf_if_release(struct bpf_if *bp, struct psref *psref)
+{
+
+ psref_release(psref, &bp->bif_psref, bpf_psref_class);
+}
+
/*
* bpf_iflist is the list of interfaces; each corresponds to an ifnet
* bpf_dtab holds the descriptors, indexed by minor device #
@@ -201,6 +240,7 @@ static void bpf_deliver(struct bpf_if *,
void *(*cpfn)(void *, const void *, size_t),
void *, u_int, u_int, const bool);
static void bpf_freed(struct bpf_d *);
+static void bpf_free_filter(struct bpf_filter *);
static void bpf_ifname(struct ifnet *, struct ifreq *);
static void *bpf_mcpy(void *, const void *, size_t);
static int bpf_movein(struct uio *, int, uint64_t,
@@ -404,7 +444,9 @@ bad:
static void
bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
{
+
KASSERT(mutex_owned(&bpf_mtx));
+ KASSERT(mutex_owned(d->bd_mtx));
/*
* Point d at bp, and add d to the interface's list of listeners.
* Finally, point the driver's bpf cookie at the interface so
@@ -425,6 +467,7 @@ bpf_detachd(struct bpf_d *d)
struct bpf_if *bp;
KASSERT(mutex_owned(&bpf_mtx));
+ KASSERT(mutex_owned(d->bd_mtx));
bp = d->bd_bif;
/*
@@ -442,7 +485,13 @@ bpf_detachd(struct bpf_d *d)
* the interface was configured down, so only panic
* if we don't get an unexpected error.
*/
+#ifndef NET_MPSAFE
+ KERNEL_LOCK(1, NULL);
+#endif
error = ifpromisc(bp->bif_ifp, 0);
+#ifndef NET_MPSAFE
+ KERNEL_UNLOCK_ONE(NULL);
+#endif
#ifdef DIAGNOSTIC
if (error)
printf("%s: ifpromisc failed: %d", __func__, error);
@@ -452,11 +501,8 @@ bpf_detachd(struct bpf_d *d)
/* Remove d from the interface's descriptor list. */
BPFIF_DLIST_WRITER_REMOVE(d);
- /* TODO pserialize_perform(); */
- /* TODO psref_target_destroy(); */
- BPFIF_DLIST_ENTRY_DESTROY(d);
+ pserialize_perform(bpf_psz);
- /* XXX NOMPSAFE? */
if (BPFIF_DLIST_WRITER_EMPTY(bp)) {
/*
* Let the driver know that there are no more listeners.
@@ -471,6 +517,8 @@ bpf_init(void)
{
mutex_init(&bpf_mtx, MUTEX_DEFAULT, IPL_NONE);
+ bpf_psz = pserialize_create();
+ bpf_psref_class = psref_class_create("bpf", IPL_SOFTNET);
PSLIST_INIT(&bpf_iflist);
PSLIST_INIT(&bpf_dlist);
@@ -521,9 +569,11 @@ bpfopen(dev_t dev, int flag, int mode, s
selinit(&d->bd_sel);
d->bd_sih = softint_establish(SOFTINT_CLOCK, bpf_softintr, d);
d->bd_jitcode = NULL;
+ d->bd_filter = NULL;
BPF_DLIST_ENTRY_INIT(d);
BPFIF_DLIST_ENTRY_INIT(d);
- d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
+ d->bd_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SOFTNET);
+ d->bd_buf_mtx = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET);
cv_init(&d->bd_cv, "bpf");
mutex_enter(&bpf_mtx);
@@ -542,14 +592,11 @@ static int
bpf_close(struct file *fp)
{
struct bpf_d *d;
- int s;
- KERNEL_LOCK(1, NULL);
mutex_enter(&bpf_mtx);
if ((d = fp->f_bpf) == NULL) {
mutex_exit(&bpf_mtx);
- KERNEL_UNLOCK_ONE(NULL);
return 0;
}
@@ -558,28 +605,28 @@ bpf_close(struct file *fp)
*/
d->bd_pid = curproc->p_pid;
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING)
- callout_stop(&d->bd_callout);
+ callout_halt(&d->bd_callout, d->bd_mtx);
d->bd_state = BPF_IDLE;
if (d->bd_bif)
bpf_detachd(d);
- splx(s);
- bpf_freed(d);
+ mutex_exit(d->bd_mtx);
+
BPF_DLIST_WRITER_REMOVE(d);
- fp->f_bpf = NULL;
+ pserialize_perform(bpf_psz);
mutex_exit(&bpf_mtx);
- KERNEL_UNLOCK_ONE(NULL);
- /* TODO pserialize_perform(); */
- /* TODO psref_target_destroy(); */
+ BPFIF_DLIST_ENTRY_DESTROY(d);
BPF_DLIST_ENTRY_DESTROY(d);
-
+ fp->f_bpf = NULL;
+ bpf_freed(d);
callout_destroy(&d->bd_callout);
seldestroy(&d->bd_sel);
softint_disestablish(d->bd_sih);
mutex_obj_free(d->bd_mtx);
+ mutex_obj_free(d->bd_buf_mtx);
cv_destroy(&d->bd_cv);
kmem_free(d, sizeof(*d));
@@ -608,7 +655,6 @@ bpf_read(struct file *fp, off_t *offp, s
struct bpf_d *d = fp->f_bpf;
int timed_out;
int error;
- int s;
getnanotime(&d->bd_atime);
/*
@@ -618,17 +664,18 @@ bpf_read(struct file *fp, off_t *offp, s
if (uio->uio_resid != d->bd_bufsize)
return (EINVAL);
- KERNEL_LOCK(1, NULL);
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING)
- callout_stop(&d->bd_callout);
+ callout_halt(&d->bd_callout, d->bd_buf_mtx);
timed_out = (d->bd_state == BPF_TIMED_OUT);
d->bd_state = BPF_IDLE;
+ mutex_exit(d->bd_mtx);
/*
* If the hold buffer is empty, then do a timed sleep, which
* ends when the timeout expires or when enough packets
* have arrived to fill the store buffer.
*/
+ mutex_enter(d->bd_buf_mtx);
while (d->bd_hbuf == NULL) {
if (fp->f_flag & FNONBLOCK) {
if (d->bd_slen == 0) {
@@ -649,9 +696,7 @@ bpf_read(struct file *fp, off_t *offp, s
break;
}
- mutex_enter(d->bd_mtx);
- error = cv_timedwait_sig(&d->bd_cv, d->bd_mtx, d->bd_rtout);
- mutex_exit(d->bd_mtx);
+ error = cv_timedwait_sig(&d->bd_cv, d->bd_buf_mtx, d->bd_rtout);
if (error == EINTR || error == ERESTART)
goto out;
@@ -683,7 +728,7 @@ bpf_read(struct file *fp, off_t *offp, s
/*
* At this point, we know we have something in the hold slot.
*/
- splx(s);
+ mutex_exit(d->bd_buf_mtx);
/*
* Move data from hold buffer into user space.
@@ -692,13 +737,12 @@ bpf_read(struct file *fp, off_t *offp, s
*/
error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
- s = splnet();
+ mutex_enter(d->bd_buf_mtx);
d->bd_fbuf = d->bd_hbuf;
d->bd_hbuf = NULL;
d->bd_hlen = 0;
out:
- splx(s);
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
return (error);
}
@@ -710,9 +754,9 @@ static inline void
bpf_wakeup(struct bpf_d *d)
{
- mutex_enter(d->bd_mtx);
+ mutex_enter(d->bd_buf_mtx);
cv_broadcast(&d->bd_cv);
- mutex_exit(d->bd_mtx);
+ mutex_exit(d->bd_buf_mtx);
if (d->bd_async)
softint_schedule(d->bd_sih);
@@ -733,15 +777,14 @@ static void
bpf_timed_out(void *arg)
{
struct bpf_d *d = arg;
- int s;
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING) {
d->bd_state = BPF_TIMED_OUT;
if (d->bd_slen != 0)
bpf_wakeup(d);
}
- splx(s);
+ mutex_exit(d->bd_mtx);
}
@@ -750,39 +793,49 @@ bpf_write(struct file *fp, off_t *offp,
kauth_cred_t cred, int flags)
{
struct bpf_d *d = fp->f_bpf;
+ struct bpf_if *bp;
struct ifnet *ifp;
struct mbuf *m, *mc;
- int error, s;
+ int error;
static struct sockaddr_storage dst;
+ struct psref psref;
+ int bound;
m = NULL; /* XXX gcc */
- KERNEL_LOCK(1, NULL);
-
- if (d->bd_bif == NULL) {
- KERNEL_UNLOCK_ONE(NULL);
- return (ENXIO);
+ bound = curlwp_bind();
+ mutex_enter(d->bd_mtx);
+ bp = d->bd_bif;
+ if (bp == NULL) {
+ mutex_exit(d->bd_mtx);
+ error = ENXIO;
+ goto out_bindx;
}
+ bpf_if_acquire(bp, &psref);
+ mutex_exit(d->bd_mtx);
+
getnanotime(&d->bd_mtime);
- ifp = d->bd_bif->bif_ifp;
+ ifp = bp->bif_ifp;
+ if (if_is_deactivated(ifp)) {
+ error = ENXIO;
+ goto out;
+ }
if (uio->uio_resid == 0) {
- KERNEL_UNLOCK_ONE(NULL);
- return (0);
+ error = 0;
+ goto out;
}
- error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp->if_mtu, &m,
+ error = bpf_movein(uio, (int)bp->bif_dlt, ifp->if_mtu, &m,
(struct sockaddr *) &dst);
- if (error) {
- KERNEL_UNLOCK_ONE(NULL);
- return (error);
- }
+ if (error)
+ goto out;
if (m->m_pkthdr.len > ifp->if_mtu) {
- KERNEL_UNLOCK_ONE(NULL);
m_freem(m);
- return (EMSGSIZE);
+ error = EMSGSIZE;
+ goto out;
}
if (d->bd_hdrcmplt)
@@ -798,7 +851,6 @@ bpf_write(struct file *fp, off_t *offp,
} else
mc = NULL;
- s = splsoftnet();
error = if_output_lock(ifp, ifp, m, (struct sockaddr *) &dst, NULL);
if (mc != NULL) {
@@ -807,12 +859,14 @@ bpf_write(struct file *fp, off_t *offp,
else
m_freem(mc);
}
- splx(s);
- KERNEL_UNLOCK_ONE(NULL);
/*
* The driver frees the mbuf.
*/
- return (error);
+out:
+ bpf_if_release(bp, &psref);
+out_bindx:
+ curlwp_bindx(bound);
+ return error;
}
/*
@@ -822,6 +876,10 @@ bpf_write(struct file *fp, off_t *offp,
static void
reset_d(struct bpf_d *d)
{
+
+ KASSERT(mutex_owned(d->bd_mtx));
+
+ mutex_enter(d->bd_buf_mtx);
if (d->bd_hbuf) {
/* Free the hold buffer. */
d->bd_fbuf = d->bd_hbuf;
@@ -832,6 +890,7 @@ reset_d(struct bpf_d *d)
d->bd_rcount = 0;
d->bd_dcount = 0;
d->bd_ccount = 0;
+ mutex_exit(d->bd_buf_mtx);
}
/*
@@ -860,12 +919,11 @@ static int
bpf_ioctl(struct file *fp, u_long cmd, void *addr)
{
struct bpf_d *d = fp->f_bpf;
- int s, error = 0;
+ int error = 0;
/*
* Refresh the PID associated with this bpf file.
*/
- KERNEL_LOCK(1, NULL);
d->bd_pid = curproc->p_pid;
#ifdef _LP64
if (curproc->p_flag & PK_32)
@@ -874,11 +932,11 @@ bpf_ioctl(struct file *fp, u_long cmd, v
d->bd_compat32 = 0;
#endif
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (d->bd_state == BPF_WAITING)
- callout_stop(&d->bd_callout);
+ callout_halt(&d->bd_callout, d->bd_mtx);
d->bd_state = BPF_IDLE;
- splx(s);
+ mutex_exit(d->bd_mtx);
switch (cmd) {
@@ -893,11 +951,11 @@ bpf_ioctl(struct file *fp, u_long cmd, v
{
int n;
- s = splnet();
+ mutex_enter(d->bd_buf_mtx);
n = d->bd_slen;
if (d->bd_hbuf)
n += d->bd_hlen;
- splx(s);
+ mutex_exit(d->bd_buf_mtx);
*(int *)addr = n;
break;
@@ -918,6 +976,8 @@ bpf_ioctl(struct file *fp, u_long cmd, v
* Forbid to change the buffer length if buffers are already
* allocated.
*/
+ mutex_enter(d->bd_mtx);
+ mutex_enter(d->bd_buf_mtx);
if (d->bd_bif != NULL || d->bd_sbuf != NULL)
error = EINVAL;
else {
@@ -929,6 +989,8 @@ bpf_ioctl(struct file *fp, u_long cmd, v
*(u_int *)addr = size = BPF_MINBUFSIZE;
d->bd_bufsize = size;
}
+ mutex_exit(d->bd_buf_mtx);
+ mutex_exit(d->bd_mtx);
break;
/*
@@ -942,49 +1004,60 @@ bpf_ioctl(struct file *fp, u_long cmd, v
* Flush read packet buffer.
*/
case BIOCFLUSH:
- s = splnet();
+ mutex_enter(d->bd_mtx);
reset_d(d);
- splx(s);
+ mutex_exit(d->bd_mtx);
break;
/*
* Put interface into promiscuous mode.
*/
case BIOCPROMISC:
+ mutex_enter(d->bd_mtx);
if (d->bd_bif == NULL) {
+ mutex_exit(d->bd_mtx);
/*
* No interface attached yet.
*/
error = EINVAL;
break;
}
- s = splnet();
if (d->bd_promisc == 0) {
+#ifndef NET_MPSAFE
+ KERNEL_LOCK(1, NULL);
+#endif
error = ifpromisc(d->bd_bif->bif_ifp, 1);
+#ifndef NET_MPSAFE
+ KERNEL_UNLOCK_ONE(NULL);
+#endif
if (error == 0)
d->bd_promisc = 1;
}
- splx(s);
+ mutex_exit(d->bd_mtx);
break;
/*
* Get device parameters.
*/
case BIOCGDLT:
+ mutex_enter(d->bd_mtx);
if (d->bd_bif == NULL)
error = EINVAL;
else
*(u_int *)addr = d->bd_bif->bif_dlt;
+ mutex_exit(d->bd_mtx);
break;
/*
* Get a list of supported device parameters.
*/
case BIOCGDLTLIST:
+ mutex_enter(d->bd_mtx);
if (d->bd_bif == NULL)
error = EINVAL;
else
error = bpf_getdltlist(d, addr);
+ mutex_exit(d->bd_mtx);
break;
/*
@@ -992,10 +1065,12 @@ bpf_ioctl(struct file *fp, u_long cmd, v
*/
case BIOCSDLT:
mutex_enter(&bpf_mtx);
+ mutex_enter(d->bd_mtx);
if (d->bd_bif == NULL)
error = EINVAL;
else
error = bpf_setdlt(d, *(u_int *)addr);
+ mutex_exit(d->bd_mtx);
mutex_exit(&bpf_mtx);
break;
@@ -1006,10 +1081,12 @@ bpf_ioctl(struct file *fp, u_long cmd, v
case OBIOCGETIF:
#endif
case BIOCGETIF:
+ mutex_enter(d->bd_mtx);
if (d->bd_bif == NULL)
error = EINVAL;
else
bpf_ifname(d->bd_bif->bif_ifp, addr);
+ mutex_exit(d->bd_mtx);
break;
/*
@@ -1175,7 +1252,6 @@ bpf_ioctl(struct file *fp, u_long cmd, v
error = fgetown(d->bd_pgid, cmd, addr);
break;
}
- KERNEL_UNLOCK_ONE(NULL);
return (error);
}
@@ -1186,10 +1262,10 @@ bpf_ioctl(struct file *fp, u_long cmd, v
static int
bpf_setf(struct bpf_d *d, struct bpf_program *fp)
{
- struct bpf_insn *fcode, *old;
- bpfjit_func_t jcode, oldj;
- size_t flen, size = 0, old_size;
- int s;
+ struct bpf_insn *fcode;
+ bpfjit_func_t jcode;
+ size_t flen, size = 0;
+ struct bpf_filter *oldf, *newf;
jcode = NULL;
flen = fp->bf_len;
@@ -1217,23 +1293,25 @@ bpf_setf(struct bpf_d *d, struct bpf_pro
fcode = NULL;
}
- old_size = d->bd_filter_size;
+ newf = kmem_alloc(sizeof(*newf), KM_SLEEP);
+ newf->bf_insn = fcode;
+ newf->bf_size = size;
+ newf->bf_jitcode = jcode;
+ d->bd_jitcode = jcode; /* XXX just for kvm(3) users */
- s = splnet();
- old = d->bd_filter;
- d->bd_filter = fcode;
- d->bd_filter_size = size;
- oldj = d->bd_jitcode;
- d->bd_jitcode = jcode;
+ /* Need to hold bpf_mtx for pserialize_perform */
+ mutex_enter(&bpf_mtx);
+ mutex_enter(d->bd_mtx);
+ oldf = d->bd_filter;
+ d->bd_filter = newf;
+ membar_producer();
reset_d(d);
- splx(s);
+ pserialize_perform(bpf_psz);
+ mutex_exit(d->bd_mtx);
+ mutex_exit(&bpf_mtx);
- if (old) {
- kmem_free(old, old_size);
- }
- if (oldj) {
- bpf_jit_freecode(oldj);
- }
+ if (oldf != NULL)
+ bpf_free_filter(oldf);
return 0;
}
@@ -1248,7 +1326,7 @@ bpf_setif(struct bpf_d *d, struct ifreq
{
struct bpf_if *bp;
char *cp;
- int unit_seen, i, s, error;
+ int unit_seen, i, error;
KASSERT(mutex_owned(&bpf_mtx));
/*
@@ -1292,12 +1370,16 @@ bpf_setif(struct bpf_d *d, struct ifreq
* If we're already attached to requested interface,
* just flush the buffer.
*/
+ /*
+ * bpf_allocbufs is called only here. bpf_mtx ensures that
+ * no race condition happen on d->bd_sbuf.
+ */
if (d->bd_sbuf == NULL) {
error = bpf_allocbufs(d);
if (error != 0)
return (error);
}
- s = splnet();
+ mutex_enter(d->bd_mtx);
if (bp != d->bd_bif) {
if (d->bd_bif)
/*
@@ -1308,7 +1390,7 @@ bpf_setif(struct bpf_d *d, struct ifreq
bpf_attachd(d, bp);
}
reset_d(d);
- splx(s);
+ mutex_exit(d->bd_mtx);
return (0);
}
/* Not found. */
@@ -1330,7 +1412,7 @@ bpf_stat(struct file *fp, struct stat *s
struct bpf_d *d = fp->f_bpf;
(void)memset(st, 0, sizeof(*st));
- KERNEL_LOCK(1, NULL);
+ mutex_enter(d->bd_mtx);
st->st_dev = makedev(cdevsw_lookup_major(&bpf_cdevsw), d->bd_pid);
st->st_atimespec = d->bd_atime;
st->st_mtimespec = d->bd_mtime;
@@ -1338,7 +1420,7 @@ bpf_stat(struct file *fp, struct stat *s
st->st_uid = kauth_cred_geteuid(fp->f_cred);
st->st_gid = kauth_cred_getegid(fp->f_cred);
st->st_mode = S_IFCHR;
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_mtx);
return 0;
}
@@ -1354,13 +1436,12 @@ static int
bpf_poll(struct file *fp, int events)
{
struct bpf_d *d = fp->f_bpf;
- int s = splnet();
int revents;
/*
* Refresh the PID associated with this bpf file.
*/
- KERNEL_LOCK(1, NULL);
+ mutex_enter(&bpf_mtx);
d->bd_pid = curproc->p_pid;
revents = events & (POLLOUT | POLLWRNORM);
@@ -1368,6 +1449,7 @@ bpf_poll(struct file *fp, int events)
/*
* An imitation of the FIONREAD ioctl code.
*/
+ mutex_enter(d->bd_mtx);
if (d->bd_hlen != 0 ||
((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
d->bd_slen != 0)) {
@@ -1381,10 +1463,10 @@ bpf_poll(struct file *fp, int events)
d->bd_state = BPF_WAITING;
}
}
+ mutex_exit(d->bd_mtx);
}
- KERNEL_UNLOCK_ONE(NULL);
- splx(s);
+ mutex_exit(&bpf_mtx);
return (revents);
}
@@ -1392,13 +1474,10 @@ static void
filt_bpfrdetach(struct knote *kn)
{
struct bpf_d *d = kn->kn_hook;
- int s;
- KERNEL_LOCK(1, NULL);
- s = splnet();
+ mutex_enter(d->bd_buf_mtx);
SLIST_REMOVE(&d->bd_sel.sel_klist, kn, knote, kn_selnext);
- splx(s);
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
}
static int
@@ -1407,12 +1486,12 @@ filt_bpfread(struct knote *kn, long hint
struct bpf_d *d = kn->kn_hook;
int rv;
- KERNEL_LOCK(1, NULL);
+ mutex_enter(d->bd_buf_mtx);
kn->kn_data = d->bd_hlen;
if (d->bd_immediate)
kn->kn_data += d->bd_slen;
rv = (kn->kn_data > 0);
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
return rv;
}
@@ -1424,10 +1503,8 @@ bpf_kqfilter(struct file *fp, struct kno
{
struct bpf_d *d = fp->f_bpf;
struct klist *klist;
- int s;
-
- KERNEL_LOCK(1, NULL);
+ mutex_enter(d->bd_buf_mtx);
switch (kn->kn_filter) {
case EVFILT_READ:
klist = &d->bd_sel.sel_klist;
@@ -1435,16 +1512,14 @@ bpf_kqfilter(struct file *fp, struct kno
break;
default:
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
return (EINVAL);
}
kn->kn_hook = d;
- s = splnet();
SLIST_INSERT_HEAD(klist, kn, kn_selnext);
- splx(s);
- KERNEL_UNLOCK_ONE(NULL);
+ mutex_exit(d->bd_buf_mtx);
return (0);
}
@@ -1498,25 +1573,35 @@ bpf_deliver(struct bpf_if *bp, void *(*c
bool gottime = false;
struct timespec ts;
struct bpf_d *d;
+ int s;
+
+ KASSERT(!cpu_intr_p());
/*
* Note that the IPL does not have to be raised at this point.
* The only problem that could arise here is that if two different
* interfaces shared any data. This is not the case.
*/
+ s = pserialize_read_enter();
BPFIF_DLIST_READER_FOREACH(d, bp) {
- u_int slen;
+ u_int slen = 0;
+ struct bpf_filter *filter;
if (!d->bd_seesent && !rcv) {
continue;
}
- d->bd_rcount++;
+ atomic_inc_ulong(&d->bd_rcount);
BPF_STATINC(recv);
- if (d->bd_jitcode)
- slen = d->bd_jitcode(NULL, &args);
- else
- slen = bpf_filter_ext(NULL, d->bd_filter, &args);
+ filter = d->bd_filter;
+ membar_datadep_consumer();
+ if (filter != NULL) {
+ if (filter->bf_jitcode != NULL)
+ slen = filter->bf_jitcode(NULL, &args);
+ else
+ slen = bpf_filter_ext(NULL, filter->bf_insn,
+ &args);
+ }
if (!slen) {
continue;
@@ -1525,8 +1610,10 @@ bpf_deliver(struct bpf_if *bp, void *(*c
gottime = true;
nanotime(&ts);
}
+ /* Assume catchpacket doesn't sleep */
catchpacket(d, pkt, pktlen, slen, cpfn, &ts);
}
+ pserialize_read_exit(s);
}
/*
@@ -1633,7 +1720,6 @@ _bpf_mtap_af(struct bpf_if *bp, uint32_t
static void
_bpf_mtap_sl_in(struct bpf_if *bp, u_char *chdr, struct mbuf **m)
{
- int s;
u_char *hp;
M_PREPEND(*m, SLIP_HDRLEN, M_DONTWAIT);
@@ -1644,9 +1730,7 @@ _bpf_mtap_sl_in(struct bpf_if *bp, u_cha
hp[SLX_DIR] = SLIPDIR_IN;
(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
- s = splnet();
_bpf_mtap(bp, *m);
- splx(s);
m_adj(*m, SLIP_HDRLEN);
}
@@ -1661,7 +1745,6 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_ch
{
struct mbuf m0;
u_char *hp;
- int s;
m0.m_flags = 0;
m0.m_next = m;
@@ -1673,9 +1756,7 @@ _bpf_mtap_sl_out(struct bpf_if *bp, u_ch
hp[SLX_DIR] = SLIPDIR_OUT;
(void)memcpy(&hp[SLX_CHDR], chdr, CHDR_LEN);
- s = splnet();
_bpf_mtap(bp, &m0);
- splx(s);
m_freem(m);
}
@@ -1708,6 +1789,7 @@ bpf_mbuf_dequeue(struct bpf_if *bp)
struct mbuf *m;
int s;
+ /* XXX NOMPSAFE: assumed running on one CPU */
s = splnet();
m = bp->bif_mbuf_head;
if (m != NULL) {
@@ -1737,13 +1819,7 @@ bpf_mtap_si(void *arg)
log(LOG_DEBUG, "%s: tapping mbuf=%p on %s\n",
__func__, m, bp->bif_ifp->if_xname);
#endif
-#ifndef NET_MPSAFE
- KERNEL_LOCK(1, NULL);
-#endif
bpf_ops->bpf_mtap(bp, m);
-#ifndef NET_MPSAFE
- KERNEL_UNLOCK_ONE(NULL);
-#endif
m_freem(m);
}
}
@@ -1801,7 +1877,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
int hdrlen = bpf_hdrlen(d);
int do_wakeup = 0;
- ++d->bd_ccount;
+ atomic_inc_ulong(&d->bd_ccount);
BPF_STATINC(capt);
/*
* Figure out how many bytes to move. If the packet is
@@ -1820,6 +1896,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
if (caplen < 0)
caplen = 0;
+ mutex_enter(d->bd_buf_mtx);
/*
* Round up the end of the previous packet to the next longword.
*/
@@ -1836,11 +1913,12 @@ catchpacket(struct bpf_d *d, u_char *pkt
* pending reads.
*/
if (d->bd_fbuf == NULL) {
+ mutex_exit(d->bd_buf_mtx);
/*
* We haven't completed the previous read yet,
* so drop the packet.
*/
- ++d->bd_dcount;
+ atomic_inc_ulong(&d->bd_dcount);
BPF_STATINC(drop);
return;
}
@@ -1888,6 +1966,7 @@ catchpacket(struct bpf_d *d, u_char *pkt
*/
(*cpfn)(h + hdrlen, pkt, caplen);
d->bd_slen = curlen + totlen;
+ mutex_exit(d->bd_buf_mtx);
/*
* Call bpf_wakeup after bd_slen has been updated so that kevent(2)
@@ -1917,6 +1996,19 @@ bpf_allocbufs(struct bpf_d *d)
return (0);
}
+static void
+bpf_free_filter(struct bpf_filter *filter)
+{
+
+ KASSERT(filter != NULL);
+ KASSERT(filter->bf_insn != NULL);
+
+ kmem_free(filter->bf_insn, filter->bf_size);
+ if (filter->bf_jitcode != NULL)
+ bpf_jit_freecode(filter->bf_jitcode);
+ kmem_free(filter, sizeof(*filter));
+}
+
/*
* Free buffers currently in use by a descriptor.
* Called on close.
@@ -1936,12 +2028,11 @@ bpf_freed(struct bpf_d *d)
if (d->bd_fbuf != NULL)
kmem_free(d->bd_fbuf, d->bd_bufsize);
}
- if (d->bd_filter)
- kmem_free(d->bd_filter, d->bd_filter_size);
-
- if (d->bd_jitcode != NULL) {
- bpf_jit_freecode(d->bd_jitcode);
+ if (d->bd_filter != NULL) {
+ bpf_free_filter(d->bd_filter);
+ d->bd_filter = NULL;
}
+ d->bd_jitcode = NULL;
}
/*
@@ -1964,6 +2055,7 @@ _bpfattach(struct ifnet *ifp, u_int dlt,
bp->bif_si = NULL;
BPF_IFLIST_ENTRY_INIT(bp);
PSLIST_INIT(&bp->bif_dlist_head);
+ psref_target_init(&bp->bif_psref, bpf_psref_class);
BPF_IFLIST_WRITER_INSERT_HEAD(bp);
@@ -2013,27 +2105,31 @@ _bpfdetach(struct ifnet *ifp)
/* Nuke the vnodes for any open instances */
again_d:
BPF_DLIST_WRITER_FOREACH(d) {
+ mutex_enter(d->bd_mtx);
if (d->bd_bif != NULL && d->bd_bif->bif_ifp == ifp) {
/*
* Detach the descriptor from an interface now.
* It will be free'ed later by close routine.
*/
- s = splnet();
d->bd_promisc = 0; /* we can't touch device. */
bpf_detachd(d);
- splx(s);
+ mutex_exit(d->bd_mtx);
goto again_d;
}
+ mutex_exit(d->bd_mtx);
}
again:
BPF_IFLIST_WRITER_FOREACH(bp) {
if (bp->bif_ifp == ifp) {
BPF_IFLIST_WRITER_REMOVE(bp);
- /* TODO pserialize_perform(); */
- /* TODO psref_target_destroy(); */
+
+ pserialize_perform(bpf_psz);
+ psref_target_destroy(&bp->bif_psref, bpf_psref_class);
+
BPF_IFLIST_ENTRY_DESTROY(bp);
if (bp->bif_si != NULL) {
+ /* XXX NOMPSAFE: assumed running on one CPU */
s = splnet();
while (bp->bif_mbuf_head != NULL) {
struct mbuf *m = bp->bif_mbuf_head;
@@ -2058,7 +2154,8 @@ _bpf_change_type(struct ifnet *ifp, u_in
{
struct bpf_if *bp;
- BPF_IFLIST_READER_FOREACH(bp) {
+ mutex_enter(&bpf_mtx);
+ BPF_IFLIST_WRITER_FOREACH(bp) {
if (bp->bif_driverp == &ifp->if_bpf)
break;
}
@@ -2068,6 +2165,7 @@ _bpf_change_type(struct ifnet *ifp, u_in
bp->bif_dlt = dlt;
bp->bif_hdrlen = hdrlen;
+ mutex_exit(&bpf_mtx);
}
/*
@@ -2079,21 +2177,41 @@ bpf_getdltlist(struct bpf_d *d, struct b
int n, error;
struct ifnet *ifp;
struct bpf_if *bp;
+ int s, bound;
+
+ KASSERT(mutex_owned(d->bd_mtx));
ifp = d->bd_bif->bif_ifp;
n = 0;
error = 0;
+
+ bound = curlwp_bind();
+ s = pserialize_read_enter();
BPF_IFLIST_READER_FOREACH(bp) {
if (bp->bif_ifp != ifp)
continue;
if (bfl->bfl_list != NULL) {
- if (n >= bfl->bfl_len)
+ struct psref psref;
+
+ if (n >= bfl->bfl_len) {
+ pserialize_read_exit(s);
return ENOMEM;
+ }
+
+ bpf_if_acquire(bp, &psref);
+ pserialize_read_exit(s);
+
error = copyout(&bp->bif_dlt,
bfl->bfl_list + n, sizeof(u_int));
+
+ s = pserialize_read_enter();
+ bpf_if_release(bp, &psref);
}
n++;
}
+ pserialize_read_exit(s);
+ curlwp_bindx(bound);
+
bfl->bfl_len = n;
return error;
}
@@ -2104,11 +2222,12 @@ bpf_getdltlist(struct bpf_d *d, struct b
static int
bpf_setdlt(struct bpf_d *d, u_int dlt)
{
- int s, error, opromisc;
+ int error, opromisc;
struct ifnet *ifp;
struct bpf_if *bp;
KASSERT(mutex_owned(&bpf_mtx));
+ KASSERT(mutex_owned(d->bd_mtx));
if (d->bd_bif->bif_dlt == dlt)
return 0;
@@ -2119,20 +2238,24 @@ bpf_setdlt(struct bpf_d *d, u_int dlt)
}
if (bp == NULL)
return EINVAL;
- s = splnet();
opromisc = d->bd_promisc;
bpf_detachd(d);
bpf_attachd(d, bp);
reset_d(d);
if (opromisc) {
+#ifndef NET_MPSAFE
+ KERNEL_LOCK(1, NULL);
+#endif
error = ifpromisc(bp->bif_ifp, 1);
+#ifndef NET_MPSAFE
+ KERNEL_UNLOCK_ONE(NULL);
+#endif
if (error)
printf("%s: bpf_setdlt: ifpromisc failed (%d)\n",
bp->bif_ifp->if_xname, error);
else
d->bd_promisc = 1;
}
- splx(s);
return 0;
}
@@ -2235,12 +2358,14 @@ sysctl_net_bpf_peers(SYSCTLFN_ARGS)
BPF_EXT(dcount);
BPF_EXT(ccount);
#undef BPF_EXT
+ mutex_enter(dp->bd_mtx);
if (dp->bd_bif)
(void)strlcpy(dpe.bde_ifname,
dp->bd_bif->bif_ifp->if_xname,
IFNAMSIZ - 1);
else
dpe.bde_ifname[0] = '\0';
+ mutex_exit(dp->bd_mtx);
error = copyout(&dpe, sp, out_size);
if (error)
Index: src/sys/net/bpfdesc.h
diff -u src/sys/net/bpfdesc.h:1.43 src/sys/net/bpfdesc.h:1.44
--- src/sys/net/bpfdesc.h:1.43 Wed Feb 1 08:16:42 2017
+++ src/sys/net/bpfdesc.h Thu Feb 9 09:30:26 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: bpfdesc.h,v 1.43 2017/02/01 08:16:42 ozaki-r Exp $ */
+/* $NetBSD: bpfdesc.h,v 1.44 2017/02/09 09:30:26 ozaki-r Exp $ */
/*
* Copyright (c) 1990, 1991, 1993
@@ -49,8 +49,15 @@
#include <sys/pslist.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
+#include <sys/psref.h>
#endif
+struct bpf_filter {
+ struct bpf_insn *bf_insn; /* filter code */
+ size_t bf_size;
+ bpfjit_func_t bf_jitcode; /* compiled filter program */
+};
+
/*
* Descriptor associated with each open bpf file.
*/
@@ -76,7 +83,12 @@ struct bpf_d {
struct bpf_if * bd_bif; /* interface descriptor */
u_long bd_rtout; /* Read timeout in 'ticks' */
- struct bpf_insn *bd_filter; /* filter code */
+ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */
+ struct bpf_insn *_bd_filter; /* filter code */
+ /*
+ * XXX we should make the counters per-CPU once we retire kvm(3) users
+ * that directly access them.
+ */
u_long bd_rcount; /* number of packets received */
u_long bd_dcount; /* number of packets dropped */
u_long bd_ccount; /* number of packets captured */
@@ -108,12 +120,14 @@ struct bpf_d {
#ifdef _LP64
int bd_compat32; /* 32-bit stream on LP64 system */
#endif
+ /* DEPRECATED. Keep it to avoid breaking kvm(3) users */
bpfjit_func_t bd_jitcode; /* compiled filter program */
- size_t bd_filter_size;
+ struct bpf_filter *bd_filter;
#ifdef _KERNEL
struct pslist_entry bd_bif_dlist_entry; /* For bpf_if */
struct pslist_entry bd_bpf_dlist_entry; /* For the global list */
kmutex_t *bd_mtx;
+ kmutex_t *bd_buf_mtx;
kcondvar_t bd_cv;
#endif
};
@@ -160,6 +174,7 @@ struct bpf_if {
#ifdef _KERNEL
struct pslist_entry bif_iflist_entry;
struct pslist_head bif_dlist_head;
+ struct psref_target bif_psref;
#endif
};
Index: src/sys/net/if.c
diff -u src/sys/net/if.c:1.375 src/sys/net/if.c:1.376
--- src/sys/net/if.c:1.375 Wed Jan 25 03:04:21 2017
+++ src/sys/net/if.c Thu Feb 9 09:30:26 2017
@@ -1,4 +1,4 @@
-/* $NetBSD: if.c,v 1.375 2017/01/25 03:04:21 christos Exp $ */
+/* $NetBSD: if.c,v 1.376 2017/02/09 09:30:26 ozaki-r Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc.
@@ -90,7 +90,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.375 2017/01/25 03:04:21 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.376 2017/02/09 09:30:26 ozaki-r Exp $");
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
@@ -774,13 +774,7 @@ if_percpuq_softint(void *arg)
while ((m = if_percpuq_dequeue(ipq)) != NULL) {
ifp->if_ipackets++;
-#ifndef NET_MPSAFE
- KERNEL_LOCK(1, NULL);
-#endif
bpf_mtap(ifp, m);
-#ifndef NET_MPSAFE
- KERNEL_UNLOCK_ONE(NULL);
-#endif
ifp->_if_input(ifp, m);
}
@@ -1072,13 +1066,7 @@ if_input(struct ifnet *ifp, struct mbuf
KASSERT(!cpu_intr_p());
ifp->if_ipackets++;
-#ifndef NET_MPSAFE
- KERNEL_LOCK(1, NULL);
-#endif
bpf_mtap(ifp, m);
-#ifndef NET_MPSAFE
- KERNEL_UNLOCK_ONE(NULL);
-#endif
ifp->_if_input(ifp, m);
}