Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-12-13 Thread Alexander Bluhm
On Mon, Dec 12, 2016 at 03:22:36PM +0100, Martin Pieuchot wrote:
> @@ -56,15 +56,17 @@ struct bpf_d {
...
> - struct bpf_if * bd_bif; /* interface descriptor */
> + int __in_uiomove;
> +
> + struct bpf_if  *bd_bif; /* interface descriptor */

If __in_uiomove is a debug trick, write that in a comment.  As far
as I know, the __ prefix means it is a compiler extension.  So
prefix the field with bd_ and the comment tells us, we can remove
it in a year.

otherwise OK bluhm@



Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-12-12 Thread Martin Pieuchot
On 10/12/16(Sat) 23:01, Jonathan Matthew wrote:
> On Mon, Nov 28, 2016 at 04:01:14PM +0100, Martin Pieuchot wrote:
> > Last diff to trade the KERNEL_LOCK for a mutex in order to protect data
> > accessed inside bpf_catchpacket().
> > 
> > Note about the multiples data structures:
> > 
> >   - selwakeup() is called in a thread context (task) so we rely on the
> > KERNEL_LOCK() to serialize access to kqueue(9) data.
> > 
> >   - the global list of descriptor ``bpf_d_list``, accessed via 
> > bpfilter_lookup() is also protected by the KERNEL_LOCK().
> > 
> >   - bpf_get() increments the refcount of a descriptor.  It is needed
> > to increment it around ifppromisc() which can sleep.
> > 
> >   - A mutex now protects the rotating buffers and their associated
> > fields.
> >  
> >  - The dance around uiomove(9) is here to check that buffers aren't
> >   rotated while data is copied to userland.  Setting ``b->bd_fbuf''
> >   to NULL should be enough to let bpf_catchpacket() to drop the
> >   patcket.  But I added ``__in_uiomove'' to be able to have usable
> >   panic if something weird happen.
> > 
> > Comments, oks?
> 
> The thing I'm concerned about here is the call to bpf_wakeup() from
> bpf_catchpacket() - this modifies the bpf_d refcount outside the kernel lock.
> Elsewhere the refcount is modified while holding the kernel lock, with or
> without the mutex.  It'll never free the bpf_d there, since the bpf_d list
> still holds a reference, but the refcount operations aren't atomic, so the
> counter could get messed up.  Is there a reason not to use atomics there?

Good point.

Here's an updated diff that uses the atomics and addresses Alexander's
comments.


Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.155
diff -u -p -r1.155 bpf.c
--- net/bpf.c   28 Nov 2016 10:16:08 -  1.155
+++ net/bpf.c   12 Dec 2016 14:18:38 -
@@ -116,6 +116,9 @@ int bpf_sysctl_locked(int *, u_int, void
 
 struct bpf_d *bpfilter_lookup(int);
 
+/*
+ * Called holding ``bd_mtx''.
+ */
 void   bpf_attachd(struct bpf_d *, struct bpf_if *);
 void   bpf_detachd(struct bpf_d *);
 void   bpf_resetd(struct bpf_d *);
@@ -260,11 +263,12 @@ bpf_movein(struct uio *uio, u_int linkty
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
- * Must be called at splnet.
  */
 void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
/*
 * Point d at bp, and add d to the interface's list of listeners.
 * Finally, point the driver's bpf cookie at the interface so
@@ -287,6 +291,8 @@ bpf_detachd(struct bpf_d *d)
 {
struct bpf_if *bp;
 
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
bp = d->bd_bif;
/* Not attached. */
if (bp == NULL)
@@ -313,7 +319,13 @@ bpf_detachd(struct bpf_d *d)
int error;
 
d->bd_promisc = 0;
+
+   bpf_get(d);
+   mtx_leave(>bd_mtx);
error = ifpromisc(bp->bif_ifp, 0);
+   mtx_enter(>bd_mtx);
+   bpf_put(d);
+
if (error && !(error == EINVAL || error == ENODEV))
/*
 * Something is really wrong if we were able to put
@@ -353,6 +365,7 @@ bpfopen(dev_t dev, int flag, int mode, s
bd->bd_unit = unit;
bd->bd_bufsize = bpf_bufsize;
bd->bd_sig = SIGIO;
+   mtx_init(>bd_mtx, IPL_NET);
task_set(>bd_wake_task, bpf_wakeup_cb, bd);
 
if (flag & FNONBLOCK)
@@ -372,15 +385,14 @@ int
 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
 {
struct bpf_d *d;
-   int s;
 
d = bpfilter_lookup(minor(dev));
-   s = splnet();
+   mtx_enter(>bd_mtx);
bpf_detachd(d);
bpf_wakeup(d);
LIST_REMOVE(d, bd_list);
+   mtx_leave(>bd_mtx);
bpf_put(d);
-   splx(s);
 
return (0);
 }
@@ -391,11 +403,13 @@ bpfclose(dev_t dev, int flag, int mode, 
  * Zero the length of the new store buffer.
  */
 #define ROTATE_BUFFERS(d) \
+   KASSERT(d->__in_uiomove == 0); \
+   MUTEX_ASSERT_LOCKED(>bd_mtx); \
(d)->bd_hbuf = (d)->bd_sbuf; \
(d)->bd_hlen = (d)->bd_slen; \
(d)->bd_sbuf = (d)->bd_fbuf; \
(d)->bd_slen = 0; \
-   (d)->bd_fbuf = 0;
+   (d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -403,15 +417,17 @@ int
 bpfread(dev_t dev, struct uio *uio, int ioflag)
 {
struct bpf_d *d;
-   int error;
-   int s;
+   caddr_t hbuf;
+   int hlen, error;
+
+   KERNEL_ASSERT_LOCKED();
 
d = bpfilter_lookup(minor(dev));
if (d->bd_bif == NULL)
return (ENXIO);
 
-   s = splnet();
bpf_get(d);
+   mtx_enter(>bd_mtx);
 
/*
 * Restrict application to use a buffer the same size as
@@ -460,8 +476,8 @@ bpfread(dev_t dev, struct uio 

Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-12-12 Thread Martin Pieuchot
On 07/12/16(Wed) 00:44, Alexander Bluhm wrote:
> On Mon, Nov 28, 2016 at 04:01:14PM +0100, Martin Pieuchot wrote:
> > @@ -717,7 +753,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
> > *(u_int *)addr = size = bpf_maxbufsize;
> > else if (size < BPF_MINBUFSIZE)
> > *(u_int *)addr = size = BPF_MINBUFSIZE;
> > +   mtx_enter(>bd_mtx);
> > d->bd_bufsize = size;
> > +   mtx_leave(>bd_mtx);
> > }
> > break;
> >  
> 
> bd_bufsize needs protection, but you only add it in BIOCSBLEN,
> you should also put a mutex in BIOCGBLEN.

I'm not sure to understand why.  The value is only written in ioctl(2)
context.  The mutex is here to make sure the value wont change while
bpf_catchpacket() is executed.

> > @@ -778,30 +815,36 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
> >  * Get device parameters.
> >  */
> > case BIOCGDLT:
> > +   mtx_enter(>bd_mtx);
> > if (d->bd_bif == NULL)
> > error = EINVAL;
> > else
> > *(u_int *)addr = d->bd_bif->bif_dlt;
> > +   mtx_leave(>bd_mtx);
> > break;
> 
> bd_bif is not protected elsewhere.  bif_dlt is only set during attach.
> So I think you need no mutex here.

Indeed.

> >  
> > /*
> >  * Set device parameters.
> >  */
> > case BIOCSDLT:
> > +   mtx_enter(>bd_mtx);
> > if (d->bd_bif == NULL)
> > error = EINVAL;
> > else
> > error = bpf_setdlt(d, *(u_int *)addr);
> > +   mtx_leave(>bd_mtx);
> > break;
> 
> bd_bif needs not protection, so put the mutex only in the else path.
> Then it is obvious why it is there.

Ack.

> >  
> > /*
> >  * Set interface name.
> >  */
> > case BIOCGETIF:
> > +   mtx_enter(>bd_mtx);
> > if (d->bd_bif == NULL)
> > error = EINVAL;
> > else
> > bpf_ifname(d->bd_bif->bif_ifp, (struct ifreq *)addr);
> > +   mtx_leave(>bd_mtx);
> > break;
> 
> The interface name is not used by bpf without kernel lock.  You do
> not need a mutex here.

Ack.

> > @@ -1151,6 +1197,8 @@ filt_bpfread(struct knote *kn, long hint
> >  {
> > struct bpf_d *d = kn->kn_hook;
> >  
> > +   KERNEL_ASSERT_LOCKED();
> > +
> > kn->kn_data = d->bd_hlen;
> > if (d->bd_immediate)
> > kn->kn_data += d->bd_slen;
> 
> You need the mutex here to access bd_slen.

Ack.

> > @@ -1232,12 +1279,10 @@ _bpf_mtap(caddr_t arg, const struct mbuf
> > if (!gottime++)
> > microtime();
> >  
> > -   KERNEL_LOCK();
> > -   s = splnet();
> > +   mtx_enter(>bd_mtx);
> > bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
> > );
> > -   splx(s);
> > -   KERNEL_UNLOCK();
> > +   mtx_leave(>bd_mtx);
> >  
> > if (d->bd_fildrop)
> > drop = 1;
> 
> bd_fildrop can be changed by ioctl.  Can we expect to read the
> correct value without a memory barrier?
> 
> Same question applies to bd_dirfilt.

We probably need some.  I guess that the actual code works because these
events happen occasionally.

> bd_rcount is a long value incremented atomically in _bpf_mtap().
> It is read from an other CPU with kernel lock during an ioctl.  Is
> this sufficient to get a consisten value?  Do we also need an atomic
> operation when reading it?

Do we need a consistent value?

> > --- net/bpfdesc.h   22 Aug 2016 10:40:36 -  1.31
> > +++ net/bpfdesc.h   28 Nov 2016 14:51:06 -
> > @@ -56,15 +56,17 @@ struct bpf_d {
> >  *   fbuf (free) - When read is done, put cluster here.
> >  * On receiving, if sbuf is full and fbuf is 0, packet is dropped.
> >  */
> > +   struct mutexbd_mtx; /* protect buffer slots below */
> > caddr_t bd_sbuf;/* store slot */
> > caddr_t bd_hbuf;/* hold slot */
> > caddr_t bd_fbuf;/* free slot */
> > int bd_slen;/* current length of store buffer */
> > int bd_hlen;/* current length of hold buffer */
> > -
> > int bd_bufsize; /* absolute length of buffers */
> 
> So bd_mtx is protecting the fields above.  Everything below must
> either be not accessed without kernel lock, or only be set during
> attach.  Is that correct or do I have a wrong understanding of the
> SMP model?

I'm not sure I understood your sentence, but I think you got the model
right :)

> > -   struct bpf_if * bd_bif; /* interface descriptor */
> > +   int __in_uiomove;
> 
> Why do you prefix it with __ ?  bd_uiomove would be more consistent.

Because it's a 

Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-12-10 Thread Jonathan Matthew
On Mon, Nov 28, 2016 at 04:01:14PM +0100, Martin Pieuchot wrote:
> Last diff to trade the KERNEL_LOCK for a mutex in order to protect data
> accessed inside bpf_catchpacket().
> 
> Note about the multiples data structures:
> 
>   - selwakeup() is called in a thread context (task) so we rely on the
> KERNEL_LOCK() to serialize access to kqueue(9) data.
> 
>   - the global list of descriptor ``bpf_d_list``, accessed via 
> bpfilter_lookup() is also protected by the KERNEL_LOCK().
> 
>   - bpf_get() increments the refcount of a descriptor.  It is needed
> to increment it around ifppromisc() which can sleep.
> 
>   - A mutex now protects the rotating buffers and their associated
> fields.
>  
>  - The dance around uiomove(9) is here to check that buffers aren't
>   rotated while data is copied to userland.  Setting ``b->bd_fbuf''
>   to NULL should be enough to let bpf_catchpacket() to drop the
>   patcket.  But I added ``__in_uiomove'' to be able to have usable
>   panic if something weird happen.
> 
> Comments, oks?

The thing I'm concerned about here is the call to bpf_wakeup() from
bpf_catchpacket() - this modifies the bpf_d refcount outside the kernel lock.
Elsewhere the refcount is modified while holding the kernel lock, with or
without the mutex.  It'll never free the bpf_d there, since the bpf_d list
still holds a reference, but the refcount operations aren't atomic, so the
counter could get messed up.  Is there a reason not to use atomics there?



Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-12-06 Thread Alexander Bluhm
On Mon, Nov 28, 2016 at 04:01:14PM +0100, Martin Pieuchot wrote:

> @@ -313,7 +319,13 @@ bpf_detachd(struct bpf_d *d)
>   int error;
>  
>   d->bd_promisc = 0;
> +
> + bpf_get(d);
> + mtx_leave(>bd_mtx);
>   error = ifpromisc(bp->bif_ifp, 0);
> + mtx_enter(>bd_mtx);
> + bpf_put(d);
> +

If I see something like this, I ask myself wether bp has to be
protected by the mutex like this:

bpf_get(d);
ifp = bp->bif_ifp;
mtx_leave(>bd_mtx);
error = ifpromisc(ifp, 0);
mtx_enter(>bd_mtx);
bpf_put(d);

But I think your code is safe as bd_bif is only modified with kernel
lock.

> @@ -542,6 +568,7 @@ bpf_wakeup_cb(void *xd)
>  
>   selwakeup(>bd_sel);
>   bpf_put(d);
> +
>  }
>  
>  int

This new line is wrong.

> @@ -717,7 +753,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
>   *(u_int *)addr = size = bpf_maxbufsize;
>   else if (size < BPF_MINBUFSIZE)
>   *(u_int *)addr = size = BPF_MINBUFSIZE;
> + mtx_enter(>bd_mtx);
>   d->bd_bufsize = size;
> + mtx_leave(>bd_mtx);
>   }
>   break;
>  

bd_bufsize needs protection, but you only add it in BIOCSBLEN,
you should also put a mutex in BIOCGBLEN.

> @@ -778,30 +815,36 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t 
>* Get device parameters.
>*/
>   case BIOCGDLT:
> + mtx_enter(>bd_mtx);
>   if (d->bd_bif == NULL)
>   error = EINVAL;
>   else
>   *(u_int *)addr = d->bd_bif->bif_dlt;
> + mtx_leave(>bd_mtx);
>   break;

bd_bif is not protected elsewhere.  bif_dlt is only set during attach.
So I think you need no mutex here.

>  
>   /*
>* Set device parameters.
>*/
>   case BIOCSDLT:
> + mtx_enter(>bd_mtx);
>   if (d->bd_bif == NULL)
>   error = EINVAL;
>   else
>   error = bpf_setdlt(d, *(u_int *)addr);
> + mtx_leave(>bd_mtx);
>   break;

bd_bif needs not protection, so put the mutex only in the else path.
Then it is obvious why it is there.

>  
>   /*
>* Set interface name.
>*/
>   case BIOCGETIF:
> + mtx_enter(>bd_mtx);
>   if (d->bd_bif == NULL)
>   error = EINVAL;
>   else
>   bpf_ifname(d->bd_bif->bif_ifp, (struct ifreq *)addr);
> + mtx_leave(>bd_mtx);
>   break;

The interface name is not used by bpf without kernel lock.  You do
not need a mutex here.

> @@ -1151,6 +1197,8 @@ filt_bpfread(struct knote *kn, long hint
>  {
>   struct bpf_d *d = kn->kn_hook;
>  
> + KERNEL_ASSERT_LOCKED();
> +
>   kn->kn_data = d->bd_hlen;
>   if (d->bd_immediate)
>   kn->kn_data += d->bd_slen;

You need the mutex here to access bd_slen.

> @@ -1232,12 +1279,10 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>   if (!gottime++)
>   microtime();
>  
> - KERNEL_LOCK();
> - s = splnet();
> + mtx_enter(>bd_mtx);
>   bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
>   );
> - splx(s);
> - KERNEL_UNLOCK();
> + mtx_leave(>bd_mtx);
>  
>   if (d->bd_fildrop)
>   drop = 1;

bd_fildrop can be changed by ioctl.  Can we expect to read the
correct value without a memory barrier?

Same question applies to bd_dirfilt.

bd_rcount is a long value incremented atomically in _bpf_mtap().
It is read from an other CPU with kernel lock during an ioctl.  Is
this sufficient to get a consisten value?  Do we also need an atomic
operation when reading it?

> --- net/bpfdesc.h 22 Aug 2016 10:40:36 -  1.31
> +++ net/bpfdesc.h 28 Nov 2016 14:51:06 -
> @@ -56,15 +56,17 @@ struct bpf_d {
>*   fbuf (free) - When read is done, put cluster here.
>* On receiving, if sbuf is full and fbuf is 0, packet is dropped.
>*/
> + struct mutexbd_mtx; /* protect buffer slots below */
>   caddr_t bd_sbuf;/* store slot */
>   caddr_t bd_hbuf;/* hold slot */
>   caddr_t bd_fbuf;/* free slot */
>   int bd_slen;/* current length of store buffer */
>   int bd_hlen;/* current length of hold buffer */
> -
>   int bd_bufsize; /* absolute length of buffers */

So bd_mtx is protecting the fields above.  Everything below must
either be not accessed without kernel lock, or only be 

bpf_mtap(9) w/o KERNEL_LOCK

2016-11-28 Thread Martin Pieuchot
Last diff to trade the KERNEL_LOCK for a mutex in order to protect data
accessed inside bpf_catchpacket().

Note about the multiples data structures:

  - selwakeup() is called in a thread context (task) so we rely on the
KERNEL_LOCK() to serialize access to kqueue(9) data.

  - the global list of descriptor ``bpf_d_list``, accessed via 
bpfilter_lookup() is also protected by the KERNEL_LOCK().

  - bpf_get() increments the refcount of a descriptor.  It is needed
to increment it around ifppromisc() which can sleep.

  - A mutex now protects the rotating buffers and their associated
fields.
 
 - The dance around uiomove(9) is here to check that buffers aren't
  rotated while data is copied to userland.  Setting ``b->bd_fbuf''
  to NULL should be enough to let bpf_catchpacket() to drop the
  patcket.  But I added ``__in_uiomove'' to be able to have usable
  panic if something weird happen.

Comments, oks?

Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.155
diff -u -p -r1.155 bpf.c
--- net/bpf.c   28 Nov 2016 10:16:08 -  1.155
+++ net/bpf.c   28 Nov 2016 14:51:06 -
@@ -116,6 +116,9 @@ int bpf_sysctl_locked(int *, u_int, void
 
 struct bpf_d *bpfilter_lookup(int);
 
+/*
+ * Called holding ``bd_mtx''.
+ */
 void   bpf_attachd(struct bpf_d *, struct bpf_if *);
 void   bpf_detachd(struct bpf_d *);
 void   bpf_resetd(struct bpf_d *);
@@ -260,11 +263,12 @@ bpf_movein(struct uio *uio, u_int linkty
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
- * Must be called at splnet.
  */
 void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
/*
 * Point d at bp, and add d to the interface's list of listeners.
 * Finally, point the driver's bpf cookie at the interface so
@@ -287,6 +291,8 @@ bpf_detachd(struct bpf_d *d)
 {
struct bpf_if *bp;
 
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
bp = d->bd_bif;
/* Not attached. */
if (bp == NULL)
@@ -313,7 +319,13 @@ bpf_detachd(struct bpf_d *d)
int error;
 
d->bd_promisc = 0;
+
+   bpf_get(d);
+   mtx_leave(>bd_mtx);
error = ifpromisc(bp->bif_ifp, 0);
+   mtx_enter(>bd_mtx);
+   bpf_put(d);
+
if (error && !(error == EINVAL || error == ENODEV))
/*
 * Something is really wrong if we were able to put
@@ -353,6 +365,7 @@ bpfopen(dev_t dev, int flag, int mode, s
bd->bd_unit = unit;
bd->bd_bufsize = bpf_bufsize;
bd->bd_sig = SIGIO;
+   mtx_init(>bd_mtx, IPL_NET);
task_set(>bd_wake_task, bpf_wakeup_cb, bd);
 
if (flag & FNONBLOCK)
@@ -372,15 +385,14 @@ int
 bpfclose(dev_t dev, int flag, int mode, struct proc *p)
 {
struct bpf_d *d;
-   int s;
 
d = bpfilter_lookup(minor(dev));
-   s = splnet();
+   mtx_enter(>bd_mtx);
bpf_detachd(d);
bpf_wakeup(d);
LIST_REMOVE(d, bd_list);
+   mtx_leave(>bd_mtx);
bpf_put(d);
-   splx(s);
 
return (0);
 }
@@ -391,11 +403,13 @@ bpfclose(dev_t dev, int flag, int mode, 
  * Zero the length of the new store buffer.
  */
 #define ROTATE_BUFFERS(d) \
+   KASSERT(d->__in_uiomove == 0); \
+   MUTEX_ASSERT_LOCKED(>bd_mtx); \
(d)->bd_hbuf = (d)->bd_sbuf; \
(d)->bd_hlen = (d)->bd_slen; \
(d)->bd_sbuf = (d)->bd_fbuf; \
(d)->bd_slen = 0; \
-   (d)->bd_fbuf = 0;
+   (d)->bd_fbuf = NULL;
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -403,15 +417,17 @@ int
 bpfread(dev_t dev, struct uio *uio, int ioflag)
 {
struct bpf_d *d;
-   int error;
-   int s;
+   caddr_t hbuf;
+   int hlen, error;
+
+   KERNEL_ASSERT_LOCKED();
 
d = bpfilter_lookup(minor(dev));
if (d->bd_bif == NULL)
return (ENXIO);
 
-   s = splnet();
bpf_get(d);
+   mtx_enter(>bd_mtx);
 
/*
 * Restrict application to use a buffer the same size as
@@ -460,8 +476,8 @@ bpfread(dev_t dev, struct uio *uio, int 
error = EWOULDBLOCK;
} else {
if ((d->bd_rdStart + d->bd_rtout) < ticks) {
-   error = tsleep((caddr_t)d, PRINET|PCATCH, "bpf",
-   d->bd_rtout);
+   error = msleep(d, >bd_mtx, PRINET|PCATCH,
+   "bpf", d->bd_rtout);
} else
error = EWOULDBLOCK;
}
@@ -492,22 +508,30 @@ bpfread(dev_t dev, struct uio *uio, int 
/*
 * At this point, we know we have something in the hold slot.
 */
-   splx(s);
+   hbuf = d->bd_hbuf;
+   hlen = d->bd_hlen;
+   d->bd_hbuf = 

Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-23 Thread Alexander Bluhm
On Tue, Nov 22, 2016 at 11:54:47AM +0100, Martin Pieuchot wrote:
> Next extract diff that tweaks bpf_detachd().
> 
> The goal here is to remove ``d'' from the list and NULLify ``d->bd_bif''
> before calling ifpromisc().
> 
> The reason is that ifpromisc() can sleep.  Think USB ;)  So we'll have to
> release the mutex before calling it.  So we want to make sure ``d'' is no
> longer in the interface descriptor list at this point.
> 
> ok?

OK bluhm@

> 
> Index: net/bpf.c
> ===
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.154
> diff -u -p -r1.154 bpf.c
> --- net/bpf.c 21 Nov 2016 09:15:40 -  1.154
> +++ net/bpf.c 22 Nov 2016 10:47:05 -
> @@ -288,6 +288,23 @@ bpf_detachd(struct bpf_d *d)
>   struct bpf_if *bp;
>  
>   bp = d->bd_bif;
> + /* Not attached. */
> + if (bp == NULL)
> + return;
> +
> + /* Remove ``d'' from the interface's descriptor list. */
> + KERNEL_ASSERT_LOCKED();
> + SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
> +
> + if (SRPL_EMPTY_LOCKED(>bif_dlist)) {
> + /*
> +  * Let the driver know that there are no more listeners.
> +  */
> + *bp->bif_driverp = NULL;
> + }
> +
> + d->bd_bif = NULL;
> +
>   /*
>* Check if this descriptor had requested promiscuous mode.
>* If so, turn it off.
> @@ -305,19 +322,6 @@ bpf_detachd(struct bpf_d *d)
>*/
>   panic("bpf: ifpromisc failed");
>   }
> -
> - /* Remove d from the interface's descriptor list. */
> - KERNEL_ASSERT_LOCKED();
> - SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
> -
> - if (SRPL_EMPTY_LOCKED(>bif_dlist)) {
> - /*
> -  * Let the driver know that there are no more listeners.
> -  */
> - *d->bd_bif->bif_driverp = 0;
> - }
> -
> - d->bd_bif = NULL;
>  }
>  
>  void
> @@ -372,8 +376,7 @@ bpfclose(dev_t dev, int flag, int mode, 
>  
>   d = bpfilter_lookup(minor(dev));
>   s = splnet();
> - if (d->bd_bif)
> - bpf_detachd(d);
> + bpf_detachd(d);
>   bpf_wakeup(d);
>   LIST_REMOVE(d, bd_list);
>   bpf_put(d);
> @@ -1033,12 +1036,10 @@ bpf_setif(struct bpf_d *d, struct ifreq 
>   goto out;
>   }
>   if (candidate != d->bd_bif) {
> - if (d->bd_bif)
> - /*
> -  * Detach if attached to something else.
> -  */
> - bpf_detachd(d);
> -
> + /*
> +  * Detach if attached to something else.
> +  */
> + bpf_detachd(d);
>   bpf_attachd(d, candidate);
>   }
>   bpf_resetd(d);



Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-22 Thread Martin Pieuchot
On 13/09/16(Tue) 12:23, Martin Pieuchot wrote:
> Here's the big scary diff I've been using for some months now to stop
> grabbing the KERNEL_LOCK() in bpf_mtap(9).  This has been originally
> written to prevent lock ordering inside pf_test().  Now that we're
> heading toward using a rwlock, we won't have this problem, but fewer
> usages of KERNEL_LOCK() is still interesting.
> 
> I'm going to split this diff in small chunks to ease the review.  But
> I'd appreciate if people could try to break it, test & report back.
> 
> Some notes:
> 
>   - Now that selwakeup() is called in a thread context (task) we only
> rely on the KERNEL_LOCK() to serialize access to kqueue(9) data.
> 
>   - The reference counting is here to make sure a descriptor is not
> freed during a sleep.  That's why the KERNEL_LOCK() is still
> necessary in the slow path.  On the other hand bpf_catchpacket()
> relies on the reference guaranteed by the SRP list.
> 
>   - A mutex now protects the rotating buffers and their associated
> fields.  It is dropped before calling ifpromisc() because USB
> devices sleep.
> 
>   - The dance around uiomove(9) is here to check that buffers aren't
> rotated while data is copied to userland.  Setting ``b->bd_fbuf''
> to NULL should be enough to let bpf_catchpacket() to drop the
> patcket.  But I added ``__in_uiomove'' to be able to have usable
> panic if something weird happen.

Next extract diff that tweaks bpf_detachd().

The goal here is to remove ``d'' from the list and NULLify ``d->bd_bif''
before calling ifpromisc().

The reason is that ifpromisc() can sleep.  Think USB ;)  So we'll have to
release the mutex before calling it.  So we want to make sure ``d'' is no
longer in the interface descriptor list at this point.

ok?

Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.154
diff -u -p -r1.154 bpf.c
--- net/bpf.c   21 Nov 2016 09:15:40 -  1.154
+++ net/bpf.c   22 Nov 2016 10:47:05 -
@@ -288,6 +288,23 @@ bpf_detachd(struct bpf_d *d)
struct bpf_if *bp;
 
bp = d->bd_bif;
+   /* Not attached. */
+   if (bp == NULL)
+   return;
+
+   /* Remove ``d'' from the interface's descriptor list. */
+   KERNEL_ASSERT_LOCKED();
+   SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
+
+   if (SRPL_EMPTY_LOCKED(>bif_dlist)) {
+   /*
+* Let the driver know that there are no more listeners.
+*/
+   *bp->bif_driverp = NULL;
+   }
+
+   d->bd_bif = NULL;
+
/*
 * Check if this descriptor had requested promiscuous mode.
 * If so, turn it off.
@@ -305,19 +322,6 @@ bpf_detachd(struct bpf_d *d)
 */
panic("bpf: ifpromisc failed");
}
-
-   /* Remove d from the interface's descriptor list. */
-   KERNEL_ASSERT_LOCKED();
-   SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
-
-   if (SRPL_EMPTY_LOCKED(>bif_dlist)) {
-   /*
-* Let the driver know that there are no more listeners.
-*/
-   *d->bd_bif->bif_driverp = 0;
-   }
-
-   d->bd_bif = NULL;
 }
 
 void
@@ -372,8 +376,7 @@ bpfclose(dev_t dev, int flag, int mode, 
 
d = bpfilter_lookup(minor(dev));
s = splnet();
-   if (d->bd_bif)
-   bpf_detachd(d);
+   bpf_detachd(d);
bpf_wakeup(d);
LIST_REMOVE(d, bd_list);
bpf_put(d);
@@ -1033,12 +1036,10 @@ bpf_setif(struct bpf_d *d, struct ifreq 
goto out;
}
if (candidate != d->bd_bif) {
-   if (d->bd_bif)
-   /*
-* Detach if attached to something else.
-*/
-   bpf_detachd(d);
-
+   /*
+* Detach if attached to something else.
+*/
+   bpf_detachd(d);
bpf_attachd(d, candidate);
}
bpf_resetd(d);



Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-16 Thread Alexander Bluhm
On Wed, Nov 16, 2016 at 12:18:48PM +0100, Martin Pieuchot wrote:
> Here's another extracted diff:  Use goto in read & write and always
> increment the reference count in write. 
> 
> ok?

OK bluhm@

> 
> Index: net/bpf.c
> ===
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.151
> diff -u -p -r1.151 bpf.c
> --- net/bpf.c 16 Nov 2016 09:00:01 -  1.151
> +++ net/bpf.c 16 Nov 2016 11:08:25 -
> @@ -406,16 +406,17 @@ bpfread(dev_t dev, struct uio *uio, int 
>   if (d->bd_bif == NULL)
>   return (ENXIO);
>  
> + s = splnet();
> + bpf_get(d);
> +
>   /*
>* Restrict application to use a buffer the same size as
>* as kernel buffers.
>*/
> - if (uio->uio_resid != d->bd_bufsize)
> - return (EINVAL);
> -
> - s = splnet();
> -
> - bpf_get(d);
> + if (uio->uio_resid != d->bd_bufsize) {
> + error = EINVAL;
> + goto out;
> + }
>  
>   /*
>* If there's a timeout, bd_rdStart is tagged when we start the read.
> @@ -431,13 +432,12 @@ bpfread(dev_t dev, struct uio *uio, int 
>* ends when the timeout expires or when enough packets
>* have arrived to fill the store buffer.
>*/
> - while (d->bd_hbuf == 0) {
> + while (d->bd_hbuf == NULL) {
>   if (d->bd_bif == NULL) {
>   /* interface is gone */
>   if (d->bd_slen == 0) {
> - bpf_put(d);
> - splx(s);
> - return (EIO);
> + error = EIO;
> + goto out;
>   }
>   ROTATE_BUFFERS(d);
>   break;
> @@ -461,18 +461,15 @@ bpfread(dev_t dev, struct uio *uio, int 
>   } else
>   error = EWOULDBLOCK;
>   }
> - if (error == EINTR || error == ERESTART) {
> - bpf_put(d);
> - splx(s);
> - return (error);
> - }
> + if (error == EINTR || error == ERESTART)
> + goto out;
>   if (error == EWOULDBLOCK) {
>   /*
>* On a timeout, return what's in the buffer,
>* which may be nothing.  If there is something
>* in the store buffer, we can rotate the buffers.
>*/
> - if (d->bd_hbuf)
> + if (d->bd_hbuf != NULL)
>   /*
>* We filled up the buffer in between
>* getting the timeout and arriving
> @@ -481,9 +478,8 @@ bpfread(dev_t dev, struct uio *uio, int 
>   break;
>  
>   if (d->bd_slen == 0) {
> - bpf_put(d);
> - splx(s);
> - return (0);
> + error = 0;
> + goto out;
>   }
>   ROTATE_BUFFERS(d);
>   break;
> @@ -505,7 +501,7 @@ bpfread(dev_t dev, struct uio *uio, int 
>   d->bd_fbuf = d->bd_hbuf;
>   d->bd_hbuf = NULL;
>   d->bd_hlen = 0;
> -
> +out:
>   bpf_put(d);
>   splx(s);
>  
> @@ -554,32 +550,40 @@ bpfwrite(dev_t dev, struct uio *uio, int
>   struct bpf_insn *fcode = NULL;
>   int error, s;
>   struct sockaddr_storage dst;
> + u_int dlt;
>  
>   d = bpfilter_lookup(minor(dev));
>   if (d->bd_bif == NULL)
>   return (ENXIO);
>  
> + bpf_get(d);
>   ifp = d->bd_bif->bif_ifp;
>  
> - if ((ifp->if_flags & IFF_UP) == 0)
> - return (ENETDOWN);
> + if ((ifp->if_flags & IFF_UP) == 0) {
> + error = ENETDOWN;
> + goto out;
> + }
>  
> - if (uio->uio_resid == 0)
> - return (0);
> + if (uio->uio_resid == 0) {
> + error = 0;
> + goto out;
> + }
>  
>   KERNEL_ASSERT_LOCKED(); /* for accessing bd_wfilter */
>   bf = srp_get_locked(>bd_wfilter);
>   if (bf != NULL)
>   fcode = bf->bf_insns;
>  
> - error = bpf_movein(uio, d->bd_bif->bif_dlt, ,
> - (struct sockaddr *), fcode);
> + dlt = d->bd_bif->bif_dlt;
> +
> + error = bpf_movein(uio, dlt, , (struct sockaddr *), fcode);
>   if (error)
> - return (error);
> + goto out;
>  
>   if (m->m_pkthdr.len > ifp->if_mtu) {
>   m_freem(m);
> - return (EMSGSIZE);
> + error = EMSGSIZE;
> + goto out;
>   }
>  
>   m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
> @@ -591,9 +595,9 @@ bpfwrite(dev_t dev, struct uio *uio, int
>   s = 

Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-16 Thread Martin Pieuchot
On 13/09/16(Tue) 12:23, Martin Pieuchot wrote:
> Here's the big scary diff I've been using for some months now to stop
> grabbing the KERNEL_LOCK() in bpf_mtap(9).  This has been originally
> written to prevent lock ordering inside pf_test().  Now that we're
> heading toward using a rwlock, we won't have this problem, but fewer
> usages of KERNEL_LOCK() is still interesting.
> 
> I'm going to split this diff in small chunks to ease the review.  But
> I'd appreciate if people could try to break it, test & report back.
> 
> Some notes:
> 
>   - Now that selwakeup() is called in a thread context (task) we only
> rely on the KERNEL_LOCK() to serialize access to kqueue(9) data.
> 
>   - The reference counting is here to make sure a descriptor is not
> freed during a sleep.  That's why the KERNEL_LOCK() is still
> necessary in the slow path.  On the other hand bpf_catchpacket()
> relies on the reference guaranteed by the SRP list.
> 
>   - A mutex now protects the rotating buffers and their associated
> fields.  It is dropped before calling ifpromisc() because USB
> devices sleep.
> 
>   - The dance around uiomove(9) is here to check that buffers aren't
> rotated while data is copied to userland.  Setting ``b->bd_fbuf''
> to NULL should be enough to let bpf_catchpacket() to drop the
> patcket.  But I added ``__in_uiomove'' to be able to have usable
> panic if something weird happen.

Here's another extracted diff:  Use goto in read & write and always
increment the reference count in write. 

ok?

Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.151
diff -u -p -r1.151 bpf.c
--- net/bpf.c   16 Nov 2016 09:00:01 -  1.151
+++ net/bpf.c   16 Nov 2016 11:08:25 -
@@ -406,16 +406,17 @@ bpfread(dev_t dev, struct uio *uio, int 
if (d->bd_bif == NULL)
return (ENXIO);
 
+   s = splnet();
+   bpf_get(d);
+
/*
 * Restrict application to use a buffer the same size as
 * as kernel buffers.
 */
-   if (uio->uio_resid != d->bd_bufsize)
-   return (EINVAL);
-
-   s = splnet();
-
-   bpf_get(d);
+   if (uio->uio_resid != d->bd_bufsize) {
+   error = EINVAL;
+   goto out;
+   }
 
/*
 * If there's a timeout, bd_rdStart is tagged when we start the read.
@@ -431,13 +432,12 @@ bpfread(dev_t dev, struct uio *uio, int 
 * ends when the timeout expires or when enough packets
 * have arrived to fill the store buffer.
 */
-   while (d->bd_hbuf == 0) {
+   while (d->bd_hbuf == NULL) {
if (d->bd_bif == NULL) {
/* interface is gone */
if (d->bd_slen == 0) {
-   bpf_put(d);
-   splx(s);
-   return (EIO);
+   error = EIO;
+   goto out;
}
ROTATE_BUFFERS(d);
break;
@@ -461,18 +461,15 @@ bpfread(dev_t dev, struct uio *uio, int 
} else
error = EWOULDBLOCK;
}
-   if (error == EINTR || error == ERESTART) {
-   bpf_put(d);
-   splx(s);
-   return (error);
-   }
+   if (error == EINTR || error == ERESTART)
+   goto out;
if (error == EWOULDBLOCK) {
/*
 * On a timeout, return what's in the buffer,
 * which may be nothing.  If there is something
 * in the store buffer, we can rotate the buffers.
 */
-   if (d->bd_hbuf)
+   if (d->bd_hbuf != NULL)
/*
 * We filled up the buffer in between
 * getting the timeout and arriving
@@ -481,9 +478,8 @@ bpfread(dev_t dev, struct uio *uio, int 
break;
 
if (d->bd_slen == 0) {
-   bpf_put(d);
-   splx(s);
-   return (0);
+   error = 0;
+   goto out;
}
ROTATE_BUFFERS(d);
break;
@@ -505,7 +501,7 @@ bpfread(dev_t dev, struct uio *uio, int 
d->bd_fbuf = d->bd_hbuf;
d->bd_hbuf = NULL;
d->bd_hlen = 0;
-
+out:
bpf_put(d);
splx(s);
 
@@ -554,32 +550,40 @@ bpfwrite(dev_t dev, struct uio *uio, int
struct bpf_insn *fcode = NULL;
int error, s;
struct sockaddr_storage dst;
+ 

Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-15 Thread Alexander Bluhm
On Mon, Nov 14, 2016 at 10:07:30AM +0100, Martin Pieuchot wrote:
> Here's another extracted diff to move forward.  Let bpf_allocbufs()
> fail when allocating memory, this way we can call it while holding
> a mutex.
> 
> ok?

OK bluhm@

> 
> Index: net/bpf.c
> ===
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.150
> diff -u -p -r1.150 bpf.c
> --- net/bpf.c 16 Oct 2016 18:05:41 -  1.150
> +++ net/bpf.c 14 Nov 2016 09:02:51 -
> @@ -92,7 +92,7 @@ int bpf_maxbufsize = BPF_MAXBUFSIZE;
>  struct bpf_if*bpf_iflist;
>  LIST_HEAD(, bpf_d) bpf_d_list;
>  
> -void bpf_allocbufs(struct bpf_d *);
> +int  bpf_allocbufs(struct bpf_d *);
>  void bpf_ifname(struct ifnet *, struct ifreq *);
>  int  _bpf_mtap(caddr_t, const struct mbuf *, u_int,
>   void (*)(const void *, void *, size_t));
> @@ -996,6 +996,7 @@ int
>  bpf_setif(struct bpf_d *d, struct ifreq *ifr)
>  {
>   struct bpf_if *bp, *candidate = NULL;
> + int error = 0;
>   int s;
>  
>   /*
> @@ -1012,30 +1013,33 @@ bpf_setif(struct bpf_d *d, struct ifreq 
>   candidate = bp;
>   }
>  
> - if (candidate != NULL) {
> - /*
> -  * Allocate the packet buffers if we need to.
> -  * If we're already attached to requested interface,
> -  * just flush the buffer.
> -  */
> - if (d->bd_sbuf == NULL)
> - bpf_allocbufs(d);
> - s = splnet();
> - if (candidate != d->bd_bif) {
> - if (d->bd_bif)
> - /*
> -  * Detach if attached to something else.
> -  */
> - bpf_detachd(d);
> + /* Not found. */
> + if (candidate == NULL)
> + return (ENXIO);
>  
> - bpf_attachd(d, candidate);
> - }
> - bpf_reset_d(d);
> - splx(s);
> - return (0);
> + /*
> +  * Allocate the packet buffers if we need to.
> +  * If we're already attached to requested interface,
> +  * just flush the buffer.
> +  */
> + s = splnet();
> + if (d->bd_sbuf == NULL) {
> + if ((error = bpf_allocbufs(d)))
> + goto out;
>   }
> - /* Not found. */
> - return (ENXIO);
> + if (candidate != d->bd_bif) {
> + if (d->bd_bif)
> + /*
> +  * Detach if attached to something else.
> +  */
> + bpf_detachd(d);
> +
> + bpf_attachd(d, candidate);
> + }
> + bpf_reset_d(d);
> +out:
> + splx(s);
> + return (error);
>  }
>  
>  /*
> @@ -1434,13 +1438,23 @@ bpf_catchpacket(struct bpf_d *d, u_char 
>  /*
>   * Initialize all nonzero fields of a descriptor.
>   */
> -void
> +int
>  bpf_allocbufs(struct bpf_d *d)
>  {
> - d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK);
> - d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK);
> + d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
> + if (d->bd_fbuf == NULL)
> + return (ENOMEM);
> +
> + d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
> + if (d->bd_sbuf == NULL) {
> + free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
> + return (ENOMEM);
> + }
> +
>   d->bd_slen = 0;
>   d->bd_hlen = 0;
> +
> + return (0);
>  }
>  
>  void



Re: bpf_mtap(9) w/o KERNEL_LOCK

2016-11-14 Thread Martin Pieuchot
On 13/09/16(Tue) 12:23, Martin Pieuchot wrote:
> Here's the big scary diff I've been using for some months now to stop
> grabbing the KERNEL_LOCK() in bpf_mtap(9).  This has been originally
> written to prevent lock ordering inside pf_test().  Now that we're
> heading toward using a rwlock, we won't have this problem, but fewer
> usages of KERNEL_LOCK() is still interesting.
> 
> I'm going to split this diff in small chunks to ease the review.  But
> I'd appreciate if people could try to break it, test & report back.
> 
> Some notes:
> 
>   - Now that selwakeup() is called in a thread context (task) we only
> rely on the KERNEL_LOCK() to serialize access to kqueue(9) data.
> 
>   - The reference counting is here to make sure a descriptor is not
> freed during a sleep.  That's why the KERNEL_LOCK() is still
> necessary in the slow path.  On the other hand bpf_catchpacket()
> relies on the reference guaranteed by the SRP list.
> 
>   - A mutex now protects the rotating buffers and their associated
> fields.  It is dropped before calling ifpromisc() because USB
> devices sleep.
> 
>   - The dance around uiomove(9) is here to check that buffers aren't
> rotated while data is copied to userland.  Setting ``b->bd_fbuf''
> to NULL should be enough to let bpf_catchpacket() to drop the
> patcket.  But I added ``__in_uiomove'' to be able to have usable
> panic if something weird happen.
> 
> Comments?

I've got many test reports but no actual review...  anyone?

Here's another extracted diff to move forward.  Let bpf_allocbufs()
fail when allocating memory, this way we can call it while holding
a mutex.

ok?

Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.150
diff -u -p -r1.150 bpf.c
--- net/bpf.c   16 Oct 2016 18:05:41 -  1.150
+++ net/bpf.c   14 Nov 2016 09:02:51 -
@@ -92,7 +92,7 @@ int bpf_maxbufsize = BPF_MAXBUFSIZE;
 struct bpf_if  *bpf_iflist;
 LIST_HEAD(, bpf_d) bpf_d_list;
 
-void   bpf_allocbufs(struct bpf_d *);
+intbpf_allocbufs(struct bpf_d *);
 void   bpf_ifname(struct ifnet *, struct ifreq *);
 int_bpf_mtap(caddr_t, const struct mbuf *, u_int,
void (*)(const void *, void *, size_t));
@@ -996,6 +996,7 @@ int
 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 {
struct bpf_if *bp, *candidate = NULL;
+   int error = 0;
int s;
 
/*
@@ -1012,30 +1013,33 @@ bpf_setif(struct bpf_d *d, struct ifreq 
candidate = bp;
}
 
-   if (candidate != NULL) {
-   /*
-* Allocate the packet buffers if we need to.
-* If we're already attached to requested interface,
-* just flush the buffer.
-*/
-   if (d->bd_sbuf == NULL)
-   bpf_allocbufs(d);
-   s = splnet();
-   if (candidate != d->bd_bif) {
-   if (d->bd_bif)
-   /*
-* Detach if attached to something else.
-*/
-   bpf_detachd(d);
+   /* Not found. */
+   if (candidate == NULL)
+   return (ENXIO);
 
-   bpf_attachd(d, candidate);
-   }
-   bpf_reset_d(d);
-   splx(s);
-   return (0);
+   /*
+* Allocate the packet buffers if we need to.
+* If we're already attached to requested interface,
+* just flush the buffer.
+*/
+   s = splnet();
+   if (d->bd_sbuf == NULL) {
+   if ((error = bpf_allocbufs(d)))
+   goto out;
}
-   /* Not found. */
-   return (ENXIO);
+   if (candidate != d->bd_bif) {
+   if (d->bd_bif)
+   /*
+* Detach if attached to something else.
+*/
+   bpf_detachd(d);
+
+   bpf_attachd(d, candidate);
+   }
+   bpf_reset_d(d);
+out:
+   splx(s);
+   return (error);
 }
 
 /*
@@ -1434,13 +1438,23 @@ bpf_catchpacket(struct bpf_d *d, u_char 
 /*
  * Initialize all nonzero fields of a descriptor.
  */
-void
+int
 bpf_allocbufs(struct bpf_d *d)
 {
-   d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK);
-   d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_WAITOK);
+   d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
+   if (d->bd_fbuf == NULL)
+   return (ENOMEM);
+
+   d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
+   if (d->bd_sbuf == NULL) {
+   free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
+   return (ENOMEM);
+   }
+
d->bd_slen = 0;
d->bd_hlen = 0;
+
+   return (0);
 }
 
 void



bpf_mtap(9) w/o KERNEL_LOCK

2016-09-13 Thread Martin Pieuchot
Here's the big scary diff I've been using for some months now to stop
grabbing the KERNEL_LOCK() in bpf_mtap(9).  This has been originally
written to prevent lock ordering inside pf_test().  Now that we're
heading toward using a rwlock, we won't have this problem, but fewer
usages of KERNEL_LOCK() is still interesting.

I'm going to split this diff in small chunks to ease the review.  But
I'd appreciate if people could try to break it, test & report back.

Some notes:

  - Now that selwakeup() is called in a thread context (task) we only
rely on the KERNEL_LOCK() to serialize access to kqueue(9) data.

  - The reference counting is here to make sure a descriptor is not
freed during a sleep.  That's why the KERNEL_LOCK() is still
necessary in the slow path.  On the other hand bpf_catchpacket()
relies on the reference guaranteed by the SRP list.

  - A mutex now protects the rotating buffers and their associated
fields.  It is dropped before calling ifpromisc() because USB
devices sleep.

  - The dance around uiomove(9) is here to check that buffers aren't
rotated while data is copied to userland.  Setting ``b->bd_fbuf''
to NULL should be enough to let bpf_catchpacket() to drop the
patcket.  But I added ``__in_uiomove'' to be able to have usable
panic if something weird happen.

Comments?

Index: net/bpf.c
===
RCS file: /cvs/src/sys/net/bpf.c,v
retrieving revision 1.149
diff -u -p -r1.149 bpf.c
--- net/bpf.c   12 Sep 2016 16:24:37 -  1.149
+++ net/bpf.c   13 Sep 2016 09:56:18 -
@@ -92,15 +92,13 @@ int bpf_maxbufsize = BPF_MAXBUFSIZE;
 struct bpf_if  *bpf_iflist;
 LIST_HEAD(, bpf_d) bpf_d_list;
 
-void   bpf_allocbufs(struct bpf_d *);
+intbpf_allocbufs(struct bpf_d *);
 void   bpf_ifname(struct ifnet *, struct ifreq *);
 int_bpf_mtap(caddr_t, const struct mbuf *, u_int,
void (*)(const void *, void *, size_t));
 void   bpf_mcopy(const void *, void *, size_t);
 intbpf_movein(struct uio *, u_int, struct mbuf **,
struct sockaddr *, struct bpf_insn *);
-void   bpf_attachd(struct bpf_d *, struct bpf_if *);
-void   bpf_detachd(struct bpf_d *);
 intbpf_setif(struct bpf_d *, struct ifreq *);
 intbpfpoll(dev_t, int, struct proc *);
 intbpfkqfilter(dev_t, struct knote *);
@@ -108,7 +106,6 @@ voidbpf_wakeup(struct bpf_d *);
 void   bpf_wakeup_cb(void *);
 void   bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
void (*)(const void *, void *, size_t), struct timeval *);
-void   bpf_reset_d(struct bpf_d *);
 intbpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 intbpf_setdlt(struct bpf_d *, u_int);
 
@@ -120,6 +117,13 @@ intbpf_sysctl_locked(int *, u_int, void
 struct bpf_d *bpfilter_lookup(int);
 
 /*
+ * Called holding ``bd_mtx''.
+ */
+void   bpf_attachd(struct bpf_d *, struct bpf_if *);
+void   bpf_detachd(struct bpf_d *);
+void   bpf_resetd(struct bpf_d *);
+
+/*
  * Reference count access to descriptor buffers
  */
 void   bpf_get(struct bpf_d *);
@@ -259,11 +263,12 @@ bpf_movein(struct uio *uio, u_int linkty
 
 /*
  * Attach file to the bpf interface, i.e. make d listen on bp.
- * Must be called at splnet.
  */
 void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
/*
 * Point d at bp, and add d to the interface's list of listeners.
 * Finally, point the driver's bpf cookie at the interface so
@@ -286,7 +291,23 @@ bpf_detachd(struct bpf_d *d)
 {
struct bpf_if *bp;
 
+   MUTEX_ASSERT_LOCKED(>bd_mtx);
+
bp = d->bd_bif;
+
+   /* Remove d from the interface's descriptor list. */
+   KERNEL_ASSERT_LOCKED();
+   SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
+
+   if (SRPL_EMPTY_LOCKED(>bif_dlist)) {
+   /*
+* Let the driver know that there are no more listeners.
+*/
+   *bp->bif_driverp = NULL;
+   }
+
+   d->bd_bif = NULL;
+
/*
 * Check if this descriptor had requested promiscuous mode.
 * If so, turn it off.
@@ -295,7 +316,13 @@ bpf_detachd(struct bpf_d *d)
int error;
 
d->bd_promisc = 0;
+
+   bpf_get(d);
+   mtx_leave(>bd_mtx);
error = ifpromisc(bp->bif_ifp, 0);
+   mtx_enter(>bd_mtx);
+   bpf_put(d);
+
if (error && !(error == EINVAL || error == ENODEV))
/*
 * Something is really wrong if we were able to put
@@ -304,19 +331,6 @@ bpf_detachd(struct bpf_d *d)
 */
panic("bpf: ifpromisc failed");
}
-
-   /* Remove d from the interface's descriptor list. */
-   KERNEL_ASSERT_LOCKED();
-   SRPL_REMOVE_LOCKED(_d_rc, >bif_dlist, d, bpf_d, bd_next);
-
-   if (SRPL_EMPTY_LOCKED(>bif_dlist)) {