Re: svn commit: r267760 - head/sys/kern

2015-07-07 Thread Konstantin Belousov
On Tue, Jul 07, 2015 at 12:42:01AM +0200, Mateusz Guzik wrote:
> On Mon, Jul 06, 2015 at 12:09:58PM +0300, Konstantin Belousov wrote:
> > On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote:
> > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> > > index fe8e9ef..c7f579a 100644
> > > --- a/sys/fs/devfs/devfs_vnops.c
> > > +++ b/sys/fs/devfs/devfs_vnops.c
> > > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
> > >   if (vp->v_data == NULL)
> > >   return (0);
> > >  
> > > + vp_locked = VOP_ISLOCKED(vp);
> > > +
> > >   /*
> > >* Hack: a tty device that is a controlling terminal
> > >* has a reference from the session structure.
> > > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
> > >   if (vp == p->p_session->s_ttyvp) {
> > >   PROC_UNLOCK(p);
> > >   oldvp = NULL;
> > > + VOP_UNLOCK(vp, 0);
> > This opens a window where vp can be reclaimed. Then vp->v_rdev
> > is invalid and dev_refthread() below accesses random memory.
> > 
> > Might be, the easiest fix is to call dev_refthread() before the td !=
> > NULL block, but leave dsw == NULL check where it is now.
> > 
> 
> Instead I made the bare minimum to ensure that modified fields which are
> accessed here are always protected with proc lock and sess lock, which
> eliminates the need for proctree lock in problematic cases. In effect
> there are no games with relocking vnodes.
This could work.  What I want to know is what methodology was used to
ensure that added locking for other places is enough.

The proctree locking is 'natural', since session/group membership relates
to the process tree structure and owning the sx guarantees that the tree
is stable.  Your locking can work, but needs an argumentation why it is
complete.  Also, you should add some note to proc.h to indicate in the
locking annotations that reading of e.g. pg_session is safe when only
process lock is owned.

> 
> I would commit this patch separately.
> 
> Note this establishes a process lock -> vnode interlock order which
> seems fishy, but apparently does not result in lors.
> 
> diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> index fe8e9ef..0a2a6bb 100644
> --- a/sys/fs/devfs/devfs_vnops.c
> +++ b/sys/fs/devfs/devfs_vnops.c
> @@ -586,12 +586,12 @@ devfs_close(struct vop_close_args *ap)
>   if (td != NULL) {
>   p = td->td_proc;
>   PROC_LOCK(p);
> - if (vp == p->p_session->s_ttyvp) {
> + if (vp != p->p_session->s_ttyvp) {
>   PROC_UNLOCK(p);
> + } else {
>   oldvp = NULL;
> - sx_xlock(&proctree_lock);
> + SESS_LOCK(p->p_session);
>   if (vp == p->p_session->s_ttyvp) {
> - SESS_LOCK(p->p_session);
>   VI_LOCK(vp);
>   if (count_dev(dev) == 2 &&
>   (vp->v_iflag & VI_DOOMED) == 0) {
> @@ -600,13 +600,12 @@ devfs_close(struct vop_close_args *ap)
>   oldvp = vp;
>   }
>   VI_UNLOCK(vp);
> - SESS_UNLOCK(p->p_session);
>   }
> - sx_xunlock(&proctree_lock);
> + SESS_UNLOCK(p->p_session);
> + PROC_UNLOCK(p);
>   if (oldvp != NULL)
>   vrele(oldvp);
> - } else
> - PROC_UNLOCK(p);
> + }
>   }
>   /*
>* We do not want to really close the device if it
> diff --git a/sys/kern/tty.c b/sys/kern/tty.c
> index d9d0cce..5ef5578 100644
> --- a/sys/kern/tty.c
> +++ b/sys/kern/tty.c
> @@ -1637,8 +1637,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void 
> *data, int fflag,
>   return (EPERM);
>   }
>  
> + PROC_LOCK(p);
> + SESS_LOCK(p->p_session);
> +
>   if (tp->t_session != NULL && tp->t_session == p->p_session) {
>   /* This is already our controlling TTY. */
> + SESS_UNLOCK(p->p_session);
> + PROC_UNLOCK(p);
>   sx_xunlock(&proctree_lock);
>   return (0);
>   }
> @@ -1657,6 +1662,8 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void 
> *data, int fflag,
>* TTYs of which the session leader has been
>* killed or the TTY revoked.
>*/
> + SESS_UNLOCK(p->p_session);
> + PROC_UNLOCK(p);
>   sx_xunlock(&proctree_lock);
>   return (EPERM);
>   }
> @@ -1665,14 +1672,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void 
> *data, int fflag,

Re: svn commit: r267760 - head/sys/kern

2015-07-06 Thread Mateusz Guzik
On Mon, Jul 06, 2015 at 12:09:58PM +0300, Konstantin Belousov wrote:
> On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote:
> > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> > index fe8e9ef..c7f579a 100644
> > --- a/sys/fs/devfs/devfs_vnops.c
> > +++ b/sys/fs/devfs/devfs_vnops.c
> > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
> > if (vp->v_data == NULL)
> > return (0);
> >  
> > +   vp_locked = VOP_ISLOCKED(vp);
> > +
> > /*
> >  * Hack: a tty device that is a controlling terminal
> >  * has a reference from the session structure.
> > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
> > if (vp == p->p_session->s_ttyvp) {
> > PROC_UNLOCK(p);
> > oldvp = NULL;
> > +   VOP_UNLOCK(vp, 0);
> This opens a window where vp can be reclaimed. Then vp->v_rdev
> is invalid and dev_refthread() below accesses random memory.
> 
> Might be, the easiest fix is to call dev_refthread() before the td !=
> NULL block, but leave dsw == NULL check where it is now.
> 

Instead I made the bare minimum to ensure that modified fields which are
accessed here are always protected with proc lock and sess lock, which
eliminates the need for proctree lock in problematic cases. In effect
there are no games with relocking vnodes.

I would commit this patch separately.

Note this establishes a process lock -> vnode interlock order which
seems fishy, but apparently does not result in lors.

diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index fe8e9ef..0a2a6bb 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -586,12 +586,12 @@ devfs_close(struct vop_close_args *ap)
if (td != NULL) {
p = td->td_proc;
PROC_LOCK(p);
-   if (vp == p->p_session->s_ttyvp) {
+   if (vp != p->p_session->s_ttyvp) {
PROC_UNLOCK(p);
+   } else {
oldvp = NULL;
-   sx_xlock(&proctree_lock);
+   SESS_LOCK(p->p_session);
if (vp == p->p_session->s_ttyvp) {
-   SESS_LOCK(p->p_session);
VI_LOCK(vp);
if (count_dev(dev) == 2 &&
(vp->v_iflag & VI_DOOMED) == 0) {
@@ -600,13 +600,12 @@ devfs_close(struct vop_close_args *ap)
oldvp = vp;
}
VI_UNLOCK(vp);
-   SESS_UNLOCK(p->p_session);
}
-   sx_xunlock(&proctree_lock);
+   SESS_UNLOCK(p->p_session);
+   PROC_UNLOCK(p);
if (oldvp != NULL)
vrele(oldvp);
-   } else
-   PROC_UNLOCK(p);
+   }
}
/*
 * We do not want to really close the device if it
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index d9d0cce..5ef5578 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -1637,8 +1637,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void 
*data, int fflag,
return (EPERM);
}
 
+   PROC_LOCK(p);
+   SESS_LOCK(p->p_session);
+
if (tp->t_session != NULL && tp->t_session == p->p_session) {
/* This is already our controlling TTY. */
+   SESS_UNLOCK(p->p_session);
+   PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
return (0);
}
@@ -1657,6 +1662,8 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, 
int fflag,
 * TTYs of which the session leader has been
 * killed or the TTY revoked.
 */
+   SESS_UNLOCK(p->p_session);
+   PROC_UNLOCK(p);
sx_xunlock(&proctree_lock);
return (EPERM);
}
@@ -1665,14 +1672,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void 
*data, int fflag,
tp->t_session = p->p_session;
tp->t_session->s_ttyp = tp;
tp->t_sessioncnt++;
-   sx_xunlock(&proctree_lock);
 
/* Assign foreground process group. */
tp->t_pgrp = p->p_pgrp;
-   PROC_LOCK(p);
p->p_flag |= P_CONTROLT;
+   SESS_UNLOCK(p->p_session);
PROC_UNLOCK(p);
-
+   sx_xunlock(&proctree_lock);
return (0);
}
case TIOCSPGRP: {
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
index 582b41a..dca619c 100644
--- a/sys/kern/tty_tty.c
+++ b/sys/kern/tty_tty.c
@@ -6

Re: svn commit: r267760 - head/sys/kern

2015-07-06 Thread Konstantin Belousov
On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote:
> On Mon, Jul 06, 2015 at 07:51:35AM +0200, Mateusz Guzik wrote:
> > On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote:
> > > On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote:
> > > > There is an additional problem with slocked case: witness report a lor
> > > > with devfs vnodes.
> > > > 
> > > > lock order reversal:
> > > >  1st 0xf80006fe6ab0 process imagelock (process imagelock) @ 
> > > > /usr/src/sys/kern/kern_proc.c:287
> > > >  2nd 0xf80018c88240 devfs (devfs) @ 
> > > > /usr/src/sys/kern/vfs_cache.c:1241
> > > > KDB: stack backtrace:
> > > > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 
> > > > 0xfe012324f120
> > > > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0
> > > > witness_checkorder() at witness_checkorder+0xdc2/frame 
> > > > 0xfe012324f260
> > > > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0
> > > > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0
> > > > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0
> > > > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460
> > > > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0
> > > > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530
> > > > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580
> > > > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0
> > > > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 
> > > > 0xfe012324f840
> > > > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 
> > > > 0xfe012324f900
> > > > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 
> > > > 0xfe012324f940
> > > > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990
> > > > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30
> > > > 
> > > > witness detected the following lock orders:
> > > > devfs -> proctree
> > > Where is the dependency catched comes from ?  I suspect it might be tty.
> > > 
> > > I consider this case as an advantage of using sx over the hand-rolled 
> > > lock,
> > > since the issue is/must be present with the counter as well, or the LOR
> > > is false positive, possibly due to sx taken in shared mode.  But debugging
> > > features of sx give the warning, while counter is silent.
> > > 
> > > That said, if the issue above is analyzed, I do not have any preference
> > > and will not object strongly against you decision.
> > > 
> > 
> > [snip]
> > 
> > This patch is about providing a way to block execs and exits of
> > processes so that it is safe to inspect their state. For instance it
> > deals with a case where the target process executes a setuid binary,
> > where the process inspecting it could have its permission checked prior
> > to exec but is reading data after setuid completed.
> > 
> > LOR mentioned above indeed comes form tty handling and is handled by
> > relocking the vnode.
> > 
> 
> Oops, the attached patch was missing one place (devfs_lookupx). Updated patch:
> 
> 
> diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
> index fe8e9ef..c7f579a 100644
> --- a/sys/fs/devfs/devfs_vnops.c
> +++ b/sys/fs/devfs/devfs_vnops.c
> @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
>   if (vp->v_data == NULL)
>   return (0);
>  
> + vp_locked = VOP_ISLOCKED(vp);
> +
>   /*
>* Hack: a tty device that is a controlling terminal
>* has a reference from the session structure.
> @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
>   if (vp == p->p_session->s_ttyvp) {
>   PROC_UNLOCK(p);
>   oldvp = NULL;
> + VOP_UNLOCK(vp, 0);
This opens a window where vp can be reclaimed. Then vp->v_rdev
is invalid and dev_refthread() below accesses random memory.

Might be, the easiest fix is to call dev_refthread() before the td !=
NULL block, but leave dsw == NULL check where it is now.

>   sx_xlock(&proctree_lock);
>   if (vp == p->p_session->s_ttyvp) {
>   SESS_LOCK(p->p_session);
> @@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap)
>   SESS_UNLOCK(p->p_session);
>   }
>   sx_xunlock(&proctree_lock);
> + vn_lock(vp, vp_locked | LK_RETRY);
>   if (oldvp != NULL)
>   vrele(oldvp);
>   } else
> @@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap)
>   }
>   vholdl(vp);
>   VI_UNLOCK(vp);
> - vp_locked = VOP_ISLOCKED(vp);
>   VOP_UNLOCK(vp, 0);
>   KASSERT(dev->si_refcount > 0,
>   ("devfs_close() on un-referenced struct cdev *(%s)", 
> devtoname(dev)));
> @@ -964,10 +967,13 @@ devfs_lookupx(struct vop_lookup_args *ap, int 
> *dm_unlock

Re: svn commit: r267760 - head/sys/kern

2015-07-05 Thread Mateusz Guzik
On Mon, Jul 06, 2015 at 07:51:35AM +0200, Mateusz Guzik wrote:
> On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote:
> > On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote:
> > > There is an additional problem with slocked case: witness report a lor
> > > with devfs vnodes.
> > > 
> > > lock order reversal:
> > >  1st 0xf80006fe6ab0 process imagelock (process imagelock) @ 
> > > /usr/src/sys/kern/kern_proc.c:287
> > >  2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241
> > > KDB: stack backtrace:
> > > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 
> > > 0xfe012324f120
> > > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0
> > > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260
> > > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0
> > > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0
> > > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0
> > > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460
> > > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0
> > > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530
> > > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580
> > > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0
> > > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 
> > > 0xfe012324f840
> > > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 
> > > 0xfe012324f900
> > > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 
> > > 0xfe012324f940
> > > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990
> > > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30
> > > 
> > > witness detected the following lock orders:
> > > devfs -> proctree
> > Where is the dependency catched comes from ?  I suspect it might be tty.
> > 
> > I consider this case as an advantage of using sx over the hand-rolled lock,
> > since the issue is/must be present with the counter as well, or the LOR
> > is false positive, possibly due to sx taken in shared mode.  But debugging
> > features of sx give the warning, while counter is silent.
> > 
> > That said, if the issue above is analyzed, I do not have any preference
> > and will not object strongly against you decision.
> > 
> 
> [snip]
> 
> This patch is about providing a way to block execs and exits of
> processes so that it is safe to inspect their state. For instance it
> deals with a case where the target process executes a setuid binary,
> where the process inspecting it could have its permission checked prior
> to exec but is reading data after setuid completed.
> 
> LOR mentioned above indeed comes form tty handling and is handled by
> relocking the vnode.
> 

Oops, the attached patch was missing one place (devfs_lookupx). Updated patch:


diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index fe8e9ef..c7f579a 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
if (vp->v_data == NULL)
return (0);
 
+   vp_locked = VOP_ISLOCKED(vp);
+
/*
 * Hack: a tty device that is a controlling terminal
 * has a reference from the session structure.
@@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
if (vp == p->p_session->s_ttyvp) {
PROC_UNLOCK(p);
oldvp = NULL;
+   VOP_UNLOCK(vp, 0);
sx_xlock(&proctree_lock);
if (vp == p->p_session->s_ttyvp) {
SESS_LOCK(p->p_session);
@@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap)
SESS_UNLOCK(p->p_session);
}
sx_xunlock(&proctree_lock);
+   vn_lock(vp, vp_locked | LK_RETRY);
if (oldvp != NULL)
vrele(oldvp);
} else
@@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap)
}
vholdl(vp);
VI_UNLOCK(vp);
-   vp_locked = VOP_ISLOCKED(vp);
VOP_UNLOCK(vp, 0);
KASSERT(dev->si_refcount > 0,
("devfs_close() on un-referenced struct cdev *(%s)", 
devtoname(dev)));
@@ -964,10 +967,13 @@ devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
cdev = NULL;
DEVFS_DMP_HOLD(dmp);
sx_xunlock(&dmp->dm_lock);
+   dvplocked = VOP_ISLOCKED(dvp);
+   VOP_UNLOCK(dvp, 0);
sx_slock(&clone_drain_lock);
EVENTHANDLER_INVOKE(dev_clone,
td->td_ucred, pname, strlen(pname), &cdev);
sx_sunlock(&clone_drain_lock);
+   vn_lock(dvp, dvplocked | LK_RETRY);
 
if (cdev == NULL)
   

Re: svn commit: r267760 - head/sys/kern

2015-07-05 Thread Mateusz Guzik
On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote:
> On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote:
> > There is an additional problem with slocked case: witness report a lor
> > with devfs vnodes.
> > 
> > lock order reversal:
> >  1st 0xf80006fe6ab0 process imagelock (process imagelock) @ 
> > /usr/src/sys/kern/kern_proc.c:287
> >  2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241
> > KDB: stack backtrace:
> > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 
> > 0xfe012324f120
> > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0
> > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260
> > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0
> > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0
> > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0
> > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460
> > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0
> > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530
> > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580
> > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0
> > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 
> > 0xfe012324f840
> > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 
> > 0xfe012324f900
> > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 
> > 0xfe012324f940
> > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990
> > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30
> > 
> > witness detected the following lock orders:
> > devfs -> proctree
> Where is the dependency catched comes from ?  I suspect it might be tty.
> 
> I consider this case as an advantage of using sx over the hand-rolled lock,
> since the issue is/must be present with the counter as well, or the LOR
> is false positive, possibly due to sx taken in shared mode.  But debugging
> features of sx give the warning, while counter is silent.
> 
> That said, if the issue above is analyzed, I do not have any preference
> and will not object strongly against you decision.
> 

[snip]

This patch is about providing a way to block execs and exits of
processes so that it is safe to inspect their state. For instance it
deals with a case where the target process executes a setuid binary,
where the process inspecting it could have its permission checked prior
to exec but is reading data after setuid completed.

LOR mentioned above indeed comes form tty handling and is handled by
relocking the vnode.

Patch below is the variant with sx lock.

diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index fe8e9ef..31f43a8 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap)
if (vp->v_data == NULL)
return (0);
 
+   vp_locked = VOP_ISLOCKED(vp);
+
/*
 * Hack: a tty device that is a controlling terminal
 * has a reference from the session structure.
@@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap)
if (vp == p->p_session->s_ttyvp) {
PROC_UNLOCK(p);
oldvp = NULL;
+   VOP_UNLOCK(vp, 0);
sx_xlock(&proctree_lock);
if (vp == p->p_session->s_ttyvp) {
SESS_LOCK(p->p_session);
@@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap)
SESS_UNLOCK(p->p_session);
}
sx_xunlock(&proctree_lock);
+   vn_lock(vp, vp_locked | LK_RETRY);
if (oldvp != NULL)
vrele(oldvp);
} else
@@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap)
}
vholdl(vp);
VI_UNLOCK(vp);
-   vp_locked = VOP_ISLOCKED(vp);
VOP_UNLOCK(vp, 0);
KASSERT(dev->si_refcount > 0,
("devfs_close() on un-referenced struct cdev *(%s)", 
devtoname(dev)));
diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c
index d3bac30..118a7d9 100644
--- a/sys/fs/nfsclient/nfs_clport.c
+++ b/sys/fs/nfsclient/nfs_clport.c
@@ -1188,7 +1188,7 @@ nfscl_procdoesntexist(u_int8_t *own)
tl.cval[2] = *own++;
tl.cval[3] = *own++;
pid = tl.lval;
-   p = pfind_locked(pid);
+   p = pfind_locked(pid, 0);
if (p == NULL)
return (1);
if (p->p_stats == NULL) {
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 0675128..2ad09aa 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -1887,6 +1887,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t 
*sizep)
sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN);
sb

Re: svn commit: r267760 - head/sys/kern

2014-07-16 Thread Konstantin Belousov
On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote:
> On Sun, Jul 13, 2014 at 04:26:52PM +0300, Konstantin Belousov wrote:
> > On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote:
> > > On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote:
> > > > The nolock version requires two atomics on both entry and leave from the
> > > > protected region, while sx-locked variant requires only one atomic for
> > > > entry and leave.
> > > > 
> > > > I am not sure why you decided to acquire p->p_keeplock in after the
> > > > proc lock in pget(), which indeed causes the complications of dropping
> > > > the proc_lock and rechecking to avoid LOR.  Did you tried to add a flag
> > > > to pfind*() functions to indicate that p_keeplock should be acquired,
> > > > instead ?
> > > 
> > > Lock is taken later to avoid waiting for finished exec/exit of processes
> > > we cannot return, so that e.g. procstat -fa does not trip over that
> > > much.
> > > 
> > > Right now only PROC_LOCK guarantees stability of p->p_ucred across pget
> > > operation. Without that the code would have to crget() and various
> > > functions modified to accept cred instead of proc, or 'imagelock'
> > > mechanism would have to be extended to also protect against cred
> > > changes.
> > No, you could get both locks, imagelock first, proc_lock next.
> > 
> 
> Ignoring allproc_lock:
> 
> sx lock case:
> filedesc out: slock + proc lock + proc unlock + sunlock
> exit/exec: xlock + xunlock
> 
> counter case:
> filedesc out: proc lock + proc unlock + proc lock + proc unlock
> exit/exec: just wait for imagelock to be 0
This should  be proc_lock/mwait/proc_lock, and proc_wait_imagelocked()
does this.

> 
> counter can result in temporary errors due to catching the process
> in exec, on the other hand slock before proc lock forces the caller to
> wait even for processes it cannot read
> 
> I find the counter case better.
> 
> sx:
> http://people.freebsd.org/~mjg/patches/sx-imagelock.patch
> 
> counter: 
> http://people.freebsd.org/~mjg/patches/counter-imagelock.patch
> 
> There is an additional problem with slocked case: witness report a lor
> with devfs vnodes.
> 
> lock order reversal:
>  1st 0xf80006fe6ab0 process imagelock (process imagelock) @ 
> /usr/src/sys/kern/kern_proc.c:287
>  2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241
> KDB: stack backtrace:
> db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe012324f120
> kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0
> witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260
> __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0
> vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0
> VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0
> _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460
> vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0
> vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530
> vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580
> export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0
> kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 
> 0xfe012324f840
> sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 
> 0xfe012324f900
> sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 
> 0xfe012324f940
> sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990
> userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30
> 
> witness detected the following lock orders:
> devfs -> proctree
Where is the dependency catched comes from ?  I suspect it might be tty.

I consider this case as an advantage of using sx over the hand-rolled lock,
since the issue is/must be present with the counter as well, or the LOR
is false positive, possibly due to sx taken in shared mode.  But debugging
features of sx give the warning, while counter is silent.

That said, if the issue above is analyzed, I do not have any preference
and will not object strongly against you decision.

> proctree -> allproc
> allproc -> imagelock
> imagelock -> devfs


pgp0EepJcVAP1.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-07-13 Thread Mateusz Guzik
On Sun, Jul 13, 2014 at 04:26:52PM +0300, Konstantin Belousov wrote:
> On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote:
> > On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote:
> > > The nolock version requires two atomics on both entry and leave from the
> > > protected region, while sx-locked variant requires only one atomic for
> > > entry and leave.
> > > 
> > > I am not sure why you decided to acquire p->p_keeplock in after the
> > > proc lock in pget(), which indeed causes the complications of dropping
> > > the proc_lock and rechecking to avoid LOR.  Did you tried to add a flag
> > > to pfind*() functions to indicate that p_keeplock should be acquired,
> > > instead ?
> > 
> > Lock is taken later to avoid waiting for finished exec/exit of processes
> > we cannot return, so that e.g. procstat -fa does not trip over that
> > much.
> > 
> > Right now only PROC_LOCK guarantees stability of p->p_ucred across pget
> > operation. Without that the code would have to crget() and various
> > functions modified to accept cred instead of proc, or 'imagelock'
> > mechanism would have to be extended to also protect against cred
> > changes.
> No, you could get both locks, imagelock first, proc_lock next.
> 

Ignoring allproc_lock:

sx lock case:
filedesc out: slock + proc lock + proc unlock + sunlock
exit/exec: xlock + xunlock

counter case:
filedesc out: proc lock + proc unlock + proc lock + proc unlock
exit/exec: just wait for imagelock to be 0

counter can result in temporary errors due to catching the process
in exec, on the other hand slock before proc lock forces the caller to
wait even for processes it cannot read

I find the counter case better.

sx:
http://people.freebsd.org/~mjg/patches/sx-imagelock.patch

counter: 
http://people.freebsd.org/~mjg/patches/counter-imagelock.patch

There is an additional problem with slocked case: witness report a lor
with devfs vnodes.

lock order reversal:
 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ 
/usr/src/sys/kern/kern_proc.c:287
 2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241
KDB: stack backtrace:
db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe012324f120
kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0
witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260
__lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0
vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0
VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0
_vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460
vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0
vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530
vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580
export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0
kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 
0xfe012324f840
sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 
0xfe012324f900
sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 
0xfe012324f940
sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990
userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30

witness detected the following lock orders:
devfs -> proctree
proctree -> allproc
allproc -> imagelock
imagelock -> devfs

-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-07-13 Thread Konstantin Belousov
On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote:
> On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote:
> > On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote:
> > > In both cases the same mechanism blocks both exec and exit, this can be
> > > split if needed (p_lock would still cover exit, p_something would cover
> > > exec).
> > > 
> > > Here is a version with sx lock:
> > > 
> > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch
> > > 
> > > I'm not really happy with this. Reading foreign fdt is very rare and
> > > this adds lock + unlock for every exec and exit.
> > > 
> > > On the other hand mere counter version is rather simple:
> > > 
> > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch
> > > 
> > > I don't have strong opinion here, but prefer the latter.
> > 
> > I suggest the name 'imagelock' for the beast.
> > 
> 
> Sounds good.
> 
> > The nolock version requires two atomics on both entry and leave from the
> > protected region, while sx-locked variant requires only one atomic for
> > entry and leave.
> > 
> > I am not sure why you decided to acquire p->p_keeplock in after the
> > proc lock in pget(), which indeed causes the complications of dropping
> > the proc_lock and rechecking to avoid LOR.  Did you tried to add a flag
> > to pfind*() functions to indicate that p_keeplock should be acquired,
> > instead ?
> 
> Lock is taken later to avoid waiting for finished exec/exit of processes
> we cannot return, so that e.g. procstat -fa does not trip over that
> much.
> 
> Right now only PROC_LOCK guarantees stability of p->p_ucred across pget
> operation. Without that the code would have to crget() and various
> functions modified to accept cred instead of proc, or 'imagelock'
> mechanism would have to be extended to also protect against cred
> changes.
No, you could get both locks, imagelock first, proc_lock next.

> 
> That said, the code could be indeed changed to sx requiring one atomic
> on entry and leave, but that would still leave us with such atomics in
> exit and exec and the last 2 are way more common than the first one,
> thus I prefer counter case which only adds lock + unlock on leave.
> 
> -- 
> Mateusz Guzik 


pgp8aPtfy5R68.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-07-11 Thread Mateusz Guzik
On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote:
> On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote:
> > In both cases the same mechanism blocks both exec and exit, this can be
> > split if needed (p_lock would still cover exit, p_something would cover
> > exec).
> > 
> > Here is a version with sx lock:
> > 
> > http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch
> > 
> > I'm not really happy with this. Reading foreign fdt is very rare and
> > this adds lock + unlock for every exec and exit.
> > 
> > On the other hand mere counter version is rather simple:
> > 
> > http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch
> > 
> > I don't have strong opinion here, but prefer the latter.
> 
> I suggest the name 'imagelock' for the beast.
> 

Sounds good.

> The nolock version requires two atomics on both entry and leave from the
> protected region, while sx-locked variant requires only one atomic for
> entry and leave.
> 
> I am not sure why you decided to acquire p->p_keeplock in after the
> proc lock in pget(), which indeed causes the complications of dropping
> the proc_lock and rechecking to avoid LOR.  Did you tried to add a flag
> to pfind*() functions to indicate that p_keeplock should be acquired,
> instead ?

Lock is taken later to avoid waiting for finished exec/exit of processes
we cannot return, so that e.g. procstat -fa does not trip over that
much.

Right now only PROC_LOCK guarantees stability of p->p_ucred across pget
operation. Without that the code would have to crget() and various
functions modified to accept cred instead of proc, or 'imagelock'
mechanism would have to be extended to also protect against cred
changes.

That said, the code could be indeed changed to sx requiring one atomic
on entry and leave, but that would still leave us with such atomics in
exit and exec and the last 2 are way more common than the first one,
thus I prefer counter case which only adds lock + unlock on leave.

-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-07-11 Thread Konstantin Belousov
On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote:
> On Mon, Jun 23, 2014 at 07:35:23PM +0300, Konstantin Belousov wrote:
> > On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote:
> > > If traversal while transition to P_INEXEC is allowed, execve dealing
> > > with a setuid binary is problematic. This is more of hypothetical nature,
> > > but with sufficienly long delay it could finish the syscall and start
> > > opening some files, which paths would now be visible for an unprivileged
> > > reader.
> > > 
> > > That said, I propose adding a counter to struct proc which would which
> > > would block execve. It would be quite similar to p_lock.
> > I thought about this too.  In fact, I considered using PHOLD for this.
> > 
> > > 
> > > iow execve would:
> > > 
> > > PROC_LOCK(p);
> > >   p->p_flag |= P_INEXEC; 
> > > while (p->p_execlock > 0)
> > > msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0);
> > >   PROC_UNLOCK(p);
> > > 
> > > And it would be mandatory for external fdp consumers to grab the counter.
> > > 
> > > I'm tempted to add P_GETPIN which would both increase p_lock and 
> > > p_execlock,
> > > that way the process is guaranteed not to exit and not to execve even
> > > after proc lock is dropped.
> > See above about PHOLD.
> > 
> > > 
> > > There is a separate question if p_execlock should be renamed and
> > > extended to also block any kind of credential changes.
> > > 
> > > Then the guarantee is even stronger since we know that credentials we
> > > checked against are not going to change for the duration of our
> > > operations, but it is unclear if we need this.
> > 
> > If doing separate execlock/p_lock, I think that it could be possible
> > to use per-process sx lock instead of hand-rolling the counter.  The
> > accessors would lock sx shared, while kern_execve would take it in
> > exclusive mode.
> 
> Both patches need some cleaning up. The name 'keeplock' is no exactly
> the best either.
> 
> In both cases the same mechanism blocks both exec and exit, this can be
> split if needed (p_lock would still cover exit, p_something would cover
> exec).
> 
> Here is a version with sx lock:
> 
> http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch
> 
> I'm not really happy with this. Reading foreign fdt is very rare and
> this adds lock + unlock for every exec and exit.
> 
> On the other hand mere counter version is rather simple:
> 
> http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch
> 
> I don't have strong opinion here, but prefer the latter.

I suggest the name 'imagelock' for the beast.

The nolock version requires two atomics on both entry and leave from the
protected region, while sx-locked variant requires only one atomic for
entry and leave.

I am not sure why you decided to acquire p->p_keeplock in after the
proc lock in pget(), which indeed causes the complications of dropping
the proc_lock and rechecking to avoid LOR.  Did you tried to add a flag
to pfind*() functions to indicate that p_keeplock should be acquired,
instead ?


pgpDPPmoqHYU0.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-07-10 Thread Mateusz Guzik
On Mon, Jun 23, 2014 at 07:35:23PM +0300, Konstantin Belousov wrote:
> On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote:
> > If traversal while transition to P_INEXEC is allowed, execve dealing
> > with a setuid binary is problematic. This is more of hypothetical nature,
> > but with sufficienly long delay it could finish the syscall and start
> > opening some files, which paths would now be visible for an unprivileged
> > reader.
> > 
> > That said, I propose adding a counter to struct proc which would which
> > would block execve. It would be quite similar to p_lock.
> I thought about this too.  In fact, I considered using PHOLD for this.
> 
> > 
> > iow execve would:
> > 
> > PROC_LOCK(p);
> > p->p_flag |= P_INEXEC; 
> > while (p->p_execlock > 0)
> > msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0);
> > PROC_UNLOCK(p);
> > 
> > And it would be mandatory for external fdp consumers to grab the counter.
> > 
> > I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock,
> > that way the process is guaranteed not to exit and not to execve even
> > after proc lock is dropped.
> See above about PHOLD.
> 
> > 
> > There is a separate question if p_execlock should be renamed and
> > extended to also block any kind of credential changes.
> > 
> > Then the guarantee is even stronger since we know that credentials we
> > checked against are not going to change for the duration of our
> > operations, but it is unclear if we need this.
> 
> If doing separate execlock/p_lock, I think that it could be possible
> to use per-process sx lock instead of hand-rolling the counter.  The
> accessors would lock sx shared, while kern_execve would take it in
> exclusive mode.

Both patches need some cleaning up. The name 'keeplock' is no exactly
the best either.

In both cases the same mechanism blocks both exec and exit, this can be
split if needed (p_lock would still cover exit, p_something would cover
exec).

Here is a version with sx lock:

http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch

I'm not really happy with this. Reading foreign fdt is very rare and
this adds lock + unlock for every exec and exit.

On the other hand mere counter version is rather simple:

http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch

I don't have strong opinion here, but prefer the latter.

-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Konstantin Belousov
On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote:
> On Mon, Jun 23, 2014 at 11:18:23AM +0300, Konstantin Belousov wrote:
> > On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote:
> > > On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote:
> > > > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote:
> > > > > The table is modified in these functions and is reachable from the 
> > > > > rest
> > > > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
> > > > > XLOCK is needed to ensure consistency for readers. It can also be
> > > > > altered by mountcheckdirs, although not in a way which disrupts any of
> > > > > these functions.
> > > > I would think that such cases should be avoided by testing for P_INEXEC,
> > > > but I do not insist on this.
> > > > 
> > > 
> > > proc lock has to be dropped before filedesc lock is taken and I don't
> > > see any way to prevent the proc from transitioning to P_INEXEC afterwards.
> > > Since sysctl_kern_proc_filedesc et al can take a long time it does not
> > > seem feasible to pursue this.
> > > 
> 
> After a second look this problem has to be dealt with.
> 
> If traversal while transition to P_INEXEC is allowed, execve dealing
> with a setuid binary is problematic. This is more of hypothetical nature,
> but with sufficienly long delay it could finish the syscall and start
> opening some files, which paths would now be visible for an unprivileged
> reader.
> 
> That said, I propose adding a counter to struct proc which would which
> would block execve. It would be quite similar to p_lock.
I thought about this too.  In fact, I considered using PHOLD for this.

> 
> iow execve would:
> 
> PROC_LOCK(p);
>   p->p_flag |= P_INEXEC; 
> while (p->p_execlock > 0)
> msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0);
>   PROC_UNLOCK(p);
> 
> And it would be mandatory for external fdp consumers to grab the counter.
> 
> I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock,
> that way the process is guaranteed not to exit and not to execve even
> after proc lock is dropped.
See above about PHOLD.

> 
> There is a separate question if p_execlock should be renamed and
> extended to also block any kind of credential changes.
> 
> Then the guarantee is even stronger since we know that credentials we
> checked against are not going to change for the duration of our
> operations, but it is unclear if we need this.

If doing separate execlock/p_lock, I think that it could be possible
to use per-process sx lock instead of hand-rolling the counter.  The
accessors would lock sx shared, while kern_execve would take it in
exclusive mode.


pgpCuY9nMjzhd.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Mateusz Guzik
On Mon, Jun 23, 2014 at 11:18:23AM +0300, Konstantin Belousov wrote:
> On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote:
> > On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote:
> > > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote:
> > > > The table is modified in these functions and is reachable from the rest
> > > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
> > > > XLOCK is needed to ensure consistency for readers. It can also be
> > > > altered by mountcheckdirs, although not in a way which disrupts any of
> > > > these functions.
> > > I would think that such cases should be avoided by testing for P_INEXEC,
> > > but I do not insist on this.
> > > 
> > 
> > proc lock has to be dropped before filedesc lock is taken and I don't
> > see any way to prevent the proc from transitioning to P_INEXEC afterwards.
> > Since sysctl_kern_proc_filedesc et al can take a long time it does not
> > seem feasible to pursue this.
> > 

After a second look this problem has to be dealt with.

If traversal while transition to P_INEXEC is allowed, execve dealing
with a setuid binary is problematic. This is more of hypothetical nature,
but with sufficienly long delay it could finish the syscall and start
opening some files, which paths would now be visible for an unprivileged
reader.

That said, I propose adding a counter to struct proc which would which
would block execve. It would be quite similar to p_lock.

iow execve would:

PROC_LOCK(p);
p->p_flag |= P_INEXEC; 
while (p->p_execlock > 0)
msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0);
PROC_UNLOCK(p);

And it would be mandatory for external fdp consumers to grab the counter.

I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock,
that way the process is guaranteed not to exit and not to execve even
after proc lock is dropped.

There is a separate question if p_execlock should be renamed and
extended to also block any kind of credential changes.

Then the guarantee is even stronger since we know that credentials we
checked against are not going to change for the duration of our
operations, but it is unclear if we need this.

-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Konstantin Belousov
On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote:
> On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote:
> > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote:
> > > The table is modified in these functions and is reachable from the rest
> > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
> > > XLOCK is needed to ensure consistency for readers. It can also be
> > > altered by mountcheckdirs, although not in a way which disrupts any of
> > > these functions.
> > I would think that such cases should be avoided by testing for P_INEXEC,
> > but I do not insist on this.
> > 
> 
> proc lock has to be dropped before filedesc lock is taken and I don't
> see any way to prevent the proc from transitioning to P_INEXEC afterwards.
> Since sysctl_kern_proc_filedesc et al can take a long time it does not
> seem feasible to pursue this.
> 
> This also makes me realize I screwed up r265247 "Request a non-exiting
> process in sysctl_kern_proc_{o,}filedesc". As it is the code has to hold
> the process and release it later to really close the race and not only
> affect the window. I'll poke around this later, maybe it is better to
> nullify some stuff in exit1 with PROC_LOCK held and access it in a
> similar manner.
> 
> > > @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td)
> > > fdfree(fdp, i);
> > > FILEDESC_XUNLOCK(fdp);
> > > (void) closef(fp, td);
> > > -   FILEDESC_XLOCK(fdp);
> > > }
> > > }
> > > FILEDESC_XUNLOCK(fdp);
> > This unlock should be removed ?
> > 
> 
> Ugh, yes.
> 
> > > +   /* See the comment in setugidsafety */
> > > +   FILEDESC_XLOCK(fdp);
> > > fdfree(fdp, i);
> > > (void) closefp(fdp, i, fp, td, 0);
> > > /* closefp() drops the FILEDESC lock. */
> > > -   FILEDESC_XLOCK(fdp);
> > This should be XUNLOCK ?
> 
> This one is fine as closefp drops the lock.
> 
> > > }
> > > }
> > > FILEDESC_XUNLOCK(fdp);
> > And this unlock removed ?
> 
> Yes.
> 
> I tested this patch and it works fine:
> 
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index 25c3a1e..a900464 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td)
>   int i;
>  
>   fdp = td->td_proc->p_fd;
> + /*
> +  * While no other thread can alter filedescriptors in this table,
> +  * there may be code trying to read it, thus the lock is required
> +  * to provide consistent view if we are going to change it.
> +  */
>   KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> - FILEDESC_XLOCK(fdp);
>   for (i = 0; i <= fdp->fd_lastfile; i++) {
>   if (i > 2)
>   break;
>   fp = fdp->fd_ofiles[i].fde_file;
>   if (fp != NULL && is_unsafe(fp)) {
> + FILEDESC_XLOCK(fdp);
>   knote_fdclose(td, i);
>   /*
>* NULL-out descriptor prior to close to avoid
> @@ -2097,10 +2102,8 @@ setugidsafety(struct thread *td)
>   fdfree(fdp, i);
>   FILEDESC_XUNLOCK(fdp);
>   (void) closef(fp, td);
> - FILEDESC_XLOCK(fdp);
>   }
>   }
> - FILEDESC_XUNLOCK(fdp);
>  }
>  
>  /*
> @@ -2136,19 +2139,18 @@ fdcloseexec(struct thread *td)
>  
>   fdp = td->td_proc->p_fd;
>   KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> - FILEDESC_XLOCK(fdp);
>   for (i = 0; i <= fdp->fd_lastfile; i++) {
>   fde = &fdp->fd_ofiles[i];
>   fp = fde->fde_file;
>   if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
>   (fde->fde_flags & UF_EXCLOSE))) {
> + /* See the comment in setugidsafety */
> + FILEDESC_XLOCK(fdp);
>   fdfree(fdp, i);
>   (void) closefp(fdp, i, fp, td, 0);
>   /* closefp() drops the FILEDESC lock. */
> - FILEDESC_XLOCK(fdp);
>   }
>   }
> - FILEDESC_XUNLOCK(fdp);
>  }
>  
>  /*

This looks fine, but please e.g. add a reference to the comment in
etugidsafety(), to fdcloseexec().  Or duplicate it.


pgpGV4JEGvzDc.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Mateusz Guzik
On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote:
> On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote:
> > The table is modified in these functions and is reachable from the rest
> > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
> > XLOCK is needed to ensure consistency for readers. It can also be
> > altered by mountcheckdirs, although not in a way which disrupts any of
> > these functions.
> I would think that such cases should be avoided by testing for P_INEXEC,
> but I do not insist on this.
> 

proc lock has to be dropped before filedesc lock is taken and I don't
see any way to prevent the proc from transitioning to P_INEXEC afterwards.
Since sysctl_kern_proc_filedesc et al can take a long time it does not
seem feasible to pursue this.

This also makes me realize I screwed up r265247 "Request a non-exiting
process in sysctl_kern_proc_{o,}filedesc". As it is the code has to hold
the process and release it later to really close the race and not only
affect the window. I'll poke around this later, maybe it is better to
nullify some stuff in exit1 with PROC_LOCK held and access it in a
similar manner.

> > @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td)
> > fdfree(fdp, i);
> > FILEDESC_XUNLOCK(fdp);
> > (void) closef(fp, td);
> > -   FILEDESC_XLOCK(fdp);
> > }
> > }
> > FILEDESC_XUNLOCK(fdp);
> This unlock should be removed ?
> 

Ugh, yes.

> > +   /* See the comment in setugidsafety */
> > +   FILEDESC_XLOCK(fdp);
> > fdfree(fdp, i);
> > (void) closefp(fdp, i, fp, td, 0);
> > /* closefp() drops the FILEDESC lock. */
> > -   FILEDESC_XLOCK(fdp);
> This should be XUNLOCK ?

This one is fine as closefp drops the lock.

> > }
> > }
> > FILEDESC_XUNLOCK(fdp);
> And this unlock removed ?

Yes.

I tested this patch and it works fine:

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 25c3a1e..a900464 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td)
int i;
 
fdp = td->td_proc->p_fd;
+   /*
+* While no other thread can alter filedescriptors in this table,
+* there may be code trying to read it, thus the lock is required
+* to provide consistent view if we are going to change it.
+*/
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
-   FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
break;
fp = fdp->fd_ofiles[i].fde_file;
if (fp != NULL && is_unsafe(fp)) {
+   FILEDESC_XLOCK(fdp);
knote_fdclose(td, i);
/*
 * NULL-out descriptor prior to close to avoid
@@ -2097,10 +2102,8 @@ setugidsafety(struct thread *td)
fdfree(fdp, i);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
-   FILEDESC_XLOCK(fdp);
}
}
-   FILEDESC_XUNLOCK(fdp);
 }
 
 /*
@@ -2136,19 +2139,18 @@ fdcloseexec(struct thread *td)
 
fdp = td->td_proc->p_fd;
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
-   FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
fde = &fdp->fd_ofiles[i];
fp = fde->fde_file;
if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
(fde->fde_flags & UF_EXCLOSE))) {
+   /* See the comment in setugidsafety */
+   FILEDESC_XLOCK(fdp);
fdfree(fdp, i);
(void) closefp(fdp, i, fp, td, 0);
/* closefp() drops the FILEDESC lock. */
-   FILEDESC_XLOCK(fdp);
}
}
-   FILEDESC_XUNLOCK(fdp);
 }
 
 /*
-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Konstantin Belousov
On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote:
> On Mon, Jun 23, 2014 at 09:40:44AM +0300, Konstantin Belousov wrote:
> > On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote:
> > > + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> > >   FILEDESC_XLOCK(fdp);
> > This  is at least weird.  Not incorrect, but the code now looks strange.
> > The fd_refcnt == 1 assert just states the circumstances of the code which
> > currently calls the functions.  Would the functions become incorrect or
> > destructive if there are other references to the filedescriptor table ?
> > 
> > In case you argument is that refcnt == 1 must hold to prevent the parallel
> > modifications of the descriptor table, which would invalidate the checks
> > and actions of the functions, then XLOCK is not needed (similar to your
> > earlier commit).
> > 
> > Note that kern_execve() is executed with the process single-threaded,
> > which, together with statement fd_refcnt == 1 must prevent the parallel
> > modifications.
> 
> The table is modified in these functions and is reachable from the rest
> of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
> XLOCK is needed to ensure consistency for readers. It can also be
> altered by mountcheckdirs, although not in a way which disrupts any of
> these functions.
I would think that such cases should be avoided by testing for P_INEXEC,
but I do not insist on this.

> 
> For now I do agree that both the assertion and XLOCK look weird as it is
> and as such deserve a comment.
> 
> Actually we can take the lock only if we are going to modify something.
> 
> How about the following then (untested):
See below, but I would be completely satisfied even by only
comment addition.

> 
> diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
> index 25c3a1e..31f4881 100644
> --- a/sys/kern/kern_descrip.c
> +++ b/sys/kern/kern_descrip.c
> @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td)
> int i;
>  
> fdp = td->td_proc->p_fd;
> +   /*
> +* While no other thread can alter filedescriptors in this table,
> +* there may be code trying to read it, thus the lock is required
> +* to provide consistent view if we are going to change it.
> +*/
> KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> -   FILEDESC_XLOCK(fdp);
> for (i = 0; i <= fdp->fd_lastfile; i++) {
> if (i > 2)
> break;
> fp = fdp->fd_ofiles[i].fde_file;
> if (fp != NULL && is_unsafe(fp)) {
> +   FILEDESC_XLOCK(fdp);
> knote_fdclose(td, i);
> /*
>  * NULL-out descriptor prior to close to avoid
> @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td)
> fdfree(fdp, i);
> FILEDESC_XUNLOCK(fdp);
> (void) closef(fp, td);
> -   FILEDESC_XLOCK(fdp);
> }
> }
> FILEDESC_XUNLOCK(fdp);
This unlock should be removed ?

> @@ -2136,16 +2140,16 @@ fdcloseexec(struct thread *td)
>  
> fdp = td->td_proc->p_fd;
> KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> -   FILEDESC_XLOCK(fdp);
> for (i = 0; i <= fdp->fd_lastfile; i++) {
> fde = &fdp->fd_ofiles[i];
> fp = fde->fde_file;
> if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
> (fde->fde_flags & UF_EXCLOSE))) {
> +   /* See the comment in setugidsafety */
> +   FILEDESC_XLOCK(fdp);
> fdfree(fdp, i);
> (void) closefp(fdp, i, fp, td, 0);
> /* closefp() drops the FILEDESC lock. */
> -   FILEDESC_XLOCK(fdp);
This should be XUNLOCK ?
> }
> }
> FILEDESC_XUNLOCK(fdp);
And this unlock removed ?
> 
> 
> -- 
> Mateusz Guzik 


pgpMWxnApmOw7.pgp
Description: PGP signature


Re: svn commit: r267760 - head/sys/kern

2014-06-23 Thread Mateusz Guzik
On Mon, Jun 23, 2014 at 09:40:44AM +0300, Konstantin Belousov wrote:
> On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote:
> > +   KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
> > FILEDESC_XLOCK(fdp);
> This  is at least weird.  Not incorrect, but the code now looks strange.
> The fd_refcnt == 1 assert just states the circumstances of the code which
> currently calls the functions.  Would the functions become incorrect or
> destructive if there are other references to the filedescriptor table ?
> 
> In case you argument is that refcnt == 1 must hold to prevent the parallel
> modifications of the descriptor table, which would invalidate the checks
> and actions of the functions, then XLOCK is not needed (similar to your
> earlier commit).
> 
> Note that kern_execve() is executed with the process single-threaded,
> which, together with statement fd_refcnt == 1 must prevent the parallel
> modifications.

The table is modified in these functions and is reachable from the rest
of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus
XLOCK is needed to ensure consistency for readers. It can also be
altered by mountcheckdirs, although not in a way which disrupts any of
these functions.

For now I do agree that both the assertion and XLOCK look weird as it is
and as such deserve a comment.

Actually we can take the lock only if we are going to modify something.

How about the following then (untested):

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 25c3a1e..31f4881 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td)
int i;
 
fdp = td->td_proc->p_fd;
+   /*
+* While no other thread can alter filedescriptors in this table,
+* there may be code trying to read it, thus the lock is required
+* to provide consistent view if we are going to change it.
+*/
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
-   FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
break;
fp = fdp->fd_ofiles[i].fde_file;
if (fp != NULL && is_unsafe(fp)) {
+   FILEDESC_XLOCK(fdp);
knote_fdclose(td, i);
/*
 * NULL-out descriptor prior to close to avoid
@@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td)
fdfree(fdp, i);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
-   FILEDESC_XLOCK(fdp);
}
}
FILEDESC_XUNLOCK(fdp);
@@ -2136,16 +2140,16 @@ fdcloseexec(struct thread *td)
 
fdp = td->td_proc->p_fd;
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
-   FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
fde = &fdp->fd_ofiles[i];
fp = fde->fde_file;
if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
(fde->fde_flags & UF_EXCLOSE))) {
+   /* See the comment in setugidsafety */
+   FILEDESC_XLOCK(fdp);
fdfree(fdp, i);
(void) closefp(fdp, i, fp, td, 0);
/* closefp() drops the FILEDESC lock. */
-   FILEDESC_XLOCK(fdp);
}
}
FILEDESC_XUNLOCK(fdp);


-- 
Mateusz Guzik 
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"


Re: svn commit: r267760 - head/sys/kern

2014-06-22 Thread Konstantin Belousov
On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote:
> + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
>   FILEDESC_XLOCK(fdp);
This  is at least weird.  Not incorrect, but the code now looks strange.
The fd_refcnt == 1 assert just states the circumstances of the code which
currently calls the functions.  Would the functions become incorrect or
destructive if there are other references to the filedescriptor table ?

In case you argument is that refcnt == 1 must hold to prevent the parallel
modifications of the descriptor table, which would invalidate the checks
and actions of the functions, then XLOCK is not needed (similar to your
earlier commit).

Note that kern_execve() is executed with the process single-threaded,
which, together with statement fd_refcnt == 1 must prevent the parallel
modifications.


pgpEHUZWui4zg.pgp
Description: PGP signature


svn commit: r267760 - head/sys/kern

2014-06-22 Thread Mateusz Guzik
Author: mjg
Date: Mon Jun 23 01:28:18 2014
New Revision: 267760
URL: http://svnweb.freebsd.org/changeset/base/267760

Log:
  Tidy up fd-related functions called by do_execve
  
  o assert in each one that fdp is not shared
  o remove unnecessary NULL checks - all userspace processes have fdtables
  and kernel processes cannot execve
  o remove comments about the danger of fd_ofiles getting reallocated - fdtable
  is not shared and fd_ofiles could be only reallocated if new fd was about to 
be
  added, but if that was possible the code would already be buggy as 
setugidsafety
  work could be undone
  
  MFC after:1 week

Modified:
  head/sys/kern/kern_descrip.c

Modified: head/sys/kern/kern_descrip.c
==
--- head/sys/kern/kern_descrip.cMon Jun 23 01:10:56 2014
(r267759)
+++ head/sys/kern/kern_descrip.cMon Jun 23 01:28:18 2014
(r267760)
@@ -2081,15 +2081,8 @@ setugidsafety(struct thread *td)
struct file *fp;
int i;
 
-   /* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
-   if (fdp == NULL)
-   return;
-
-   /*
-* Note: fdp->fd_ofiles may be reallocated out from under us while
-* we are blocked in a close.  Be careful!
-*/
+   KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
@@ -2141,15 +2134,8 @@ fdcloseexec(struct thread *td)
struct file *fp;
int i;
 
-   /* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
-   if (fdp == NULL)
-   return;
-
-   /*
-* We cannot cache fd_ofiles since operations
-* may block and rip them out from under us.
-*/
+   KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
fde = &fdp->fd_ofiles[i];
@@ -2180,8 +2166,6 @@ fdcheckstd(struct thread *td)
int i, error, devnull;
 
fdp = td->td_proc->p_fd;
-   if (fdp == NULL)
-   return (0);
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
devnull = -1;
error = 0;
___
svn-src-all@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"