Re: svn commit: r267760 - head/sys/kern
On Tue, Jul 07, 2015 at 12:42:01AM +0200, Mateusz Guzik wrote: > On Mon, Jul 06, 2015 at 12:09:58PM +0300, Konstantin Belousov wrote: > > On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote: > > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c > > > index fe8e9ef..c7f579a 100644 > > > --- a/sys/fs/devfs/devfs_vnops.c > > > +++ b/sys/fs/devfs/devfs_vnops.c > > > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap) > > > if (vp->v_data == NULL) > > > return (0); > > > > > > + vp_locked = VOP_ISLOCKED(vp); > > > + > > > /* > > >* Hack: a tty device that is a controlling terminal > > >* has a reference from the session structure. > > > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap) > > > if (vp == p->p_session->s_ttyvp) { > > > PROC_UNLOCK(p); > > > oldvp = NULL; > > > + VOP_UNLOCK(vp, 0); > > This opens a window where vp can be reclaimed. Then vp->v_rdev > > is invalid and dev_refthread() below accesses random memory. > > > > Might be, the easiest fix is to call dev_refthread() before the td != > > NULL block, but leave dsw == NULL check where it is now. > > > > Instead I made the bare minimum to ensure that modified fields which are > accessed here are always protected with proc lock and sess lock, which > eliminates the need for proctree lock in problematic cases. In effect > there are no games with relocking vnodes. This could work. What I want to know is what methodology was used to ensure that added locking for other places is enough. The proctree locking is 'natural', since session/group membership relates to the process tree structure and owning the sx guarantees that the tree is stable. Your locking can work, but needs an argumentation why it is complete. Also, you should add some note to proc.h to indicate in the locking annotations that reading of e.g. pg_session is safe when only process lock is owned. > > I would commit this patch separately. > > Note this establishes a process lock -> vnode interlock order which > seems fishy, but apparently does not result in lors. > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c > index fe8e9ef..0a2a6bb 100644 > --- a/sys/fs/devfs/devfs_vnops.c > +++ b/sys/fs/devfs/devfs_vnops.c > @@ -586,12 +586,12 @@ devfs_close(struct vop_close_args *ap) > if (td != NULL) { > p = td->td_proc; > PROC_LOCK(p); > - if (vp == p->p_session->s_ttyvp) { > + if (vp != p->p_session->s_ttyvp) { > PROC_UNLOCK(p); > + } else { > oldvp = NULL; > - sx_xlock(&proctree_lock); > + SESS_LOCK(p->p_session); > if (vp == p->p_session->s_ttyvp) { > - SESS_LOCK(p->p_session); > VI_LOCK(vp); > if (count_dev(dev) == 2 && > (vp->v_iflag & VI_DOOMED) == 0) { > @@ -600,13 +600,12 @@ devfs_close(struct vop_close_args *ap) > oldvp = vp; > } > VI_UNLOCK(vp); > - SESS_UNLOCK(p->p_session); > } > - sx_xunlock(&proctree_lock); > + SESS_UNLOCK(p->p_session); > + PROC_UNLOCK(p); > if (oldvp != NULL) > vrele(oldvp); > - } else > - PROC_UNLOCK(p); > + } > } > /* >* We do not want to really close the device if it > diff --git a/sys/kern/tty.c b/sys/kern/tty.c > index d9d0cce..5ef5578 100644 > --- a/sys/kern/tty.c > +++ b/sys/kern/tty.c > @@ -1637,8 +1637,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void > *data, int fflag, > return (EPERM); > } > > + PROC_LOCK(p); > + SESS_LOCK(p->p_session); > + > if (tp->t_session != NULL && tp->t_session == p->p_session) { > /* This is already our controlling TTY. */ > + SESS_UNLOCK(p->p_session); > + PROC_UNLOCK(p); > sx_xunlock(&proctree_lock); > return (0); > } > @@ -1657,6 +1662,8 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void > *data, int fflag, >* TTYs of which the session leader has been >* killed or the TTY revoked. >*/ > + SESS_UNLOCK(p->p_session); > + PROC_UNLOCK(p); > sx_xunlock(&proctree_lock); > return (EPERM); > } > @@ -1665,14 +1672,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void > *data, int fflag,
Re: svn commit: r267760 - head/sys/kern
On Mon, Jul 06, 2015 at 12:09:58PM +0300, Konstantin Belousov wrote: > On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote: > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c > > index fe8e9ef..c7f579a 100644 > > --- a/sys/fs/devfs/devfs_vnops.c > > +++ b/sys/fs/devfs/devfs_vnops.c > > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap) > > if (vp->v_data == NULL) > > return (0); > > > > + vp_locked = VOP_ISLOCKED(vp); > > + > > /* > > * Hack: a tty device that is a controlling terminal > > * has a reference from the session structure. > > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap) > > if (vp == p->p_session->s_ttyvp) { > > PROC_UNLOCK(p); > > oldvp = NULL; > > + VOP_UNLOCK(vp, 0); > This opens a window where vp can be reclaimed. Then vp->v_rdev > is invalid and dev_refthread() below accesses random memory. > > Might be, the easiest fix is to call dev_refthread() before the td != > NULL block, but leave dsw == NULL check where it is now. > Instead I made the bare minimum to ensure that modified fields which are accessed here are always protected with proc lock and sess lock, which eliminates the need for proctree lock in problematic cases. In effect there are no games with relocking vnodes. I would commit this patch separately. Note this establishes a process lock -> vnode interlock order which seems fishy, but apparently does not result in lors. diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index fe8e9ef..0a2a6bb 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -586,12 +586,12 @@ devfs_close(struct vop_close_args *ap) if (td != NULL) { p = td->td_proc; PROC_LOCK(p); - if (vp == p->p_session->s_ttyvp) { + if (vp != p->p_session->s_ttyvp) { PROC_UNLOCK(p); + } else { oldvp = NULL; - sx_xlock(&proctree_lock); + SESS_LOCK(p->p_session); if (vp == p->p_session->s_ttyvp) { - SESS_LOCK(p->p_session); VI_LOCK(vp); if (count_dev(dev) == 2 && (vp->v_iflag & VI_DOOMED) == 0) { @@ -600,13 +600,12 @@ devfs_close(struct vop_close_args *ap) oldvp = vp; } VI_UNLOCK(vp); - SESS_UNLOCK(p->p_session); } - sx_xunlock(&proctree_lock); + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); if (oldvp != NULL) vrele(oldvp); - } else - PROC_UNLOCK(p); + } } /* * We do not want to really close the device if it diff --git a/sys/kern/tty.c b/sys/kern/tty.c index d9d0cce..5ef5578 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -1637,8 +1637,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, return (EPERM); } + PROC_LOCK(p); + SESS_LOCK(p->p_session); + if (tp->t_session != NULL && tp->t_session == p->p_session) { /* This is already our controlling TTY. */ + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return (0); } @@ -1657,6 +1662,8 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, * TTYs of which the session leader has been * killed or the TTY revoked. */ + SESS_UNLOCK(p->p_session); + PROC_UNLOCK(p); sx_xunlock(&proctree_lock); return (EPERM); } @@ -1665,14 +1672,13 @@ tty_generic_ioctl(struct tty *tp, u_long cmd, void *data, int fflag, tp->t_session = p->p_session; tp->t_session->s_ttyp = tp; tp->t_sessioncnt++; - sx_xunlock(&proctree_lock); /* Assign foreground process group. */ tp->t_pgrp = p->p_pgrp; - PROC_LOCK(p); p->p_flag |= P_CONTROLT; + SESS_UNLOCK(p->p_session); PROC_UNLOCK(p); - + sx_xunlock(&proctree_lock); return (0); } case TIOCSPGRP: { diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c index 582b41a..dca619c 100644 --- a/sys/kern/tty_tty.c +++ b/sys/kern/tty_tty.c @@ -6
Re: svn commit: r267760 - head/sys/kern
On Mon, Jul 06, 2015 at 08:46:02AM +0200, Mateusz Guzik wrote: > On Mon, Jul 06, 2015 at 07:51:35AM +0200, Mateusz Guzik wrote: > > On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote: > > > On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote: > > > > There is an additional problem with slocked case: witness report a lor > > > > with devfs vnodes. > > > > > > > > lock order reversal: > > > > 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ > > > > /usr/src/sys/kern/kern_proc.c:287 > > > > 2nd 0xf80018c88240 devfs (devfs) @ > > > > /usr/src/sys/kern/vfs_cache.c:1241 > > > > KDB: stack backtrace: > > > > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame > > > > 0xfe012324f120 > > > > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0 > > > > witness_checkorder() at witness_checkorder+0xdc2/frame > > > > 0xfe012324f260 > > > > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0 > > > > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0 > > > > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0 > > > > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460 > > > > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0 > > > > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530 > > > > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580 > > > > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0 > > > > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame > > > > 0xfe012324f840 > > > > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame > > > > 0xfe012324f900 > > > > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame > > > > 0xfe012324f940 > > > > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990 > > > > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30 > > > > > > > > witness detected the following lock orders: > > > > devfs -> proctree > > > Where is the dependency catched comes from ? I suspect it might be tty. > > > > > > I consider this case as an advantage of using sx over the hand-rolled > > > lock, > > > since the issue is/must be present with the counter as well, or the LOR > > > is false positive, possibly due to sx taken in shared mode. But debugging > > > features of sx give the warning, while counter is silent. > > > > > > That said, if the issue above is analyzed, I do not have any preference > > > and will not object strongly against you decision. > > > > > > > [snip] > > > > This patch is about providing a way to block execs and exits of > > processes so that it is safe to inspect their state. For instance it > > deals with a case where the target process executes a setuid binary, > > where the process inspecting it could have its permission checked prior > > to exec but is reading data after setuid completed. > > > > LOR mentioned above indeed comes form tty handling and is handled by > > relocking the vnode. > > > > Oops, the attached patch was missing one place (devfs_lookupx). Updated patch: > > > diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c > index fe8e9ef..c7f579a 100644 > --- a/sys/fs/devfs/devfs_vnops.c > +++ b/sys/fs/devfs/devfs_vnops.c > @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap) > if (vp->v_data == NULL) > return (0); > > + vp_locked = VOP_ISLOCKED(vp); > + > /* >* Hack: a tty device that is a controlling terminal >* has a reference from the session structure. > @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap) > if (vp == p->p_session->s_ttyvp) { > PROC_UNLOCK(p); > oldvp = NULL; > + VOP_UNLOCK(vp, 0); This opens a window where vp can be reclaimed. Then vp->v_rdev is invalid and dev_refthread() below accesses random memory. Might be, the easiest fix is to call dev_refthread() before the td != NULL block, but leave dsw == NULL check where it is now. > sx_xlock(&proctree_lock); > if (vp == p->p_session->s_ttyvp) { > SESS_LOCK(p->p_session); > @@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap) > SESS_UNLOCK(p->p_session); > } > sx_xunlock(&proctree_lock); > + vn_lock(vp, vp_locked | LK_RETRY); > if (oldvp != NULL) > vrele(oldvp); > } else > @@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap) > } > vholdl(vp); > VI_UNLOCK(vp); > - vp_locked = VOP_ISLOCKED(vp); > VOP_UNLOCK(vp, 0); > KASSERT(dev->si_refcount > 0, > ("devfs_close() on un-referenced struct cdev *(%s)", > devtoname(dev))); > @@ -964,10 +967,13 @@ devfs_lookupx(struct vop_lookup_args *ap, int > *dm_unlock
Re: svn commit: r267760 - head/sys/kern
On Mon, Jul 06, 2015 at 07:51:35AM +0200, Mateusz Guzik wrote: > On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote: > > On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote: > > > There is an additional problem with slocked case: witness report a lor > > > with devfs vnodes. > > > > > > lock order reversal: > > > 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ > > > /usr/src/sys/kern/kern_proc.c:287 > > > 2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241 > > > KDB: stack backtrace: > > > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame > > > 0xfe012324f120 > > > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0 > > > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260 > > > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0 > > > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0 > > > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0 > > > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460 > > > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0 > > > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530 > > > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580 > > > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0 > > > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame > > > 0xfe012324f840 > > > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame > > > 0xfe012324f900 > > > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame > > > 0xfe012324f940 > > > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990 > > > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30 > > > > > > witness detected the following lock orders: > > > devfs -> proctree > > Where is the dependency catched comes from ? I suspect it might be tty. > > > > I consider this case as an advantage of using sx over the hand-rolled lock, > > since the issue is/must be present with the counter as well, or the LOR > > is false positive, possibly due to sx taken in shared mode. But debugging > > features of sx give the warning, while counter is silent. > > > > That said, if the issue above is analyzed, I do not have any preference > > and will not object strongly against you decision. > > > > [snip] > > This patch is about providing a way to block execs and exits of > processes so that it is safe to inspect their state. For instance it > deals with a case where the target process executes a setuid binary, > where the process inspecting it could have its permission checked prior > to exec but is reading data after setuid completed. > > LOR mentioned above indeed comes form tty handling and is handled by > relocking the vnode. > Oops, the attached patch was missing one place (devfs_lookupx). Updated patch: diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index fe8e9ef..c7f579a 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap) if (vp->v_data == NULL) return (0); + vp_locked = VOP_ISLOCKED(vp); + /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap) if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; + VOP_UNLOCK(vp, 0); sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); @@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap) SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); + vn_lock(vp, vp_locked | LK_RETRY); if (oldvp != NULL) vrele(oldvp); } else @@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap) } vholdl(vp); VI_UNLOCK(vp); - vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); @@ -964,10 +967,13 @@ devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock) cdev = NULL; DEVFS_DMP_HOLD(dmp); sx_xunlock(&dmp->dm_lock); + dvplocked = VOP_ISLOCKED(dvp); + VOP_UNLOCK(dvp, 0); sx_slock(&clone_drain_lock); EVENTHANDLER_INVOKE(dev_clone, td->td_ucred, pname, strlen(pname), &cdev); sx_sunlock(&clone_drain_lock); + vn_lock(dvp, dvplocked | LK_RETRY); if (cdev == NULL)
Re: svn commit: r267760 - head/sys/kern
On Thu, Jul 17, 2014 at 03:56:38AM +0300, Konstantin Belousov wrote: > On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote: > > There is an additional problem with slocked case: witness report a lor > > with devfs vnodes. > > > > lock order reversal: > > 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ > > /usr/src/sys/kern/kern_proc.c:287 > > 2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241 > > KDB: stack backtrace: > > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame > > 0xfe012324f120 > > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0 > > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260 > > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0 > > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0 > > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0 > > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460 > > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0 > > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530 > > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580 > > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0 > > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame > > 0xfe012324f840 > > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame > > 0xfe012324f900 > > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame > > 0xfe012324f940 > > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990 > > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30 > > > > witness detected the following lock orders: > > devfs -> proctree > Where is the dependency catched comes from ? I suspect it might be tty. > > I consider this case as an advantage of using sx over the hand-rolled lock, > since the issue is/must be present with the counter as well, or the LOR > is false positive, possibly due to sx taken in shared mode. But debugging > features of sx give the warning, while counter is silent. > > That said, if the issue above is analyzed, I do not have any preference > and will not object strongly against you decision. > [snip] This patch is about providing a way to block execs and exits of processes so that it is safe to inspect their state. For instance it deals with a case where the target process executes a setuid binary, where the process inspecting it could have its permission checked prior to exec but is reading data after setuid completed. LOR mentioned above indeed comes form tty handling and is handled by relocking the vnode. Patch below is the variant with sx lock. diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c index fe8e9ef..31f43a8 100644 --- a/sys/fs/devfs/devfs_vnops.c +++ b/sys/fs/devfs/devfs_vnops.c @@ -574,6 +574,8 @@ devfs_close(struct vop_close_args *ap) if (vp->v_data == NULL) return (0); + vp_locked = VOP_ISLOCKED(vp); + /* * Hack: a tty device that is a controlling terminal * has a reference from the session structure. @@ -589,6 +591,7 @@ devfs_close(struct vop_close_args *ap) if (vp == p->p_session->s_ttyvp) { PROC_UNLOCK(p); oldvp = NULL; + VOP_UNLOCK(vp, 0); sx_xlock(&proctree_lock); if (vp == p->p_session->s_ttyvp) { SESS_LOCK(p->p_session); @@ -603,6 +606,7 @@ devfs_close(struct vop_close_args *ap) SESS_UNLOCK(p->p_session); } sx_xunlock(&proctree_lock); + vn_lock(vp, vp_locked | LK_RETRY); if (oldvp != NULL) vrele(oldvp); } else @@ -632,7 +636,6 @@ devfs_close(struct vop_close_args *ap) } vholdl(vp); VI_UNLOCK(vp); - vp_locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp, 0); KASSERT(dev->si_refcount > 0, ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev))); diff --git a/sys/fs/nfsclient/nfs_clport.c b/sys/fs/nfsclient/nfs_clport.c index d3bac30..118a7d9 100644 --- a/sys/fs/nfsclient/nfs_clport.c +++ b/sys/fs/nfsclient/nfs_clport.c @@ -1188,7 +1188,7 @@ nfscl_procdoesntexist(u_int8_t *own) tl.cval[2] = *own++; tl.cval[3] = *own++; pid = tl.lval; - p = pfind_locked(pid); + p = pfind_locked(pid, 0); if (p == NULL) return (1); if (p->p_stats == NULL) { diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 0675128..2ad09aa 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -1887,6 +1887,7 @@ note_procstat_files(void *arg, struct sbuf *sb, size_t *sizep) sb = sbuf_new(NULL, NULL, 128, SBUF_FIXEDLEN); sb
Re: svn commit: r267760 - head/sys/kern
On Sun, Jul 13, 2014 at 11:36:24PM +0200, Mateusz Guzik wrote: > On Sun, Jul 13, 2014 at 04:26:52PM +0300, Konstantin Belousov wrote: > > On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote: > > > On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote: > > > > The nolock version requires two atomics on both entry and leave from the > > > > protected region, while sx-locked variant requires only one atomic for > > > > entry and leave. > > > > > > > > I am not sure why you decided to acquire p->p_keeplock in after the > > > > proc lock in pget(), which indeed causes the complications of dropping > > > > the proc_lock and rechecking to avoid LOR. Did you tried to add a flag > > > > to pfind*() functions to indicate that p_keeplock should be acquired, > > > > instead ? > > > > > > Lock is taken later to avoid waiting for finished exec/exit of processes > > > we cannot return, so that e.g. procstat -fa does not trip over that > > > much. > > > > > > Right now only PROC_LOCK guarantees stability of p->p_ucred across pget > > > operation. Without that the code would have to crget() and various > > > functions modified to accept cred instead of proc, or 'imagelock' > > > mechanism would have to be extended to also protect against cred > > > changes. > > No, you could get both locks, imagelock first, proc_lock next. > > > > Ignoring allproc_lock: > > sx lock case: > filedesc out: slock + proc lock + proc unlock + sunlock > exit/exec: xlock + xunlock > > counter case: > filedesc out: proc lock + proc unlock + proc lock + proc unlock > exit/exec: just wait for imagelock to be 0 This should be proc_lock/mwait/proc_lock, and proc_wait_imagelocked() does this. > > counter can result in temporary errors due to catching the process > in exec, on the other hand slock before proc lock forces the caller to > wait even for processes it cannot read > > I find the counter case better. > > sx: > http://people.freebsd.org/~mjg/patches/sx-imagelock.patch > > counter: > http://people.freebsd.org/~mjg/patches/counter-imagelock.patch > > There is an additional problem with slocked case: witness report a lor > with devfs vnodes. > > lock order reversal: > 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ > /usr/src/sys/kern/kern_proc.c:287 > 2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241 > KDB: stack backtrace: > db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe012324f120 > kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0 > witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260 > __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0 > vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0 > VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0 > _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460 > vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0 > vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530 > vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580 > export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0 > kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame > 0xfe012324f840 > sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame > 0xfe012324f900 > sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame > 0xfe012324f940 > sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990 > userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30 > > witness detected the following lock orders: > devfs -> proctree Where is the dependency catched comes from ? I suspect it might be tty. I consider this case as an advantage of using sx over the hand-rolled lock, since the issue is/must be present with the counter as well, or the LOR is false positive, possibly due to sx taken in shared mode. But debugging features of sx give the warning, while counter is silent. That said, if the issue above is analyzed, I do not have any preference and will not object strongly against you decision. > proctree -> allproc > allproc -> imagelock > imagelock -> devfs pgp0EepJcVAP1.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Sun, Jul 13, 2014 at 04:26:52PM +0300, Konstantin Belousov wrote: > On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote: > > On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote: > > > The nolock version requires two atomics on both entry and leave from the > > > protected region, while sx-locked variant requires only one atomic for > > > entry and leave. > > > > > > I am not sure why you decided to acquire p->p_keeplock in after the > > > proc lock in pget(), which indeed causes the complications of dropping > > > the proc_lock and rechecking to avoid LOR. Did you tried to add a flag > > > to pfind*() functions to indicate that p_keeplock should be acquired, > > > instead ? > > > > Lock is taken later to avoid waiting for finished exec/exit of processes > > we cannot return, so that e.g. procstat -fa does not trip over that > > much. > > > > Right now only PROC_LOCK guarantees stability of p->p_ucred across pget > > operation. Without that the code would have to crget() and various > > functions modified to accept cred instead of proc, or 'imagelock' > > mechanism would have to be extended to also protect against cred > > changes. > No, you could get both locks, imagelock first, proc_lock next. > Ignoring allproc_lock: sx lock case: filedesc out: slock + proc lock + proc unlock + sunlock exit/exec: xlock + xunlock counter case: filedesc out: proc lock + proc unlock + proc lock + proc unlock exit/exec: just wait for imagelock to be 0 counter can result in temporary errors due to catching the process in exec, on the other hand slock before proc lock forces the caller to wait even for processes it cannot read I find the counter case better. sx: http://people.freebsd.org/~mjg/patches/sx-imagelock.patch counter: http://people.freebsd.org/~mjg/patches/counter-imagelock.patch There is an additional problem with slocked case: witness report a lor with devfs vnodes. lock order reversal: 1st 0xf80006fe6ab0 process imagelock (process imagelock) @ /usr/src/sys/kern/kern_proc.c:287 2nd 0xf80018c88240 devfs (devfs) @ /usr/src/sys/kern/vfs_cache.c:1241 KDB: stack backtrace: db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfe012324f120 kdb_backtrace() at kdb_backtrace+0x39/frame 0xfe012324f1d0 witness_checkorder() at witness_checkorder+0xdc2/frame 0xfe012324f260 __lockmgr_args() at __lockmgr_args+0x588/frame 0xfe012324f3a0 vop_stdlock() at vop_stdlock+0x3c/frame 0xfe012324f3c0 VOP_LOCK1_APV() at VOP_LOCK1_APV+0xfc/frame 0xfe012324f3f0 _vn_lock() at _vn_lock+0xaa/frame 0xfe012324f460 vn_vptocnp_locked() at vn_vptocnp_locked+0xe8/frame 0xfe012324f4d0 vn_fullpath1() at vn_fullpath1+0xb0/frame 0xfe012324f530 vn_fullpath() at vn_fullpath+0xc1/frame 0xfe012324f580 export_fd_to_sb() at export_fd_to_sb+0x489/frame 0xfe012324f7b0 kern_proc_filedesc_out() at kern_proc_filedesc_out+0x1c6/frame 0xfe012324f840 sysctl_kern_proc_filedesc() at sysctl_kern_proc_filedesc+0x84/frame 0xfe012324f900 sysctl_root_handler_locked() at sysctl_root_handler_locked+0x68/frame 0xfe012324f940 sysctl_root() at sysctl_root+0x18e/frame 0xfe012324f990 userland_sysctl() at userland_sysctl+0x192/frame 0xfe012324fa30 witness detected the following lock orders: devfs -> proctree proctree -> allproc allproc -> imagelock imagelock -> devfs -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Fri, Jul 11, 2014 at 01:19:25PM +0200, Mateusz Guzik wrote: > On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote: > > On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote: > > > In both cases the same mechanism blocks both exec and exit, this can be > > > split if needed (p_lock would still cover exit, p_something would cover > > > exec). > > > > > > Here is a version with sx lock: > > > > > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch > > > > > > I'm not really happy with this. Reading foreign fdt is very rare and > > > this adds lock + unlock for every exec and exit. > > > > > > On the other hand mere counter version is rather simple: > > > > > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch > > > > > > I don't have strong opinion here, but prefer the latter. > > > > I suggest the name 'imagelock' for the beast. > > > > Sounds good. > > > The nolock version requires two atomics on both entry and leave from the > > protected region, while sx-locked variant requires only one atomic for > > entry and leave. > > > > I am not sure why you decided to acquire p->p_keeplock in after the > > proc lock in pget(), which indeed causes the complications of dropping > > the proc_lock and rechecking to avoid LOR. Did you tried to add a flag > > to pfind*() functions to indicate that p_keeplock should be acquired, > > instead ? > > Lock is taken later to avoid waiting for finished exec/exit of processes > we cannot return, so that e.g. procstat -fa does not trip over that > much. > > Right now only PROC_LOCK guarantees stability of p->p_ucred across pget > operation. Without that the code would have to crget() and various > functions modified to accept cred instead of proc, or 'imagelock' > mechanism would have to be extended to also protect against cred > changes. No, you could get both locks, imagelock first, proc_lock next. > > That said, the code could be indeed changed to sx requiring one atomic > on entry and leave, but that would still leave us with such atomics in > exit and exec and the last 2 are way more common than the first one, > thus I prefer counter case which only adds lock + unlock on leave. > > -- > Mateusz Guzik pgp8aPtfy5R68.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Fri, Jul 11, 2014 at 12:55:51PM +0300, Konstantin Belousov wrote: > On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote: > > In both cases the same mechanism blocks both exec and exit, this can be > > split if needed (p_lock would still cover exit, p_something would cover > > exec). > > > > Here is a version with sx lock: > > > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch > > > > I'm not really happy with this. Reading foreign fdt is very rare and > > this adds lock + unlock for every exec and exit. > > > > On the other hand mere counter version is rather simple: > > > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch > > > > I don't have strong opinion here, but prefer the latter. > > I suggest the name 'imagelock' for the beast. > Sounds good. > The nolock version requires two atomics on both entry and leave from the > protected region, while sx-locked variant requires only one atomic for > entry and leave. > > I am not sure why you decided to acquire p->p_keeplock in after the > proc lock in pget(), which indeed causes the complications of dropping > the proc_lock and rechecking to avoid LOR. Did you tried to add a flag > to pfind*() functions to indicate that p_keeplock should be acquired, > instead ? Lock is taken later to avoid waiting for finished exec/exit of processes we cannot return, so that e.g. procstat -fa does not trip over that much. Right now only PROC_LOCK guarantees stability of p->p_ucred across pget operation. Without that the code would have to crget() and various functions modified to accept cred instead of proc, or 'imagelock' mechanism would have to be extended to also protect against cred changes. That said, the code could be indeed changed to sx requiring one atomic on entry and leave, but that would still leave us with such atomics in exit and exec and the last 2 are way more common than the first one, thus I prefer counter case which only adds lock + unlock on leave. -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Fri, Jul 11, 2014 at 04:43:51AM +0200, Mateusz Guzik wrote: > On Mon, Jun 23, 2014 at 07:35:23PM +0300, Konstantin Belousov wrote: > > On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote: > > > If traversal while transition to P_INEXEC is allowed, execve dealing > > > with a setuid binary is problematic. This is more of hypothetical nature, > > > but with sufficienly long delay it could finish the syscall and start > > > opening some files, which paths would now be visible for an unprivileged > > > reader. > > > > > > That said, I propose adding a counter to struct proc which would which > > > would block execve. It would be quite similar to p_lock. > > I thought about this too. In fact, I considered using PHOLD for this. > > > > > > > > iow execve would: > > > > > > PROC_LOCK(p); > > > p->p_flag |= P_INEXEC; > > > while (p->p_execlock > 0) > > > msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0); > > > PROC_UNLOCK(p); > > > > > > And it would be mandatory for external fdp consumers to grab the counter. > > > > > > I'm tempted to add P_GETPIN which would both increase p_lock and > > > p_execlock, > > > that way the process is guaranteed not to exit and not to execve even > > > after proc lock is dropped. > > See above about PHOLD. > > > > > > > > There is a separate question if p_execlock should be renamed and > > > extended to also block any kind of credential changes. > > > > > > Then the guarantee is even stronger since we know that credentials we > > > checked against are not going to change for the duration of our > > > operations, but it is unclear if we need this. > > > > If doing separate execlock/p_lock, I think that it could be possible > > to use per-process sx lock instead of hand-rolling the counter. The > > accessors would lock sx shared, while kern_execve would take it in > > exclusive mode. > > Both patches need some cleaning up. The name 'keeplock' is no exactly > the best either. > > In both cases the same mechanism blocks both exec and exit, this can be > split if needed (p_lock would still cover exit, p_something would cover > exec). > > Here is a version with sx lock: > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch > > I'm not really happy with this. Reading foreign fdt is very rare and > this adds lock + unlock for every exec and exit. > > On the other hand mere counter version is rather simple: > > http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch > > I don't have strong opinion here, but prefer the latter. I suggest the name 'imagelock' for the beast. The nolock version requires two atomics on both entry and leave from the protected region, while sx-locked variant requires only one atomic for entry and leave. I am not sure why you decided to acquire p->p_keeplock in after the proc lock in pget(), which indeed causes the complications of dropping the proc_lock and rechecking to avoid LOR. Did you tried to add a flag to pfind*() functions to indicate that p_keeplock should be acquired, instead ? pgpDPPmoqHYU0.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 07:35:23PM +0300, Konstantin Belousov wrote: > On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote: > > If traversal while transition to P_INEXEC is allowed, execve dealing > > with a setuid binary is problematic. This is more of hypothetical nature, > > but with sufficienly long delay it could finish the syscall and start > > opening some files, which paths would now be visible for an unprivileged > > reader. > > > > That said, I propose adding a counter to struct proc which would which > > would block execve. It would be quite similar to p_lock. > I thought about this too. In fact, I considered using PHOLD for this. > > > > > iow execve would: > > > > PROC_LOCK(p); > > p->p_flag |= P_INEXEC; > > while (p->p_execlock > 0) > > msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0); > > PROC_UNLOCK(p); > > > > And it would be mandatory for external fdp consumers to grab the counter. > > > > I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock, > > that way the process is guaranteed not to exit and not to execve even > > after proc lock is dropped. > See above about PHOLD. > > > > > There is a separate question if p_execlock should be renamed and > > extended to also block any kind of credential changes. > > > > Then the guarantee is even stronger since we know that credentials we > > checked against are not going to change for the duration of our > > operations, but it is unclear if we need this. > > If doing separate execlock/p_lock, I think that it could be possible > to use per-process sx lock instead of hand-rolling the counter. The > accessors would lock sx shared, while kern_execve would take it in > exclusive mode. Both patches need some cleaning up. The name 'keeplock' is no exactly the best either. In both cases the same mechanism blocks both exec and exit, this can be split if needed (p_lock would still cover exit, p_something would cover exec). Here is a version with sx lock: http://people.freebsd.org/~mjg/patches/exec-exit-hold-wait.patch I'm not really happy with this. Reading foreign fdt is very rare and this adds lock + unlock for every exec and exit. On the other hand mere counter version is rather simple: http://people.freebsd.org/~mjg/patches/exec-exit-hold-nolock.patch I don't have strong opinion here, but prefer the latter. -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 03:16:53PM +0200, Mateusz Guzik wrote: > On Mon, Jun 23, 2014 at 11:18:23AM +0300, Konstantin Belousov wrote: > > On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote: > > > On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote: > > > > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote: > > > > > The table is modified in these functions and is reachable from the > > > > > rest > > > > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus > > > > > XLOCK is needed to ensure consistency for readers. It can also be > > > > > altered by mountcheckdirs, although not in a way which disrupts any of > > > > > these functions. > > > > I would think that such cases should be avoided by testing for P_INEXEC, > > > > but I do not insist on this. > > > > > > > > > > proc lock has to be dropped before filedesc lock is taken and I don't > > > see any way to prevent the proc from transitioning to P_INEXEC afterwards. > > > Since sysctl_kern_proc_filedesc et al can take a long time it does not > > > seem feasible to pursue this. > > > > > After a second look this problem has to be dealt with. > > If traversal while transition to P_INEXEC is allowed, execve dealing > with a setuid binary is problematic. This is more of hypothetical nature, > but with sufficienly long delay it could finish the syscall and start > opening some files, which paths would now be visible for an unprivileged > reader. > > That said, I propose adding a counter to struct proc which would which > would block execve. It would be quite similar to p_lock. I thought about this too. In fact, I considered using PHOLD for this. > > iow execve would: > > PROC_LOCK(p); > p->p_flag |= P_INEXEC; > while (p->p_execlock > 0) > msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0); > PROC_UNLOCK(p); > > And it would be mandatory for external fdp consumers to grab the counter. > > I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock, > that way the process is guaranteed not to exit and not to execve even > after proc lock is dropped. See above about PHOLD. > > There is a separate question if p_execlock should be renamed and > extended to also block any kind of credential changes. > > Then the guarantee is even stronger since we know that credentials we > checked against are not going to change for the duration of our > operations, but it is unclear if we need this. If doing separate execlock/p_lock, I think that it could be possible to use per-process sx lock instead of hand-rolling the counter. The accessors would lock sx shared, while kern_execve would take it in exclusive mode. pgpCuY9nMjzhd.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 11:18:23AM +0300, Konstantin Belousov wrote: > On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote: > > On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote: > > > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote: > > > > The table is modified in these functions and is reachable from the rest > > > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus > > > > XLOCK is needed to ensure consistency for readers. It can also be > > > > altered by mountcheckdirs, although not in a way which disrupts any of > > > > these functions. > > > I would think that such cases should be avoided by testing for P_INEXEC, > > > but I do not insist on this. > > > > > > > proc lock has to be dropped before filedesc lock is taken and I don't > > see any way to prevent the proc from transitioning to P_INEXEC afterwards. > > Since sysctl_kern_proc_filedesc et al can take a long time it does not > > seem feasible to pursue this. > > After a second look this problem has to be dealt with. If traversal while transition to P_INEXEC is allowed, execve dealing with a setuid binary is problematic. This is more of hypothetical nature, but with sufficienly long delay it could finish the syscall and start opening some files, which paths would now be visible for an unprivileged reader. That said, I propose adding a counter to struct proc which would which would block execve. It would be quite similar to p_lock. iow execve would: PROC_LOCK(p); p->p_flag |= P_INEXEC; while (p->p_execlock > 0) msleep(&p->p_execlock, &p->p_mtx, PWAIT, "execlock", 0); PROC_UNLOCK(p); And it would be mandatory for external fdp consumers to grab the counter. I'm tempted to add P_GETPIN which would both increase p_lock and p_execlock, that way the process is guaranteed not to exit and not to execve even after proc lock is dropped. There is a separate question if p_execlock should be renamed and extended to also block any kind of credential changes. Then the guarantee is even stronger since we know that credentials we checked against are not going to change for the duration of our operations, but it is unclear if we need this. -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 10:05:01AM +0200, Mateusz Guzik wrote: > On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote: > > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote: > > > The table is modified in these functions and is reachable from the rest > > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus > > > XLOCK is needed to ensure consistency for readers. It can also be > > > altered by mountcheckdirs, although not in a way which disrupts any of > > > these functions. > > I would think that such cases should be avoided by testing for P_INEXEC, > > but I do not insist on this. > > > > proc lock has to be dropped before filedesc lock is taken and I don't > see any way to prevent the proc from transitioning to P_INEXEC afterwards. > Since sysctl_kern_proc_filedesc et al can take a long time it does not > seem feasible to pursue this. > > This also makes me realize I screwed up r265247 "Request a non-exiting > process in sysctl_kern_proc_{o,}filedesc". As it is the code has to hold > the process and release it later to really close the race and not only > affect the window. I'll poke around this later, maybe it is better to > nullify some stuff in exit1 with PROC_LOCK held and access it in a > similar manner. > > > > @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td) > > > fdfree(fdp, i); > > > FILEDESC_XUNLOCK(fdp); > > > (void) closef(fp, td); > > > - FILEDESC_XLOCK(fdp); > > > } > > > } > > > FILEDESC_XUNLOCK(fdp); > > This unlock should be removed ? > > > > Ugh, yes. > > > > + /* See the comment in setugidsafety */ > > > + FILEDESC_XLOCK(fdp); > > > fdfree(fdp, i); > > > (void) closefp(fdp, i, fp, td, 0); > > > /* closefp() drops the FILEDESC lock. */ > > > - FILEDESC_XLOCK(fdp); > > This should be XUNLOCK ? > > This one is fine as closefp drops the lock. > > > > } > > > } > > > FILEDESC_XUNLOCK(fdp); > > And this unlock removed ? > > Yes. > > I tested this patch and it works fine: > > diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c > index 25c3a1e..a900464 100644 > --- a/sys/kern/kern_descrip.c > +++ b/sys/kern/kern_descrip.c > @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td) > int i; > > fdp = td->td_proc->p_fd; > + /* > + * While no other thread can alter filedescriptors in this table, > + * there may be code trying to read it, thus the lock is required > + * to provide consistent view if we are going to change it. > + */ > KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > - FILEDESC_XLOCK(fdp); > for (i = 0; i <= fdp->fd_lastfile; i++) { > if (i > 2) > break; > fp = fdp->fd_ofiles[i].fde_file; > if (fp != NULL && is_unsafe(fp)) { > + FILEDESC_XLOCK(fdp); > knote_fdclose(td, i); > /* >* NULL-out descriptor prior to close to avoid > @@ -2097,10 +2102,8 @@ setugidsafety(struct thread *td) > fdfree(fdp, i); > FILEDESC_XUNLOCK(fdp); > (void) closef(fp, td); > - FILEDESC_XLOCK(fdp); > } > } > - FILEDESC_XUNLOCK(fdp); > } > > /* > @@ -2136,19 +2139,18 @@ fdcloseexec(struct thread *td) > > fdp = td->td_proc->p_fd; > KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > - FILEDESC_XLOCK(fdp); > for (i = 0; i <= fdp->fd_lastfile; i++) { > fde = &fdp->fd_ofiles[i]; > fp = fde->fde_file; > if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || > (fde->fde_flags & UF_EXCLOSE))) { > + /* See the comment in setugidsafety */ > + FILEDESC_XLOCK(fdp); > fdfree(fdp, i); > (void) closefp(fdp, i, fp, td, 0); > /* closefp() drops the FILEDESC lock. */ > - FILEDESC_XLOCK(fdp); > } > } > - FILEDESC_XUNLOCK(fdp); > } > > /* This looks fine, but please e.g. add a reference to the comment in etugidsafety(), to fdcloseexec(). Or duplicate it. pgpGV4JEGvzDc.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 10:25:19AM +0300, Konstantin Belousov wrote: > On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote: > > The table is modified in these functions and is reachable from the rest > > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus > > XLOCK is needed to ensure consistency for readers. It can also be > > altered by mountcheckdirs, although not in a way which disrupts any of > > these functions. > I would think that such cases should be avoided by testing for P_INEXEC, > but I do not insist on this. > proc lock has to be dropped before filedesc lock is taken and I don't see any way to prevent the proc from transitioning to P_INEXEC afterwards. Since sysctl_kern_proc_filedesc et al can take a long time it does not seem feasible to pursue this. This also makes me realize I screwed up r265247 "Request a non-exiting process in sysctl_kern_proc_{o,}filedesc". As it is the code has to hold the process and release it later to really close the race and not only affect the window. I'll poke around this later, maybe it is better to nullify some stuff in exit1 with PROC_LOCK held and access it in a similar manner. > > @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td) > > fdfree(fdp, i); > > FILEDESC_XUNLOCK(fdp); > > (void) closef(fp, td); > > - FILEDESC_XLOCK(fdp); > > } > > } > > FILEDESC_XUNLOCK(fdp); > This unlock should be removed ? > Ugh, yes. > > + /* See the comment in setugidsafety */ > > + FILEDESC_XLOCK(fdp); > > fdfree(fdp, i); > > (void) closefp(fdp, i, fp, td, 0); > > /* closefp() drops the FILEDESC lock. */ > > - FILEDESC_XLOCK(fdp); > This should be XUNLOCK ? This one is fine as closefp drops the lock. > > } > > } > > FILEDESC_XUNLOCK(fdp); > And this unlock removed ? Yes. I tested this patch and it works fine: diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 25c3a1e..a900464 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td) int i; fdp = td->td_proc->p_fd; + /* +* While no other thread can alter filedescriptors in this table, +* there may be code trying to read it, thus the lock is required +* to provide consistent view if we are going to change it. +*/ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); - FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { + FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid @@ -2097,10 +2102,8 @@ setugidsafety(struct thread *td) fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); - FILEDESC_XLOCK(fdp); } } - FILEDESC_XUNLOCK(fdp); } /* @@ -2136,19 +2139,18 @@ fdcloseexec(struct thread *td) fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); - FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { + /* See the comment in setugidsafety */ + FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); /* closefp() drops the FILEDESC lock. */ - FILEDESC_XLOCK(fdp); } } - FILEDESC_XUNLOCK(fdp); } /* -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 09:06:52AM +0200, Mateusz Guzik wrote: > On Mon, Jun 23, 2014 at 09:40:44AM +0300, Konstantin Belousov wrote: > > On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote: > > > + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > > > FILEDESC_XLOCK(fdp); > > This is at least weird. Not incorrect, but the code now looks strange. > > The fd_refcnt == 1 assert just states the circumstances of the code which > > currently calls the functions. Would the functions become incorrect or > > destructive if there are other references to the filedescriptor table ? > > > > In case you argument is that refcnt == 1 must hold to prevent the parallel > > modifications of the descriptor table, which would invalidate the checks > > and actions of the functions, then XLOCK is not needed (similar to your > > earlier commit). > > > > Note that kern_execve() is executed with the process single-threaded, > > which, together with statement fd_refcnt == 1 must prevent the parallel > > modifications. > > The table is modified in these functions and is reachable from the rest > of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus > XLOCK is needed to ensure consistency for readers. It can also be > altered by mountcheckdirs, although not in a way which disrupts any of > these functions. I would think that such cases should be avoided by testing for P_INEXEC, but I do not insist on this. > > For now I do agree that both the assertion and XLOCK look weird as it is > and as such deserve a comment. > > Actually we can take the lock only if we are going to modify something. > > How about the following then (untested): See below, but I would be completely satisfied even by only comment addition. > > diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c > index 25c3a1e..31f4881 100644 > --- a/sys/kern/kern_descrip.c > +++ b/sys/kern/kern_descrip.c > @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td) > int i; > > fdp = td->td_proc->p_fd; > + /* > +* While no other thread can alter filedescriptors in this table, > +* there may be code trying to read it, thus the lock is required > +* to provide consistent view if we are going to change it. > +*/ > KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > - FILEDESC_XLOCK(fdp); > for (i = 0; i <= fdp->fd_lastfile; i++) { > if (i > 2) > break; > fp = fdp->fd_ofiles[i].fde_file; > if (fp != NULL && is_unsafe(fp)) { > + FILEDESC_XLOCK(fdp); > knote_fdclose(td, i); > /* > * NULL-out descriptor prior to close to avoid > @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td) > fdfree(fdp, i); > FILEDESC_XUNLOCK(fdp); > (void) closef(fp, td); > - FILEDESC_XLOCK(fdp); > } > } > FILEDESC_XUNLOCK(fdp); This unlock should be removed ? > @@ -2136,16 +2140,16 @@ fdcloseexec(struct thread *td) > > fdp = td->td_proc->p_fd; > KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > - FILEDESC_XLOCK(fdp); > for (i = 0; i <= fdp->fd_lastfile; i++) { > fde = &fdp->fd_ofiles[i]; > fp = fde->fde_file; > if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || > (fde->fde_flags & UF_EXCLOSE))) { > + /* See the comment in setugidsafety */ > + FILEDESC_XLOCK(fdp); > fdfree(fdp, i); > (void) closefp(fdp, i, fp, td, 0); > /* closefp() drops the FILEDESC lock. */ > - FILEDESC_XLOCK(fdp); This should be XUNLOCK ? > } > } > FILEDESC_XUNLOCK(fdp); And this unlock removed ? > > > -- > Mateusz Guzik pgpMWxnApmOw7.pgp Description: PGP signature
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 09:40:44AM +0300, Konstantin Belousov wrote: > On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote: > > + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > > FILEDESC_XLOCK(fdp); > This is at least weird. Not incorrect, but the code now looks strange. > The fd_refcnt == 1 assert just states the circumstances of the code which > currently calls the functions. Would the functions become incorrect or > destructive if there are other references to the filedescriptor table ? > > In case you argument is that refcnt == 1 must hold to prevent the parallel > modifications of the descriptor table, which would invalidate the checks > and actions of the functions, then XLOCK is not needed (similar to your > earlier commit). > > Note that kern_execve() is executed with the process single-threaded, > which, together with statement fd_refcnt == 1 must prevent the parallel > modifications. The table is modified in these functions and is reachable from the rest of the kernel (can be found by e.g. sysctl_kern_proc_filedesc), thus XLOCK is needed to ensure consistency for readers. It can also be altered by mountcheckdirs, although not in a way which disrupts any of these functions. For now I do agree that both the assertion and XLOCK look weird as it is and as such deserve a comment. Actually we can take the lock only if we are going to modify something. How about the following then (untested): diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 25c3a1e..31f4881 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -2082,13 +2082,18 @@ setugidsafety(struct thread *td) int i; fdp = td->td_proc->p_fd; + /* +* While no other thread can alter filedescriptors in this table, +* there may be code trying to read it, thus the lock is required +* to provide consistent view if we are going to change it. +*/ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); - FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) break; fp = fdp->fd_ofiles[i].fde_file; if (fp != NULL && is_unsafe(fp)) { + FILEDESC_XLOCK(fdp); knote_fdclose(td, i); /* * NULL-out descriptor prior to close to avoid @@ -2097,7 +2102,6 @@ setugidsafety(struct thread *td) fdfree(fdp, i); FILEDESC_XUNLOCK(fdp); (void) closef(fp, td); - FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); @@ -2136,16 +2140,16 @@ fdcloseexec(struct thread *td) fdp = td->td_proc->p_fd; KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); - FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; fp = fde->fde_file; if (fp != NULL && (fp->f_type == DTYPE_MQUEUE || (fde->fde_flags & UF_EXCLOSE))) { + /* See the comment in setugidsafety */ + FILEDESC_XLOCK(fdp); fdfree(fdp, i); (void) closefp(fdp, i, fp, td, 0); /* closefp() drops the FILEDESC lock. */ - FILEDESC_XLOCK(fdp); } } FILEDESC_XUNLOCK(fdp); -- Mateusz Guzik ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"
Re: svn commit: r267760 - head/sys/kern
On Mon, Jun 23, 2014 at 01:28:18AM +, Mateusz Guzik wrote: > + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); > FILEDESC_XLOCK(fdp); This is at least weird. Not incorrect, but the code now looks strange. The fd_refcnt == 1 assert just states the circumstances of the code which currently calls the functions. Would the functions become incorrect or destructive if there are other references to the filedescriptor table ? In case you argument is that refcnt == 1 must hold to prevent the parallel modifications of the descriptor table, which would invalidate the checks and actions of the functions, then XLOCK is not needed (similar to your earlier commit). Note that kern_execve() is executed with the process single-threaded, which, together with statement fd_refcnt == 1 must prevent the parallel modifications. pgpEHUZWui4zg.pgp Description: PGP signature
svn commit: r267760 - head/sys/kern
Author: mjg Date: Mon Jun 23 01:28:18 2014 New Revision: 267760 URL: http://svnweb.freebsd.org/changeset/base/267760 Log: Tidy up fd-related functions called by do_execve o assert in each one that fdp is not shared o remove unnecessary NULL checks - all userspace processes have fdtables and kernel processes cannot execve o remove comments about the danger of fd_ofiles getting reallocated - fdtable is not shared and fd_ofiles could be only reallocated if new fd was about to be added, but if that was possible the code would already be buggy as setugidsafety work could be undone MFC after:1 week Modified: head/sys/kern/kern_descrip.c Modified: head/sys/kern/kern_descrip.c == --- head/sys/kern/kern_descrip.cMon Jun 23 01:10:56 2014 (r267759) +++ head/sys/kern/kern_descrip.cMon Jun 23 01:28:18 2014 (r267760) @@ -2081,15 +2081,8 @@ setugidsafety(struct thread *td) struct file *fp; int i; - /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; - if (fdp == NULL) - return; - - /* -* Note: fdp->fd_ofiles may be reallocated out from under us while -* we are blocked in a close. Be careful! -*/ + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { if (i > 2) @@ -2141,15 +2134,8 @@ fdcloseexec(struct thread *td) struct file *fp; int i; - /* Certain daemons might not have file descriptors. */ fdp = td->td_proc->p_fd; - if (fdp == NULL) - return; - - /* -* We cannot cache fd_ofiles since operations -* may block and rip them out from under us. -*/ + KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); FILEDESC_XLOCK(fdp); for (i = 0; i <= fdp->fd_lastfile; i++) { fde = &fdp->fd_ofiles[i]; @@ -2180,8 +2166,6 @@ fdcheckstd(struct thread *td) int i, error, devnull; fdp = td->td_proc->p_fd; - if (fdp == NULL) - return (0); KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared")); devnull = -1; error = 0; ___ svn-src-all@freebsd.org mailing list http://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"