Diff below is _not_ for commit. It's just an experiment I wish to share. Socket layer is already protected by solock(). Inet sockets are protected by NET_LOCK() which is grabbed by solock(). All other sockets are still under KERNEL_LOCK().
My suggestion is to implement another lock, identical to NET_LOCK() and grab it within solock() for PF_UNIX sockets. I suggest solock() is already placed everythere it's required :) struct file is already mp-safe. Diff below implements UNIXDOMAIN_LOCK(). It's just rwlock "unixdomainlock" like NET_LOCK()'s "netlock". solock(), sounlock() and soassertlocked() deal with UNIXDOMAIN_LOCK() in PF_UNIX case. sosleep_nsec() uses unixdomainlock directly in PF_UNIX case. Garbage collector unp_gc() placed to systqmp instead of systq. unp_gc() grabs UNIXDOMAIN_LOCK(). Vnode and process credentials are still protected by KERNEL_LOCK(). These are filesystem sockets "connect", "bind" and "disconnect" cases. pledge(2) checks are protected by KERNEL_LOCK() too. These are "sendfd" and "recvfd" cases. All other socket operations protected by solock() and underelaying UNIXDOMAIN_LOCK(). I wrote and run simultaneously a lot of test programs doing various socket related things. System looks stable. May be after 6.7 release PF_UNIX sockets can be direction to go. Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.104 diff -u -p -r1.104 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104 +++ sys/kern/uipc_socket2.c 19 Apr 2020 11:07:10 -0000 @@ -282,6 +282,8 @@ solock(struct socket *so) NET_LOCK(); break; case PF_UNIX: + UNIXDOMAIN_LOCK(); + break; case PF_ROUTE: case PF_KEY: default: @@ -306,6 +308,8 @@ sounlock(struct socket *so, int s) NET_UNLOCK(); break; case PF_UNIX: + UNIXDOMAIN_UNLOCK(); + break; case PF_ROUTE: case PF_KEY: default: @@ -323,6 +327,8 @@ soassertlocked(struct socket *so) NET_ASSERT_LOCKED(); break; case PF_UNIX: + UNIXDOMAIN_ASSERT_LOCKED(); + break; case PF_ROUTE: case PF_KEY: default: @@ -335,12 +341,24 @@ int sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, uint64_t nsecs) { - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) && - (so->so_proto->pr_domain->dom_family != PF_ROUTE) && - (so->so_proto->pr_domain->dom_family != PF_KEY)) { - return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - } else - return tsleep_nsec(ident, prio, wmesg, nsecs); + int ret; + + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); + break; + case PF_UNIX: + ret = rwsleep_nsec(ident, &unixdomainlock, prio, wmesg, nsecs); + break; + case PF_ROUTE: + case PF_KEY: + default: + ret = tsleep_nsec(ident, prio, wmesg, nsecs); + break; + } + + return ret; } /* Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.142 diff -u -p -r1.142 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142 +++ sys/kern/uipc_usrreq.c 19 Apr 2020 11:07:10 -0000 @@ -51,6 +51,9 @@ #include <sys/task.h> #include <sys/pledge.h> #include <sys/pool.h> +#include <sys/rwlock.h> + +struct rwlock unixdomainlock = RWLOCK_INITIALIZER("unixdomainlock"); void uipc_setaddr(const struct unpcb *, struct mbuf *); @@ -152,13 +155,17 @@ uipc_usrreq(struct socket *so, int req, case PRU_CONNECT2: error = unp_connect2(so, (struct socket *)nam); if (!error) { + KERNEL_LOCK(); unp->unp_connid.uid = p->p_ucred->cr_uid; unp->unp_connid.gid = p->p_ucred->cr_gid; + KERNEL_UNLOCK(); unp->unp_connid.pid = p->p_p->ps_pid; unp->unp_flags |= UNP_FEIDS; unp2 = sotounpcb((struct socket *)nam); + KERNEL_LOCK(); unp2->unp_connid.uid = p->p_ucred->cr_uid; unp2->unp_connid.gid = p->p_ucred->cr_gid; + KERNEL_UNLOCK(); unp2->unp_connid.pid = p->p_p->ps_pid; unp2->unp_flags |= UNP_FEIDS; } @@ -411,10 +418,12 @@ unp_detach(struct unpcb *unp) LIST_REMOVE(unp, unp_link); if (unp->unp_vnode) { + KERNEL_LOCK(); unp->unp_vnode->v_socket = NULL; vp = unp->unp_vnode; unp->unp_vnode = NULL; vrele(vp); + KERNEL_UNLOCK(); } if (unp->unp_conn) unp_disconnect(unp); @@ -425,7 +434,7 @@ unp_detach(struct unpcb *unp) m_freem(unp->unp_addr); pool_put(&unpcb_pool, unp); if (unp_rights) - task_add(systq, &unp_gc_task); + task_add(systqmp, &unp_gc_task); } int @@ -455,11 +464,13 @@ unp_bind(struct unpcb *unp, struct mbuf /* Fixup sun_len to keep it in sync with m_len. */ soun->sun_len = nam2->m_len; + KERNEL_LOCK(); NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ if ((error = namei(&nd)) != 0) { + KERNEL_UNLOCK(); m_freem(nam2); return (error); } @@ -471,6 +482,7 @@ unp_bind(struct unpcb *unp, struct mbuf else vput(nd.ni_dvp); vrele(vp); + KERNEL_UNLOCK(); m_freem(nam2); return (EADDRINUSE); } @@ -480,6 +492,7 @@ unp_bind(struct unpcb *unp, struct mbuf error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); vput(nd.ni_dvp); if (error) { + KERNEL_UNLOCK(); m_freem(nam2); return (error); } @@ -492,6 +505,8 @@ unp_bind(struct unpcb *unp, struct mbuf unp->unp_connid.pid = p->p_p->ps_pid; unp->unp_flags |= UNP_FEIDSBIND; VOP_UNLOCK(vp); + KERNEL_UNLOCK(); + return (0); } @@ -508,18 +523,22 @@ unp_connect(struct socket *so, struct mb if ((error = unp_nam2sun(nam, &soun, NULL))) return (error); + KERNEL_LOCK(); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; - if ((error = namei(&nd)) != 0) + if ((error = namei(&nd)) != 0) { + KERNEL_UNLOCK(); return (error); + } vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; - goto bad; + goto bad_locked; } if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0) - goto bad; + goto bad_locked; so2 = vp->v_socket; + KERNEL_UNLOCK(); if (so2 == NULL) { error = ECONNREFUSED; goto bad; @@ -540,8 +559,10 @@ unp_connect(struct socket *so, struct mb if (unp2->unp_addr) unp3->unp_addr = m_copym(unp2->unp_addr, 0, M_COPYALL, M_NOWAIT); + KERNEL_LOCK(); unp3->unp_connid.uid = p->p_ucred->cr_uid; unp3->unp_connid.gid = p->p_ucred->cr_gid; + KERNEL_UNLOCK(); unp3->unp_connid.pid = p->p_p->ps_pid; unp3->unp_flags |= UNP_FEIDS; so2 = so3; @@ -552,7 +573,11 @@ unp_connect(struct socket *so, struct mb } error = unp_connect2(so, so2); bad: + KERNEL_LOCK(); +bad_locked: vput(vp); + KERNEL_UNLOCK(); + return (error); } @@ -635,17 +660,11 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; - KERNEL_ASSERT_LOCKED(); - so->so_error = errno; unp_disconnect(unp); if (so->so_head) { so->so_pcb = NULL; - /* - * As long as the KERNEL_LOCK() is the default lock for Unix - * sockets, do not release it. - */ - sofree(so, SL_NOUNLOCK); + sofree(so, SL_LOCKED); m_freem(unp->unp_addr); pool_put(&unpcb_pool, unp); } @@ -708,7 +727,9 @@ unp_externalize(struct mbuf *rights, soc for (i = 0; i < nfds; i++) { fp = rp->fp; rp++; + KERNEL_LOCK(); error = pledge_recvfd(p, fp); + KERNEL_UNLOCK(); if (error) break; @@ -717,14 +738,19 @@ unp_externalize(struct mbuf *rights, soc * make sure that it is underneath the root. */ if (fdp->fd_rdir != NULL && fp->f_type == DTYPE_VNODE) { - struct vnode *vp = (struct vnode *)fp->f_data; + struct vnode *vp; + KERNEL_LOCK(); + vp = (struct vnode *)fp->f_data; if (vp->v_type == VBLK || (vp->v_type == VDIR && !vn_isunder(vp, fdp->fd_rdir, p))) { + KERNEL_UNLOCK(); error = EPERM; break; } + + KERNEL_UNLOCK(); } } @@ -886,7 +912,9 @@ morespace: error = EDEADLK; goto fail; } + KERNEL_LOCK(); error = pledge_sendfd(p, fp); + KERNEL_UNLOCK(); if (error) goto fail; @@ -934,8 +962,12 @@ unp_gc(void *arg __unused) struct unpcb *unp; int nunref, i; - if (unp_gcing) + UNIXDOMAIN_LOCK(); + + if (unp_gcing) { + UNIXDOMAIN_UNLOCK(); return; + } unp_gcing = 1; /* close any fds on the deferred list */ @@ -950,7 +982,9 @@ unp_gc(void *arg __unused) if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; unp_rights--; + UNIXDOMAIN_UNLOCK(); (void) closef(fp, NULL); + UNIXDOMAIN_LOCK(); } free(defer, M_TEMP, sizeof(*defer) + sizeof(struct fdpass) * defer->ud_n); @@ -1021,6 +1055,8 @@ unp_gc(void *arg __unused) } } unp_gcing = 0; + + UNIXDOMAIN_UNLOCK(); } void @@ -1095,7 +1131,7 @@ unp_discard(struct fdpass *rp, int nfds) memset(rp, 0, sizeof(*rp) * nfds); SLIST_INSERT_HEAD(&unp_deferred, defer, ud_link); - task_add(systq, &unp_gc_task); + task_add(systqmp, &unp_gc_task); } int Index: sys/sys/systm.h =================================================================== RCS file: /cvs/src/sys/sys/systm.h,v retrieving revision 1.145 diff -u -p -r1.145 systm.h --- sys/sys/systm.h 20 Mar 2020 03:37:08 -0000 1.145 +++ sys/sys/systm.h 19 Apr 2020 11:07:10 -0000 @@ -347,6 +347,18 @@ do { \ splassert_fail(RW_READ, _s, __func__); \ } while (0) +extern struct rwlock unixdomainlock; + +#define UNIXDOMAIN_LOCK() do { rw_enter_write(&unixdomainlock); } while (0) +#define UNIXDOMAIN_UNLOCK() do { rw_exit_write(&unixdomainlock); } while (0) + +#define UNIXDOMAIN_ASSERT_LOCKED() \ +do { \ + int __s = rw_status(&unixdomainlock); \ + if((splassert_ctl > 0) && (__s != RW_WRITE)) \ + splassert_fail(RW_WRITE, __s, __func__); \ +} while (0) + __returns_twice int setjmp(label_t *); __dead void longjmp(label_t *); #endif