socket(2) layer is already protected by solock(). It grabs NET_LOCK() for inet{,6}(4) sockets, but all other sockets are still under KERNEL_LOCK().
I guess solock is already placed everythere it's required. Also `struct file' is already mp-safe. Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s `netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases is still under KERNEL_LOCK(). I'am experimenting with his diff since 19.04.2020 and my test systems, include Gnome workstaion are stable, without any issue. Index: sys/kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.104 diff -u -p -r1.104 uipc_socket2.c --- sys/kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104 +++ sys/kern/uipc_socket2.c 1 May 2020 13:47:11 -0000 @@ -53,6 +53,8 @@ u_long sb_max = SB_MAX; /* patchable */ extern struct pool mclpools[]; extern struct pool mbpool; +extern struct rwlock unp_lock; + /* * Procedures to manipulate state flags of socket * and do appropriate wakeups. Normal sequence from the @@ -282,6 +284,8 @@ solock(struct socket *so) NET_LOCK(); break; case PF_UNIX: + rw_enter_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -306,6 +310,8 @@ sounlock(struct socket *so, int s) NET_UNLOCK(); break; case PF_UNIX: + rw_exit_write(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -323,6 +329,8 @@ soassertlocked(struct socket *so) NET_ASSERT_LOCKED(); break; case PF_UNIX: + rw_assert_wrlock(&unp_lock); + break; case PF_ROUTE: case PF_KEY: default: @@ -335,12 +343,24 @@ int sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg, uint64_t nsecs) { - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) && - (so->so_proto->pr_domain->dom_family != PF_ROUTE) && - (so->so_proto->pr_domain->dom_family != PF_KEY)) { - return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); - } else - return tsleep_nsec(ident, prio, wmesg, nsecs); + int ret; + + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs); + break; + case PF_UNIX: + ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs); + break; + case PF_ROUTE: + case PF_KEY: + default: + ret = tsleep_nsec(ident, prio, wmesg, nsecs); + break; + } + + return ret; } /* Index: sys/kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.142 diff -u -p -r1.142 uipc_usrreq.c --- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142 +++ sys/kern/uipc_usrreq.c 1 May 2020 13:47:11 -0000 @@ -51,10 +51,17 @@ #include <sys/task.h> #include <sys/pledge.h> #include <sys/pool.h> +#include <sys/rwlock.h> +/* + * Locks used to protect global data and struct members: + * I immutable after creation + * U unp_lock + */ +struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock"); void uipc_setaddr(const struct unpcb *, struct mbuf *); -/* list of all UNIX domain sockets, for unp_gc() */ +/* [U] list of all UNIX domain sockets, for unp_gc() */ LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head); /* @@ -62,10 +69,10 @@ LIST_HEAD(unp_head, unpcb) unp_head = LI * not received and need to be closed. */ struct unp_deferral { - SLIST_ENTRY(unp_deferral) ud_link; - int ud_n; + SLIST_ENTRY(unp_deferral) ud_link; /* [U] */ + int ud_n; /* [I] */ /* followed by ud_n struct fdpass */ - struct fdpass ud_fp[]; + struct fdpass ud_fp[]; /* [I] */ }; void unp_discard(struct fdpass *, int); @@ -74,7 +81,7 @@ void unp_scan(struct mbuf *, void (*)(st int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *); struct pool unpcb_pool; -/* list of sets of files that were sent over sockets that are now closed */ +/* [U] list of sets of files that were sent over sockets that are now closed */ SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred); struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL); @@ -89,7 +96,7 @@ struct task unp_gc_task = TASK_INITIALIZ * need a proper out-of-band */ struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; -ino_t unp_ino; /* prototype for fake inode numbers */ +ino_t unp_ino; /* [U] prototype for fake inode numbers */ void unp_init(void) @@ -351,7 +358,7 @@ u_long unpst_recvspace = PIPSIZ; u_long unpdg_sendspace = 2*1024; /* really max datagram size */ u_long unpdg_recvspace = 4*1024; -int unp_rights; /* file descriptors in flight */ +int unp_rights; /* [U] file descriptors in flight */ int uipc_attach(struct socket *so, int proto) @@ -359,6 +366,8 @@ uipc_attach(struct socket *so, int proto struct unpcb *unp; int error; + rw_assert_wrlock(&unp_lock); + if (so->so_pcb) return EISCONN; if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp) { struct vnode *vp; + rw_assert_wrlock(&unp_lock); + LIST_REMOVE(unp, unp_link); if (unp->unp_vnode) { + /* this is an unlocked cleaning of `v_socket', but it's safe */ unp->unp_vnode->v_socket = NULL; vp = unp->unp_vnode; unp->unp_vnode = NULL; + KERNEL_LOCK(); vrele(vp); + KERNEL_UNLOCK(); } if (unp->unp_conn) unp_disconnect(unp); @@ -458,10 +472,11 @@ unp_bind(struct unpcb *unp, struct mbuf NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; + KERNEL_LOCK(); /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ if ((error = namei(&nd)) != 0) { m_freem(nam2); - return (error); + goto unlock; } vp = nd.ni_vp; if (vp != NULL) { @@ -472,7 +487,8 @@ unp_bind(struct unpcb *unp, struct mbuf vput(nd.ni_dvp); vrele(vp); m_freem(nam2); - return (EADDRINUSE); + error = EADDRINUSE; + goto unlock; } VATTR_NULL(&vattr); vattr.va_type = VSOCK; @@ -481,7 +497,7 @@ unp_bind(struct unpcb *unp, struct mbuf vput(nd.ni_dvp); if (error) { m_freem(nam2); - return (error); + goto unlock; } unp->unp_addr = nam2; vp = nd.ni_vp; @@ -492,7 +508,10 @@ unp_bind(struct unpcb *unp, struct mbuf unp->unp_connid.pid = p->p_p->ps_pid; unp->unp_flags |= UNP_FEIDSBIND; VOP_UNLOCK(vp); - return (0); +unlock: + KERNEL_UNLOCK(); + + return error; } int @@ -510,8 +529,9 @@ unp_connect(struct socket *so, struct mb NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p); nd.ni_pledge = PLEDGE_UNIX; + KERNEL_LOCK(); if ((error = namei(&nd)) != 0) - return (error); + goto unlock; vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; @@ -553,6 +573,9 @@ unp_connect(struct socket *so, struct mb error = unp_connect2(so, so2); bad: vput(vp); +unlock: + KERNEL_UNLOCK(); + return (error); } @@ -562,6 +585,8 @@ unp_connect2(struct socket *so, struct s struct unpcb *unp = sotounpcb(so); struct unpcb *unp2; + rw_assert_wrlock(&unp_lock); + if (so2->so_type != so->so_type) return (EPROTOTYPE); unp2 = sotounpcb(so2); @@ -635,15 +660,16 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; - KERNEL_ASSERT_LOCKED(); + rw_assert_wrlock(&unp_lock); so->so_error = errno; unp_disconnect(unp); if (so->so_head) { so->so_pcb = NULL; /* - * As long as the KERNEL_LOCK() is the default lock for Unix - * sockets, do not release it. + * As long as `unp_lock' is taken before entering + * uipc_usrreq() releasing it here would lead to a + * double unlock. */ sofree(so, SL_NOUNLOCK); m_freem(unp->unp_addr); @@ -685,6 +711,8 @@ unp_externalize(struct mbuf *rights, soc struct file *fp; int nfds, error = 0; + rw_assert_wrlock(&unp_lock); + /* * This code only works because SCM_RIGHTS is the only supported * control message type on unix sockets. Enforce this here. @@ -705,6 +733,10 @@ unp_externalize(struct mbuf *rights, soc /* Make sure the recipient should be able to see the descriptors.. */ rp = (struct fdpass *)CMSG_DATA(cm); + + /* fdp->fd_rdir requires KERNEL_LOCK() */ + KERNEL_LOCK(); + for (i = 0; i < nfds; i++) { fp = rp->fp; rp++; @@ -728,6 +760,8 @@ unp_externalize(struct mbuf *rights, soc } } + KERNEL_UNLOCK(); + fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK); restart: @@ -825,6 +859,8 @@ unp_internalize(struct mbuf *control, st int i, error; int nfds, *ip, fd, neededspace; + rw_assert_wrlock(&unp_lock); + /* * Check for two potential msg_controllen values because * IETF stuck their nose in a place it does not belong. @@ -923,7 +959,8 @@ fail: return (error); } -int unp_defer, unp_gcing; +int unp_defer; /* [U] number of deferred fp to close by the GC task */ +int unp_gcing; /* [U] GC task currently running */ void unp_gc(void *arg __unused) @@ -934,8 +971,10 @@ unp_gc(void *arg __unused) struct unpcb *unp; int nunref, i; + rw_enter_write(&unp_lock); + if (unp_gcing) - return; + goto unlock; unp_gcing = 1; /* close any fds on the deferred list */ @@ -950,7 +989,9 @@ unp_gc(void *arg __unused) if ((unp = fptounp(fp)) != NULL) unp->unp_msgcount--; unp_rights--; + rw_exit_write(&unp_lock); (void) closef(fp, NULL); + rw_enter_write(&unp_lock); } free(defer, M_TEMP, sizeof(*defer) + sizeof(struct fdpass) * defer->ud_n); @@ -1021,6 +1062,8 @@ unp_gc(void *arg __unused) } } unp_gcing = 0; +unlock: + rw_exit_write(&unp_lock); } void @@ -1066,6 +1109,8 @@ unp_mark(struct fdpass *rp, int nfds) struct unpcb *unp; int i; + rw_assert_wrlock(&unp_lock); + for (i = 0; i < nfds; i++) { if (rp[i].fp == NULL) continue; @@ -1087,6 +1132,8 @@ void unp_discard(struct fdpass *rp, int nfds) { struct unp_deferral *defer; + + rw_assert_wrlock(&unp_lock); /* copy the file pointers to a deferral structure */ defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK); Index: sys/sys/unpcb.h =================================================================== RCS file: /cvs/src/sys/sys/unpcb.h,v retrieving revision 1.17 diff -u -p -r1.17 unpcb.h --- sys/sys/unpcb.h 15 Jul 2019 12:28:06 -0000 1.17 +++ sys/sys/unpcb.h 1 May 2020 13:47:12 -0000 @@ -56,22 +56,27 @@ * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt * so that changes in the sockbuf may be computed to modify * back pressure on the sender accordingly. + * + * Locks used to protect struct members: + * I immutable after creation + * U unp_lock */ + struct unpcb { - struct socket *unp_socket; /* pointer back to socket */ - struct vnode *unp_vnode; /* if associated with file */ - struct file *unp_file; /* backpointer for unp_gc() */ - struct unpcb *unp_conn; /* control block of connected socket */ - ino_t unp_ino; /* fake inode number */ - SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */ - SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */ - struct mbuf *unp_addr; /* bound address of socket */ - long unp_msgcount; /* references from socket rcv buf */ - int unp_flags; /* this unpcb contains peer eids */ - struct sockpeercred unp_connid;/* id of peer process */ - struct timespec unp_ctime; /* holds creation time */ - LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */ + struct socket *unp_socket; /* [I] pointer back to socket */ + struct vnode *unp_vnode; /* [U] if associated with file */ + struct file *unp_file; /* [U] backpointer for unp_gc() */ + struct unpcb *unp_conn; /* [U] control block of connected socket */ + ino_t unp_ino; /* [U] fake inode number */ + SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */ + SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */ + struct mbuf *unp_addr; /* [U] bound address of socket */ + long unp_msgcount; /* [U] references from socket rcv buf */ + int unp_flags; /* [U] this unpcb contains peer eids */ + struct sockpeercred unp_connid;/* [U] id of peer process */ + struct timespec unp_ctime; /* [I] holds creation time */ + LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */ }; /*