socket(2) layer is already protected by solock(). It grabs NET_LOCK()
for inet{,6}(4) sockets, but all other sockets are still under
KERNEL_LOCK().
I guess solock is already placed everythere it's required. Also `struct
file' is already mp-safe.
Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s
`netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX
layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases
is still under KERNEL_LOCK().
I'am experimenting with his diff since 19.04.2020 and my test systems,
include Gnome workstaion are stable, without any issue.
Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- sys/kern/uipc_socket2.c 11 Apr 2020 14:07:06 -0000 1.104
+++ sys/kern/uipc_socket2.c 1 May 2020 13:47:11 -0000
@@ -53,6 +53,8 @@ u_long sb_max = SB_MAX; /* patchable */
extern struct pool mclpools[];
extern struct pool mbpool;
+extern struct rwlock unp_lock;
+
/*
* Procedures to manipulate state flags of socket
* and do appropriate wakeups. Normal sequence from the
@@ -282,6 +284,8 @@ solock(struct socket *so)
NET_LOCK();
break;
case PF_UNIX:
+ rw_enter_write(&unp_lock);
+ break;
case PF_ROUTE:
case PF_KEY:
default:
@@ -306,6 +310,8 @@ sounlock(struct socket *so, int s)
NET_UNLOCK();
break;
case PF_UNIX:
+ rw_exit_write(&unp_lock);
+ break;
case PF_ROUTE:
case PF_KEY:
default:
@@ -323,6 +329,8 @@ soassertlocked(struct socket *so)
NET_ASSERT_LOCKED();
break;
case PF_UNIX:
+ rw_assert_wrlock(&unp_lock);
+ break;
case PF_ROUTE:
case PF_KEY:
default:
@@ -335,12 +343,24 @@ int
sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
uint64_t nsecs)
{
- if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
- (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
- (so->so_proto->pr_domain->dom_family != PF_KEY)) {
- return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
- } else
- return tsleep_nsec(ident, prio, wmesg, nsecs);
+ int ret;
+
+ switch (so->so_proto->pr_domain->dom_family) {
+ case PF_INET:
+ case PF_INET6:
+ ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+ break;
+ case PF_UNIX:
+ ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
+ break;
+ case PF_ROUTE:
+ case PF_KEY:
+ default:
+ ret = tsleep_nsec(ident, prio, wmesg, nsecs);
+ break;
+ }
+
+ return ret;
}
/*
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.142
diff -u -p -r1.142 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c 16 Jul 2019 21:41:37 -0000 1.142
+++ sys/kern/uipc_usrreq.c 1 May 2020 13:47:11 -0000
@@ -51,10 +51,17 @@
#include <sys/task.h>
#include <sys/pledge.h>
#include <sys/pool.h>
+#include <sys/rwlock.h>
+/*
+ * Locks used to protect global data and struct members:
+ * I immutable after creation
+ * U unp_lock
+ */
+struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
void uipc_setaddr(const struct unpcb *, struct mbuf *);
-/* list of all UNIX domain sockets, for unp_gc() */
+/* [U] list of all UNIX domain sockets, for unp_gc() */
LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
/*
@@ -62,10 +69,10 @@ LIST_HEAD(unp_head, unpcb) unp_head = LI
* not received and need to be closed.
*/
struct unp_deferral {
- SLIST_ENTRY(unp_deferral) ud_link;
- int ud_n;
+ SLIST_ENTRY(unp_deferral) ud_link; /* [U] */
+ int ud_n; /* [I] */
/* followed by ud_n struct fdpass */
- struct fdpass ud_fp[];
+ struct fdpass ud_fp[]; /* [I] */
};
void unp_discard(struct fdpass *, int);
@@ -74,7 +81,7 @@ void unp_scan(struct mbuf *, void (*)(st
int unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
struct pool unpcb_pool;
-/* list of sets of files that were sent over sockets that are now closed */
+/* [U] list of sets of files that were sent over sockets that are now closed */
SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
@@ -89,7 +96,7 @@ struct task unp_gc_task = TASK_INITIALIZ
* need a proper out-of-band
*/
struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t unp_ino; /* prototype for fake inode numbers */
+ino_t unp_ino; /* [U] prototype for fake inode numbers */
void
unp_init(void)
@@ -351,7 +358,7 @@ u_long unpst_recvspace = PIPSIZ;
u_long unpdg_sendspace = 2*1024; /* really max datagram size */
u_long unpdg_recvspace = 4*1024;
-int unp_rights; /* file descriptors in flight */
+int unp_rights; /* [U] file descriptors in flight */
int
uipc_attach(struct socket *so, int proto)
@@ -359,6 +366,8 @@ uipc_attach(struct socket *so, int proto
struct unpcb *unp;
int error;
+ rw_assert_wrlock(&unp_lock);
+
if (so->so_pcb)
return EISCONN;
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp)
{
struct vnode *vp;
+ rw_assert_wrlock(&unp_lock);
+
LIST_REMOVE(unp, unp_link);
if (unp->unp_vnode) {
+ /* this is an unlocked cleaning of `v_socket', but it's safe */
unp->unp_vnode->v_socket = NULL;
vp = unp->unp_vnode;
unp->unp_vnode = NULL;
+ KERNEL_LOCK();
vrele(vp);
+ KERNEL_UNLOCK();
}
if (unp->unp_conn)
unp_disconnect(unp);
@@ -458,10 +472,11 @@ unp_bind(struct unpcb *unp, struct mbuf
NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
soun->sun_path, p);
nd.ni_pledge = PLEDGE_UNIX;
+ KERNEL_LOCK();
/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
if ((error = namei(&nd)) != 0) {
m_freem(nam2);
- return (error);
+ goto unlock;
}
vp = nd.ni_vp;
if (vp != NULL) {
@@ -472,7 +487,8 @@ unp_bind(struct unpcb *unp, struct mbuf
vput(nd.ni_dvp);
vrele(vp);
m_freem(nam2);
- return (EADDRINUSE);
+ error = EADDRINUSE;
+ goto unlock;
}
VATTR_NULL(&vattr);
vattr.va_type = VSOCK;
@@ -481,7 +497,7 @@ unp_bind(struct unpcb *unp, struct mbuf
vput(nd.ni_dvp);
if (error) {
m_freem(nam2);
- return (error);
+ goto unlock;
}
unp->unp_addr = nam2;
vp = nd.ni_vp;
@@ -492,7 +508,10 @@ unp_bind(struct unpcb *unp, struct mbuf
unp->unp_connid.pid = p->p_p->ps_pid;
unp->unp_flags |= UNP_FEIDSBIND;
VOP_UNLOCK(vp);
- return (0);
+unlock:
+ KERNEL_UNLOCK();
+
+ return error;
}
int
@@ -510,8 +529,9 @@ unp_connect(struct socket *so, struct mb
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
nd.ni_pledge = PLEDGE_UNIX;
+ KERNEL_LOCK();
if ((error = namei(&nd)) != 0)
- return (error);
+ goto unlock;
vp = nd.ni_vp;
if (vp->v_type != VSOCK) {
error = ENOTSOCK;
@@ -553,6 +573,9 @@ unp_connect(struct socket *so, struct mb
error = unp_connect2(so, so2);
bad:
vput(vp);
+unlock:
+ KERNEL_UNLOCK();
+
return (error);
}
@@ -562,6 +585,8 @@ unp_connect2(struct socket *so, struct s
struct unpcb *unp = sotounpcb(so);
struct unpcb *unp2;
+ rw_assert_wrlock(&unp_lock);
+
if (so2->so_type != so->so_type)
return (EPROTOTYPE);
unp2 = sotounpcb(so2);
@@ -635,15 +660,16 @@ unp_drop(struct unpcb *unp, int errno)
{
struct socket *so = unp->unp_socket;
- KERNEL_ASSERT_LOCKED();
+ rw_assert_wrlock(&unp_lock);
so->so_error = errno;
unp_disconnect(unp);
if (so->so_head) {
so->so_pcb = NULL;
/*
- * As long as the KERNEL_LOCK() is the default lock for Unix
- * sockets, do not release it.
+ * As long as `unp_lock' is taken before entering
+ * uipc_usrreq() releasing it here would lead to a
+ * double unlock.
*/
sofree(so, SL_NOUNLOCK);
m_freem(unp->unp_addr);
@@ -685,6 +711,8 @@ unp_externalize(struct mbuf *rights, soc
struct file *fp;
int nfds, error = 0;
+ rw_assert_wrlock(&unp_lock);
+
/*
* This code only works because SCM_RIGHTS is the only supported
* control message type on unix sockets. Enforce this here.
@@ -705,6 +733,10 @@ unp_externalize(struct mbuf *rights, soc
/* Make sure the recipient should be able to see the descriptors.. */
rp = (struct fdpass *)CMSG_DATA(cm);
+
+ /* fdp->fd_rdir requires KERNEL_LOCK() */
+ KERNEL_LOCK();
+
for (i = 0; i < nfds; i++) {
fp = rp->fp;
rp++;
@@ -728,6 +760,8 @@ unp_externalize(struct mbuf *rights, soc
}
}
+ KERNEL_UNLOCK();
+
fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
restart:
@@ -825,6 +859,8 @@ unp_internalize(struct mbuf *control, st
int i, error;
int nfds, *ip, fd, neededspace;
+ rw_assert_wrlock(&unp_lock);
+
/*
* Check for two potential msg_controllen values because
* IETF stuck their nose in a place it does not belong.
@@ -923,7 +959,8 @@ fail:
return (error);
}
-int unp_defer, unp_gcing;
+int unp_defer; /* [U] number of deferred fp to close by the GC task */
+int unp_gcing; /* [U] GC task currently running */
void
unp_gc(void *arg __unused)
@@ -934,8 +971,10 @@ unp_gc(void *arg __unused)
struct unpcb *unp;
int nunref, i;
+ rw_enter_write(&unp_lock);
+
if (unp_gcing)
- return;
+ goto unlock;
unp_gcing = 1;
/* close any fds on the deferred list */
@@ -950,7 +989,9 @@ unp_gc(void *arg __unused)
if ((unp = fptounp(fp)) != NULL)
unp->unp_msgcount--;
unp_rights--;
+ rw_exit_write(&unp_lock);
(void) closef(fp, NULL);
+ rw_enter_write(&unp_lock);
}
free(defer, M_TEMP, sizeof(*defer) +
sizeof(struct fdpass) * defer->ud_n);
@@ -1021,6 +1062,8 @@ unp_gc(void *arg __unused)
}
}
unp_gcing = 0;
+unlock:
+ rw_exit_write(&unp_lock);
}
void
@@ -1066,6 +1109,8 @@ unp_mark(struct fdpass *rp, int nfds)
struct unpcb *unp;
int i;
+ rw_assert_wrlock(&unp_lock);
+
for (i = 0; i < nfds; i++) {
if (rp[i].fp == NULL)
continue;
@@ -1087,6 +1132,8 @@ void
unp_discard(struct fdpass *rp, int nfds)
{
struct unp_deferral *defer;
+
+ rw_assert_wrlock(&unp_lock);
/* copy the file pointers to a deferral structure */
defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.17
diff -u -p -r1.17 unpcb.h
--- sys/sys/unpcb.h 15 Jul 2019 12:28:06 -0000 1.17
+++ sys/sys/unpcb.h 1 May 2020 13:47:12 -0000
@@ -56,22 +56,27 @@
* Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt
* so that changes in the sockbuf may be computed to modify
* back pressure on the sender accordingly.
+ *
+ * Locks used to protect struct members:
+ * I immutable after creation
+ * U unp_lock
*/
+
struct unpcb {
- struct socket *unp_socket; /* pointer back to socket */
- struct vnode *unp_vnode; /* if associated with file */
- struct file *unp_file; /* backpointer for unp_gc() */
- struct unpcb *unp_conn; /* control block of connected socket */
- ino_t unp_ino; /* fake inode number */
- SLIST_HEAD(,unpcb) unp_refs; /* referencing socket linked list */
- SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */
- struct mbuf *unp_addr; /* bound address of socket */
- long unp_msgcount; /* references from socket rcv buf */
- int unp_flags; /* this unpcb contains peer eids */
- struct sockpeercred unp_connid;/* id of peer process */
- struct timespec unp_ctime; /* holds creation time */
- LIST_ENTRY(unpcb) unp_link; /* link in per-AF list of sockets */
+ struct socket *unp_socket; /* [I] pointer back to socket */
+ struct vnode *unp_vnode; /* [U] if associated with file */
+ struct file *unp_file; /* [U] backpointer for unp_gc() */
+ struct unpcb *unp_conn; /* [U] control block of connected
socket */
+ ino_t unp_ino; /* [U] fake inode number */
+ SLIST_HEAD(,unpcb) unp_refs; /* [U] referencing socket linked list */
+ SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
+ struct mbuf *unp_addr; /* [U] bound address of socket */
+ long unp_msgcount; /* [U] references from socket rcv buf */
+ int unp_flags; /* [U] this unpcb contains peer eids */
+ struct sockpeercred unp_connid;/* [U] id of peer process */
+ struct timespec unp_ctime; /* [I] holds creation time */
+ LIST_ENTRY(unpcb) unp_link; /* [U] link in per-AF list of sockets */
};
/*