socket(2) layer is already protected by solock(). It grabs NET_LOCK()
for inet{,6}(4) sockets, but all other sockets are still under
KERNEL_LOCK().

I guess solock is already placed everythere it's required. Also `struct
file' is already mp-safe. 

Diff below introduces rwlock `unp_lock'. It's like NET_LOCK()'s
`netlock' but for PF_UNIX sockets. This lock protects the whole PF_UNIX
layer. Vnode layer accessed in "connect" "bind" and "desconnect" cases
is still under KERNEL_LOCK().

I'am experimenting with his diff since 19.04.2020 and my test systems,
include Gnome workstaion are stable, without any issue.


Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- sys/kern/uipc_socket2.c     11 Apr 2020 14:07:06 -0000      1.104
+++ sys/kern/uipc_socket2.c     1 May 2020 13:47:11 -0000
@@ -53,6 +53,8 @@ u_long        sb_max = SB_MAX;                /* patchable */
 extern struct pool mclpools[];
 extern struct pool mbpool;
 
+extern struct rwlock unp_lock;
+
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -282,6 +284,8 @@ solock(struct socket *so)
                NET_LOCK();
                break;
        case PF_UNIX:
+               rw_enter_write(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -306,6 +310,8 @@ sounlock(struct socket *so, int s)
                NET_UNLOCK();
                break;
        case PF_UNIX:
+               rw_exit_write(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -323,6 +329,8 @@ soassertlocked(struct socket *so)
                NET_ASSERT_LOCKED();
                break;
        case PF_UNIX:
+               rw_assert_wrlock(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -335,12 +343,24 @@ int
 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
     uint64_t nsecs)
 {
-       if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
-           (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
-           (so->so_proto->pr_domain->dom_family != PF_KEY)) {
-               return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
-       } else
-               return tsleep_nsec(ident, prio, wmesg, nsecs);
+       int ret;
+
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+               break;
+       case PF_UNIX:
+               ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
+               break;
+       case PF_ROUTE:
+       case PF_KEY:
+       default:
+               ret = tsleep_nsec(ident, prio, wmesg, nsecs);
+               break;
+       }
+
+       return ret;
 }
 
 /*
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.142
diff -u -p -r1.142 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      16 Jul 2019 21:41:37 -0000      1.142
+++ sys/kern/uipc_usrreq.c      1 May 2020 13:47:11 -0000
@@ -51,10 +51,17 @@
 #include <sys/task.h>
 #include <sys/pledge.h>
 #include <sys/pool.h>
+#include <sys/rwlock.h>                                                        
                     
+/*
+ * Locks used to protect global data and struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
+ */
+struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
 
 void   uipc_setaddr(const struct unpcb *, struct mbuf *);
 
-/* list of all UNIX domain sockets, for unp_gc() */
+/* [U] list of all UNIX domain sockets, for unp_gc() */
 LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
 
 /*
@@ -62,10 +69,10 @@ LIST_HEAD(unp_head, unpcb) unp_head = LI
  * not received and need to be closed.
  */
 struct unp_deferral {
-       SLIST_ENTRY(unp_deferral)       ud_link;
-       int     ud_n;
+       SLIST_ENTRY(unp_deferral)       ud_link;        /* [U] */
+       int                             ud_n;           /* [I] */
        /* followed by ud_n struct fdpass */
-       struct fdpass ud_fp[];
+       struct fdpass                   ud_fp[];        /* [I] */
 };
 
 void   unp_discard(struct fdpass *, int);
@@ -74,7 +81,7 @@ void  unp_scan(struct mbuf *, void (*)(st
 int    unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
 
 struct pool unpcb_pool;
-/* list of sets of files that were sent over sockets that are now closed */
+/* [U] list of sets of files that were sent over sockets that are now closed */
 SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
 
 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
@@ -89,7 +96,7 @@ struct task unp_gc_task = TASK_INITIALIZ
  *     need a proper out-of-band
  */
 struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t  unp_ino;                        /* prototype for fake inode numbers */
+ino_t  unp_ino;                /* [U] prototype for fake inode numbers */
 
 void
 unp_init(void)
@@ -351,7 +358,7 @@ u_long      unpst_recvspace = PIPSIZ;
 u_long unpdg_sendspace = 2*1024;       /* really max datagram size */
 u_long unpdg_recvspace = 4*1024;
 
-int    unp_rights;                     /* file descriptors in flight */
+int    unp_rights;                     /* [U] file descriptors in flight */
 
 int
 uipc_attach(struct socket *so, int proto)
@@ -359,6 +366,8 @@ uipc_attach(struct socket *so, int proto
        struct unpcb *unp;
        int error;
 
+       rw_assert_wrlock(&unp_lock);
+
        if (so->so_pcb)
                return EISCONN;
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -409,12 +418,17 @@ unp_detach(struct unpcb *unp)
 {
        struct vnode *vp;
 
+       rw_assert_wrlock(&unp_lock);
+
        LIST_REMOVE(unp, unp_link);
        if (unp->unp_vnode) {
+               /* this is an unlocked cleaning of `v_socket', but it's safe */
                unp->unp_vnode->v_socket = NULL;
                vp = unp->unp_vnode;
                unp->unp_vnode = NULL;
+               KERNEL_LOCK();
                vrele(vp);
+               KERNEL_UNLOCK();
        }
        if (unp->unp_conn)
                unp_disconnect(unp);
@@ -458,10 +472,11 @@ unp_bind(struct unpcb *unp, struct mbuf 
        NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
            soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
+       KERNEL_LOCK();
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
        if ((error = namei(&nd)) != 0) {
                m_freem(nam2);
-               return (error);
+               goto unlock;
        }
        vp = nd.ni_vp;
        if (vp != NULL) {
@@ -472,7 +487,8 @@ unp_bind(struct unpcb *unp, struct mbuf 
                        vput(nd.ni_dvp);
                vrele(vp);
                m_freem(nam2);
-               return (EADDRINUSE);
+               error = EADDRINUSE;
+               goto unlock;
        }
        VATTR_NULL(&vattr);
        vattr.va_type = VSOCK;
@@ -481,7 +497,7 @@ unp_bind(struct unpcb *unp, struct mbuf 
        vput(nd.ni_dvp);
        if (error) {
                m_freem(nam2);
-               return (error);
+               goto unlock;
        }
        unp->unp_addr = nam2;
        vp = nd.ni_vp;
@@ -492,7 +508,10 @@ unp_bind(struct unpcb *unp, struct mbuf 
        unp->unp_connid.pid = p->p_p->ps_pid;
        unp->unp_flags |= UNP_FEIDSBIND;
        VOP_UNLOCK(vp);
-       return (0);
+unlock:
+       KERNEL_UNLOCK();
+
+       return error;
 }
 
 int
@@ -510,8 +529,9 @@ unp_connect(struct socket *so, struct mb
 
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
+       KERNEL_LOCK();
        if ((error = namei(&nd)) != 0)
-               return (error);
+               goto unlock;
        vp = nd.ni_vp;
        if (vp->v_type != VSOCK) {
                error = ENOTSOCK;
@@ -553,6 +573,9 @@ unp_connect(struct socket *so, struct mb
        error = unp_connect2(so, so2);
 bad:
        vput(vp);
+unlock:
+       KERNEL_UNLOCK();
+
        return (error);
 }
 
@@ -562,6 +585,8 @@ unp_connect2(struct socket *so, struct s
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;
 
+       rw_assert_wrlock(&unp_lock);
+
        if (so2->so_type != so->so_type)
                return (EPROTOTYPE);
        unp2 = sotounpcb(so2);
@@ -635,15 +660,16 @@ unp_drop(struct unpcb *unp, int errno)
 {
        struct socket *so = unp->unp_socket;
 
-       KERNEL_ASSERT_LOCKED();
+       rw_assert_wrlock(&unp_lock);
 
        so->so_error = errno;
        unp_disconnect(unp);
        if (so->so_head) {
                so->so_pcb = NULL;
                /*
-                * As long as the KERNEL_LOCK() is the default lock for Unix
-                * sockets, do not release it.
+                * As long as `unp_lock' is taken before entering
+                * uipc_usrreq() releasing it here would lead to a
+                * double unlock.
                 */
                sofree(so, SL_NOUNLOCK);
                m_freem(unp->unp_addr);
@@ -685,6 +711,8 @@ unp_externalize(struct mbuf *rights, soc
        struct file *fp;
        int nfds, error = 0;
 
+       rw_assert_wrlock(&unp_lock);
+
        /*
         * This code only works because SCM_RIGHTS is the only supported
         * control message type on unix sockets. Enforce this here.
@@ -705,6 +733,10 @@ unp_externalize(struct mbuf *rights, soc
 
        /* Make sure the recipient should be able to see the descriptors.. */
        rp = (struct fdpass *)CMSG_DATA(cm);
+
+       /* fdp->fd_rdir requires KERNEL_LOCK() */
+       KERNEL_LOCK();
+
        for (i = 0; i < nfds; i++) {
                fp = rp->fp;
                rp++;
@@ -728,6 +760,8 @@ unp_externalize(struct mbuf *rights, soc
                }
        }
 
+       KERNEL_UNLOCK();
+
        fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
 
 restart:
@@ -825,6 +859,8 @@ unp_internalize(struct mbuf *control, st
        int i, error;
        int nfds, *ip, fd, neededspace;
 
+       rw_assert_wrlock(&unp_lock);
+
        /*
         * Check for two potential msg_controllen values because
         * IETF stuck their nose in a place it does not belong.
@@ -923,7 +959,8 @@ fail:
        return (error);
 }
 
-int    unp_defer, unp_gcing;
+int unp_defer; /* [U] number of deferred fp to close by the GC task */
+int unp_gcing; /* [U] GC task currently running */
 
 void
 unp_gc(void *arg __unused)
@@ -934,8 +971,10 @@ unp_gc(void *arg __unused)
        struct unpcb *unp;
        int nunref, i;
 
+       rw_enter_write(&unp_lock);
+
        if (unp_gcing)
-               return;
+               goto unlock;
        unp_gcing = 1;
 
        /* close any fds on the deferred list */
@@ -950,7 +989,9 @@ unp_gc(void *arg __unused)
                        if ((unp = fptounp(fp)) != NULL)
                                unp->unp_msgcount--;
                        unp_rights--;
+                       rw_exit_write(&unp_lock);
                        (void) closef(fp, NULL);
+                       rw_enter_write(&unp_lock);
                }
                free(defer, M_TEMP, sizeof(*defer) +
                    sizeof(struct fdpass) * defer->ud_n);
@@ -1021,6 +1062,8 @@ unp_gc(void *arg __unused)
                }
        }
        unp_gcing = 0;
+unlock:
+       rw_exit_write(&unp_lock);
 }
 
 void
@@ -1066,6 +1109,8 @@ unp_mark(struct fdpass *rp, int nfds)
        struct unpcb *unp;
        int i;
 
+       rw_assert_wrlock(&unp_lock);
+
        for (i = 0; i < nfds; i++) {
                if (rp[i].fp == NULL)
                        continue;
@@ -1087,6 +1132,8 @@ void
 unp_discard(struct fdpass *rp, int nfds)
 {
        struct unp_deferral *defer;
+
+       rw_assert_wrlock(&unp_lock);
 
        /* copy the file pointers to a deferral structure */
        defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.17
diff -u -p -r1.17 unpcb.h
--- sys/sys/unpcb.h     15 Jul 2019 12:28:06 -0000      1.17
+++ sys/sys/unpcb.h     1 May 2020 13:47:12 -0000
@@ -56,22 +56,27 @@
  * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt
  * so that changes in the sockbuf may be computed to modify
  * back pressure on the sender accordingly.
+ *
+ * Locks used to protect struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
  */
 
+
 struct unpcb {
-       struct  socket *unp_socket;     /* pointer back to socket */
-       struct  vnode *unp_vnode;       /* if associated with file */
-       struct  file *unp_file;         /* backpointer for unp_gc() */
-       struct  unpcb *unp_conn;        /* control block of connected socket */
-       ino_t   unp_ino;                /* fake inode number */
-       SLIST_HEAD(,unpcb) unp_refs;    /* referencing socket linked list */
-       SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */
-       struct  mbuf *unp_addr;         /* bound address of socket */
-       long    unp_msgcount;           /* references from socket rcv buf */
-       int     unp_flags;              /* this unpcb contains peer eids */
-       struct  sockpeercred unp_connid;/* id of peer process */
-       struct  timespec unp_ctime;     /* holds creation time */
-       LIST_ENTRY(unpcb) unp_link;     /* link in per-AF list of sockets */
+       struct  socket *unp_socket;     /* [I] pointer back to socket */
+       struct  vnode *unp_vnode;       /* [U] if associated with file */
+       struct  file *unp_file;         /* [U] backpointer for unp_gc() */
+       struct  unpcb *unp_conn;        /* [U] control block of connected 
socket */
+       ino_t   unp_ino;                /* [U] fake inode number */
+       SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
+       SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
+       struct  mbuf *unp_addr;         /* [U] bound address of socket */
+       long    unp_msgcount;           /* [U] references from socket rcv buf */
+       int     unp_flags;              /* [U] this unpcb contains peer eids */
+       struct  sockpeercred unp_connid;/* [U] id of peer process */
+       struct  timespec unp_ctime;     /* [I] holds creation time */
+       LIST_ENTRY(unpcb) unp_link;     /* [U] link in per-AF list of sockets */
 };
 
 /*

Reply via email to