The latest version of the PF_UNIX sockets unlocking diff. The new
`unp_lock' rwlock(9) was used as solock()'s backend to protect the whole
layer.

We release solock() while we perform all vnode(9) related operations.
Since fifo subsystem uses unix sockets as backend this is required  to
enforce `i_lock' -> `unp_lock' lock order. The socket can't be closed by
concurrent thread because the file descriptor reference is held. Also
the new two flags `UNP_BINDING' and `UNP_CONNECTING' were introduced to
prevent connection or binding override while solock() is released.

This version was tested by claudio@. The previous version was tested by
bluhm@, but the differences are mostly in commentaries and error paths
simplification.

I hope someone else will try it and gives positive feedback which allow
to push it forward.

Index: sys/kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.104
diff -u -p -r1.104 uipc_socket2.c
--- sys/kern/uipc_socket2.c     11 Apr 2020 14:07:06 -0000      1.104
+++ sys/kern/uipc_socket2.c     4 Feb 2021 11:19:29 -0000
@@ -53,6 +53,8 @@ u_long        sb_max = SB_MAX;                /* patchable */
 extern struct pool mclpools[];
 extern struct pool mbpool;
 
+extern struct rwlock unp_lock;
+
 /*
  * Procedures to manipulate state flags of socket
  * and do appropriate wakeups.  Normal sequence from the
@@ -282,6 +284,8 @@ solock(struct socket *so)
                NET_LOCK();
                break;
        case PF_UNIX:
+               rw_enter_write(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -306,6 +310,8 @@ sounlock(struct socket *so, int s)
                NET_UNLOCK();
                break;
        case PF_UNIX:
+               rw_exit_write(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -323,6 +329,8 @@ soassertlocked(struct socket *so)
                NET_ASSERT_LOCKED();
                break;
        case PF_UNIX:
+               rw_assert_wrlock(&unp_lock);
+               break;
        case PF_ROUTE:
        case PF_KEY:
        default:
@@ -335,12 +343,24 @@ int
 sosleep_nsec(struct socket *so, void *ident, int prio, const char *wmesg,
     uint64_t nsecs)
 {
-       if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
-           (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
-           (so->so_proto->pr_domain->dom_family != PF_KEY)) {
-               return rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
-       } else
-               return tsleep_nsec(ident, prio, wmesg, nsecs);
+       int ret;
+
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               ret = rwsleep_nsec(ident, &netlock, prio, wmesg, nsecs);
+               break;
+       case PF_UNIX:
+               ret = rwsleep_nsec(ident, &unp_lock, prio, wmesg, nsecs);
+               break;
+       case PF_ROUTE:
+       case PF_KEY:
+       default:
+               ret = tsleep_nsec(ident, prio, wmesg, nsecs);
+               break;
+       }
+
+       return ret;
 }
 
 /*
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.142
diff -u -p -r1.142 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      16 Jul 2019 21:41:37 -0000      1.142
+++ sys/kern/uipc_usrreq.c      4 Feb 2021 11:19:29 -0000
@@ -51,35 +51,35 @@
 #include <sys/task.h>
 #include <sys/pledge.h>
 #include <sys/pool.h>
+#include <sys/rwlock.h>
 
-void   uipc_setaddr(const struct unpcb *, struct mbuf *);
-
-/* list of all UNIX domain sockets, for unp_gc() */
-LIST_HEAD(unp_head, unpcb) unp_head = LIST_HEAD_INITIALIZER(unp_head);
+/*
+ * Locks used to protect global data and struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
+ */
+struct rwlock unp_lock = RWLOCK_INITIALIZER("unplock");
 
 /*
  * Stack of sets of files that were passed over a socket but were
  * not received and need to be closed.
  */
 struct unp_deferral {
-       SLIST_ENTRY(unp_deferral)       ud_link;
-       int     ud_n;
+       SLIST_ENTRY(unp_deferral)       ud_link;        /* [U] */
+       int                             ud_n;           /* [I] */
        /* followed by ud_n struct fdpass */
-       struct fdpass ud_fp[];
+       struct fdpass                   ud_fp[];        /* [I] */
 };
 
+void   uipc_setaddr(const struct unpcb *, struct mbuf *);
 void   unp_discard(struct fdpass *, int);
 void   unp_mark(struct fdpass *, int);
 void   unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
 int    unp_nam2sun(struct mbuf *, struct sockaddr_un **, size_t *);
 
 struct pool unpcb_pool;
-/* list of sets of files that were sent over sockets that are now closed */
-SLIST_HEAD(,unp_deferral) unp_deferred = SLIST_HEAD_INITIALIZER(unp_deferred);
-
 struct task unp_gc_task = TASK_INITIALIZER(unp_gc, NULL);
 
-
 /*
  * Unix communications domain.
  *
@@ -88,14 +88,25 @@ struct task unp_gc_task = TASK_INITIALIZ
  *     rethink name space problems
  *     need a proper out-of-band
  */
-struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t  unp_ino;                        /* prototype for fake inode numbers */
+const struct   sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
+
+/* [U] list of all UNIX domain sockets, for unp_gc() */
+LIST_HEAD(unp_head, unpcb)     unp_head =
+       LIST_HEAD_INITIALIZER(unp_head);
+/* [U] list of sets of files that were sent over sockets that are now closed */
+SLIST_HEAD(,unp_deferral)      unp_deferred =
+       SLIST_HEAD_INITIALIZER(unp_deferred);
+
+ino_t  unp_ino;        /* [U] prototype for fake inode numbers */
+int    unp_rights;     /* [U] file descriptors in flight */
+int    unp_defer;      /* [U] number of deferred fp to close by the GC task */
+int    unp_gcing;      /* [U] GC task currently running */
 
 void
 unp_init(void)
 {
        pool_init(&unpcb_pool, sizeof(struct unpcb), 0,
-           IPL_NONE, 0, "unpcb", NULL);
+           IPL_SOFTNET, 0, "unpcb", NULL);
 }
 
 void
@@ -214,7 +225,7 @@ uipc_usrreq(struct socket *so, int req, 
                switch (so->so_type) {
 
                case SOCK_DGRAM: {
-                       struct sockaddr *from;
+                       const struct sockaddr *from;
 
                        if (nam) {
                                if (unp->unp_conn) {
@@ -351,14 +362,14 @@ u_long    unpst_recvspace = PIPSIZ;
 u_long unpdg_sendspace = 2*1024;       /* really max datagram size */
 u_long unpdg_recvspace = 4*1024;
 
-int    unp_rights;                     /* file descriptors in flight */
-
 int
 uipc_attach(struct socket *so, int proto)
 {
        struct unpcb *unp;
        int error;
 
+       rw_assert_wrlock(&unp_lock);
+
        if (so->so_pcb)
                return EISCONN;
        if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
@@ -407,25 +418,48 @@ uipc_detach(struct socket *so)
 void
 unp_detach(struct unpcb *unp)
 {
-       struct vnode *vp;
+       struct socket *so = unp->unp_socket;
+       struct vnode *vp = NULL;
+
+       rw_assert_wrlock(&unp_lock);
 
        LIST_REMOVE(unp, unp_link);
        if (unp->unp_vnode) {
+               /*
+                * `v_socket' is only read in unp_connect and
+                * unplock prevents concurrent access.
+                */
+
                unp->unp_vnode->v_socket = NULL;
                vp = unp->unp_vnode;
                unp->unp_vnode = NULL;
-               vrele(vp);
        }
+
        if (unp->unp_conn)
                unp_disconnect(unp);
        while (!SLIST_EMPTY(&unp->unp_refs))
                unp_drop(SLIST_FIRST(&unp->unp_refs), ECONNRESET);
-       soisdisconnected(unp->unp_socket);
-       unp->unp_socket->so_pcb = NULL;
+       soisdisconnected(so);
+       so->so_pcb = NULL;
        m_freem(unp->unp_addr);
        pool_put(&unpcb_pool, unp);
        if (unp_rights)
                task_add(systq, &unp_gc_task);
+
+       if (vp != NULL) {
+               /*
+                * Enforce `i_lock' -> `unplock' because fifo subsystem
+                * requires it. The socket can't be closed concurrently
+                * because the file descriptor reference is
+                * still hold.
+                */
+
+               sounlock(so, SL_LOCKED);
+               KERNEL_LOCK();
+               vrele(vp);
+               KERNEL_UNLOCK();
+               solock(so);
+       }
 }
 
 int
@@ -439,6 +473,8 @@ unp_bind(struct unpcb *unp, struct mbuf 
        struct nameidata nd;
        size_t pathlen;
 
+       if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
+               return (EINVAL);
        if (unp->unp_vnode != NULL)
                return (EINVAL);
        if ((error = unp_nam2sun(nam, &soun, &pathlen)))
@@ -458,10 +494,24 @@ unp_bind(struct unpcb *unp, struct mbuf 
        NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
            soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
+
+       unp->unp_flags |= UNP_BINDING;
+
+       /*
+        * Enforce `i_lock' -> `unplock' because fifo subsystem
+        * requires it. The socket can't be closed concurrently
+        * because the file descriptor reference is still held.
+        */
+
+       sounlock(unp->unp_socket, SL_LOCKED);
+
+       KERNEL_LOCK();
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
-       if ((error = namei(&nd)) != 0) {
+       error = namei(&nd);
+       if (error != 0) {
                m_freem(nam2);
-               return (error);
+               solock(unp->unp_socket);
+               goto out;
        }
        vp = nd.ni_vp;
        if (vp != NULL) {
@@ -472,7 +522,9 @@ unp_bind(struct unpcb *unp, struct mbuf 
                        vput(nd.ni_dvp);
                vrele(vp);
                m_freem(nam2);
-               return (EADDRINUSE);
+               error = EADDRINUSE;
+               solock(unp->unp_socket);
+               goto out;
        }
        VATTR_NULL(&vattr);
        vattr.va_type = VSOCK;
@@ -481,8 +533,10 @@ unp_bind(struct unpcb *unp, struct mbuf 
        vput(nd.ni_dvp);
        if (error) {
                m_freem(nam2);
-               return (error);
+               solock(unp->unp_socket);
+               goto out;
        }
+       solock(unp->unp_socket);
        unp->unp_addr = nam2;
        vp = nd.ni_vp;
        vp->v_socket = unp->unp_socket;
@@ -492,7 +546,11 @@ unp_bind(struct unpcb *unp, struct mbuf 
        unp->unp_connid.pid = p->p_p->ps_pid;
        unp->unp_flags |= UNP_FEIDSBIND;
        VOP_UNLOCK(vp);
-       return (0);
+out:
+       KERNEL_UNLOCK();
+       unp->unp_flags &= ~UNP_BINDING;
+
+       return (error);
 }
 
 int
@@ -505,36 +563,52 @@ unp_connect(struct socket *so, struct mb
        struct nameidata nd;
        int error;
 
+       unp = sotounpcb(so);
+       if (unp->unp_flags & (UNP_BINDING | UNP_CONNECTING))
+               return (EISCONN);
        if ((error = unp_nam2sun(nam, &soun, NULL)))
                return (error);
 
        NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, soun->sun_path, p);
        nd.ni_pledge = PLEDGE_UNIX;
-       if ((error = namei(&nd)) != 0)
-               return (error);
+
+       unp->unp_flags |= UNP_CONNECTING;
+
+       /*
+        * Enforce `i_lock' -> `unplock' because fifo subsystem
+        * requires it. The socket can't be closed concurrently
+        * because the file descriptor reference is still held.
+        */
+
+       sounlock(so, SL_LOCKED);
+
+       KERNEL_LOCK();
+       error = namei(&nd);
+       if (error != 0)
+               goto unlock;
        vp = nd.ni_vp;
        if (vp->v_type != VSOCK) {
                error = ENOTSOCK;
-               goto bad;
+               goto put;
        }
        if ((error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) != 0)
-               goto bad;
+               goto put;
+       solock(so);
        so2 = vp->v_socket;
        if (so2 == NULL) {
                error = ECONNREFUSED;
-               goto bad;
+               goto put_locked;
        }
        if (so->so_type != so2->so_type) {
                error = EPROTOTYPE;
-               goto bad;
+               goto put_locked;
        }
        if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
                if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
                    (so3 = sonewconn(so2, 0)) == 0) {
                        error = ECONNREFUSED;
-                       goto bad;
+                       goto put_locked;
                }
-               unp = sotounpcb(so);
                unp2 = sotounpcb(so2);
                unp3 = sotounpcb(so3);
                if (unp2->unp_addr)
@@ -551,8 +625,15 @@ unp_connect(struct socket *so, struct mb
                }
        }
        error = unp_connect2(so, so2);
-bad:
+put_locked:
+       sounlock(so, SL_LOCKED);
+put:
        vput(vp);
+unlock:
+       KERNEL_UNLOCK();
+       solock(so);
+       unp->unp_flags &= ~UNP_CONNECTING;
+
        return (error);
 }
 
@@ -562,6 +643,8 @@ unp_connect2(struct socket *so, struct s
        struct unpcb *unp = sotounpcb(so);
        struct unpcb *unp2;
 
+       rw_assert_wrlock(&unp_lock);
+
        if (so2->so_type != so->so_type)
                return (EPROTOTYPE);
        unp2 = sotounpcb(so2);
@@ -635,15 +718,16 @@ unp_drop(struct unpcb *unp, int errno)
 {
        struct socket *so = unp->unp_socket;
 
-       KERNEL_ASSERT_LOCKED();
+       rw_assert_wrlock(&unp_lock);
 
        so->so_error = errno;
        unp_disconnect(unp);
        if (so->so_head) {
                so->so_pcb = NULL;
                /*
-                * As long as the KERNEL_LOCK() is the default lock for Unix
-                * sockets, do not release it.
+                * As long as `unp_lock' is taken before entering
+                * uipc_usrreq() releasing it here would lead to a
+                * double unlock.
                 */
                sofree(so, SL_NOUNLOCK);
                m_freem(unp->unp_addr);
@@ -685,6 +769,8 @@ unp_externalize(struct mbuf *rights, soc
        struct file *fp;
        int nfds, error = 0;
 
+       rw_assert_wrlock(&unp_lock);
+
        /*
         * This code only works because SCM_RIGHTS is the only supported
         * control message type on unix sockets. Enforce this here.
@@ -705,6 +791,10 @@ unp_externalize(struct mbuf *rights, soc
 
        /* Make sure the recipient should be able to see the descriptors.. */
        rp = (struct fdpass *)CMSG_DATA(cm);
+
+       /* fdp->fd_rdir requires KERNEL_LOCK() */
+       KERNEL_LOCK();
+
        for (i = 0; i < nfds; i++) {
                fp = rp->fp;
                rp++;
@@ -728,6 +818,8 @@ unp_externalize(struct mbuf *rights, soc
                }
        }
 
+       KERNEL_UNLOCK();
+
        fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
 
 restart:
@@ -825,6 +917,8 @@ unp_internalize(struct mbuf *control, st
        int i, error;
        int nfds, *ip, fd, neededspace;
 
+       rw_assert_wrlock(&unp_lock);
+
        /*
         * Check for two potential msg_controllen values because
         * IETF stuck their nose in a place it does not belong.
@@ -923,8 +1017,6 @@ fail:
        return (error);
 }
 
-int    unp_defer, unp_gcing;
-
 void
 unp_gc(void *arg __unused)
 {
@@ -934,8 +1026,10 @@ unp_gc(void *arg __unused)
        struct unpcb *unp;
        int nunref, i;
 
+       rw_enter_write(&unp_lock);
+
        if (unp_gcing)
-               return;
+               goto unlock;
        unp_gcing = 1;
 
        /* close any fds on the deferred list */
@@ -950,7 +1044,9 @@ unp_gc(void *arg __unused)
                        if ((unp = fptounp(fp)) != NULL)
                                unp->unp_msgcount--;
                        unp_rights--;
+                       rw_exit_write(&unp_lock);
                        (void) closef(fp, NULL);
+                       rw_enter_write(&unp_lock);
                }
                free(defer, M_TEMP, sizeof(*defer) +
                    sizeof(struct fdpass) * defer->ud_n);
@@ -1021,6 +1117,8 @@ unp_gc(void *arg __unused)
                }
        }
        unp_gcing = 0;
+unlock:
+       rw_exit_write(&unp_lock);
 }
 
 void
@@ -1066,6 +1164,8 @@ unp_mark(struct fdpass *rp, int nfds)
        struct unpcb *unp;
        int i;
 
+       rw_assert_wrlock(&unp_lock);
+
        for (i = 0; i < nfds; i++) {
                if (rp[i].fp == NULL)
                        continue;
@@ -1087,6 +1187,8 @@ void
 unp_discard(struct fdpass *rp, int nfds)
 {
        struct unp_deferral *defer;
+
+       rw_assert_wrlock(&unp_lock);
 
        /* copy the file pointers to a deferral structure */
        defer = malloc(sizeof(*defer) + sizeof(*rp) * nfds, M_TEMP, M_WAITOK);
Index: sys/sys/unpcb.h
===================================================================
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.17
diff -u -p -r1.17 unpcb.h
--- sys/sys/unpcb.h     15 Jul 2019 12:28:06 -0000      1.17
+++ sys/sys/unpcb.h     4 Feb 2021 11:19:30 -0000
@@ -56,22 +56,27 @@
  * Stream sockets keep copies of receive sockbuf sb_cc and sb_mbcnt
  * so that changes in the sockbuf may be computed to modify
  * back pressure on the sender accordingly.
+ *
+ * Locks used to protect struct members:
+ *      I       immutable after creation
+ *      U       unp_lock
  */
 
+
 struct unpcb {
-       struct  socket *unp_socket;     /* pointer back to socket */
-       struct  vnode *unp_vnode;       /* if associated with file */
-       struct  file *unp_file;         /* backpointer for unp_gc() */
-       struct  unpcb *unp_conn;        /* control block of connected socket */
-       ino_t   unp_ino;                /* fake inode number */
-       SLIST_HEAD(,unpcb) unp_refs;    /* referencing socket linked list */
-       SLIST_ENTRY(unpcb) unp_nextref; /* link in unp_refs list */
-       struct  mbuf *unp_addr;         /* bound address of socket */
-       long    unp_msgcount;           /* references from socket rcv buf */
-       int     unp_flags;              /* this unpcb contains peer eids */
-       struct  sockpeercred unp_connid;/* id of peer process */
-       struct  timespec unp_ctime;     /* holds creation time */
-       LIST_ENTRY(unpcb) unp_link;     /* link in per-AF list of sockets */
+       struct  socket *unp_socket;     /* [I] pointer back to socket */
+       struct  vnode *unp_vnode;       /* [U] if associated with file */
+       struct  file *unp_file;         /* [U] backpointer for unp_gc() */
+       struct  unpcb *unp_conn;        /* [U] control block of connected 
socket */
+       ino_t   unp_ino;                /* [U] fake inode number */
+       SLIST_HEAD(,unpcb) unp_refs;    /* [U] referencing socket linked list */
+       SLIST_ENTRY(unpcb) unp_nextref; /* [U] link in unp_refs list */
+       struct  mbuf *unp_addr;         /* [U] bound address of socket */
+       long    unp_msgcount;           /* [U] references from socket rcv buf */
+       int     unp_flags;              /* [U] this unpcb contains peer eids */
+       struct  sockpeercred unp_connid;/* [U] id of peer process */
+       struct  timespec unp_ctime;     /* [I] holds creation time */
+       LIST_ENTRY(unpcb) unp_link;     /* [U] link in per-AF list of sockets */
 };
 
 /*
@@ -82,6 +87,8 @@ struct        unpcb {
 #define UNP_GCMARK     0x04            /* mark during unp_gc() */
 #define UNP_GCDEFER    0x08            /* ref'd, but not marked in this pass */
 #define UNP_GCDEAD     0x10            /* unref'd in this pass */
+#define UNP_BINDING    0x20            /* unp is binding now */
+#define UNP_CONNECTING 0x40            /* unp is connecting now */
 
 #define        sotounpcb(so)   ((struct unpcb *)((so)->so_pcb))
 

Reply via email to