I want to move a little bit forward with fine grained locking in UNIX
domain sockets area. My goal is to introduce the new rwlock(9) for
garbage collector data and keep existing `unp_lock' only for per-socket
data. This will allow to replace it by per-socket `so_lock' in the
future.

I like to start from unp_externalize(). It touches only garbage
collector data such as `unp_rights', `unp_msgcount' directly and
`unp_deferred' through unp_discard(). So `unp_lock' could be dropped
outside unp_externalize(). I don't want to introduce the new gc related
rwlock(9) with this diff, so `unp_lock' is still taken around garbage
collector data access. But right now this diff removes M_WAITOK
allocation from rwlock(9) and useless and weird fdplock() dances and
memory allocation from the error path. Also with this diff the only
place where we have `unp_lock' and fdplock() are both taken will be the
unp_internalize(). This allows to change their existing lock order or
completely avoid the case where these locks are both taken. This should
help to mpi@'s knote(9) work.

I want to do the unp_internalize() related work and introduce the new
garbage collector related rwlock(9) in the further diffs.

The socket locking could be not clean to somebody, so I like to put a
little explanation here. The solock() (`unp_lock') release is absolutely
safe around unp_externalize() within soreceive() path. The socket's
`so_rcv' buffer is still locked by sblock(). Concurrent thread could
disconnect this socket, but our soreceive() thread will be finished fine.
Such races are application specific. Also the lock sequence:

        s = solock(so);
        sblock(so, &so->so_rcv, ...);
        sounlock(so, s);
        s = solock(so);

does not provide lock order issue because the rwsleep_nsec(9) called
within sblock() releases passed rwlock(9). Since sblock() and sosleep()
release rwlock(9) taken by preceding solock(), the only `so_state' flags
and the PCB flags like `unp_flags' make socket consistent but not
solock().

Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.264
diff -u -p -r1.264 uipc_socket.c
--- sys/kern/uipc_socket.c      26 Jul 2021 05:51:13 -0000      1.264
+++ sys/kern/uipc_socket.c      2 Oct 2021 23:07:04 -0000
@@ -892,9 +892,11 @@ dontblock:
                        sbsync(&so->so_rcv, nextrecord);
                        if (controlp) {
                                if (pr->pr_domain->dom_externalize) {
+                                       sounlock(so, s);
                                        error =
                                            (*pr->pr_domain->dom_externalize)
                                            (cm, controllen, flags);
+                                       s = solock(so);
                                }
                                *controlp = cm;
                        } else {
Index: sys/kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.148
diff -u -p -r1.148 uipc_usrreq.c
--- sys/kern/uipc_usrreq.c      25 May 2021 22:45:09 -0000      1.148
+++ sys/kern/uipc_usrreq.c      2 Oct 2021 23:07:04 -0000
@@ -817,8 +817,6 @@ unp_externalize(struct mbuf *rights, soc
        struct file *fp;
        int nfds, error = 0;
 
-       rw_assert_wrlock(&unp_lock);
-
        /*
         * This code only works because SCM_RIGHTS is the only supported
         * control message type on unix sockets. Enforce this here.
@@ -834,7 +832,7 @@ unp_externalize(struct mbuf *rights, soc
                controllen -= CMSG_ALIGN(sizeof(struct cmsghdr));
        if (nfds > controllen / sizeof(int)) {
                error = EMSGSIZE;
-               goto restart;
+               goto out;
        }
 
        /* Make sure the recipient should be able to see the descriptors.. */
@@ -868,18 +866,13 @@ unp_externalize(struct mbuf *rights, soc
 
        KERNEL_UNLOCK();
 
+       if (error)
+               goto out;
+
        fds = mallocarray(nfds, sizeof(int), M_TEMP, M_WAITOK);
 
-restart:
        fdplock(fdp);
-       if (error != 0) {
-               if (nfds > 0) {
-                       rp = ((struct fdpass *)CMSG_DATA(cm));
-                       unp_discard(rp, nfds);
-               }
-               goto out;
-       }
-
+restart:
        /*
         * First loop -- allocate file descriptor table slots for the
         * new descriptors.
@@ -895,17 +888,19 @@ restart:
 
                        if (error == ENOSPC) {
                                fdexpand(p);
-                               error = 0;
-                       } else {
-                               /*
-                                * This is the error that has historically
-                                * been returned, and some callers may
-                                * expect it.
-                                */
-                               error = EMSGSIZE;
+                               goto restart;
                        }
+
                        fdpunlock(fdp);
-                       goto restart;
+
+                       /*
+                        * This is the error that has historically
+                        * been returned, and some callers may
+                        * expect it.
+                        */
+
+                       error = EMSGSIZE;
+                       goto out;
                }
 
                /*
@@ -924,12 +919,15 @@ restart:
 
                rp++;
        }
+       fdpunlock(fdp);
 
        /*
         * Now that adding them has succeeded, update all of the
         * descriptor passing state.
         */
        rp = (struct fdpass *)CMSG_DATA(cm);
+
+       rw_enter_write(&unp_lock);
        for (i = 0; i < nfds; i++) {
                struct unpcb *unp;
 
@@ -939,6 +937,7 @@ restart:
                        unp->unp_msgcount--;
                unp_rights--;
        }
+       rw_exit_write(&unp_lock);
 
        /*
         * Copy temporary array to message and adjust length, in case of
@@ -947,10 +946,19 @@ restart:
        memcpy(CMSG_DATA(cm), fds, nfds * sizeof(int));
        cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
        rights->m_len = CMSG_LEN(nfds * sizeof(int));
- out:
-       fdpunlock(fdp);
+out:
        if (fds != NULL)
                free(fds, M_TEMP, nfds * sizeof(int));
+
+       if (error) {
+               if (nfds > 0) {
+                       rp = ((struct fdpass *)CMSG_DATA(cm));
+                       rw_enter_write(&unp_lock);
+                       unp_discard(rp, nfds);
+                       rw_exit_write(&unp_lock);
+               }
+       }
+
        return (error);
 }
 

Reply via email to