The branch main has been updated by glebius:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a982ce04428eb3ed3eda38b31b4bdf2b763fb50c

commit a982ce04428eb3ed3eda38b31b4bdf2b763fb50c
Author:     Gleb Smirnoff <[email protected]>
AuthorDate: 2022-05-09 17:43:01 +0000
Commit:     Gleb Smirnoff <[email protected]>
CommitDate: 2022-05-09 17:43:01 +0000

    sockets: remove the socket-on-stack hack from sorflush()
    
    The hack can be tracked down to 4.4BSD, where copy was performed
    under splimp() and then after splx() dom_dispose was called.
    Stevens has a chapter on this function, but he doesn't answer why
    this trick is necessary.  Why can't we call into dom_dispose under
    splimp()?  Anyway, with multithreaded kernel the hack seems to be
    necessary to avoid LORs between socket buffer lock and different
    filesystem locks, especially network file systems.
    
    The new socket buffers KPI sbcut() from 1d2df300e9b allow us to get
    rid of the hack.
    
    Reviewed by:            markj
    Differential revision:  https://reviews.freebsd.org/D35125
---
 sys/kern/uipc_socket.c | 33 +++++----------------------------
 sys/kern/uipc_usrreq.c | 18 +++++++++++++++++-
 sys/sys/sockbuf.h      |  2 --
 sys/sys/socketvar.h    |  2 ++
 4 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index 299610477068..2989d53c223e 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -2944,22 +2944,12 @@ done:
 void
 sorflush(struct socket *so)
 {
-       struct socket aso;
        struct protosw *pr;
        int error;
 
        VNET_SO_ASSERT(so);
 
        /*
-        * In order to avoid calling dom_dispose with the socket buffer mutex
-        * held, we make a partial copy of the socket buffer and clear the
-        * original.  The new socket buffer copy won't have initialized locks so
-        * we can only call routines that won't use or assert those locks.
-        * Ideally calling socantrcvmore() would prevent data from being added
-        * to the buffer, but currently it merely prevents buffered data from
-        * being read by userspace.  We make this effort to free buffered data
-        * nonetheless.
-        *
         * Dislodge threads currently blocked in receive and wait to acquire
         * a lock against other simultaneous readers before clearing the
         * socket buffer.  Don't let our acquire be interrupted by a signal
@@ -2974,28 +2964,15 @@ sorflush(struct socket *so)
                return;
        }
 
-       SOCK_RECVBUF_LOCK(so);
-       bzero(&aso, sizeof(aso));
-       aso.so_pcb = so->so_pcb;
-       bcopy(&so->so_rcv.sb_startzero, &aso.so_rcv.sb_startzero,
-           offsetof(struct sockbuf, sb_endzero) -
-           offsetof(struct sockbuf, sb_startzero));
-       bzero(&so->so_rcv.sb_startzero,
-           offsetof(struct sockbuf, sb_endzero) -
-           offsetof(struct sockbuf, sb_startzero));
-       SOCK_RECVBUF_UNLOCK(so);
-       SOCK_IO_RECV_UNLOCK(so);
-
-       /*
-        * Dispose of special rights and flush the copied socket.  Don't call
-        * any unsafe routines (that rely on locks being initialized) on aso.
-        */
        pr = so->so_proto;
        if (pr->pr_flags & PR_RIGHTS) {
                MPASS(pr->pr_domain->dom_dispose != NULL);
-               (*pr->pr_domain->dom_dispose)(&aso);
+               (*pr->pr_domain->dom_dispose)(so);
+       } else {
+               sbrelease(&so->so_rcv, so);
+               SOCK_IO_RECV_UNLOCK(so);
        }
-       sbrelease_internal(&aso.so_rcv, so);
+
 }
 
 /*
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index d56922c6fa3a..82b291fa835d 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -2767,7 +2767,9 @@ unp_dispose_mbuf(struct mbuf *m)
 static void
 unp_dispose(struct socket *so)
 {
+       struct sockbuf *sb = &so->so_rcv;
        struct unpcb *unp;
+       struct mbuf *m;
 
        MPASS(!SOLISTENING(so));
 
@@ -2775,7 +2777,21 @@ unp_dispose(struct socket *so)
        UNP_LINK_WLOCK();
        unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
        UNP_LINK_WUNLOCK();
-       unp_dispose_mbuf(so->so_rcv.sb_mb);
+
+       /*
+        * Grab our special mbufs before calling sbrelease().
+        */
+       SOCK_RECVBUF_LOCK(so);
+       m = sbcut_locked(sb, sb->sb_ccc);
+       KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
+           ("%s: ccc %u mb %p mbcnt %u", __func__,
+           sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
+       sbrelease_locked(sb, so);
+       SOCK_RECVBUF_UNLOCK(so);
+       if (SOCK_IO_RECV_OWNED(so))
+               SOCK_IO_RECV_UNLOCK(so);
+
+       unp_dispose_mbuf(m);
 }
 
 static void
diff --git a/sys/sys/sockbuf.h b/sys/sys/sockbuf.h
index eb35a372cae5..ef323bf59da7 100644
--- a/sys/sys/sockbuf.h
+++ b/sys/sys/sockbuf.h
@@ -83,7 +83,6 @@ struct sockbuf {
        struct  mtx *sb_mtx;            /* sockbuf lock */
        struct  selinfo *sb_sel;        /* process selecting read/write */
        short   sb_state;       /* (a) socket state on sockbuf */
-#define        sb_startzero    sb_flags
        short   sb_flags;       /* (a) flags, see above */
        struct  mbuf *sb_mb;    /* (a) the mbuf chain */
        struct  mbuf *sb_mbtail; /* (a) the last mbuf in the chain */
@@ -108,7 +107,6 @@ struct sockbuf {
        struct  mbuf *sb_mtlstail; /* (a) last mbuf in TLS chain */
        int     (*sb_upcall)(struct socket *, void *, int); /* (a) */
        void    *sb_upcallarg;  /* (a) */
-#define        sb_endzero      sb_tls_seqno
        uint64_t sb_tls_seqno;  /* (a) TLS seqno */
        struct  ktls_session *sb_tls_info; /* (a + b) TLS state */
        TAILQ_HEAD(, kaiocb) sb_aiojobq; /* (a) pending AIO ops */
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index b379dc319cea..fe6faa842bda 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -301,10 +301,12 @@ typedef enum { SO_RCV, SO_SND } sb_which;
        soiolock((so), &(so)->so_snd_sx, (flags))
 #define        SOCK_IO_SEND_UNLOCK(so)                                         
\
        soiounlock(&(so)->so_snd_sx)
+#define        SOCK_IO_SEND_OWNED(so)  sx_xlocked(&(so)->so_snd_sx)
 #define        SOCK_IO_RECV_LOCK(so, flags)                                    
\
        soiolock((so), &(so)->so_rcv_sx, (flags))
 #define        SOCK_IO_RECV_UNLOCK(so)                                         
\
        soiounlock(&(so)->so_rcv_sx)
+#define        SOCK_IO_RECV_OWNED(so)  sx_xlocked(&(so)->so_rcv_sx)
 
 /*
  * Do we need to notify the other side when I/O is possible?

Reply via email to