Author: glebius
Date: Thu Jun  8 21:30:34 2017
New Revision: 319722
URL: https://svnweb.freebsd.org/changeset/base/319722

Log:
  Listening sockets improvements.
  
  o Separate fields of struct socket that belong to listening from
    fields that belong to normal dataflow, and unionize them.  This
    shrinks the structure a bit.
    - Take out selinfo's from the socket buffers into the socket. The
      first reason is to support braindamaged scenario when a socket is
      added to kevent(2) and then listen(2) is cast on it. The second
      reason is that there is future plan to make socket buffers pluggable,
      so that for a dataflow socket a socket buffer can be changed, and
      in this case we also want to keep same selinfos through the lifetime
      of a socket.
    - Remove struct struct so_accf. Since now listening stuff no longer
      affects struct socket size, just move its fields into listening part
      of the union.
    - Provide sol_upcall field and enforce that so_upcall_set() may be called
      only on a dataflow socket, which has buffers, and for listening sockets
      provide solisten_upcall_set().
  
  o Remove ACCEPT_LOCK() global.
    - Add a mutex to socket, to be used instead of socket buffer lock to lock
      fields of struct socket that don't belong to a socket buffer.
    - Allow to acquire two socket locks, but the first one must belong to a
      listening socket.
    - Make soref()/sorele() to use atomic(9).  This allows in some situations
      to do soref() without owning socket lock.  There is place for improvement
      here, it is possible to make sorele() also to lock optionally.
    - Most protocols aren't touched by this change, except UNIX local sockets.
      See below for more information.
  
  o Reduce copy-and-paste in kernel modules that accept connections from
    listening sockets: provide function solisten_dequeue(), and use it in
    the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
    infiniband, rpc.
  
  o UNIX local sockets.
    - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
      local sockets.  Most races exist around spawning a new socket, when we
      are connecting to a local listening socket.  To cover them, we need to
      hold locks on both PCBs when spawning a third one.  This means holding
      them across sonewconn().  This creates a LOR between pcb locks and
      unp_list_lock.
    - To fix the new LOR, abandon the global unp_list_lock in favor of global
      unp_link_lock.  Indeed, separating these two locks didn't provide us any
      extra parralelism in the UNIX sockets.
    - Now call into uipc_attach() may happen with unp_link_lock hold if, we
      are accepting, or without unp_link_lock in case if we are just creating
      a socket.
    - Another problem in UNIX sockets is that uipc_close() basicly did nothing
      for a listening socket.  The vnode remained opened for connections.  This
      is fixed by removing vnode in uipc_close().  Maybe the right way would be
      to do it for all sockets (not only listening), simply move the vnode
      teardown from uipc_detach() to uipc_close()?
  
  Sponsored by:         Netflix
  Differential Revision:        https://reviews.freebsd.org/D9770

Modified:
  head/sys/cam/ctl/ctl_ha.c
  head/sys/dev/iscsi/icl_soft_proxy.c
  head/sys/kern/sys_socket.c
  head/sys/kern/uipc_accf.c
  head/sys/kern/uipc_debug.c
  head/sys/kern/uipc_sockbuf.c
  head/sys/kern/uipc_socket.c
  head/sys/kern/uipc_syscalls.c
  head/sys/kern/uipc_usrreq.c
  head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c
  head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
  head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c
  head/sys/netgraph/ng_ksocket.c
  head/sys/netinet/sctp_input.c
  head/sys/netinet/sctp_syscalls.c
  head/sys/netinet/sctp_sysctl.c
  head/sys/netinet/sctp_usrreq.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/tcp_timewait.c
  head/sys/ofed/drivers/infiniband/core/iwcm.c
  head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
  head/sys/rpc/svc_vc.c
  head/sys/sys/sockbuf.h
  head/sys/sys/socket.h
  head/sys/sys/socketvar.h
  head/usr.bin/netstat/inet.c

Modified: head/sys/cam/ctl/ctl_ha.c
==============================================================================
--- head/sys/cam/ctl/ctl_ha.c   Thu Jun  8 21:06:18 2017        (r319721)
+++ head/sys/cam/ctl/ctl_ha.c   Thu Jun  8 21:30:34 2017        (r319722)
@@ -458,45 +458,20 @@ out:
 static int
 ctl_ha_accept(struct ha_softc *softc)
 {
-       struct socket *so;
+       struct socket *lso, *so;
        struct sockaddr *sap;
        int error;
 
-       ACCEPT_LOCK();
-       if (softc->ha_lso->so_rcv.sb_state & SBS_CANTRCVMORE)
-               softc->ha_lso->so_error = ECONNABORTED;
-       if (softc->ha_lso->so_error) {
-               error = softc->ha_lso->so_error;
-               softc->ha_lso->so_error = 0;
-               ACCEPT_UNLOCK();
+       lso = softc->ha_lso;
+       SOLISTEN_LOCK(lso);
+       error = solisten_dequeue(lso, &so, 0);
+       if (error == EWOULDBLOCK)
+               return (error);
+       if (error) {
                printf("%s: socket error %d\n", __func__, error);
                goto out;
        }
-       so = TAILQ_FIRST(&softc->ha_lso->so_comp);
-       if (so == NULL) {
-               ACCEPT_UNLOCK();
-               return (EWOULDBLOCK);
-       }
-       KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
-       KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
 
-       /*
-        * Before changing the flags on the socket, we have to bump the
-        * reference count.  Otherwise, if the protocol calls sofree(),
-        * the socket will be released due to a zero refcount.
-        */
-       SOCK_LOCK(so);                  /* soref() and so_state update */
-       soref(so);                      /* file descriptor reference */
-
-       TAILQ_REMOVE(&softc->ha_lso->so_comp, so, so_list);
-       softc->ha_lso->so_qlen--;
-       so->so_state |= SS_NBIO;
-       so->so_qstate &= ~SQ_COMP;
-       so->so_head = NULL;
-
-       SOCK_UNLOCK(so);
-       ACCEPT_UNLOCK();
-
        sap = NULL;
        error = soaccept(so, &sap);
        if (error != 0) {
@@ -556,9 +531,6 @@ ctl_ha_listen(struct ha_softc *softc)
                        printf("%s: REUSEPORT setting failed %d\n",
                            __func__, error);
                }
-               SOCKBUF_LOCK(&softc->ha_lso->so_rcv);
-               soupcall_set(softc->ha_lso, SO_RCV, ctl_ha_lupcall, softc);
-               SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv);
        }
 
        memcpy(&sa, &softc->ha_peer_in, sizeof(sa));
@@ -572,6 +544,10 @@ ctl_ha_listen(struct ha_softc *softc)
                printf("%s: solisten() error %d\n", __func__, error);
                goto out;
        }
+       SOLISTEN_LOCK(softc->ha_lso);
+       softc->ha_lso->so_state |= SS_NBIO;
+       solisten_upcall_set(softc->ha_lso, ctl_ha_lupcall, softc);
+       SOLISTEN_UNLOCK(softc->ha_lso);
        return (0);
 
 out:

Modified: head/sys/dev/iscsi/icl_soft_proxy.c
==============================================================================
--- head/sys/dev/iscsi/icl_soft_proxy.c Thu Jun  8 21:06:18 2017        
(r319721)
+++ head/sys/dev/iscsi/icl_soft_proxy.c Thu Jun  8 21:30:34 2017        
(r319722)
@@ -92,7 +92,6 @@ struct icl_listen_sock {
        struct icl_listen               *ils_listen;
        struct socket                   *ils_socket;
        bool                            ils_running;
-       bool                            ils_disconnecting;
        int                             ils_id;
 };
 
@@ -184,7 +183,9 @@ icl_listen_free(struct icl_listen *il)
                while (ils->ils_running) {
                        ICL_DEBUG("waiting for accept thread to terminate");
                        sx_xunlock(&il->il_lock);
-                       ils->ils_disconnecting = true;
+                       SOLISTEN_LOCK(ils->ils_socket);
+                       ils->ils_socket->so_error = ENOTCONN;
+                       SOLISTEN_UNLOCK(ils->ils_socket);
                        wakeup(&ils->ils_socket->so_timeo);
                        pause("icl_unlisten", 1 * hz);
                        sx_xlock(&il->il_lock);
@@ -200,9 +201,9 @@ icl_listen_free(struct icl_listen *il)
 }
 
 /*
- * XXX: Doing accept in a separate thread in each socket might not be the best 
way
- *     to do stuff, but it's pretty clean and debuggable - and you probably 
won't
- *     have hundreds of listening sockets anyway.
+ * XXX: Doing accept in a separate thread in each socket might not be the
+ * best way to do stuff, but it's pretty clean and debuggable - and you
+ * probably won't have hundreds of listening sockets anyway.
  */
 static void
 icl_accept_thread(void *arg)
@@ -218,55 +219,22 @@ icl_accept_thread(void *arg)
        ils->ils_running = true;
 
        for (;;) {
-               ACCEPT_LOCK();
-               while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0 && 
ils->ils_disconnecting == false) {
-                       if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
-                               head->so_error = ECONNABORTED;
-                               break;
-                       }
-                       error = msleep(&head->so_timeo, &accept_mtx, PSOCK | 
PCATCH,
-                           "accept", 0);
-                       if (error) {
-                               ACCEPT_UNLOCK();
-                               ICL_WARN("msleep failed with error %d", error);
-                               continue;
-                       }
-                       if (ils->ils_disconnecting) {
-                               ACCEPT_UNLOCK();
-                               ICL_DEBUG("terminating");
-                               ils->ils_running = false;
-                               kthread_exit();
-                               return;
-                       }
+               SOLISTEN_LOCK(head);
+               error = solisten_dequeue(head, &so, 0);
+               if (error == ENOTCONN) {
+                       /*
+                        * XXXGL: ENOTCONN is our mark from icl_listen_free().
+                        * Neither socket code, nor msleep(9) may return it.
+                        */
+                       ICL_DEBUG("terminating");
+                       ils->ils_running = false;
+                       kthread_exit();
+                       return;
                }
-               if (head->so_error) {
-                       error = head->so_error;
-                       head->so_error = 0;
-                       ACCEPT_UNLOCK();
-                       ICL_WARN("socket error %d", error);
+               if (error) {
+                       ICL_WARN("solisten_dequeue error %d", error);
                        continue;
                }
-               so = TAILQ_FIRST(&head->so_comp);
-               KASSERT(so != NULL, ("NULL so"));
-               KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so 
SQ_INCOMP"));
-               KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
-
-               /*
-                * Before changing the flags on the socket, we have to bump the
-                * reference count.  Otherwise, if the protocol calls sofree(),
-                * the socket will be released due to a zero refcount.
-                */
-               SOCK_LOCK(so);                  /* soref() and so_state update 
*/
-               soref(so);                      /* file descriptor reference */
-
-               TAILQ_REMOVE(&head->so_comp, so, so_list);
-               head->so_qlen--;
-               so->so_state |= (head->so_state & SS_NBIO);
-               so->so_qstate &= ~SQ_COMP;
-               so->so_head = NULL;
-
-               SOCK_UNLOCK(so);
-               ACCEPT_UNLOCK();
 
                sa = NULL;
                error = soaccept(so, &sa);

Modified: head/sys/kern/sys_socket.c
==============================================================================
--- head/sys/kern/sys_socket.c  Thu Jun  8 21:06:18 2017        (r319721)
+++ head/sys/kern/sys_socket.c  Thu Jun  8 21:30:34 2017        (r319722)
@@ -170,32 +170,36 @@ soo_ioctl(struct file *fp, u_long cmd, void *data, str
                break;
 
        case FIOASYNC:
-               /*
-                * XXXRW: This code separately acquires SOCK_LOCK(so) and
-                * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
-                * mutex to avoid introducing the assumption that they are
-                * the same.
-                */
                if (*(int *)data) {
                        SOCK_LOCK(so);
                        so->so_state |= SS_ASYNC;
+                       if (SOLISTENING(so)) {
+                               so->sol_sbrcv_flags |= SB_ASYNC;
+                               so->sol_sbsnd_flags |= SB_ASYNC;
+                       } else {
+                               SOCKBUF_LOCK(&so->so_rcv);
+                               so->so_rcv.sb_flags |= SB_ASYNC;
+                               SOCKBUF_UNLOCK(&so->so_rcv);
+                               SOCKBUF_LOCK(&so->so_snd);
+                               so->so_snd.sb_flags |= SB_ASYNC;
+                               SOCKBUF_UNLOCK(&so->so_snd);
+                       }
                        SOCK_UNLOCK(so);
-                       SOCKBUF_LOCK(&so->so_rcv);
-                       so->so_rcv.sb_flags |= SB_ASYNC;
-                       SOCKBUF_UNLOCK(&so->so_rcv);
-                       SOCKBUF_LOCK(&so->so_snd);
-                       so->so_snd.sb_flags |= SB_ASYNC;
-                       SOCKBUF_UNLOCK(&so->so_snd);
                } else {
                        SOCK_LOCK(so);
                        so->so_state &= ~SS_ASYNC;
+                       if (SOLISTENING(so)) {
+                               so->sol_sbrcv_flags &= ~SB_ASYNC;
+                               so->sol_sbsnd_flags &= ~SB_ASYNC;
+                       } else {
+                               SOCKBUF_LOCK(&so->so_rcv);
+                               so->so_rcv.sb_flags &= ~SB_ASYNC;
+                               SOCKBUF_UNLOCK(&so->so_rcv);
+                               SOCKBUF_LOCK(&so->so_snd);
+                               so->so_snd.sb_flags &= ~SB_ASYNC;
+                               SOCKBUF_UNLOCK(&so->so_snd);
+                       }
                        SOCK_UNLOCK(so);
-                       SOCKBUF_LOCK(&so->so_rcv);
-                       so->so_rcv.sb_flags &= ~SB_ASYNC;
-                       SOCKBUF_UNLOCK(&so->so_rcv);
-                       SOCKBUF_LOCK(&so->so_snd);
-                       so->so_snd.sb_flags &= ~SB_ASYNC;
-                       SOCKBUF_UNLOCK(&so->so_snd);
                }
                break;
 
@@ -706,7 +710,6 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb
        sb->sb_flags &= ~SB_AIO_RUNNING;
        SOCKBUF_UNLOCK(sb);
 
-       ACCEPT_LOCK();
        SOCK_LOCK(so);
        sorele(so);
 }

Modified: head/sys/kern/uipc_accf.c
==============================================================================
--- head/sys/kern/uipc_accf.c   Thu Jun  8 21:06:18 2017        (r319721)
+++ head/sys/kern/uipc_accf.c   Thu Jun  8 21:30:34 2017        (r319722)
@@ -173,13 +173,13 @@ accept_filt_getopt(struct socket *so, struct sockopt *
                error = EINVAL;
                goto out;
        }
-       if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+       if (so->sol_accept_filter == NULL) {
                error = EINVAL;
                goto out;
        }
-       strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
-       if (so->so_accf->so_accept_filter_str != NULL)
-               strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+       strcpy(afap->af_name, so->sol_accept_filter->accf_name);
+       if (so->sol_accept_filter_str != NULL)
+               strcpy(afap->af_arg, so->sol_accept_filter_str);
 out:
        SOCK_UNLOCK(so);
        if (error == 0)
@@ -193,31 +193,57 @@ accept_filt_setopt(struct socket *so, struct sockopt *
 {
        struct accept_filter_arg *afap;
        struct accept_filter *afp;
-       struct so_accf *newaf;
-       int error = 0;
+       char *accept_filter_str = NULL;
+       void *accept_filter_arg = NULL;
+       int error;
 
        /*
         * Handle the simple delete case first.
         */
        if (sopt == NULL || sopt->sopt_val == NULL) {
+               struct socket *sp, *sp1;
+               int wakeup;
+
                SOCK_LOCK(so);
                if ((so->so_options & SO_ACCEPTCONN) == 0) {
                        SOCK_UNLOCK(so);
                        return (EINVAL);
                }
-               if (so->so_accf != NULL) {
-                       struct so_accf *af = so->so_accf;
-                       if (af->so_accept_filter != NULL &&
-                               af->so_accept_filter->accf_destroy != NULL) {
-                               af->so_accept_filter->accf_destroy(so);
-                       }
-                       if (af->so_accept_filter_str != NULL)
-                               free(af->so_accept_filter_str, M_ACCF);
-                       free(af, M_ACCF);
-                       so->so_accf = NULL;
+               if (so->sol_accept_filter == NULL) {
+                       SOCK_UNLOCK(so);
+                       return (0);
                }
+               if (so->sol_accept_filter->accf_destroy != NULL)
+                       so->sol_accept_filter->accf_destroy(so);
+               if (so->sol_accept_filter_str != NULL)
+                       free(so->sol_accept_filter_str, M_ACCF);
+               so->sol_accept_filter = NULL;
+               so->sol_accept_filter_arg = NULL;
+               so->sol_accept_filter_str = NULL;
                so->so_options &= ~SO_ACCEPTFILTER;
-               SOCK_UNLOCK(so);
+
+               /*
+                * Move from incomplete queue to complete only those
+                * connections, that are blocked by us.
+                */
+               wakeup = 0;
+               TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
+                       SOCK_LOCK(sp);
+                       if (sp->so_options & SO_ACCEPTFILTER) {
+                               TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
+                               TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
+                               sp->so_qstate = SQ_COMP;
+                               sp->so_options &= ~SO_ACCEPTFILTER;
+                               so->sol_incqlen--;
+                               so->sol_qlen++;
+                               wakeup = 1;
+                       }
+                       SOCK_UNLOCK(sp);
+               }
+               if (wakeup)
+                       solisten_wakeup(so);  /* unlocks */
+               else
+                       SOLISTEN_UNLOCK(so);
                return (0);
        }
 
@@ -238,17 +264,10 @@ accept_filt_setopt(struct socket *so, struct sockopt *
                free(afap, M_TEMP);
                return (ENOENT);
        }
-       /*
-        * Allocate the new accept filter instance storage.  We may
-        * have to free it again later if we fail to attach it.  If
-        * attached properly, 'newaf' is NULLed to avoid a free()
-        * while in use.
-        */
-       newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | M_ZERO);
        if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
                size_t len = strlen(afap->af_name) + 1;
-               newaf->so_accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
-               strcpy(newaf->so_accept_filter_str, afap->af_name);
+               accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
+               strcpy(accept_filter_str, afap->af_name);
        }
 
        /*
@@ -256,8 +275,8 @@ accept_filt_setopt(struct socket *so, struct sockopt *
         * without first removing it.
         */
        SOCK_LOCK(so);
-       if (((so->so_options & SO_ACCEPTCONN) == 0) ||
-           (so->so_accf != NULL)) {
+       if ((so->so_options & SO_ACCEPTCONN) == 0 ||
+           so->sol_accept_filter != NULL) {
                error = EINVAL;
                goto out;
        }
@@ -268,25 +287,20 @@ accept_filt_setopt(struct socket *so, struct sockopt *
         * can't block.
         */
        if (afp->accf_create != NULL) {
-               newaf->so_accept_filter_arg =
-                   afp->accf_create(so, afap->af_arg);
-               if (newaf->so_accept_filter_arg == NULL) {
+               accept_filter_arg = afp->accf_create(so, afap->af_arg);
+               if (accept_filter_arg == NULL) {
                        error = EINVAL;
                        goto out;
                }
        }
-       newaf->so_accept_filter = afp;
-       so->so_accf = newaf;
+       so->sol_accept_filter = afp;
+       so->sol_accept_filter_arg = accept_filter_arg;
+       so->sol_accept_filter_str = accept_filter_str;
        so->so_options |= SO_ACCEPTFILTER;
-       newaf = NULL;
 out:
        SOCK_UNLOCK(so);
-       if (newaf != NULL) {
-               if (newaf->so_accept_filter_str != NULL)
-                       free(newaf->so_accept_filter_str, M_ACCF);
-               free(newaf, M_ACCF);
-       }
-       if (afap != NULL)
-               free(afap, M_TEMP);
+       if (accept_filter_str != NULL)
+               free(accept_filter_str, M_ACCF);
+       free(afap, M_TEMP);
        return (error);
 }

Modified: head/sys/kern/uipc_debug.c
==============================================================================
--- head/sys/kern/uipc_debug.c  Thu Jun  8 21:06:18 2017        (r319721)
+++ head/sys/kern/uipc_debug.c  Thu Jun  8 21:30:34 2017        (r319722)
@@ -448,8 +448,6 @@ db_print_socket(struct socket *so, const char *socketn
        db_printf(")\n");
 
        db_print_indent(indent);
-       db_printf("so_qstate: 0x%x (", so->so_qstate);
-       db_print_soqstate(so->so_qstate);
        db_printf(")   ");
        db_printf("so_pcb: %p   ", so->so_pcb);
        db_printf("so_proto: %p\n", so->so_proto);
@@ -458,24 +456,28 @@ db_print_socket(struct socket *so, const char *socketn
                db_print_protosw(so->so_proto, "so_proto", indent);
 
        db_print_indent(indent);
-       db_printf("so_head: %p   ", so->so_head);
-       db_printf("so_incomp first: %p   ", TAILQ_FIRST(&so->so_incomp));
-       db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+       if (so->so_options & SO_ACCEPTCONN) {
+               db_printf("sol_incomp first: %p   ",
+                   TAILQ_FIRST(&so->sol_incomp));
+               db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp));
+               db_printf("sol_qlen: %d   ", so->sol_qlen);
+               db_printf("sol_incqlen: %d   ", so->sol_incqlen);
+               db_printf("sol_qlimit: %d   ", so->sol_qlimit);
+       } else {
+               db_printf("so_qstate: 0x%x (", so->so_qstate);
+               db_print_soqstate(so->so_qstate);
+               db_printf("so_listen: %p   ", so->so_listen);
+               /* so_list skipped */
+               db_printf("so_timeo: %d   ", so->so_timeo);
+               db_printf("so_error: %d\n", so->so_error);
 
-       db_print_indent(indent);
-       /* so_list skipped */
-       db_printf("so_qlen: %u   ", so->so_qlen);
-       db_printf("so_incqlen: %u   ", so->so_incqlen);
-       db_printf("so_qlimit: %u   ", so->so_qlimit);
-       db_printf("so_timeo: %d   ", so->so_timeo);
-       db_printf("so_error: %d\n", so->so_error);
+               db_print_indent(indent);
+               db_printf("so_sigio: %p   ", so->so_sigio);
+               db_printf("so_oobmark: %lu   ", so->so_oobmark);
 
-       db_print_indent(indent);
-       db_printf("so_sigio: %p   ", so->so_sigio);
-       db_printf("so_oobmark: %lu   ", so->so_oobmark);
-
-       db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
-       db_print_sockbuf(&so->so_snd, "so_snd", indent);
+               db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+               db_print_sockbuf(&so->so_snd, "so_snd", indent);
+       }
 }
 
 DB_SHOW_COMMAND(socket, db_show_socket)

Modified: head/sys/kern/uipc_sockbuf.c
==============================================================================
--- head/sys/kern/uipc_sockbuf.c        Thu Jun  8 21:06:18 2017        
(r319721)
+++ head/sys/kern/uipc_sockbuf.c        Thu Jun  8 21:30:34 2017        
(r319722)
@@ -314,14 +314,14 @@ sowakeup(struct socket *so, struct sockbuf *sb)
 
        SOCKBUF_LOCK_ASSERT(sb);
 
-       selwakeuppri(&sb->sb_sel, PSOCK);
-       if (!SEL_WAITING(&sb->sb_sel))
+       selwakeuppri(sb->sb_sel, PSOCK);
+       if (!SEL_WAITING(sb->sb_sel))
                sb->sb_flags &= ~SB_SEL;
        if (sb->sb_flags & SB_WAIT) {
                sb->sb_flags &= ~SB_WAIT;
                wakeup(&sb->sb_acc);
        }
-       KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+       KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
        if (sb->sb_upcall != NULL && !(so->so_state & SS_ISDISCONNECTED)) {
                ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
                if (ret == SU_ISCONNECTED) {

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c Thu Jun  8 21:06:18 2017        (r319721)
+++ head/sys/kern/uipc_socket.c Thu Jun  8 21:30:34 2017        (r319722)
@@ -106,6 +106,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_compat.h"
+#include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -154,13 +155,21 @@ __FBSDID("$FreeBSD$");
 
 static int     soreceive_rcvoob(struct socket *so, struct uio *uio,
                    int flags);
+static void    so_rdknl_lock(void *);
+static void    so_rdknl_unlock(void *);
+static void    so_rdknl_assert_locked(void *);
+static void    so_rdknl_assert_unlocked(void *);
+static void    so_wrknl_lock(void *);
+static void    so_wrknl_unlock(void *);
+static void    so_wrknl_assert_locked(void *);
+static void    so_wrknl_assert_unlocked(void *);
 
 static void    filt_sordetach(struct knote *kn);
 static int     filt_soread(struct knote *kn, long hint);
 static void    filt_sowdetach(struct knote *kn);
 static int     filt_sowrite(struct knote *kn, long hint);
-static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t 
h_id);
 static int     filt_soempty(struct knote *kn, long hint);
+static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t 
h_id);
 fo_kqfilter_t  soo_kqfilter;
 
 static struct filterops soread_filtops = {
@@ -393,8 +402,16 @@ soalloc(struct vnet *vnet)
                return (NULL);
        }
 
+       /*
+        * The socket locking protocol allows to lock 2 sockets at a time,
+        * however, the first one must be a listening socket.  WITNESS lacks
+        * a feature to change class of an existing lock, so we use DUPOK.
+        */
+       mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
        SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
        SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+       so->so_rcv.sb_sel = &so->so_rdsel;
+       so->so_snd.sb_sel = &so->so_wrsel;
        sx_init(&so->so_snd.sb_sx, "so_snd_sx");
        sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
        TAILQ_INIT(&so->so_snd.sb_aiojobq);
@@ -450,9 +467,6 @@ sodealloc(struct socket *so)
        if (so->so_snd.sb_hiwat)
                (void)chgsbsize(so->so_cred->cr_uidinfo,
                    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
-       /* remove accept filter if one is present. */
-       if (so->so_accf != NULL)
-               accept_filt_setopt(so, NULL);
 #ifdef MAC
        mac_socket_destroy(so);
 #endif
@@ -460,10 +474,16 @@ sodealloc(struct socket *so)
 
        crfree(so->so_cred);
        khelp_destroy_osd(&so->osd);
-       sx_destroy(&so->so_snd.sb_sx);
-       sx_destroy(&so->so_rcv.sb_sx);
-       SOCKBUF_LOCK_DESTROY(&so->so_snd);
-       SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+       if (SOLISTENING(so)) {
+               if (so->sol_accept_filter != NULL)
+                       accept_filt_setopt(so, NULL);
+       } else {
+               sx_destroy(&so->so_snd.sb_sx);
+               sx_destroy(&so->so_rcv.sb_sx);
+               SOCKBUF_LOCK_DESTROY(&so->so_snd);
+               SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+       }
+       mtx_destroy(&so->so_lock);
        uma_zfree(socket_zone, so);
 }
 
@@ -506,8 +526,6 @@ socreate(int dom, struct socket **aso, int type, int p
        if (so == NULL)
                return (ENOBUFS);
 
-       TAILQ_INIT(&so->so_incomp);
-       TAILQ_INIT(&so->so_comp);
        so->so_type = type;
        so->so_cred = crhold(cred);
        if ((prp->pr_domain->dom_family == PF_INET) ||
@@ -520,9 +538,10 @@ socreate(int dom, struct socket **aso, int type, int p
 #ifdef MAC
        mac_socket_create(cred, so);
 #endif
-       knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
-       knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
-       so->so_count = 1;
+       knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+           so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+       knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+           so_wrknl_assert_locked, so_wrknl_assert_unlocked);
        /*
         * Auto-sizing of socket buffers is managed by the protocols and
         * the appropriate flags must be set in the pru_attach function.
@@ -531,12 +550,10 @@ socreate(int dom, struct socket **aso, int type, int p
        error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
        CURVNET_RESTORE();
        if (error) {
-               KASSERT(so->so_count == 1, ("socreate: so_count %d",
-                   so->so_count));
-               so->so_count = 0;
                sodealloc(so);
                return (error);
        }
+       soref(so);
        *aso = so;
        return (0);
 }
@@ -564,11 +581,11 @@ sonewconn(struct socket *head, int connstatus)
        static int overcount;
 
        struct socket *so;
-       int over;
+       u_int over;
 
-       ACCEPT_LOCK();
-       over = (head->so_qlen > 3 * head->so_qlimit / 2);
-       ACCEPT_UNLOCK();
+       SOLISTEN_LOCK(head);
+       over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
+       SOLISTEN_UNLOCK(head);
 #ifdef REGRESSION
        if (regression_sonewconn_earlytest && over) {
 #else
@@ -580,15 +597,15 @@ sonewconn(struct socket *head, int connstatus)
                        log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
                            "%i already in queue awaiting acceptance "
                            "(%d occurrences)\n",
-                           __func__, head->so_pcb, head->so_qlen, overcount);
+                           __func__, head->so_pcb, head->sol_qlen, overcount);
 
                        overcount = 0;
                }
 
                return (NULL);
        }
-       VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
-           __func__, __LINE__, head));
+       VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
+           __func__, head));
        so = soalloc(head->so_vnet);
        if (so == NULL) {
                log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
@@ -596,11 +613,8 @@ sonewconn(struct socket *head, int connstatus)
                    __func__, head->so_pcb);
                return (NULL);
        }
-       if ((head->so_options & SO_ACCEPTFILTER) != 0)
-               connstatus = 0;
-       so->so_head = head;
+       so->so_listen = head;
        so->so_type = head->so_type;
-       so->so_options = head->so_options &~ SO_ACCEPTCONN;
        so->so_linger = head->so_linger;
        so->so_state = head->so_state | SS_NOFDREF;
        so->so_fibnum = head->so_fibnum;
@@ -609,10 +623,12 @@ sonewconn(struct socket *head, int connstatus)
 #ifdef MAC
        mac_socket_newconn(head, so);
 #endif
-       knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
-       knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+       knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+           so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+       knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+           so_wrknl_assert_locked, so_wrknl_assert_unlocked);
        VNET_SO_ASSERT(head);
-       if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+       if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
                sodealloc(so);
                log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
                    __func__, head->so_pcb);
@@ -624,32 +640,24 @@ sonewconn(struct socket *head, int connstatus)
                    __func__, head->so_pcb);
                return (NULL);
        }
-       so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
-       so->so_snd.sb_lowat = head->so_snd.sb_lowat;
-       so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
-       so->so_snd.sb_timeo = head->so_snd.sb_timeo;
-       so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
-       so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+       so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
+       so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
+       so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
+       so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
+       so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
+       so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
+
+       SOLISTEN_LOCK(head);
+       if (head->sol_accept_filter != NULL)
+               connstatus = 0;
        so->so_state |= connstatus;
-       ACCEPT_LOCK();
-       /*
-        * The accept socket may be tearing down but we just
-        * won a race on the ACCEPT_LOCK.
-        * However, if sctp_peeloff() is called on a 1-to-many
-        * style socket, the SO_ACCEPTCONN doesn't need to be set.
-        */
-       if (!(head->so_options & SO_ACCEPTCONN) &&
-           ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
-            (head->so_type != SOCK_SEQPACKET))) {
-               SOCK_LOCK(so);
-               so->so_head = NULL;
-               sofree(so);             /* NB: returns ACCEPT_UNLOCK'ed. */
-               return (NULL);
-       }
+       so->so_options = head->so_options & ~SO_ACCEPTCONN;
+       soref(head); /* A socket on (in)complete queue refs head. */
        if (connstatus) {
-               TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
-               so->so_qstate |= SQ_COMP;
-               head->so_qlen++;
+               TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+               so->so_qstate = SQ_COMP;
+               head->sol_qlen++;
+               solisten_wakeup(head);  /* unlocks */
        } else {
                /*
                 * Keep removing sockets from the head until there's room for
@@ -658,28 +666,86 @@ sonewconn(struct socket *head, int connstatus)
                 * threads and soabort() requires dropping locks, we must
                 * loop waiting for the condition to be true.
                 */
-               while (head->so_incqlen > head->so_qlimit) {
+               while (head->sol_incqlen > head->sol_qlimit) {
                        struct socket *sp;
-                       sp = TAILQ_FIRST(&head->so_incomp);
-                       TAILQ_REMOVE(&head->so_incomp, sp, so_list);
-                       head->so_incqlen--;
-                       sp->so_qstate &= ~SQ_INCOMP;
-                       sp->so_head = NULL;
-                       ACCEPT_UNLOCK();
+
+                       sp = TAILQ_FIRST(&head->sol_incomp);
+                       TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
+                       head->sol_incqlen--;
+                       SOCK_LOCK(sp);
+                       sp->so_qstate = SQ_NONE;
+                       sp->so_listen = NULL;
+                       SOCK_UNLOCK(sp);
+                       sorele(head);   /* does SOLISTEN_UNLOCK, head stays */
                        soabort(sp);
-                       ACCEPT_LOCK();
+                       SOLISTEN_LOCK(head);
                }
-               TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
-               so->so_qstate |= SQ_INCOMP;
-               head->so_incqlen++;
+               TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
+               so->so_qstate = SQ_INCOMP;
+               head->sol_incqlen++;
+               SOLISTEN_UNLOCK(head);
        }
-       ACCEPT_UNLOCK();
-       if (connstatus) {
-               sorwakeup(head);
-               wakeup_one(&head->so_timeo);
+       return (so);
+}
+
+#ifdef SCTP
+/*
+ * Socket part of sctp_peeloff().  Detach a new socket from an
+ * association.  The new socket is returned with a reference.
+ */
+struct socket *
+sopeeloff(struct socket *head)
+{
+       struct socket *so;
+
+       VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+           __func__, __LINE__, head));
+       so = soalloc(head->so_vnet);
+       if (so == NULL) {
+               log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+                   "limit reached or out of memory\n",
+                   __func__, head->so_pcb);
+               return (NULL);
        }
+       so->so_type = head->so_type;
+       so->so_options = head->so_options;
+       so->so_linger = head->so_linger;
+       so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
+       so->so_fibnum = head->so_fibnum;
+       so->so_proto = head->so_proto;
+       so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+       mac_socket_newconn(head, so);
+#endif
+       knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+           so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+       knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+           so_wrknl_assert_locked, so_wrknl_assert_unlocked);
+       VNET_SO_ASSERT(head);
+       if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+               sodealloc(so);
+               log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+                   __func__, head->so_pcb);
+               return (NULL);
+       }
+       if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+               sodealloc(so);
+               log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+                   __func__, head->so_pcb);
+               return (NULL);
+       }
+       so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+       so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+       so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+       so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+       so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+       so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+
+       soref(so);
+
        return (so);
 }
+#endif /* SCTP */
 
 int
 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -741,16 +807,140 @@ solisten_proto_check(struct socket *so)
 void
 solisten_proto(struct socket *so, int backlog)
 {
+       int sbrcv_lowat, sbsnd_lowat;
+       u_int sbrcv_hiwat, sbsnd_hiwat;
+       short sbrcv_flags, sbsnd_flags;
+       sbintime_t sbrcv_timeo, sbsnd_timeo;
 
        SOCK_LOCK_ASSERT(so);
 
+       if (SOLISTENING(so))
+               goto listening;
+
+       /*
+        * Change this socket to listening state.
+        */
+       sbrcv_lowat = so->so_rcv.sb_lowat;
+       sbsnd_lowat = so->so_snd.sb_lowat;
+       sbrcv_hiwat = so->so_rcv.sb_hiwat;
+       sbsnd_hiwat = so->so_snd.sb_hiwat;
+       sbrcv_flags = so->so_rcv.sb_flags;
+       sbsnd_flags = so->so_snd.sb_flags;
+       sbrcv_timeo = so->so_rcv.sb_timeo;
+       sbsnd_timeo = so->so_snd.sb_timeo;
+
+       sbdestroy(&so->so_snd, so);
+       sbdestroy(&so->so_rcv, so);
+       sx_destroy(&so->so_snd.sb_sx);
+       sx_destroy(&so->so_rcv.sb_sx);
+       SOCKBUF_LOCK_DESTROY(&so->so_snd);
+       SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+
+#ifdef INVARIANTS
+       bzero(&so->so_rcv,
+           sizeof(struct socket) - offsetof(struct socket, so_rcv));
+#endif
+
+       so->sol_sbrcv_lowat = sbrcv_lowat;
+       so->sol_sbsnd_lowat = sbsnd_lowat;
+       so->sol_sbrcv_hiwat = sbrcv_hiwat;
+       so->sol_sbsnd_hiwat = sbsnd_hiwat;
+       so->sol_sbrcv_flags = sbrcv_flags;
+       so->sol_sbsnd_flags = sbsnd_flags;
+       so->sol_sbrcv_timeo = sbrcv_timeo;
+       so->sol_sbsnd_timeo = sbsnd_timeo;
+
+       so->sol_qlen = so->sol_incqlen = 0;
+       TAILQ_INIT(&so->sol_incomp);
+       TAILQ_INIT(&so->sol_comp);
+
+       so->sol_accept_filter = NULL;
+       so->sol_accept_filter_arg = NULL;
+       so->sol_accept_filter_str = NULL;
+
+       so->so_options |= SO_ACCEPTCONN;
+
+listening:
        if (backlog < 0 || backlog > somaxconn)
                backlog = somaxconn;
-       so->so_qlimit = backlog;
-       so->so_options |= SO_ACCEPTCONN;
+       so->sol_qlimit = backlog;
 }
 
 /*
+ * Wakeup listeners/subsystems once we have a complete connection.
+ * Enters with lock, returns unlocked.
+ */
+void
+solisten_wakeup(struct socket *sol)
+{
+
+       if (sol->sol_upcall != NULL)
+               (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
+       else {
+               selwakeuppri(&sol->so_rdsel, PSOCK);
+               KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
+       }
+       SOLISTEN_UNLOCK(sol);
+       wakeup_one(&sol->sol_comp);
+}
+
+/*
+ * Return single connection off a listening socket queue.  Main consumer of
+ * the function is kern_accept4().  Some modules, that do their own accept
+ * management also use the function.
+ *
+ * Listening socket must be locked on entry and is returned unlocked on
+ * return.
+ * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
+ */
+int
+solisten_dequeue(struct socket *head, struct socket **ret, int flags)
+{
+       struct socket *so;
+       int error;
+
+       SOLISTEN_LOCK_ASSERT(head);
+
+       while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
+           head->so_error == 0) {
+               error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
+                   "accept", 0);
+               if (error != 0) {
+                       SOLISTEN_UNLOCK(head);
+                       return (error);
+               }
+       }
+       if (head->so_error) {
+               error = head->so_error;
+               head->so_error = 0;
+               SOLISTEN_UNLOCK(head);
+               return (error);
+        }
+       if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) {
+               SOLISTEN_UNLOCK(head);
+               return (EWOULDBLOCK);
+       }
+       so = TAILQ_FIRST(&head->sol_comp);
+       SOCK_LOCK(so);
+       KASSERT(so->so_qstate == SQ_COMP,
+           ("%s: so %p not SQ_COMP", __func__, so));
+       soref(so);
+       head->sol_qlen--;
+       so->so_qstate = SQ_NONE;
+       so->so_listen = NULL;
+       TAILQ_REMOVE(&head->sol_comp, so, so_list);
+       if (flags & ACCEPT4_INHERIT)
+               so->so_state |= (head->so_state & SS_NBIO);
+       else
+               so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+       SOCK_UNLOCK(so);
+       sorele(head);
+
+       *ret = so;
+       return (0);
+}
+
+/*
  * Evaluate the reference count and named references on a socket; if no
  * references remain, free it.  This should be called whenever a reference is
  * released, such as in sorele(), but also when named reference flags are
@@ -774,44 +964,62 @@ void
 sofree(struct socket *so)
 {
        struct protosw *pr = so->so_proto;
-       struct socket *head;
 
-       ACCEPT_LOCK_ASSERT();
        SOCK_LOCK_ASSERT(so);
 
        if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
-           (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+           (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
                SOCK_UNLOCK(so);
-               ACCEPT_UNLOCK();
                return;
        }
 
-       head = so->so_head;
-       if (head != NULL) {
-               KASSERT((so->so_qstate & SQ_COMP) != 0 ||
-                   (so->so_qstate & SQ_INCOMP) != 0,
-                   ("sofree: so_head != NULL, but neither SQ_COMP nor "
-                   "SQ_INCOMP"));
-               KASSERT((so->so_qstate & SQ_COMP) == 0 ||
-                   (so->so_qstate & SQ_INCOMP) == 0,
-                   ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
-               TAILQ_REMOVE(&head->so_incomp, so, so_list);
-               head->so_incqlen--;
-               so->so_qstate &= ~SQ_INCOMP;
-               so->so_head = NULL;
+       if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
+               struct socket *sol;
+
+               sol = so->so_listen;
+               KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
+
+               /*
+                * To solve race between close of a listening socket and
+                * a socket on its incomplete queue, we need to lock both.
+                * The order is first listening socket, then regular.
+                * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
+                * function and the listening socket are the only pointers
+                * to so.  To preserve so and sol, we reference both and then
+                * relock.
+                * After relock the socket may not move to so_comp since it
+                * doesn't have PCB already, but it may be removed from
+                * so_incomp. If that happens, we share responsiblity on
+                * freeing the socket, but soclose() has already removed

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to