Diff below adds a 'struct socket *' argument to sounlock() in order to
prepare the stack for per-socket locks.

That means sofree() will now unlock a given socket before freeing it.
But since we do not want to not release the NET_LOCK() when processing
incoming TCP packets, in_pcbdetach() needs a special treatment.  That's
also true for unp_drop() as long as Unix sockets will required the
KERNEL_LOCK().

This is on top of my previous diff to reduce the number of sofree().

Comments?  Oks?

diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c
index 916c33a0c1a..a754a7b2698 100644
--- sys/kern/sys_socket.c
+++ sys/kern/sys_socket.c
@@ -88,7 +88,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct 
proc *p)
                        so->so_state |= SS_NBIO;
                else
                        so->so_state &= ~SS_NBIO;
-               sounlock(s);
+               sounlock(so, s);
                break;
 
        case FIOASYNC:
@@ -102,7 +102,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct 
proc *p)
                        so->so_rcv.sb_flags &= ~SB_ASYNC;
                        so->so_snd.sb_flags &= ~SB_ASYNC;
                }
-               sounlock(s);
+               sounlock(so, s);
                break;
 
        case FIONREAD:
@@ -176,7 +176,7 @@ soo_poll(struct file *fp, int events, struct proc *p)
                        so->so_snd.sb_flags |= SB_SEL;
                }
        }
-       sounlock(s);
+       sounlock(so, s);
        return (revents);
 }
 
@@ -197,7 +197,7 @@ soo_stat(struct file *fp, struct stat *ub, struct proc *p)
        ub->st_gid = so->so_egid;
        (void) ((*so->so_proto->pr_usrreq)(so, PRU_SENSE,
            (struct mbuf *)ub, NULL, NULL, p));
-       sounlock(s);
+       sounlock(so, s);
        return (0);
 }
 
diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index 211966c79c8..aa789d403cc 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -142,11 +142,11 @@ socreate(int dom, struct socket **aso, int type, int 
proto)
        error = (*prp->pr_attach)(so, proto);
        if (error) {
                so->so_state |= SS_NOFDREF;
-               sofree(so);
-               sounlock(s);
+               /* sofree() calls sounlock(). */
+               sofree(so, s);
                return (error);
        }
-       sounlock(s);
+       sounlock(so, s);
        *aso = so;
        return (0);
 }
@@ -177,7 +177,7 @@ solisten(struct socket *so, int backlog)
        error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
            curproc);
        if (error) {
-               sounlock(s);
+               sounlock(so, s);
                return (error);
        }
        if (TAILQ_FIRST(&so->so_q) == NULL)
@@ -187,25 +187,29 @@ solisten(struct socket *so, int backlog)
        if (backlog < sominconn)
                backlog = sominconn;
        so->so_qlimit = backlog;
-       sounlock(s);
+       sounlock(so, s);
        return (0);
 }
 
 void
-sofree(struct socket *so)
+sofree(struct socket *so, int s)
 {
        soassertlocked(so);
 
-       if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+       if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
+               sounlock(so, s);
                return;
+       }
        if (so->so_head) {
                /*
                 * We must not decommission a socket that's on the accept(2)
                 * queue.  If we do, then accept(2) may hang after select(2)
                 * indicated that the listening socket was ready.
                 */
-               if (!soqremque(so, 0))
+               if (!soqremque(so, 0)) {
+                       sounlock(so, s);
                        return;
+               }
        }
 #ifdef SOCKET_SPLICE
        if (so->so_sp) {
@@ -218,6 +222,7 @@ sofree(struct socket *so)
 #endif /* SOCKET_SPLICE */
        sbrelease(so, &so->so_snd);
        sorflush(so);
+       sounlock(so, s);
 #ifdef SOCKET_SPLICE
        if (so->so_sp) {
                /* Reuse splice idle, sounsplice() has been called before. */
@@ -284,8 +289,8 @@ drop:
 discard:
        KASSERT((so->so_state & SS_NOFDREF) == 0);
        so->so_state |= SS_NOFDREF;
-       sofree(so);
-       sounlock(s);
+       /* sofree() calls sounlock(). */
+       sofree(so, s);
        return (error);
 }
 
@@ -349,7 +354,7 @@ soconnect2(struct socket *so1, struct socket *so2)
        s = solock(so1);
        error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
            (struct mbuf *)so2, NULL, curproc);
-       sounlock(s);
+       sounlock(so1, s);
        return (error);
 }
 
@@ -478,7 +483,7 @@ restart:
                                if (flags & MSG_EOR)
                                        top->m_flags |= M_EOR;
                        } else {
-                               sounlock(s);
+                               sounlock(so, s);
                                error = m_getuio(&top, atomic, space, uio);
                                s = solock(so);
                                if (error)
@@ -507,7 +512,7 @@ release:
        so->so_state &= ~SS_ISSENDING;
        sbunlock(so, &so->so_snd);
 out:
-       sounlock(s);
+       sounlock(so, s);
        m_freem(top);
        m_freem(control);
        return (error);
@@ -661,7 +666,7 @@ soreceive(struct socket *so, struct mbuf **paddr, struct 
uio *uio,
                s = solock(so);
                error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
                    (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc);
-               sounlock(s);
+               sounlock(so, s);
                if (error)
                        goto bad;
                do {
@@ -679,7 +684,7 @@ bad:
        s = solock(so);
 restart:
        if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) {
-               sounlock(s);
+               sounlock(so, s);
                return (error);
        }
 
@@ -747,7 +752,7 @@ restart:
                sbunlock(so, &so->so_rcv);
                error = sbwait(so, &so->so_rcv);
                if (error) {
-                       sounlock(s);
+                       sounlock(so, s);
                        return (error);
                }
                goto restart;
@@ -883,7 +888,7 @@ dontblock:
                        SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
                        SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
                        resid = uio->uio_resid;
-                       sounlock(s);
+                       sounlock(so, s);
                        uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
                        s = solock(so);
                        if (uio_error)
@@ -967,7 +972,7 @@ dontblock:
                        error = sbwait(so, &so->so_rcv);
                        if (error) {
                                sbunlock(so, &so->so_rcv);
-                               sounlock(s);
+                               sounlock(so, s);
                                return (0);
                        }
                        if ((m = so->so_rcv.sb_mb) != NULL)
@@ -1013,7 +1018,7 @@ dontblock:
                *flagsp |= flags;
 release:
        sbunlock(so, &so->so_rcv);
-       sounlock(s);
+       sounlock(so, s);
        return (error);
 }
 
@@ -1039,7 +1044,7 @@ soshutdown(struct socket *so, int how)
                error = EINVAL;
                break;
        }
-       sounlock(s);
+       sounlock(so, s);
 
        return (error);
 }
@@ -1218,7 +1223,7 @@ soidle(void *arg)
                so->so_error = ETIMEDOUT;
                sounsplice(so, so->so_sp->ssp_socket, 1);
        }
-       sounlock(s);
+       sounlock(so, s);
 }
 
 void
@@ -1236,7 +1241,7 @@ sotask(void *arg)
                 */
                somove(so, M_DONTWAIT);
        }
-       sounlock(s);
+       sounlock(so, s);
 
        /* Avoid user land starvation. */
        yield();
diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
index 8bb11fd97a1..0cb8b6dc98f 100644
--- sys/kern/uipc_socket2.c
+++ sys/kern/uipc_socket2.c
@@ -277,25 +277,38 @@ solock(struct socket *so)
 {
        int s = 0;
 
-       if ((so->so_proto->pr_domain->dom_family != PF_UNIX) &&
-           (so->so_proto->pr_domain->dom_family != PF_ROUTE) &&
-           (so->so_proto->pr_domain->dom_family != PF_KEY))
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               s = -42;
                NET_LOCK();
-       else {
+               break;
+       case PF_UNIX:
+       case PF_ROUTE:
+       case PF_KEY:
+       default:
                KERNEL_LOCK();
-               s = -42;
+               break;
        }
 
        return (s);
 }
 
 void
-sounlock(int s)
+sounlock(struct socket *so, int s)
 {
-       if (s != -42)
-               NET_UNLOCK();
-       else {
+       switch (so->so_proto->pr_domain->dom_family) {
+       case PF_INET:
+       case PF_INET6:
+               if (s == -42)
+                       NET_UNLOCK();
+               break;
+       case PF_UNIX:
+       case PF_ROUTE:
+       case PF_KEY:
+       default:
                KERNEL_UNLOCK();
+               break;
        }
 }
 
diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c
index 1c23bb59091..a6a6aee173d 100644
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -209,7 +209,7 @@ sys_bind(struct proc *p, void *v, register_t *retval)
 #endif
        s = solock(so);
        error = sobind(so, nam, p);
-       sounlock(s);
+       sounlock(so, s);
        m_freem(nam);
 out:
        FRELE(fp, p);
@@ -351,7 +351,7 @@ out:
                        so->so_state |= SS_NBIO;
                else
                        so->so_state &= ~SS_NBIO;
-               sounlock(s);
+               sounlock(head, s);
                fdplock(fdp);
                fp->f_data = so;
                fdinsert(fdp, tmpfd, cloexec, fp);
@@ -359,7 +359,7 @@ out:
                FRELE(fp, p);
                *retval = tmpfd;
        } else {
-               sounlock(s);
+               sounlock(head, s);
                fdplock(fdp);
                fdremove(fdp, tmpfd);
                closef(fp, p);
@@ -437,7 +437,7 @@ bad:
        if (!interrupted)
                so->so_state &= ~SS_ISCONNECTING;
 out:
-       sounlock(s);
+       sounlock(so, s);
        FRELE(fp, p);
        m_freem(nam);
        if (error == ERESTART)
@@ -1000,7 +1000,7 @@ sys_setsockopt(struct proc *p, void *v, register_t 
*retval)
        so = fp->f_data;
        s = solock(so);
        error = sosetopt(so, SCARG(uap, level), SCARG(uap, name), m);
-       sounlock(s);
+       sounlock(so, s);
 bad:
        m_freem(m);
        FRELE(fp, p);
@@ -1039,7 +1039,7 @@ sys_getsockopt(struct proc *p, void *v, register_t 
*retval)
        so = fp->f_data;
        s = solock(so);
        error = sogetopt(so, SCARG(uap, level), SCARG(uap, name), m);
-       sounlock(s);
+       sounlock(so, s);
        if (error == 0 && SCARG(uap, val) && valsize && m != NULL) {
                if (valsize > m->m_len)
                        valsize = m->m_len;
@@ -1083,7 +1083,7 @@ sys_getsockname(struct proc *p, void *v, register_t 
*retval)
        m = m_getclr(M_WAIT, MT_SONAME);
        s = solock(so);
        error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0, p);
-       sounlock(s);
+       sounlock(so, s);
        if (error)
                goto bad;
        error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen));
@@ -1126,7 +1126,7 @@ sys_getpeername(struct proc *p, void *v, register_t 
*retval)
        m = m_getclr(M_WAIT, MT_SONAME);
        s = solock(so);
        error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0, p);
-       sounlock(s);
+       sounlock(so, s);
        if (error)
                goto bad;
        error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen));
diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c
index 5d95208adc6..81a64a8554a 100644
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -612,11 +612,19 @@ unp_drop(struct unpcb *unp, int errno)
 {
        struct socket *so = unp->unp_socket;
 
+       KERNEL_ASSERT_LOCKED();
+
        so->so_error = errno;
        unp_disconnect(unp);
        if (so->so_head) {
                so->so_pcb = NULL;
-               sofree(so);
+               /*
+                * sofree() releases the socket lock, so we need to
+                * grab it beforehand as long as Unix sockets rely on
+                * the KERNEL_LOCK();
+                */
+               KERNEL_LOCK();
+               sofree(so, 0);
                m_freem(unp->unp_addr);
                free(unp, M_PCB, sizeof *unp);
        }
diff --git sys/miscfs/fifofs/fifo_vnops.c sys/miscfs/fifofs/fifo_vnops.c
index 03a5677a05d..472bfd408a0 100644
--- sys/miscfs/fifofs/fifo_vnops.c
+++ sys/miscfs/fifofs/fifo_vnops.c
@@ -170,7 +170,7 @@ fifo_open(void *v)
                fip->fi_writers++;
                if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) {
                        error = ENXIO;
-                       sounlock(s);
+                       sounlock(wso, s);
                        goto bad;
                }
                if (fip->fi_writers == 1) {
@@ -179,7 +179,7 @@ fifo_open(void *v)
                                wakeup(&fip->fi_readers);
                }
        }
-       sounlock(s);
+       sounlock(wso, s);
        if ((ap->a_mode & O_NONBLOCK) == 0) {
                if ((ap->a_mode & FREAD) && fip->fi_writers == 0) {
                        VOP_UNLOCK(vp);
@@ -334,7 +334,7 @@ fifo_poll(void *v)
                        wso->so_snd.sb_flags |= SB_SEL;
                }
        }
-       sounlock(s);
+       sounlock(rso, s);
        return (revents);
 }
 
@@ -369,7 +369,7 @@ fifo_close(void *v)
 
                        s = solock(wso);
                        socantsendmore(wso);
-                       sounlock(s);
+                       sounlock(wso, s);
                }
        }
        if (ap->a_fflag & FWRITE) {
@@ -380,7 +380,7 @@ fifo_close(void *v)
                        /* SS_ISDISCONNECTED will result in POLLHUP */
                        rso->so_state |= SS_ISDISCONNECTED;
                        socantrcvmore(rso);
-                       sounlock(s);
+                       sounlock(rso, s);
                }
        }
        if (fip->fi_readers == 0 && fip->fi_writers == 0) {
diff --git sys/net/bfd.c sys/net/bfd.c
index e3e557e0d37..8bcfb305c99 100644
--- sys/net/bfd.c
+++ sys/net/bfd.c
@@ -611,7 +611,7 @@ bfd_sender(struct bfd_config *bfd, unsigned int port)
 
        s = solock(so);
        error = soconnect(so, m);
-       sounlock(s);
+       sounlock(so, s);
        if (error && error != ECONNREFUSED) {
                printf("%s: soconnect error %d\n",
                    __func__, error);
diff --git sys/net/if_pflow.c sys/net/if_pflow.c
index 9ba382d5069..1b1c2a4e33e 100644
--- sys/net/if_pflow.c
+++ sys/net/if_pflow.c
@@ -442,7 +442,7 @@ pflow_set(struct pflow_softc *sc, struct pflowreq *pflowr)
 
                                s = solock(so);
                                error = sobind(so, m, p);
-                               sounlock(s);
+                               sounlock(so, s);
                                m_freem(m);
                                if (error) {
                                        soclose(so);
diff --git sys/netinet/in_pcb.c sys/netinet/in_pcb.c
index 413f9de9df5..62bb9af000c 100644
--- sys/netinet/in_pcb.c
+++ sys/netinet/in_pcb.c
@@ -584,8 +584,13 @@ in_pcbdetach(struct inpcb *inp)
 
        NET_ASSERT_LOCKED();
 
-       so->so_pcb = 0;
-       sofree(so);
+       so->so_pcb = NULL;
+       /*
+        * As long as the NET_LOCK() is the default lock for Internet
+        * sockets, do not release it to not introduce new sleeping
+        * points.
+        */
+       sofree(so, 0);
        m_freem(inp->inp_options);
        if (inp->inp_route.ro_rt) {
                rtfree(inp->inp_route.ro_rt);
diff --git sys/nfs/krpc_subr.c sys/nfs/krpc_subr.c
index 346ff9ec989..e487867db1b 100644
--- sys/nfs/krpc_subr.c
+++ sys/nfs/krpc_subr.c
@@ -241,7 +241,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, 
u_int func,
        m->m_len = sizeof(tv);
        s = solock(so);
        error = sosetopt(so, SOL_SOCKET, SO_RCVTIMEO, m);
-       sounlock(s);
+       sounlock(so, s);
        m_freem(m);
        if (error)
                goto out;
@@ -257,7 +257,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, 
u_int func,
                *on = 1;
                s = solock(so);
                error = sosetopt(so, SOL_SOCKET, SO_BROADCAST, m);
-               sounlock(s);
+               sounlock(so, s);
                m_freem(m);
                if (error)
                        goto out;
@@ -274,7 +274,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, 
u_int func,
        *ip = IP_PORTRANGE_LOW;
        s = solock(so);
        error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
-       sounlock(s);
+       sounlock(so, s);
        m_freem(mopt);
        if (error)
                goto out;
@@ -288,7 +288,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, 
u_int func,
        sin->sin_port = htons(0);
        s = solock(so);
        error = sobind(so, m, &proc0);
-       sounlock(s);
+       sounlock(so, s);
        m_freem(m);
        if (error) {
                printf("bind failed\n");
@@ -301,7 +301,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, 
u_int func,
        *ip = IP_PORTRANGE_DEFAULT;
        s = solock(so);
        error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt);
-       sounlock(s);
+       sounlock(so, s);
        m_freem(mopt);
        if (error)
                goto out;
diff --git sys/nfs/nfs_socket.c sys/nfs/nfs_socket.c
index 383db1cb930..0119f135ef2 100644
--- sys/nfs/nfs_socket.c
+++ sys/nfs/nfs_socket.c
@@ -365,7 +365,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
                goto bad;
        so->so_rcv.sb_flags |= SB_NOINTR;
        so->so_snd.sb_flags |= SB_NOINTR;
-       sounlock(s);
+       sounlock(so, s);
 
        m_freem(mopt);
        m_freem(nam);
@@ -378,7 +378,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep)
        return (0);
 
 bad:
-       sounlock(s);
+       sounlock(so, s);
 
        m_freem(mopt);
        m_freem(nam);
diff --git sys/nfs/nfs_syscalls.c sys/nfs/nfs_syscalls.c
index 527a61a37de..6b501cc419f 100644
--- sys/nfs/nfs_syscalls.c
+++ sys/nfs/nfs_syscalls.c
@@ -250,7 +250,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam)
        s = solock(so);
        error = soreserve(so, siz, siz); 
        if (error) {
-               sounlock(s);
+               sounlock(so, s);
                m_freem(mynam);
                return (error);
        }
@@ -279,7 +279,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam)
        so->so_rcv.sb_timeo = 0;
        so->so_snd.sb_flags &= ~SB_NOINTR;
        so->so_snd.sb_timeo = 0;
-       sounlock(s);
+       sounlock(so, s);
        if (tslp)
                slp = tslp;
        else {
diff --git sys/sys/socketvar.h sys/sys/socketvar.h
index 097ae3a4ab9..ce8bbdf9f2d 100644
--- sys/sys/socketvar.h
+++ sys/sys/socketvar.h
@@ -311,7 +311,7 @@ int soconnect(struct socket *so, struct mbuf *nam);
 int    soconnect2(struct socket *so1, struct socket *so2);
 int    socreate(int dom, struct socket **aso, int type, int proto);
 int    sodisconnect(struct socket *so);
-void   sofree(struct socket *so);
+void   sofree(struct socket *so, int);
 int    sogetopt(struct socket *so, int level, int optname, struct mbuf *m);
 void   sohasoutofband(struct socket *so);
 void   soisconnected(struct socket *so);
@@ -338,7 +338,7 @@ int sockargs(struct mbuf **, const void *, size_t, int);
 
 int    sosleep(struct socket *, void *, int, const char *, int);
 int    solock(struct socket *);
-void   sounlock(int);
+void   sounlock(struct socket *, int);
 
 int    sendit(struct proc *, int, struct msghdr *, int, register_t *);
 int    recvit(struct proc *, int, struct msghdr *, caddr_t,

Reply via email to