Diff below adds a 'struct socket *' argument to sounlock() in order to prepare the stack for per-socket locks.
That means sofree() will now unlock a given socket before freeing it. But since we do not want to not release the NET_LOCK() when processing incoming TCP packets, in_pcbdetach() needs a special treatment. That's also true for unp_drop() as long as Unix sockets will required the KERNEL_LOCK(). This is on top of my previous diff to reduce the number of sofree(). Comments? Oks? diff --git sys/kern/sys_socket.c sys/kern/sys_socket.c index 916c33a0c1a..a754a7b2698 100644 --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -88,7 +88,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; - sounlock(s); + sounlock(so, s); break; case FIOASYNC: @@ -102,7 +102,7 @@ soo_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p) so->so_rcv.sb_flags &= ~SB_ASYNC; so->so_snd.sb_flags &= ~SB_ASYNC; } - sounlock(s); + sounlock(so, s); break; case FIONREAD: @@ -176,7 +176,7 @@ soo_poll(struct file *fp, int events, struct proc *p) so->so_snd.sb_flags |= SB_SEL; } } - sounlock(s); + sounlock(so, s); return (revents); } @@ -197,7 +197,7 @@ soo_stat(struct file *fp, struct stat *ub, struct proc *p) ub->st_gid = so->so_egid; (void) ((*so->so_proto->pr_usrreq)(so, PRU_SENSE, (struct mbuf *)ub, NULL, NULL, p)); - sounlock(s); + sounlock(so, s); return (0); } diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c index 211966c79c8..aa789d403cc 100644 --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -142,11 +142,11 @@ socreate(int dom, struct socket **aso, int type, int proto) error = (*prp->pr_attach)(so, proto); if (error) { so->so_state |= SS_NOFDREF; - sofree(so); - sounlock(s); + /* sofree() calls sounlock(). */ + sofree(so, s); return (error); } - sounlock(s); + sounlock(so, s); *aso = so; return (0); } @@ -177,7 +177,7 @@ solisten(struct socket *so, int backlog) error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL, curproc); if (error) { - sounlock(s); + sounlock(so, s); return (error); } if (TAILQ_FIRST(&so->so_q) == NULL) @@ -187,25 +187,29 @@ solisten(struct socket *so, int backlog) if (backlog < sominconn) backlog = sominconn; so->so_qlimit = backlog; - sounlock(s); + sounlock(so, s); return (0); } void -sofree(struct socket *so) +sofree(struct socket *so, int s) { soassertlocked(so); - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) { + sounlock(so, s); return; + } if (so->so_head) { /* * We must not decommission a socket that's on the accept(2) * queue. If we do, then accept(2) may hang after select(2) * indicated that the listening socket was ready. */ - if (!soqremque(so, 0)) + if (!soqremque(so, 0)) { + sounlock(so, s); return; + } } #ifdef SOCKET_SPLICE if (so->so_sp) { @@ -218,6 +222,7 @@ sofree(struct socket *so) #endif /* SOCKET_SPLICE */ sbrelease(so, &so->so_snd); sorflush(so); + sounlock(so, s); #ifdef SOCKET_SPLICE if (so->so_sp) { /* Reuse splice idle, sounsplice() has been called before. */ @@ -284,8 +289,8 @@ drop: discard: KASSERT((so->so_state & SS_NOFDREF) == 0); so->so_state |= SS_NOFDREF; - sofree(so); - sounlock(s); + /* sofree() calls sounlock(). */ + sofree(so, s); return (error); } @@ -349,7 +354,7 @@ soconnect2(struct socket *so1, struct socket *so2) s = solock(so1); error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL, (struct mbuf *)so2, NULL, curproc); - sounlock(s); + sounlock(so1, s); return (error); } @@ -478,7 +483,7 @@ restart: if (flags & MSG_EOR) top->m_flags |= M_EOR; } else { - sounlock(s); + sounlock(so, s); error = m_getuio(&top, atomic, space, uio); s = solock(so); if (error) @@ -507,7 +512,7 @@ release: so->so_state &= ~SS_ISSENDING; sbunlock(so, &so->so_snd); out: - sounlock(s); + sounlock(so, s); m_freem(top); m_freem(control); return (error); @@ -661,7 +666,7 @@ soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio, s = solock(so); error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, (struct mbuf *)(long)(flags & MSG_PEEK), NULL, curproc); - sounlock(s); + sounlock(so, s); if (error) goto bad; do { @@ -679,7 +684,7 @@ bad: s = solock(so); restart: if ((error = sblock(so, &so->so_rcv, SBLOCKWAIT(flags))) != 0) { - sounlock(s); + sounlock(so, s); return (error); } @@ -747,7 +752,7 @@ restart: sbunlock(so, &so->so_rcv); error = sbwait(so, &so->so_rcv); if (error) { - sounlock(s); + sounlock(so, s); return (error); } goto restart; @@ -883,7 +888,7 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove"); SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove"); resid = uio->uio_resid; - sounlock(s); + sounlock(so, s); uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio); s = solock(so); if (uio_error) @@ -967,7 +972,7 @@ dontblock: error = sbwait(so, &so->so_rcv); if (error) { sbunlock(so, &so->so_rcv); - sounlock(s); + sounlock(so, s); return (0); } if ((m = so->so_rcv.sb_mb) != NULL) @@ -1013,7 +1018,7 @@ dontblock: *flagsp |= flags; release: sbunlock(so, &so->so_rcv); - sounlock(s); + sounlock(so, s); return (error); } @@ -1039,7 +1044,7 @@ soshutdown(struct socket *so, int how) error = EINVAL; break; } - sounlock(s); + sounlock(so, s); return (error); } @@ -1218,7 +1223,7 @@ soidle(void *arg) so->so_error = ETIMEDOUT; sounsplice(so, so->so_sp->ssp_socket, 1); } - sounlock(s); + sounlock(so, s); } void @@ -1236,7 +1241,7 @@ sotask(void *arg) */ somove(so, M_DONTWAIT); } - sounlock(s); + sounlock(so, s); /* Avoid user land starvation. */ yield(); diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c index 8bb11fd97a1..0cb8b6dc98f 100644 --- sys/kern/uipc_socket2.c +++ sys/kern/uipc_socket2.c @@ -277,25 +277,38 @@ solock(struct socket *so) { int s = 0; - if ((so->so_proto->pr_domain->dom_family != PF_UNIX) && - (so->so_proto->pr_domain->dom_family != PF_ROUTE) && - (so->so_proto->pr_domain->dom_family != PF_KEY)) + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + s = -42; NET_LOCK(); - else { + break; + case PF_UNIX: + case PF_ROUTE: + case PF_KEY: + default: KERNEL_LOCK(); - s = -42; + break; } return (s); } void -sounlock(int s) +sounlock(struct socket *so, int s) { - if (s != -42) - NET_UNLOCK(); - else { + switch (so->so_proto->pr_domain->dom_family) { + case PF_INET: + case PF_INET6: + if (s == -42) + NET_UNLOCK(); + break; + case PF_UNIX: + case PF_ROUTE: + case PF_KEY: + default: KERNEL_UNLOCK(); + break; } } diff --git sys/kern/uipc_syscalls.c sys/kern/uipc_syscalls.c index 1c23bb59091..a6a6aee173d 100644 --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -209,7 +209,7 @@ sys_bind(struct proc *p, void *v, register_t *retval) #endif s = solock(so); error = sobind(so, nam, p); - sounlock(s); + sounlock(so, s); m_freem(nam); out: FRELE(fp, p); @@ -351,7 +351,7 @@ out: so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; - sounlock(s); + sounlock(head, s); fdplock(fdp); fp->f_data = so; fdinsert(fdp, tmpfd, cloexec, fp); @@ -359,7 +359,7 @@ out: FRELE(fp, p); *retval = tmpfd; } else { - sounlock(s); + sounlock(head, s); fdplock(fdp); fdremove(fdp, tmpfd); closef(fp, p); @@ -437,7 +437,7 @@ bad: if (!interrupted) so->so_state &= ~SS_ISCONNECTING; out: - sounlock(s); + sounlock(so, s); FRELE(fp, p); m_freem(nam); if (error == ERESTART) @@ -1000,7 +1000,7 @@ sys_setsockopt(struct proc *p, void *v, register_t *retval) so = fp->f_data; s = solock(so); error = sosetopt(so, SCARG(uap, level), SCARG(uap, name), m); - sounlock(s); + sounlock(so, s); bad: m_freem(m); FRELE(fp, p); @@ -1039,7 +1039,7 @@ sys_getsockopt(struct proc *p, void *v, register_t *retval) so = fp->f_data; s = solock(so); error = sogetopt(so, SCARG(uap, level), SCARG(uap, name), m); - sounlock(s); + sounlock(so, s); if (error == 0 && SCARG(uap, val) && valsize && m != NULL) { if (valsize > m->m_len) valsize = m->m_len; @@ -1083,7 +1083,7 @@ sys_getsockname(struct proc *p, void *v, register_t *retval) m = m_getclr(M_WAIT, MT_SONAME); s = solock(so); error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0, p); - sounlock(s); + sounlock(so, s); if (error) goto bad; error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen)); @@ -1126,7 +1126,7 @@ sys_getpeername(struct proc *p, void *v, register_t *retval) m = m_getclr(M_WAIT, MT_SONAME); s = solock(so); error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0, p); - sounlock(s); + sounlock(so, s); if (error) goto bad; error = copyaddrout(p, m, SCARG(uap, asa), len, SCARG(uap, alen)); diff --git sys/kern/uipc_usrreq.c sys/kern/uipc_usrreq.c index 5d95208adc6..81a64a8554a 100644 --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -612,11 +612,19 @@ unp_drop(struct unpcb *unp, int errno) { struct socket *so = unp->unp_socket; + KERNEL_ASSERT_LOCKED(); + so->so_error = errno; unp_disconnect(unp); if (so->so_head) { so->so_pcb = NULL; - sofree(so); + /* + * sofree() releases the socket lock, so we need to + * grab it beforehand as long as Unix sockets rely on + * the KERNEL_LOCK(); + */ + KERNEL_LOCK(); + sofree(so, 0); m_freem(unp->unp_addr); free(unp, M_PCB, sizeof *unp); } diff --git sys/miscfs/fifofs/fifo_vnops.c sys/miscfs/fifofs/fifo_vnops.c index 03a5677a05d..472bfd408a0 100644 --- sys/miscfs/fifofs/fifo_vnops.c +++ sys/miscfs/fifofs/fifo_vnops.c @@ -170,7 +170,7 @@ fifo_open(void *v) fip->fi_writers++; if ((ap->a_mode & O_NONBLOCK) && fip->fi_readers == 0) { error = ENXIO; - sounlock(s); + sounlock(wso, s); goto bad; } if (fip->fi_writers == 1) { @@ -179,7 +179,7 @@ fifo_open(void *v) wakeup(&fip->fi_readers); } } - sounlock(s); + sounlock(wso, s); if ((ap->a_mode & O_NONBLOCK) == 0) { if ((ap->a_mode & FREAD) && fip->fi_writers == 0) { VOP_UNLOCK(vp); @@ -334,7 +334,7 @@ fifo_poll(void *v) wso->so_snd.sb_flags |= SB_SEL; } } - sounlock(s); + sounlock(rso, s); return (revents); } @@ -369,7 +369,7 @@ fifo_close(void *v) s = solock(wso); socantsendmore(wso); - sounlock(s); + sounlock(wso, s); } } if (ap->a_fflag & FWRITE) { @@ -380,7 +380,7 @@ fifo_close(void *v) /* SS_ISDISCONNECTED will result in POLLHUP */ rso->so_state |= SS_ISDISCONNECTED; socantrcvmore(rso); - sounlock(s); + sounlock(rso, s); } } if (fip->fi_readers == 0 && fip->fi_writers == 0) { diff --git sys/net/bfd.c sys/net/bfd.c index e3e557e0d37..8bcfb305c99 100644 --- sys/net/bfd.c +++ sys/net/bfd.c @@ -611,7 +611,7 @@ bfd_sender(struct bfd_config *bfd, unsigned int port) s = solock(so); error = soconnect(so, m); - sounlock(s); + sounlock(so, s); if (error && error != ECONNREFUSED) { printf("%s: soconnect error %d\n", __func__, error); diff --git sys/net/if_pflow.c sys/net/if_pflow.c index 9ba382d5069..1b1c2a4e33e 100644 --- sys/net/if_pflow.c +++ sys/net/if_pflow.c @@ -442,7 +442,7 @@ pflow_set(struct pflow_softc *sc, struct pflowreq *pflowr) s = solock(so); error = sobind(so, m, p); - sounlock(s); + sounlock(so, s); m_freem(m); if (error) { soclose(so); diff --git sys/netinet/in_pcb.c sys/netinet/in_pcb.c index 413f9de9df5..62bb9af000c 100644 --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -584,8 +584,13 @@ in_pcbdetach(struct inpcb *inp) NET_ASSERT_LOCKED(); - so->so_pcb = 0; - sofree(so); + so->so_pcb = NULL; + /* + * As long as the NET_LOCK() is the default lock for Internet + * sockets, do not release it to not introduce new sleeping + * points. + */ + sofree(so, 0); m_freem(inp->inp_options); if (inp->inp_route.ro_rt) { rtfree(inp->inp_route.ro_rt); diff --git sys/nfs/krpc_subr.c sys/nfs/krpc_subr.c index 346ff9ec989..e487867db1b 100644 --- sys/nfs/krpc_subr.c +++ sys/nfs/krpc_subr.c @@ -241,7 +241,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, u_int func, m->m_len = sizeof(tv); s = solock(so); error = sosetopt(so, SOL_SOCKET, SO_RCVTIMEO, m); - sounlock(s); + sounlock(so, s); m_freem(m); if (error) goto out; @@ -257,7 +257,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, u_int func, *on = 1; s = solock(so); error = sosetopt(so, SOL_SOCKET, SO_BROADCAST, m); - sounlock(s); + sounlock(so, s); m_freem(m); if (error) goto out; @@ -274,7 +274,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, u_int func, *ip = IP_PORTRANGE_LOW; s = solock(so); error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); - sounlock(s); + sounlock(so, s); m_freem(mopt); if (error) goto out; @@ -288,7 +288,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, u_int func, sin->sin_port = htons(0); s = solock(so); error = sobind(so, m, &proc0); - sounlock(s); + sounlock(so, s); m_freem(m); if (error) { printf("bind failed\n"); @@ -301,7 +301,7 @@ krpc_call(struct sockaddr_in *sa, u_int prog, u_int vers, u_int func, *ip = IP_PORTRANGE_DEFAULT; s = solock(so); error = sosetopt(so, IPPROTO_IP, IP_PORTRANGE, mopt); - sounlock(s); + sounlock(so, s); m_freem(mopt); if (error) goto out; diff --git sys/nfs/nfs_socket.c sys/nfs/nfs_socket.c index 383db1cb930..0119f135ef2 100644 --- sys/nfs/nfs_socket.c +++ sys/nfs/nfs_socket.c @@ -365,7 +365,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) goto bad; so->so_rcv.sb_flags |= SB_NOINTR; so->so_snd.sb_flags |= SB_NOINTR; - sounlock(s); + sounlock(so, s); m_freem(mopt); m_freem(nam); @@ -378,7 +378,7 @@ nfs_connect(struct nfsmount *nmp, struct nfsreq *rep) return (0); bad: - sounlock(s); + sounlock(so, s); m_freem(mopt); m_freem(nam); diff --git sys/nfs/nfs_syscalls.c sys/nfs/nfs_syscalls.c index 527a61a37de..6b501cc419f 100644 --- sys/nfs/nfs_syscalls.c +++ sys/nfs/nfs_syscalls.c @@ -250,7 +250,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam) s = solock(so); error = soreserve(so, siz, siz); if (error) { - sounlock(s); + sounlock(so, s); m_freem(mynam); return (error); } @@ -279,7 +279,7 @@ nfssvc_addsock(struct file *fp, struct mbuf *mynam) so->so_rcv.sb_timeo = 0; so->so_snd.sb_flags &= ~SB_NOINTR; so->so_snd.sb_timeo = 0; - sounlock(s); + sounlock(so, s); if (tslp) slp = tslp; else { diff --git sys/sys/socketvar.h sys/sys/socketvar.h index 097ae3a4ab9..ce8bbdf9f2d 100644 --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -311,7 +311,7 @@ int soconnect(struct socket *so, struct mbuf *nam); int soconnect2(struct socket *so1, struct socket *so2); int socreate(int dom, struct socket **aso, int type, int proto); int sodisconnect(struct socket *so); -void sofree(struct socket *so); +void sofree(struct socket *so, int); int sogetopt(struct socket *so, int level, int optname, struct mbuf *m); void sohasoutofband(struct socket *so); void soisconnected(struct socket *so); @@ -338,7 +338,7 @@ int sockargs(struct mbuf **, const void *, size_t, int); int sosleep(struct socket *, void *, int, const char *, int); int solock(struct socket *); -void sounlock(int); +void sounlock(struct socket *, int); int sendit(struct proc *, int, struct msghdr *, int, register_t *); int recvit(struct proc *, int, struct msghdr *, caddr_t,