On 26/07/21(Mon) 09:23, Martin Pieuchot wrote: > On 26/07/21(Mon) 08:55, Martin Pieuchot wrote: > > On 21/07/21(Wed) 10:18, Martin Pieuchot wrote: > > > On 11/07/21(Sun) 14:45, Visa Hankala wrote: > > > > On Sat, Jul 10, 2021 at 05:26:57PM +0200, Martin Pieuchot wrote: > > > > > One of the reasons for the drop of performances in the kqueue-based > > > > > poll/select is the fact that kqueue filters are called up to 3 times > > > > > per syscall and that they all spin on the NET_LOCK() for TCP/UDP > > > > > packets. > > > > > > > > > > Diff below is a RFC for improving the situation. > > > > > > > > > > socket kqueue filters mainly check for the amount of available items > > > > > to > > > > > read/write. This involves comparing various socket buffer fields > > > > > (sb_cc, > > > > > sb_lowat, etc). The diff below introduces a new mutex to serialize > > > > > updates of those fields with reads in the kqueue filters. > > > > > > > > > > Since these fields are always modified with the socket lock held, > > > > > either > > > > > the mutex or the solock are enough to have a coherent view of them. > > > > > Note that either of these locks is necessary only if multiple fields > > > > > have to be read (like in sbspace()). > > > > > > > > > > Other per-socket fields accessed in the kqueue filters are never > > > > > combined (with &&) to determine a condition. So assuming it is fine > > > > > to > > > > > read register-sized fields w/o the socket lock we can safely remove it > > > > > there. > > > > > > > > > > Could such mutex also be used to serialize klist updates? > > > > > > > > I think the lock should be such that it can serialize socket klists. > > > > > > > > As the main motivator for this change is kqueue, the viability of using > > > > the mutex for the klist locking should be checked now. The mutex has to > > > > be held whenever calling KNOTE() on sb_sel.si_note, or selwakeup() on > > > > sb_sel. Then the socket f_event callbacks will not need to lock the > > > > mutex themselves. > > > > > > > > I had a diff that serialized socket klists using solock(). It did not > > > > work well because it increased lock contention, especially when using > > > > kqueue as backend for poll(2) and select(2). The diff is not even > > > > correct any longer since recent changes to socket locking have > > > > introduced new lock order constraints that conflict with it. > > > > > > Updated diff below does that. It also uses a single per-socket mutex as > > > suggested by bluhm@. > > > > > > Note that as long poll(2) & select(2) use the current implementation a > > > KERNEL_LOCK()/UNLOCK() dance is necessary in sowakeup(). The goal of > > > this change combined with the poll/select rewrite is to get rid of this > > > dance. > > > > Updated diff after recent commits, more comments? Oks? > > Previous diff had a double mtx_enter() in filt_fifowrite_common(), this > one use the *locked() version of sbspace() to prevent it.
New diff fixing a locking dance pointed out by visa@. Index: kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.264 diff -u -p -r1.264 uipc_socket.c --- kern/uipc_socket.c 26 Jul 2021 05:51:13 -0000 1.264 +++ kern/uipc_socket.c 29 Jul 2021 07:31:32 -0000 @@ -84,7 +84,7 @@ int filt_solistenprocess(struct knote *k int filt_solisten_common(struct knote *kn, struct socket *so); const struct filterops solisten_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_solisten, @@ -93,7 +93,7 @@ const struct filterops solisten_filtops }; const struct filterops soread_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, @@ -102,7 +102,7 @@ const struct filterops soread_filtops = }; const struct filterops sowrite_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sowdetach, .f_event = filt_sowrite, @@ -111,7 +111,7 @@ const struct filterops sowrite_filtops = }; const struct filterops soexcept_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_sordetach, .f_event = filt_soread, @@ -181,6 +181,9 @@ socreate(int dom, struct socket **aso, i so->so_egid = p->p_ucred->cr_gid; so->so_cpid = p->p_p->ps_pid; so->so_proto = prp; + mtx_init(&so->so_mtx, IPL_MPFLOOR); + klist_init_mutex(&so->so_snd.sb_sel.si_note, &so->so_mtx); + klist_init_mutex(&so->so_rcv.sb_sel.si_note, &so->so_mtx); so->so_snd.sb_timeo_nsecs = INFSLP; so->so_rcv.sb_timeo_nsecs = INFSLP; @@ -276,7 +279,9 @@ sofree(struct socket *so, int s) } } #endif /* SOCKET_SPLICE */ + mtx_enter(&so->so_mtx); sbrelease(so, &so->so_snd); + mtx_leave(&so->so_mtx); sorflush(so); sounlock(so, s); #ifdef SOCKET_SPLICE @@ -1019,8 +1024,10 @@ dontblock: *mp = m_copym(m, 0, len, M_WAIT); m->m_data += len; m->m_len -= len; + mtx_enter(&so->so_mtx); so->so_rcv.sb_cc -= len; so->so_rcv.sb_datacc -= len; + mtx_leave(&so->so_mtx); } } if (so->so_oobmark) { @@ -1537,8 +1544,10 @@ somove(struct socket *so, int wait) } so->so_rcv.sb_mb->m_data += size; so->so_rcv.sb_mb->m_len -= size; + mtx_enter(&so->so_mtx); so->so_rcv.sb_cc -= size; so->so_rcv.sb_datacc -= size; + mtx_leave(&so->so_mtx); } else { *mp = so->so_rcv.sb_mb; sbfree(so, &so->so_rcv, *mp); @@ -1777,30 +1786,40 @@ sosetopt(struct socket *so, int level, i case SO_SNDBUF: if (so->so_state & SS_CANTSENDMORE) return (EINVAL); + mtx_enter(&so->so_mtx); if (sbcheckreserve(cnt, so->so_snd.sb_wat) || sbreserve(so, &so->so_snd, cnt)) - return (ENOBUFS); - so->so_snd.sb_wat = cnt; + error = ENOBUFS; + if (error == 0) + so->so_snd.sb_wat = cnt; + mtx_leave(&so->so_mtx); break; case SO_RCVBUF: if (so->so_state & SS_CANTRCVMORE) return (EINVAL); + mtx_enter(&so->so_mtx); if (sbcheckreserve(cnt, so->so_rcv.sb_wat) || sbreserve(so, &so->so_rcv, cnt)) - return (ENOBUFS); - so->so_rcv.sb_wat = cnt; + error = ENOBUFS; + if (error == 0) + so->so_rcv.sb_wat = cnt; + mtx_leave(&so->so_mtx); break; case SO_SNDLOWAT: + mtx_enter(&so->so_mtx); so->so_snd.sb_lowat = (cnt > so->so_snd.sb_hiwat) ? so->so_snd.sb_hiwat : cnt; + mtx_leave(&so->so_mtx); break; case SO_RCVLOWAT: + mtx_enter(&so->so_mtx); so->so_rcv.sb_lowat = (cnt > so->so_rcv.sb_hiwat) ? so->so_rcv.sb_hiwat : cnt; + mtx_leave(&so->so_mtx); break; } break; @@ -2028,7 +2047,12 @@ void sohasoutofband(struct socket *so) { pgsigio(&so->so_sigio, SIGURG, 0); + /* XXX KERNEL_LOCK() needed because of legacy poll/select */ + KERNEL_LOCK(); + mtx_enter(&so->so_mtx); selwakeup(&so->so_rcv.sb_sel); + mtx_leave(&so->so_mtx); + KERNEL_UNLOCK(); } int @@ -2037,8 +2061,6 @@ soo_kqfilter(struct file *fp, struct kno struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; - KERNEL_ASSERT_LOCKED(); - switch (kn->kn_filter) { case EVFILT_READ: if (so->so_options & SO_ACCEPTCONN) @@ -2059,7 +2081,7 @@ soo_kqfilter(struct file *fp, struct kno return (EINVAL); } - klist_insert_locked(&sb->sb_sel.si_note, kn); + klist_insert(&sb->sb_sel.si_note, kn); return (0); } @@ -2069,17 +2091,16 @@ filt_sordetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - KERNEL_ASSERT_LOCKED(); - - klist_remove_locked(&so->so_rcv.sb_sel.si_note, kn); + klist_remove(&so->so_rcv.sb_sel.si_note, kn); } int filt_soread_common(struct knote *kn, struct socket *so) { + u_int sostate = so->so_state; int rv = 0; - soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); kn->kn_data = so->so_rcv.sb_cc; #ifdef SOCKET_SPLICE @@ -2088,15 +2109,17 @@ filt_soread_common(struct knote *kn, str } else #endif /* SOCKET_SPLICE */ if (kn->kn_sfflags & NOTE_OOB) { - if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) { + u_long oobmark = so->so_oobmark; + + if (oobmark || (sostate & SS_RCVATMARK)) { kn->kn_fflags |= NOTE_OOB; - kn->kn_data -= so->so_oobmark; + kn->kn_data -= oobmark; rv = 1; } - } else if (so->so_state & SS_CANTRCVMORE) { + } else if (sostate & SS_CANTRCVMORE) { kn->kn_flags |= EV_EOF; if (kn->kn_flags & __EV_POLL) { - if (so->so_state & SS_ISDISCONNECTED) + if (sostate & SS_ISDISCONNECTED) kn->kn_flags |= __EV_HUP; } kn->kn_fflags = so->so_error; @@ -2124,12 +2147,12 @@ int filt_soreadmodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); knote_modify(kev, kn); rv = filt_soread_common(kn, so); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -2138,16 +2161,16 @@ int filt_soreadprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) rv = 1; else rv = filt_soread_common(kn, so); if (rv != 0) knote_submit(kn, kev); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -2157,30 +2180,29 @@ filt_sowdetach(struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - KERNEL_ASSERT_LOCKED(); - - klist_remove_locked(&so->so_snd.sb_sel.si_note, kn); + klist_remove(&so->so_snd.sb_sel.si_note, kn); } int filt_sowrite_common(struct knote *kn, struct socket *so) { + u_int sostate = so->so_state; int rv; - soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); - kn->kn_data = sbspace(so, &so->so_snd); - if (so->so_state & SS_CANTSENDMORE) { + kn->kn_data = sbspace_locked(so, &so->so_snd); + if (sostate & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; if (kn->kn_flags & __EV_POLL) { - if (so->so_state & SS_ISDISCONNECTED) + if (sostate & SS_ISDISCONNECTED) kn->kn_flags |= __EV_HUP; } kn->kn_fflags = so->so_error; rv = 1; } else if (so->so_error) { /* temporary udp error */ rv = 1; - } else if (((so->so_state & SS_ISCONNECTED) == 0) && + } else if (((sostate & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { rv = 0; } else if (kn->kn_sfflags & NOTE_LOWAT) { @@ -2204,12 +2226,12 @@ int filt_sowritemodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); knote_modify(kev, kn); rv = filt_sowrite_common(kn, so); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -2218,16 +2240,16 @@ int filt_sowriteprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) rv = 1; else rv = filt_sowrite_common(kn, so); if (rv != 0) knote_submit(kn, kev); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -2235,8 +2257,6 @@ filt_sowriteprocess(struct knote *kn, st int filt_solisten_common(struct knote *kn, struct socket *so) { - soassertlocked(so); - kn->kn_data = so->so_qlen; return (kn->kn_data != 0); @@ -2254,12 +2274,12 @@ int filt_solistenmodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); knote_modify(kev, kn); rv = filt_solisten_common(kn, so); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -2268,16 +2288,16 @@ int filt_solistenprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_fp->f_data; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) rv = 1; else rv = filt_solisten_common(kn, so); if (rv != 0) knote_submit(kn, kev); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } Index: kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.113 diff -u -p -r1.113 uipc_socket2.c --- kern/uipc_socket2.c 26 Jul 2021 05:51:13 -0000 1.113 +++ kern/uipc_socket2.c 26 Jul 2021 05:52:29 -0000 @@ -34,7 +34,6 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/domain.h> @@ -163,6 +162,9 @@ sonewconn(struct socket *head, int conns if (so == NULL) return (NULL); rw_init(&so->so_lock, "solock"); + mtx_init(&so->so_mtx, IPL_MPFLOOR); + klist_init_mutex(&so->so_snd.sb_sel.si_note, &so->so_mtx); + klist_init_mutex(&so->so_rcv.sb_sel.si_note, &so->so_mtx); so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -423,7 +425,12 @@ sowakeup(struct socket *so, struct sockb } if (sb->sb_flags & SB_ASYNC) pgsigio(&so->so_sigio, SIGIO, 0); + /* XXX KERNEL_LOCK() needed because of legacy poll/select */ + KERNEL_LOCK(); + mtx_enter(&so->so_mtx); selwakeup(&sb->sb_sel); + mtx_leave(&so->so_mtx); + KERNEL_UNLOCK(); } /* @@ -463,6 +470,7 @@ soreserve(struct socket *so, u_long sndc { soassertlocked(so); + mtx_enter(&so->so_mtx); if (sbreserve(so, &so->so_snd, sndcc)) goto bad; if (sbreserve(so, &so->so_rcv, rcvcc)) @@ -475,10 +483,12 @@ soreserve(struct socket *so, u_long sndc so->so_snd.sb_lowat = MCLBYTES; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + mtx_leave(&so->so_mtx); return (0); bad2: sbrelease(so, &so->so_snd); bad: + mtx_leave(&so->so_mtx); return (ENOBUFS); } @@ -492,6 +502,7 @@ sbreserve(struct socket *so, struct sock { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); if (cc == 0 || cc > sb_max) return (1); @@ -533,6 +544,7 @@ sbchecklowmem(void) void sbrelease(struct socket *so, struct sockbuf *sb) { + MUTEX_ASSERT_LOCKED(&so->so_mtx); sbflush(so, sb); sb->sb_hiwat = sb->sb_mbmax = 0; @@ -686,6 +698,7 @@ sbcheck(struct socket *so, struct sockbu struct mbuf *m, *n; u_long len = 0, mbcnt = 0; + mtx_enter(&so->so_mtx); for (m = sb->sb_mb; m; m = m->m_nextpkt) { for (n = m; n; n = n->m_next) { len += n->m_len; @@ -701,6 +714,7 @@ sbcheck(struct socket *so, struct sockbu mbcnt, sb->sb_mbcnt); panic("sbcheck"); } + mtx_leave(&so->so_mtx); } #endif @@ -860,9 +874,11 @@ sbcompress(struct socket *so, struct soc memcpy(mtod(n, caddr_t) + n->m_len, mtod(m, caddr_t), m->m_len); n->m_len += m->m_len; + mtx_enter(&so->so_mtx); sb->sb_cc += m->m_len; if (m->m_type != MT_CONTROL && m->m_type != MT_SONAME) sb->sb_datacc += m->m_len; + mtx_leave(&so->so_mtx); m = m_free(m); continue; } @@ -895,6 +911,7 @@ sbflush(struct socket *so, struct sockbu { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); KASSERT((sb->sb_flags & SB_LOCK) == 0); + MUTEX_ASSERT_LOCKED(&so->so_mtx); while (sb->sb_mbcnt) sbdrop(so, sb, (int)sb->sb_cc); @@ -917,6 +934,7 @@ sbdrop(struct socket *so, struct sockbuf KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { @@ -936,12 +954,12 @@ sbdrop(struct socket *so, struct sockbuf break; } len -= m->m_len; - sbfree(so, sb, m); + sbfree_locked(so, sb, m); mn = m_free(m); m = mn; } while (m && m->m_len == 0) { - sbfree(so, sb, m); + sbfree_locked(so, sb, m); mn = m_free(m); m = mn; } Index: kern/uipc_syscalls.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v retrieving revision 1.193 diff -u -p -r1.193 uipc_syscalls.c --- kern/uipc_syscalls.c 2 Jul 2021 12:17:41 -0000 1.193 +++ kern/uipc_syscalls.c 26 Jul 2021 05:51:52 -0000 @@ -308,7 +308,9 @@ doaccept(struct proc *p, int sock, struc : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0); /* connection has been removed from the listen queue */ + mtx_enter(&head->so_mtx); KNOTE(&head->so_rcv.sb_sel.si_note, 0); + mtx_leave(&head->so_mtx); fp->f_type = DTYPE_SOCKET; fp->f_flag = FREAD | FWRITE | nflag; Index: kern/uipc_usrreq.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v retrieving revision 1.148 diff -u -p -r1.148 uipc_usrreq.c --- kern/uipc_usrreq.c 25 May 2021 22:45:09 -0000 1.148 +++ kern/uipc_usrreq.c 26 Jul 2021 05:51:52 -0000 @@ -208,8 +208,10 @@ uipc_usrreq(struct socket *so, int req, * Adjust backpressure on sender * and wakeup any waiting to write. */ + mtx_enter(&so2->so_mtx); so2->so_snd.sb_mbcnt = so->so_rcv.sb_mbcnt; so2->so_snd.sb_cc = so->so_rcv.sb_cc; + mtx_leave(&so2->so_mtx); sowwakeup(so2); break; @@ -284,8 +286,10 @@ uipc_usrreq(struct socket *so, int req, sbappendrecord(so2, &so2->so_rcv, m); else sbappend(so2, &so2->so_rcv, m); + mtx_enter(&so->so_mtx); so->so_snd.sb_mbcnt = so2->so_rcv.sb_mbcnt; so->so_snd.sb_cc = so2->so_rcv.sb_cc; + mtx_leave(&so->so_mtx); if (so2->so_rcv.sb_cc > 0) sorwakeup(so2); m = NULL; @@ -736,12 +740,16 @@ unp_disconnect(struct unpcb *unp) case SOCK_STREAM: case SOCK_SEQPACKET: + mtx_enter(&unp->unp_socket->so_mtx); unp->unp_socket->so_snd.sb_mbcnt = 0; unp->unp_socket->so_snd.sb_cc = 0; + mtx_leave(&unp->unp_socket->so_mtx); soisdisconnected(unp->unp_socket); unp2->unp_conn = NULL; + mtx_enter(&unp2->unp_socket->so_mtx); unp2->unp_socket->so_snd.sb_mbcnt = 0; unp2->unp_socket->so_snd.sb_cc = 0; + mtx_leave(&unp2->unp_socket->so_mtx); soisdisconnected(unp2->unp_socket); break; } Index: miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.80 diff -u -p -r1.80 fifo_vnops.c --- miscfs/fifofs/fifo_vnops.c 13 Jul 2021 07:37:50 -0000 1.80 +++ miscfs/fifofs/fifo_vnops.c 26 Jul 2021 07:02:12 -0000 @@ -114,7 +114,7 @@ int filt_fifowriteprocess(struct knote * int filt_fifowrite_common(struct knote *kn, struct socket *so); const struct filterops fiforead_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_fifordetach, .f_event = filt_fiforead, @@ -123,7 +123,7 @@ const struct filterops fiforead_filtops }; const struct filterops fifowrite_filtops = { - .f_flags = FILTEROP_ISFD, + .f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE, .f_attach = NULL, .f_detach = filt_fifowdetach, .f_event = filt_fifowrite, @@ -542,7 +542,7 @@ fifo_kqfilter(void *v) ap->a_kn->kn_hook = so; - klist_insert_locked(&sb->sb_sel.si_note, ap->a_kn); + klist_insert(&sb->sb_sel.si_note, ap->a_kn); return (0); } @@ -552,7 +552,7 @@ filt_fifordetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; - klist_remove_locked(&so->so_rcv.sb_sel.si_note, kn); + klist_remove(&so->so_rcv.sb_sel.si_note, kn); } int @@ -560,7 +560,7 @@ filt_fiforead_common(struct knote *kn, s { int rv; - soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); kn->kn_data = so->so_rcv.sb_cc; if (so->so_state & SS_CANTRCVMORE) { @@ -590,12 +590,12 @@ int filt_fiforeadmodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_hook; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); knote_modify(kev, kn); rv = filt_fiforead_common(kn, so); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -604,16 +604,16 @@ int filt_fiforeadprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_hook; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) rv = 1; else rv = filt_fiforead_common(kn, so); if (rv != 0) knote_submit(kn, kev); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -623,7 +623,7 @@ filt_fifowdetach(struct knote *kn) { struct socket *so = (struct socket *)kn->kn_hook; - klist_remove_locked(&so->so_snd.sb_sel.si_note, kn); + klist_remove(&so->so_snd.sb_sel.si_note, kn); } int @@ -631,9 +631,9 @@ filt_fifowrite_common(struct knote *kn, { int rv; - soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); - kn->kn_data = sbspace(so, &so->so_snd); + kn->kn_data = sbspace_locked(so, &so->so_snd); if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; rv = 1; @@ -657,12 +657,12 @@ int filt_fifowritemodify(struct kevent *kev, struct knote *kn) { struct socket *so = kn->kn_hook; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); knote_modify(kev, kn); rv = filt_fifowrite_common(kn, so); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } @@ -671,16 +671,16 @@ int filt_fifowriteprocess(struct knote *kn, struct kevent *kev) { struct socket *so = kn->kn_hook; - int rv, s; + int rv; - s = solock(so); + mtx_enter(&so->so_mtx); if (kev != NULL && (kn->kn_flags & EV_ONESHOT)) rv = 1; else rv = filt_fifowrite_common(kn, so); if (rv != 0) knote_submit(kn, kev); - sounlock(so, s); + mtx_leave(&so->so_mtx); return (rv); } Index: netinet/tcp_input.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_input.c,v retrieving revision 1.368 diff -u -p -r1.368 tcp_input.c --- netinet/tcp_input.c 16 Apr 2021 12:08:25 -0000 1.368 +++ netinet/tcp_input.c 26 Jul 2021 05:51:52 -0000 @@ -946,7 +946,9 @@ findpcb: tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked); ND6_HINT(tp); + mtx_enter(&so->so_mtx); sbdrop(so, &so->so_snd, acked); + mtx_leave(&so->so_mtx); /* * If we had a pending ICMP message that @@ -1714,6 +1716,7 @@ trimthenstep6: TCP_MAXWIN << tp->snd_scale); } ND6_HINT(tp); + mtx_enter(&so->so_mtx); if (acked > so->so_snd.sb_cc) { if (tp->snd_wnd > so->so_snd.sb_cc) tp->snd_wnd -= so->so_snd.sb_cc; @@ -1729,6 +1732,7 @@ trimthenstep6: tp->snd_wnd = 0; ourfinisacked = 0; } + mtx_leave(&so->so_mtx); tcp_update_sndspace(tp); if (sb_notify(so, &so->so_snd)) { @@ -2967,7 +2971,9 @@ tcp_mss_update(struct tcpcb *tp) bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; + mtx_enter(&so->so_mtx); (void)sbreserve(so, &so->so_snd, bufsize); + mtx_leave(&so->so_mtx); } bufsize = so->so_rcv.sb_hiwat; @@ -2975,7 +2981,9 @@ tcp_mss_update(struct tcpcb *tp) bufsize = roundup(bufsize, mss); if (bufsize > sb_max) bufsize = sb_max; + mtx_enter(&so->so_mtx); (void)sbreserve(so, &so->so_rcv, bufsize); + mtx_leave(&so->so_mtx); } } Index: netinet/tcp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.181 diff -u -p -r1.181 tcp_usrreq.c --- netinet/tcp_usrreq.c 30 Apr 2021 13:52:48 -0000 1.181 +++ netinet/tcp_usrreq.c 26 Jul 2021 05:51:52 -0000 @@ -688,7 +688,9 @@ tcp_disconnect(struct tcpcb *tp) tp = tcp_drop(tp, 0); else { soisdisconnecting(so); + mtx_enter(&so->so_mtx); sbflush(so, &so->so_rcv); + mtx_leave(&so->so_mtx); tp = tcp_usrclosed(tp); if (tp) (void) tcp_output(tp); @@ -1115,6 +1117,7 @@ tcp_update_sndspace(struct tcpcb *tp) struct socket *so = tp->t_inpcb->inp_socket; u_long nmax = so->so_snd.sb_hiwat; + mtx_enter(&so->so_mtx); if (sbchecklowmem()) { /* low on memory try to get rid of some */ if (tcp_sendspace < nmax) @@ -1128,7 +1131,7 @@ tcp_update_sndspace(struct tcpcb *tp) tp->snd_una); /* a writable socket must be preserved because of poll(2) semantics */ - if (sbspace(so, &so->so_snd) >= so->so_snd.sb_lowat) { + if (sbspace_locked(so, &so->so_snd) >= so->so_snd.sb_lowat) { if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; /* keep in sync with sbreserve() calculation */ @@ -1141,6 +1144,7 @@ tcp_update_sndspace(struct tcpcb *tp) if (nmax != so->so_snd.sb_hiwat) sbreserve(so, &so->so_snd, nmax); + mtx_leave(&so->so_mtx); } /* @@ -1179,5 +1183,7 @@ tcp_update_rcvspace(struct tcpcb *tp) /* round to MSS boundary */ nmax = roundup(nmax, tp->t_maxseg); + mtx_enter(&so->so_mtx); sbreserve(so, &so->so_rcv, nmax); + mtx_leave(&so->so_mtx); } Index: sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.100 diff -u -p -r1.100 socketvar.h --- sys/socketvar.h 26 Jul 2021 05:51:13 -0000 1.100 +++ sys/socketvar.h 26 Jul 2021 05:57:31 -0000 @@ -33,10 +33,12 @@ */ #include <sys/selinfo.h> /* for struct selinfo */ +#include <sys/systm.h> /* panicstr for MUTEX_ASSERT */ #include <sys/queue.h> #include <sys/sigio.h> /* for struct sigio_ref */ #include <sys/task.h> #include <sys/timeout.h> +#include <sys/mutex.h> #include <sys/rwlock.h> #ifndef _SOCKLEN_T_DEFINED_ @@ -51,10 +53,15 @@ TAILQ_HEAD(soqhead, socket); * Contains send and receive buffer queues, * handle on protocol and pointer to protocol * private data and error information. + * + * Locks used to protect struct members in this file: + * s this socket solock + * m this socket `so_mtx' */ struct socket { const struct protosw *so_proto; /* protocol handle */ struct rwlock so_lock; /* this socket lock */ + struct mutex so_mtx; void *so_pcb; /* protocol control block */ u_int so_state; /* internal state flags SS_*, below */ short so_type; /* generic type, see socket.h */ @@ -101,13 +108,13 @@ struct socket { struct sockbuf { /* The following fields are all zeroed on flush. */ #define sb_startzero sb_cc - u_long sb_cc; /* actual chars in buffer */ - u_long sb_datacc; /* data only chars in buffer */ - u_long sb_hiwat; /* max actual char count */ - u_long sb_wat; /* default watermark */ - u_long sb_mbcnt; /* chars of mbufs used */ - u_long sb_mbmax; /* max chars of mbufs to use */ - long sb_lowat; /* low water mark */ + u_long sb_cc; /* [s|m] actual chars in buffer */ + u_long sb_datacc; /* [s|m] data only chars in buffer */ + u_long sb_hiwat; /* [s|m] max actual char count */ + u_long sb_wat; /* [s|m] default watermark */ + u_long sb_mbcnt; /* [s|m] chars of mbufs used */ + u_long sb_mbmax; /* [s|m] max chars of mbufs to use */ + long sb_lowat; /* [s|m] low water mark */ struct mbuf *sb_mb; /* the mbuf chain */ struct mbuf *sb_mbtail; /* the last mbuf in the chain */ struct mbuf *sb_lastrecord;/* first mbuf of last record in @@ -189,13 +196,27 @@ sb_notify(struct socket *so, struct sock * overflow and return 0. */ static inline long -sbspace(struct socket *so, struct sockbuf *sb) +sbspace_locked(struct socket *so, struct sockbuf *sb) { KASSERT(sb == &so->so_rcv || sb == &so->so_snd); - soassertlocked(so); + MUTEX_ASSERT_LOCKED(&so->so_mtx); + return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt); } +static inline long +sbspace(struct socket *so, struct sockbuf *sb) +{ + long space; + + mtx_enter(&so->so_mtx); + space = sbspace_locked(so, sb); + mtx_leave(&so->so_mtx); + + return space; +} + + /* do we have to send all at once on a socket? */ #define sosendallatonce(so) \ ((so)->so_proto->pr_flags & PR_ATOMIC) @@ -224,22 +245,31 @@ soreadable(struct socket *so) /* adjust counters in sb reflecting allocation of m */ #define sballoc(so, sb, m) do { \ + mtx_enter(&(so)->so_mtx); \ (sb)->sb_cc += (m)->m_len; \ if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME) \ (sb)->sb_datacc += (m)->m_len; \ (sb)->sb_mbcnt += MSIZE; \ if ((m)->m_flags & M_EXT) \ (sb)->sb_mbcnt += (m)->m_ext.ext_size; \ + mtx_leave(&(so)->so_mtx); \ } while (/* CONSTCOND */ 0) /* adjust counters in sb reflecting freeing of m */ -#define sbfree(so, sb, m) do { \ +#define sbfree_locked(so, sb, m) do { \ + MUTEX_ASSERT_LOCKED(&so->so_mtx); \ (sb)->sb_cc -= (m)->m_len; \ if ((m)->m_type != MT_CONTROL && (m)->m_type != MT_SONAME) \ (sb)->sb_datacc -= (m)->m_len; \ (sb)->sb_mbcnt -= MSIZE; \ if ((m)->m_flags & M_EXT) \ (sb)->sb_mbcnt -= (m)->m_ext.ext_size; \ +} while (/* CONSTCOND */ 0) + +#define sbfree(so, sb, m) do { \ + mtx_enter(&(so)->so_mtx); \ + sbfree_locked((so), (sb), (m)); \ + mtx_leave(&(so)->so_mtx); \ } while (/* CONSTCOND */ 0) /*