On 20/11/17(Mon) 16:22, Martin Pieuchot wrote: > Diff below remove the KERNEL_LOCK() around all pr_input() routines. > It's a bit rough so I'd appreciate more tests before splitting it into > pieces. > > I'm using tasks to delay selwakeup/csignal calls, just like I did for > bpf(4).
Updated version that should fix previous issues. I'm using a new flag to prevent notification tasks to be scheduled. Reusing SS_NOFDREF would lead to a race in sofree() since we need to release the NET_LOCK() around taskq_barrier(). I'm also merging `sb_flagsintr' and `sb_flags'. Both of them are currently protected by the KERNEL_LOCK(). I'll plan to use atomic set/clear bits operations to avoid taking the socketlock() when setting SB_KNOTE. Questions, comments, tests? Index: kern/sys_socket.c =================================================================== RCS file: /cvs/src/sys/kern/sys_socket.c,v retrieving revision 1.34 diff -u -p -r1.34 sys_socket.c --- kern/sys_socket.c 14 Nov 2017 16:01:55 -0000 1.34 +++ kern/sys_socket.c 27 Nov 2017 10:46:15 -0000 @@ -166,11 +166,11 @@ soo_poll(struct file *fp, int events, st if (revents == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(p, &so->so_rcv.sb_sel); - so->so_rcv.sb_flagsintr |= SB_SEL; + so->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(p, &so->so_snd.sb_sel); - so->so_snd.sb_flagsintr |= SB_SEL; + so->so_snd.sb_flags |= SB_SEL; } } sounlock(s); Index: kern/uipc_socket.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket.c,v retrieving revision 1.209 diff -u -p -r1.209 uipc_socket.c --- kern/uipc_socket.c 23 Nov 2017 13:45:46 -0000 1.209 +++ kern/uipc_socket.c 27 Nov 2017 10:53:44 -0000 @@ -135,6 +135,8 @@ socreate(int dom, struct socket **aso, i so->so_egid = p->p_ucred->cr_gid; so->so_cpid = p->p_p->ps_pid; so->so_proto = prp; + task_set(&so->so_rcv.sb_wtask, sorwakeup_cb, so); + task_set(&so->so_snd.sb_wtask, sowwakeup_cb, so); s = solock(so); error = (*prp->pr_attach)(so, proto); @@ -194,7 +196,8 @@ sofree(struct socket *so) { soassertlocked(so); - if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + if ((so->so_pcb != NULL) || + (so->so_state & (SS_NOFDREF|SS_NONOTIF)) != (SS_NOFDREF|SS_NONOTIF)) return; if (so->so_head) { /* @@ -273,8 +276,15 @@ drop: error = error2; } discard: - if (so->so_state & SS_NOFDREF) - panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type); + KASSERT((so->so_state & (SS_NOFDREF|SS_NONOTIF)) == 0); + /* Make sure possible delayed notification are delivered. */ + so->so_state |= SS_NONOTIF; + if (!task_del(systq, &so->so_rcv.sb_wtask) || + !task_del(systq, &so->so_snd.sb_wtask)) { + NET_UNLOCK(); + taskq_barrier(systq); + NET_LOCK(); + } so->so_state |= SS_NOFDREF; sofree(so); sounlock(s); @@ -296,9 +306,8 @@ soaccept(struct socket *so, struct mbuf int error = 0; soassertlocked(so); + KASSERT(so->so_state & SS_NOFDREF); - if ((so->so_state & SS_NOFDREF) == 0) - panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type); so->so_state &= ~SS_NOFDREF; if ((so->so_state & SS_ISDISCONNECTED) == 0 || (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0) @@ -1052,12 +1061,8 @@ sorflush(struct socket *so) sbunlock(so, sb); aso.so_proto = pr; aso.so_rcv = *sb; - memset(sb, 0, sizeof (*sb)); - /* XXX - the memset stomps all over so_rcv */ - if (aso.so_rcv.sb_flags & SB_KNOTE) { - sb->sb_sel.si_note = aso.so_rcv.sb_sel.si_note; - sb->sb_flags = SB_KNOTE; - } + memset(&sb->sb_startzero, 0, + (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) (*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb); sbrelease(&aso, &aso.so_rcv); @@ -1178,8 +1183,8 @@ sosplice(struct socket *so, int fd, off_ * we sleep, the socket buffers are not marked as spliced yet. */ if (somove(so, M_WAIT)) { - so->so_rcv.sb_flagsintr |= SB_SPLICE; - sosp->so_snd.sb_flagsintr |= SB_SPLICE; + so->so_rcv.sb_flags |= SB_SPLICE; + sosp->so_snd.sb_flags |= SB_SPLICE; } release: @@ -1196,8 +1201,8 @@ sounsplice(struct socket *so, struct soc task_del(sosplice_taskq, &so->so_splicetask); timeout_del(&so->so_idleto); - sosp->so_snd.sb_flagsintr &= ~SB_SPLICE; - so->so_rcv.sb_flagsintr &= ~SB_SPLICE; + sosp->so_snd.sb_flags &= ~SB_SPLICE; + so->so_rcv.sb_flags &= ~SB_SPLICE; so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL; if (wakeup && soreadable(so)) sorwakeup(so); @@ -1210,7 +1215,7 @@ soidle(void *arg) int s; s = solock(so); - if (so->so_rcv.sb_flagsintr & SB_SPLICE) { + if (so->so_rcv.sb_flags & SB_SPLICE) { so->so_error = ETIMEDOUT; sounsplice(so, so->so_sp->ssp_socket, 1); } @@ -1224,7 +1229,7 @@ sotask(void *arg) int s; s = solock(so); - if (so->so_rcv.sb_flagsintr & SB_SPLICE) { + if (so->so_rcv.sb_flags & SB_SPLICE) { /* * We may not sleep here as sofree() and unsplice() may be * called from softnet interrupt context. This would remove @@ -1527,7 +1532,7 @@ sorwakeup(struct socket *so) soassertlocked(so); #ifdef SOCKET_SPLICE - if (so->so_rcv.sb_flagsintr & SB_SPLICE) { + if (so->so_rcv.sb_flags & SB_SPLICE) { /* * TCP has a sendbuffer that can handle multiple packets * at once. So queue the stream a bit to accumulate data. @@ -1544,7 +1549,8 @@ sorwakeup(struct socket *so) if (isspliced(so)) return; #endif - sowakeup(so, &so->so_rcv); + if ((so->so_state & SS_NONOTIF) == 0) + task_add(systq, &so->so_rcv.sb_wtask); if (so->so_upcall) (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT); } @@ -1555,10 +1561,12 @@ sowwakeup(struct socket *so) soassertlocked(so); #ifdef SOCKET_SPLICE - if (so->so_snd.sb_flagsintr & SB_SPLICE) + if (so->so_snd.sb_flags & SB_SPLICE) task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask); #endif - sowakeup(so, &so->so_snd); + + if ((so->so_state & SS_NONOTIF) == 0) + task_add(systq, &so->so_snd.sb_wtask); } int @@ -2025,7 +2033,6 @@ sobuf_print(struct sockbuf *sb, (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail); (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord); (*pr)("\tsb_sel: ...\n"); - (*pr)("\tsb_flagsintr: %d\n", sb->sb_flagsintr); (*pr)("\tsb_flags: %i\n", sb->sb_flags); (*pr)("\tsb_timeo: %i\n", sb->sb_timeo); } Index: kern/uipc_socket2.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_socket2.c,v retrieving revision 1.87 diff -u -p -r1.87 uipc_socket2.c --- kern/uipc_socket2.c 23 Nov 2017 13:42:53 -0000 1.87 +++ kern/uipc_socket2.c 27 Nov 2017 11:10:57 -0000 @@ -189,6 +189,8 @@ sonewconn(struct socket *head, int conns so->so_rcv.sb_wat = head->so_rcv.sb_wat; so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + task_set(&so->so_rcv.sb_wtask, sorwakeup_cb, so); + task_set(&so->so_snd.sb_wtask, sowwakeup_cb, so); soqinsque(head, so, soqueue); if ((*so->so_proto->pr_attach)(so, 0)) { @@ -329,12 +331,12 @@ sosleep(struct socket *so, void *ident, int sbwait(struct socket *so, struct sockbuf *sb) { + int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH; + soassertlocked(so); - sb->sb_flagsintr |= SB_WAIT; - return (sosleep(so, &sb->sb_cc, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio", - sb->sb_timeo)); + sb->sb_flags |= SB_WAIT; + return (sosleep(so, &sb->sb_cc, prio, "netio", sb->sb_timeo)); } int @@ -351,6 +353,9 @@ sblock(struct socket *so, struct sockbuf if (wait & M_NOWAIT) return (EWOULDBLOCK); + /* XXXSMP ensure sosleep() isn't called from input path. */ + KERNEL_ASSERT_LOCKED(); + while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; error = sosleep(so, &sb->sb_flags, prio, "netlck", 0); @@ -364,13 +369,35 @@ sblock(struct socket *so, struct sockbuf void sbunlock(struct socket *so, struct sockbuf *sb) { + int flags = sb->sb_flags; + soassertlocked(so); - sb->sb_flags &= ~SB_LOCK; - if (sb->sb_flags & SB_WANT) { - sb->sb_flags &= ~SB_WANT; + sb->sb_flags &= ~(SB_LOCK|SB_WANT); + if (flags & SB_WANT) wakeup(&sb->sb_flags); - } +} + +void +sorwakeup_cb(void *xso) +{ + struct socket *so = xso; + int s; + + s = solock(so); + sowakeup(so, &so->so_rcv); + sounlock(s); +} + +void +sowwakeup_cb(void *xso) +{ + struct socket *so = xso; + int s; + + s = solock(so); + sowakeup(so, &so->so_snd); + sounlock(s); } /* @@ -381,15 +408,15 @@ sbunlock(struct socket *so, struct sockb void sowakeup(struct socket *so, struct sockbuf *sb) { + int flags = sb->sb_flags; + KERNEL_ASSERT_LOCKED(); soassertlocked(so); selwakeup(&sb->sb_sel); - sb->sb_flagsintr &= ~SB_SEL; - if (sb->sb_flagsintr & SB_WAIT) { - sb->sb_flagsintr &= ~SB_WAIT; + sb->sb_flags &= ~(SB_SEL|SB_WAIT); + if (flags & SB_WAIT) wakeup(&sb->sb_cc); - } if (so->so_state & SS_ASYNC) csignal(so->so_pgid, SIGIO, so->so_siguid, so->so_sigeuid); } Index: miscfs/fifofs/fifo_vnops.c =================================================================== RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v retrieving revision 1.59 diff -u -p -r1.59 fifo_vnops.c --- miscfs/fifofs/fifo_vnops.c 4 Nov 2017 14:13:53 -0000 1.59 +++ miscfs/fifofs/fifo_vnops.c 27 Nov 2017 10:48:00 -0000 @@ -333,11 +333,11 @@ fifo_poll(void *v) events = POLLIN; if (events & (POLLIN | POLLRDNORM)) { selrecord(ap->a_p, &rso->so_rcv.sb_sel); - rso->so_rcv.sb_flagsintr |= SB_SEL; + rso->so_rcv.sb_flags |= SB_SEL; } if (events & (POLLOUT | POLLWRNORM)) { selrecord(ap->a_p, &wso->so_snd.sb_sel); - wso->so_snd.sb_flagsintr |= SB_SEL; + wso->so_snd.sb_flags |= SB_SEL; } } return (revents); Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.530 diff -u -p -r1.530 if.c --- net/if.c 20 Nov 2017 10:16:25 -0000 1.530 +++ net/if.c 27 Nov 2017 10:13:22 -0000 @@ -933,7 +933,6 @@ if_netisr(void *unused) { int n, t = 0; - KERNEL_LOCK(); NET_LOCK(); while ((n = netisr) != 0) { @@ -947,8 +946,11 @@ if_netisr(void *unused) atomic_clearbits_int(&netisr, n); #if NETHER > 0 - if (n & (1 << NETISR_ARP)) + if (n & (1 << NETISR_ARP)) { + KERNEL_LOCK(); arpintr(); + KERNEL_UNLOCK(); + } #endif if (n & (1 << NETISR_IP)) ipintr(); @@ -957,35 +959,52 @@ if_netisr(void *unused) ip6intr(); #endif #if NPPP > 0 - if (n & (1 << NETISR_PPP)) + if (n & (1 << NETISR_PPP)) { + KERNEL_LOCK(); pppintr(); + KERNEL_UNLOCK(); + } #endif #if NBRIDGE > 0 - if (n & (1 << NETISR_BRIDGE)) + if (n & (1 << NETISR_BRIDGE)) { + KERNEL_LOCK(); bridgeintr(); + KERNEL_UNLOCK(); + } #endif #if NSWITCH > 0 - if (n & (1 << NETISR_SWITCH)) + if (n & (1 << NETISR_SWITCH)) { + KERNEL_LOCK(); switchintr(); + KERNEL_UNLOCK(); + } #endif #if NPPPOE > 0 - if (n & (1 << NETISR_PPPOE)) + if (n & (1 << NETISR_PPPOE)) { + KERNEL_LOCK(); pppoeintr(); + KERNEL_UNLOCK(); + } #endif #ifdef PIPEX - if (n & (1 << NETISR_PIPEX)) + if (n & (1 << NETISR_PIPEX)) { + KERNEL_LOCK(); pipexintr(); + KERNEL_UNLOCK(); + } #endif t |= n; } #if NPFSYNC > 0 - if (t & (1 << NETISR_PFSYNC)) + if (t & (1 << NETISR_PFSYNC)) { + KERNEL_LOCK(); pfsyncintr(); + KERNEL_UNLOCK(); + } #endif NET_UNLOCK(); - KERNEL_UNLOCK(); } void Index: sys/socketvar.h =================================================================== RCS file: /cvs/src/sys/sys/socketvar.h,v retrieving revision 1.79 diff -u -p -r1.79 socketvar.h --- sys/socketvar.h 23 Nov 2017 13:45:46 -0000 1.79 +++ sys/socketvar.h 27 Nov 2017 10:48:45 -0000 @@ -98,6 +98,8 @@ struct socket { * Variables for socket buffering. */ struct sockbuf { +/* The following fields are all zeroed on flush. */ +#define sb_startzero sb_cc u_long sb_cc; /* actual chars in buffer */ u_long sb_datacc; /* data only chars in buffer */ u_long sb_hiwat; /* max actual char count */ @@ -109,10 +111,13 @@ struct socket { struct mbuf *sb_mbtail; /* the last mbuf in the chain */ struct mbuf *sb_lastrecord;/* first mbuf of last record in socket buffer */ +/* End area that is zeroed on flush. */ +#define sb_endzero sb_sel struct selinfo sb_sel; /* process selecting read/write */ - int sb_flagsintr; /* flags, changed during interrupt */ - short sb_flags; /* flags, see below */ + int sb_flags ; /* flags, see below */ + short __pad; u_short sb_timeo; /* timeout for read/write */ + struct task sb_wtask; /* delay csignal() and selwakeup() */ } so_rcv, so_snd; #define SB_MAX (2*1024*1024) /* default for max chars in sockbuf */ #define SB_LOCK 0x01 /* lock on data queue */ @@ -149,6 +154,7 @@ struct socket { #define SS_CONNECTOUT 0x1000 /* connect, not accept, at this end */ #define SS_ISSENDING 0x2000 /* hint for lower layer */ #define SS_DNS 0x4000 /* created using SOCK_DNS socket(2) */ +#define SS_NONOTIF 0x8000 /* no notification allowed */ #ifdef _KERNEL @@ -169,7 +175,7 @@ void soassertlocked(struct socket *); static inline int sb_notify(struct socket *so, struct sockbuf *sb) { - int flags = (sb->sb_flags | sb->sb_flagsintr); + int flags = sb->sb_flags; KASSERT(sb == &so->so_rcv || sb == &so->so_snd); soassertlocked(so); @@ -329,6 +335,8 @@ int sosend(struct socket *so, struct mbu int sosetopt(struct socket *so, int level, int optname, struct mbuf *m); int soshutdown(struct socket *so, int how); void sowakeup(struct socket *so, struct sockbuf *sb); +void sorwakeup_cb(void *); +void sowwakeup_cb(void *); void sorwakeup(struct socket *); void sowwakeup(struct socket *); int sockargs(struct mbuf **, const void *, size_t, int);