On 20/11/17(Mon) 16:22, Martin Pieuchot wrote:
> Diff below remove the KERNEL_LOCK() around all pr_input() routines.
> It's a bit rough so I'd appreciate more tests before splitting it into
> pieces.
> 
> I'm using tasks to delay selwakeup/csignal calls, just like I did for
> bpf(4).

Updated version that should fix previous issues.

I'm using a new flag to prevent notification tasks to be scheduled.
Reusing SS_NOFDREF would lead to a race in sofree() since we need to
release the NET_LOCK() around taskq_barrier().

I'm also merging `sb_flagsintr' and `sb_flags'.  Both of them are
currently protected by the KERNEL_LOCK().  I'll plan to use atomic
set/clear bits operations to avoid taking the socketlock() when
setting SB_KNOTE.

Questions, comments, tests?

Index: kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.34
diff -u -p -r1.34 sys_socket.c
--- kern/sys_socket.c   14 Nov 2017 16:01:55 -0000      1.34
+++ kern/sys_socket.c   27 Nov 2017 10:46:15 -0000
@@ -166,11 +166,11 @@ soo_poll(struct file *fp, int events, st
        if (revents == 0) {
                if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
                        selrecord(p, &so->so_rcv.sb_sel);
-                       so->so_rcv.sb_flagsintr |= SB_SEL;
+                       so->so_rcv.sb_flags |= SB_SEL;
                }
                if (events & (POLLOUT | POLLWRNORM)) {
                        selrecord(p, &so->so_snd.sb_sel);
-                       so->so_snd.sb_flagsintr |= SB_SEL;
+                       so->so_snd.sb_flags |= SB_SEL;
                }
        }
        sounlock(s);
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.209
diff -u -p -r1.209 uipc_socket.c
--- kern/uipc_socket.c  23 Nov 2017 13:45:46 -0000      1.209
+++ kern/uipc_socket.c  27 Nov 2017 10:53:44 -0000
@@ -135,6 +135,8 @@ socreate(int dom, struct socket **aso, i
        so->so_egid = p->p_ucred->cr_gid;
        so->so_cpid = p->p_p->ps_pid;
        so->so_proto = prp;
+       task_set(&so->so_rcv.sb_wtask, sorwakeup_cb, so);
+       task_set(&so->so_snd.sb_wtask, sowwakeup_cb, so);
 
        s = solock(so);
        error = (*prp->pr_attach)(so, proto);
@@ -194,7 +196,8 @@ sofree(struct socket *so)
 {
        soassertlocked(so);
 
-       if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
+       if ((so->so_pcb != NULL) ||
+           (so->so_state & (SS_NOFDREF|SS_NONOTIF)) != (SS_NOFDREF|SS_NONOTIF))
                return;
        if (so->so_head) {
                /*
@@ -273,8 +276,15 @@ drop:
                        error = error2;
        }
 discard:
-       if (so->so_state & SS_NOFDREF)
-               panic("soclose NOFDREF: so %p, so_type %d", so, so->so_type);
+       KASSERT((so->so_state & (SS_NOFDREF|SS_NONOTIF)) == 0);
+       /* Make sure possible delayed notification are delivered. */
+       so->so_state |= SS_NONOTIF;
+       if (!task_del(systq, &so->so_rcv.sb_wtask) ||
+           !task_del(systq, &so->so_snd.sb_wtask)) {
+               NET_UNLOCK();
+               taskq_barrier(systq);
+               NET_LOCK();
+       }
        so->so_state |= SS_NOFDREF;
        sofree(so);
        sounlock(s);
@@ -296,9 +306,8 @@ soaccept(struct socket *so, struct mbuf 
        int error = 0;
 
        soassertlocked(so);
+       KASSERT(so->so_state & SS_NOFDREF);
 
-       if ((so->so_state & SS_NOFDREF) == 0)
-               panic("soaccept !NOFDREF: so %p, so_type %d", so, so->so_type);
        so->so_state &= ~SS_NOFDREF;
        if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
            (so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
@@ -1052,12 +1061,8 @@ sorflush(struct socket *so)
        sbunlock(so, sb);
        aso.so_proto = pr;
        aso.so_rcv = *sb;
-       memset(sb, 0, sizeof (*sb));
-       /* XXX - the memset stomps all over so_rcv */
-       if (aso.so_rcv.sb_flags & SB_KNOTE) {
-               sb->sb_sel.si_note = aso.so_rcv.sb_sel.si_note;
-               sb->sb_flags = SB_KNOTE;
-       }
+       memset(&sb->sb_startzero, 0,
+           (caddr_t)&sb->sb_endzero - (caddr_t)&sb->sb_startzero);
        if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose)
                (*pr->pr_domain->dom_dispose)(aso.so_rcv.sb_mb);
        sbrelease(&aso, &aso.so_rcv);
@@ -1178,8 +1183,8 @@ sosplice(struct socket *so, int fd, off_
         * we sleep, the socket buffers are not marked as spliced yet.
         */
        if (somove(so, M_WAIT)) {
-               so->so_rcv.sb_flagsintr |= SB_SPLICE;
-               sosp->so_snd.sb_flagsintr |= SB_SPLICE;
+               so->so_rcv.sb_flags |= SB_SPLICE;
+               sosp->so_snd.sb_flags |= SB_SPLICE;
        }
 
  release:
@@ -1196,8 +1201,8 @@ sounsplice(struct socket *so, struct soc
 
        task_del(sosplice_taskq, &so->so_splicetask);
        timeout_del(&so->so_idleto);
-       sosp->so_snd.sb_flagsintr &= ~SB_SPLICE;
-       so->so_rcv.sb_flagsintr &= ~SB_SPLICE;
+       sosp->so_snd.sb_flags &= ~SB_SPLICE;
+       so->so_rcv.sb_flags &= ~SB_SPLICE;
        so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
        if (wakeup && soreadable(so))
                sorwakeup(so);
@@ -1210,7 +1215,7 @@ soidle(void *arg)
        int s;
 
        s = solock(so);
-       if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
+       if (so->so_rcv.sb_flags & SB_SPLICE) {
                so->so_error = ETIMEDOUT;
                sounsplice(so, so->so_sp->ssp_socket, 1);
        }
@@ -1224,7 +1229,7 @@ sotask(void *arg)
        int s;
 
        s = solock(so);
-       if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
+       if (so->so_rcv.sb_flags & SB_SPLICE) {
                /*
                 * We may not sleep here as sofree() and unsplice() may be
                 * called from softnet interrupt context.  This would remove
@@ -1527,7 +1532,7 @@ sorwakeup(struct socket *so)
        soassertlocked(so);
 
 #ifdef SOCKET_SPLICE
-       if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
+       if (so->so_rcv.sb_flags & SB_SPLICE) {
                /*
                 * TCP has a sendbuffer that can handle multiple packets
                 * at once.  So queue the stream a bit to accumulate data.
@@ -1544,7 +1549,8 @@ sorwakeup(struct socket *so)
        if (isspliced(so))
                return;
 #endif
-       sowakeup(so, &so->so_rcv);
+       if ((so->so_state & SS_NONOTIF) == 0)
+               task_add(systq, &so->so_rcv.sb_wtask);
        if (so->so_upcall)
                (*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
 }
@@ -1555,10 +1561,12 @@ sowwakeup(struct socket *so)
        soassertlocked(so);
 
 #ifdef SOCKET_SPLICE
-       if (so->so_snd.sb_flagsintr & SB_SPLICE)
+       if (so->so_snd.sb_flags & SB_SPLICE)
                task_add(sosplice_taskq, &so->so_sp->ssp_soback->so_splicetask);
 #endif
-       sowakeup(so, &so->so_snd);
+
+       if ((so->so_state & SS_NONOTIF) == 0)
+               task_add(systq, &so->so_snd.sb_wtask);
 }
 
 int
@@ -2025,7 +2033,6 @@ sobuf_print(struct sockbuf *sb,
        (*pr)("\tsb_mbtail: %p\n", sb->sb_mbtail);
        (*pr)("\tsb_lastrecord: %p\n", sb->sb_lastrecord);
        (*pr)("\tsb_sel: ...\n");
-       (*pr)("\tsb_flagsintr: %d\n", sb->sb_flagsintr);
        (*pr)("\tsb_flags: %i\n", sb->sb_flags);
        (*pr)("\tsb_timeo: %i\n", sb->sb_timeo);
 }
Index: kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.87
diff -u -p -r1.87 uipc_socket2.c
--- kern/uipc_socket2.c 23 Nov 2017 13:42:53 -0000      1.87
+++ kern/uipc_socket2.c 27 Nov 2017 11:10:57 -0000
@@ -189,6 +189,8 @@ sonewconn(struct socket *head, int conns
        so->so_rcv.sb_wat = head->so_rcv.sb_wat;
        so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
        so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+       task_set(&so->so_rcv.sb_wtask, sorwakeup_cb, so);
+       task_set(&so->so_snd.sb_wtask, sowwakeup_cb, so);
 
        soqinsque(head, so, soqueue);
        if ((*so->so_proto->pr_attach)(so, 0)) {
@@ -329,12 +331,12 @@ sosleep(struct socket *so, void *ident, 
 int
 sbwait(struct socket *so, struct sockbuf *sb)
 {
+       int prio = (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH;
+
        soassertlocked(so);
 
-       sb->sb_flagsintr |= SB_WAIT;
-       return (sosleep(so, &sb->sb_cc,
-           (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio",
-           sb->sb_timeo));
+       sb->sb_flags |= SB_WAIT;
+       return (sosleep(so, &sb->sb_cc, prio, "netio", sb->sb_timeo));
 }
 
 int
@@ -351,6 +353,9 @@ sblock(struct socket *so, struct sockbuf
        if (wait & M_NOWAIT)
                return (EWOULDBLOCK);
 
+       /* XXXSMP ensure sosleep() isn't called from input path. */
+       KERNEL_ASSERT_LOCKED();
+
        while (sb->sb_flags & SB_LOCK) {
                sb->sb_flags |= SB_WANT;
                error = sosleep(so, &sb->sb_flags, prio, "netlck", 0);
@@ -364,13 +369,35 @@ sblock(struct socket *so, struct sockbuf
 void
 sbunlock(struct socket *so, struct sockbuf *sb)
 {
+       int flags = sb->sb_flags;
+
        soassertlocked(so);
 
-       sb->sb_flags &= ~SB_LOCK;
-       if (sb->sb_flags & SB_WANT) {
-               sb->sb_flags &= ~SB_WANT;
+       sb->sb_flags &= ~(SB_LOCK|SB_WANT);
+       if (flags & SB_WANT)
                wakeup(&sb->sb_flags);
-       }
+}
+
+void
+sorwakeup_cb(void *xso)
+{
+       struct socket *so = xso;
+       int s;
+
+       s = solock(so);
+       sowakeup(so, &so->so_rcv);
+       sounlock(s);
+}
+
+void
+sowwakeup_cb(void *xso)
+{
+       struct socket *so = xso;
+       int s;
+
+       s = solock(so);
+       sowakeup(so, &so->so_snd);
+       sounlock(s);
 }
 
 /*
@@ -381,15 +408,15 @@ sbunlock(struct socket *so, struct sockb
 void
 sowakeup(struct socket *so, struct sockbuf *sb)
 {
+       int flags = sb->sb_flags;
+
        KERNEL_ASSERT_LOCKED();
        soassertlocked(so);
 
        selwakeup(&sb->sb_sel);
-       sb->sb_flagsintr &= ~SB_SEL;
-       if (sb->sb_flagsintr & SB_WAIT) {
-               sb->sb_flagsintr &= ~SB_WAIT;
+       sb->sb_flags &= ~(SB_SEL|SB_WAIT);
+       if (flags & SB_WAIT)
                wakeup(&sb->sb_cc);
-       }
        if (so->so_state & SS_ASYNC)
                csignal(so->so_pgid, SIGIO, so->so_siguid, so->so_sigeuid);
 }
Index: miscfs/fifofs/fifo_vnops.c
===================================================================
RCS file: /cvs/src/sys/miscfs/fifofs/fifo_vnops.c,v
retrieving revision 1.59
diff -u -p -r1.59 fifo_vnops.c
--- miscfs/fifofs/fifo_vnops.c  4 Nov 2017 14:13:53 -0000       1.59
+++ miscfs/fifofs/fifo_vnops.c  27 Nov 2017 10:48:00 -0000
@@ -333,11 +333,11 @@ fifo_poll(void *v)
                        events = POLLIN;
                if (events & (POLLIN | POLLRDNORM)) {
                        selrecord(ap->a_p, &rso->so_rcv.sb_sel);
-                       rso->so_rcv.sb_flagsintr |= SB_SEL;
+                       rso->so_rcv.sb_flags |= SB_SEL;
                }
                if (events & (POLLOUT | POLLWRNORM)) {
                        selrecord(ap->a_p, &wso->so_snd.sb_sel);
-                       wso->so_snd.sb_flagsintr |= SB_SEL;
+                       wso->so_snd.sb_flags |= SB_SEL;
                }
        }
        return (revents);
Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.530
diff -u -p -r1.530 if.c
--- net/if.c    20 Nov 2017 10:16:25 -0000      1.530
+++ net/if.c    27 Nov 2017 10:13:22 -0000
@@ -933,7 +933,6 @@ if_netisr(void *unused)
 {
        int n, t = 0;
 
-       KERNEL_LOCK();
        NET_LOCK();
 
        while ((n = netisr) != 0) {
@@ -947,8 +946,11 @@ if_netisr(void *unused)
                atomic_clearbits_int(&netisr, n);
 
 #if NETHER > 0
-               if (n & (1 << NETISR_ARP))
+               if (n & (1 << NETISR_ARP)) {
+                       KERNEL_LOCK();
                        arpintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
                if (n & (1 << NETISR_IP))
                        ipintr();
@@ -957,35 +959,52 @@ if_netisr(void *unused)
                        ip6intr();
 #endif
 #if NPPP > 0
-               if (n & (1 << NETISR_PPP))
+               if (n & (1 << NETISR_PPP)) {
+                       KERNEL_LOCK();
                        pppintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
 #if NBRIDGE > 0
-               if (n & (1 << NETISR_BRIDGE))
+               if (n & (1 << NETISR_BRIDGE)) {
+                       KERNEL_LOCK();
                        bridgeintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
 #if NSWITCH > 0
-               if (n & (1 << NETISR_SWITCH))
+               if (n & (1 << NETISR_SWITCH)) {
+                       KERNEL_LOCK();
                        switchintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
 #if NPPPOE > 0
-               if (n & (1 << NETISR_PPPOE))
+               if (n & (1 << NETISR_PPPOE)) {
+                       KERNEL_LOCK();
                        pppoeintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
 #ifdef PIPEX
-               if (n & (1 << NETISR_PIPEX))
+               if (n & (1 << NETISR_PIPEX)) {
+                       KERNEL_LOCK();
                        pipexintr();
+                       KERNEL_UNLOCK();
+               }
 #endif
                t |= n;
        }
 
 #if NPFSYNC > 0
-       if (t & (1 << NETISR_PFSYNC))
+       if (t & (1 << NETISR_PFSYNC)) {
+               KERNEL_LOCK();
                pfsyncintr();
+               KERNEL_UNLOCK();
+       }
 #endif
 
        NET_UNLOCK();
-       KERNEL_UNLOCK();
 }
 
 void
Index: sys/socketvar.h
===================================================================
RCS file: /cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.79
diff -u -p -r1.79 socketvar.h
--- sys/socketvar.h     23 Nov 2017 13:45:46 -0000      1.79
+++ sys/socketvar.h     27 Nov 2017 10:48:45 -0000
@@ -98,6 +98,8 @@ struct socket {
  * Variables for socket buffering.
  */
        struct  sockbuf {
+/* The following fields are all zeroed on flush. */
+#define        sb_startzero    sb_cc
                u_long  sb_cc;          /* actual chars in buffer */
                u_long  sb_datacc;      /* data only chars in buffer */
                u_long  sb_hiwat;       /* max actual char count */
@@ -109,10 +111,13 @@ struct socket {
                struct mbuf *sb_mbtail; /* the last mbuf in the chain */
                struct mbuf *sb_lastrecord;/* first mbuf of last record in
                                              socket buffer */
+/* End area that is zeroed on flush. */
+#define        sb_endzero      sb_sel
                struct  selinfo sb_sel; /* process selecting read/write */
-               int     sb_flagsintr;   /* flags, changed during interrupt */
-               short   sb_flags;       /* flags, see below */
+               int     sb_flags ;      /* flags, see below */
+               short   __pad;
                u_short sb_timeo;       /* timeout for read/write */
+               struct  task sb_wtask;  /* delay csignal() and selwakeup() */
        } so_rcv, so_snd;
 #define        SB_MAX          (2*1024*1024)   /* default for max chars in 
sockbuf */
 #define        SB_LOCK         0x01            /* lock on data queue */
@@ -149,6 +154,7 @@ struct socket {
 #define        SS_CONNECTOUT           0x1000  /* connect, not accept, at this 
end */
 #define        SS_ISSENDING            0x2000  /* hint for lower layer */
 #define        SS_DNS                  0x4000  /* created using SOCK_DNS 
socket(2) */
+#define        SS_NONOTIF              0x8000  /* no notification allowed */
 
 #ifdef _KERNEL
 
@@ -169,7 +175,7 @@ void        soassertlocked(struct socket *);
 static inline int
 sb_notify(struct socket *so, struct sockbuf *sb)
 {
-       int flags = (sb->sb_flags | sb->sb_flagsintr);
+       int flags = sb->sb_flags;
 
        KASSERT(sb == &so->so_rcv || sb == &so->so_snd);
        soassertlocked(so);
@@ -329,6 +335,8 @@ int sosend(struct socket *so, struct mbu
 int    sosetopt(struct socket *so, int level, int optname, struct mbuf *m);
 int    soshutdown(struct socket *so, int how);
 void   sowakeup(struct socket *so, struct sockbuf *sb);
+void   sorwakeup_cb(void *);
+void   sowwakeup_cb(void *);
 void   sorwakeup(struct socket *);
 void   sowwakeup(struct socket *);
 int    sockargs(struct mbuf **, const void *, size_t, int);

Reply via email to