On 10/04/16 16:44, Martin Pieuchot wrote:
On 10/03/16 16:43, Martin Pieuchot wrote:
Diff below introduces a single write lock that will be used to serialize
access to ip_output().

This lock will be then split in multiple readers and writers to allow
multiple forwarding paths to run in parallel of each others but still
serialized with the socket layer.

I'm currently looking for people wanting to run this diff and try to
break it.  In other words, your machine might panic with it and if it
does report the panic to me so the diff can be improved.

I tested NFS v2 and v3 so I'm quite confident, but I might have missed
some obvious stuff.

Updated diff attaced including a fix for syn_cache_timer(), problem
reported by Chris Jackman.

Thanks to all testers!

Here's a newer version that includes a fix for rt_timer_timer() also
found by Chris Jackman.

Index: kern/kern_rwlock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_rwlock.c,v
retrieving revision 1.27
diff -u -p -r1.27 kern_rwlock.c
--- kern/kern_rwlock.c	14 Mar 2015 07:33:42 -0000	1.27
+++ kern/kern_rwlock.c	5 Oct 2016 08:11:09 -0000
@@ -98,6 +98,12 @@ rw_enter_read(struct rwlock *rwl)
 		membar_enter();
 }
 
+#if 1
+#include <machine/db_machdep.h>
+#include <ddb/db_output.h>
+#include <ddb/db_interface.h>
+#endif
+
 void
 rw_enter_write(struct rwlock *rwl)
 {
@@ -108,6 +114,15 @@ rw_enter_write(struct rwlock *rwl)
 		rw_enter(rwl, RW_WRITE);
 	else
 		membar_enter();
+
+#if 1
+	if ((rwl == &netlock) && (splassert_ctl == 3)) {
+		printf("ENTER::%d::", cpu_number());
+		db_stack_trace_print(
+		    (db_expr_t)__builtin_frame_address(1),
+		    TRUE, 1, "", printf);
+	}
+#endif
 }
 
 void
@@ -129,6 +144,15 @@ rw_exit_write(struct rwlock *rwl)
 	unsigned long owner = rwl->rwl_owner;
 
 	rw_assert_wrlock(rwl);
+
+#if 1
+	if ((rwl == &netlock) && (splassert_ctl == 3)) {
+		printf("EXIT::%d::", cpu_number());
+		db_stack_trace_print(
+		    (db_expr_t)__builtin_frame_address(1),
+		    TRUE, 1, "", printf);
+	}
+#endif
 
 	membar_exit();
 	if (__predict_false((owner & RWLOCK_WAIT) ||
Index: kern/sys_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.21
diff -u -p -r1.21 sys_socket.c
--- kern/sys_socket.c	5 Dec 2015 10:11:53 -0000	1.21
+++ kern/sys_socket.c	5 Oct 2016 08:11:09 -0000
@@ -131,8 +131,10 @@ soo_poll(struct file *fp, int events, st
 {
 	struct socket *so = fp->f_data;
 	int revents = 0;
-	int s = splsoftnet();
+	int s;
 
+	rw_enter_write(&netlock);
+	s = splsoftnet();
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (soreadable(so))
 			revents |= events & (POLLIN | POLLRDNORM);
@@ -159,6 +161,7 @@ soo_poll(struct file *fp, int events, st
 		}
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 	return (revents);
 }
 
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.161
diff -u -p -r1.161 uipc_socket.c
--- kern/uipc_socket.c	20 Sep 2016 14:27:43 -0000	1.161
+++ kern/uipc_socket.c	5 Oct 2016 08:11:10 -0000
@@ -123,6 +123,7 @@ socreate(int dom, struct socket **aso, i
 		return (EPROTONOSUPPORT);
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	so = pool_get(&socket_pool, PR_WAITOK | PR_ZERO);
 	TAILQ_INIT(&so->so_q0);
@@ -142,9 +143,11 @@ socreate(int dom, struct socket **aso, i
 		so->so_state |= SS_NOFDREF;
 		sofree(so);
 		splx(s);
+		rw_exit_write(&netlock);
 		return (error);
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 	*aso = so;
 	return (0);
 }
@@ -152,11 +155,13 @@ socreate(int dom, struct socket **aso, i
 int
 sobind(struct socket *so, struct mbuf *nam, struct proc *p)
 {
-	int s = splsoftnet();
-	int error;
+	int s, error;
 
+	rw_enter_write(&netlock);
+	s = splsoftnet();
 	error = (*so->so_proto->pr_usrreq)(so, PRU_BIND, NULL, nam, NULL, p);
 	splx(s);
+	rw_exit_write(&netlock);
 	return (error);
 }
 
@@ -171,11 +176,13 @@ solisten(struct socket *so, int backlog)
 	if (isspliced(so) || issplicedback(so))
 		return (EOPNOTSUPP);
 #endif /* SOCKET_SPLICE */
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	error = (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, NULL, NULL, NULL,
 	    curproc);
 	if (error) {
 		splx(s);
+		rw_exit_write(&netlock);
 		return (error);
 	}
 	if (TAILQ_FIRST(&so->so_q) == NULL)
@@ -186,6 +193,7 @@ solisten(struct socket *so, int backlog)
 		backlog = sominconn;
 	so->so_qlimit = backlog;
 	splx(s);
+	rw_exit_write(&netlock);
 	return (0);
 }
 
@@ -196,6 +204,7 @@ solisten(struct socket *so, int backlog)
 void
 sofree(struct socket *so)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
@@ -234,9 +243,10 @@ int
 soclose(struct socket *so)
 {
 	struct socket *so2;
-	int s = splsoftnet();		/* conservative */
-	int error = 0;
+	int s, error = 0;
 
+	rw_enter_write(&netlock);
+	s = splsoftnet();		/* conservative */
 	if (so->so_options & SO_ACCEPTCONN) {
 		while ((so2 = TAILQ_FIRST(&so->so_q0)) != NULL) {
 			(void) soqremque(so2, 0);
@@ -260,7 +270,7 @@ soclose(struct socket *so)
 			    (so->so_state & SS_NBIO))
 				goto drop;
 			while (so->so_state & SS_ISCONNECTED) {
-				error = tsleep(&so->so_timeo,
+				error = rwsleep(&so->so_timeo, &netlock,
 				    PSOCK | PCATCH, "netcls",
 				    so->so_linger * hz);
 				if (error)
@@ -281,6 +291,7 @@ discard:
 	so->so_state |= SS_NOFDREF;
 	sofree(so);
 	splx(s);
+	rw_exit_write(&netlock);
 	return (error);
 }
 
@@ -290,6 +301,7 @@ discard:
 int
 soabort(struct socket *so)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	return (*so->so_proto->pr_usrreq)(so, PRU_ABORT, NULL, NULL, NULL,
@@ -301,6 +313,7 @@ soaccept(struct socket *so, struct mbuf 
 {
 	int error = 0;
 
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	if ((so->so_state & SS_NOFDREF) == 0)
@@ -318,11 +331,11 @@ soaccept(struct socket *so, struct mbuf 
 int
 soconnect(struct socket *so, struct mbuf *nam)
 {
-	int s;
-	int error;
+	int s, error;
 
 	if (so->so_options & SO_ACCEPTCONN)
 		return (EOPNOTSUPP);
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	/*
 	 * If protocol is connection-based, can only connect once.
@@ -338,18 +351,21 @@ soconnect(struct socket *so, struct mbuf
 		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
 		    NULL, nam, NULL, curproc);
 	splx(s);
+	rw_exit_write(&netlock);
 	return (error);
 }
 
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
-	int s = splsoftnet();
-	int error;
+	int s, error;
 
+	rw_enter_write(&netlock);
+	s = splsoftnet();
 	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, NULL,
 	    (struct mbuf *)so2, NULL, curproc);
 	splx(s);
+	rw_exit_write(&netlock);
 	return (error);
 }
 
@@ -358,14 +374,20 @@ sodisconnect(struct socket *so)
 {
 	int error;
 
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
-	if ((so->so_state & SS_ISCONNECTED) == 0)
-		return (ENOTCONN);
-	if (so->so_state & SS_ISDISCONNECTING)
-		return (EALREADY);
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto bad;
+	}
+	if (so->so_state & SS_ISDISCONNECTING) {
+		error = EALREADY;
+		goto bad;
+	}
 	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, NULL, NULL,
 	    NULL, curproc);
+bad:
 	return (error);
 }
 
@@ -426,21 +448,21 @@ sosend(struct socket *so, struct mbuf *a
 			    (sizeof(struct file *) / sizeof(int)));
 	}
 
-#define	snderr(errno)	{ error = errno; splx(s); goto release; }
+#define	snderr(e) { error = e; splx(s); rw_exit_write(&netlock); goto release; }
 
 restart:
 	if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
 		goto out;
 	so->so_state |= SS_ISSENDING;
 	do {
+		rw_enter_write(&netlock);
 		s = splsoftnet();
 		if (so->so_state & SS_CANTSENDMORE)
 			snderr(EPIPE);
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
-			splx(s);
-			goto release;
+			snderr(error);
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
@@ -465,11 +487,13 @@ restart:
 			error = sbwait(&so->so_snd);
 			so->so_state &= ~SS_ISSENDING;
 			splx(s);
+			rw_exit_write(&netlock);
 			if (error)
 				goto out;
 			goto restart;
 		}
 		splx(s);
+		rw_exit_write(&netlock);
 		space -= clen;
 		do {
 			if (uio == NULL) {
@@ -489,6 +513,7 @@ restart:
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			}
+			rw_enter_write(&netlock);
 			s = splsoftnet();		/* XXX */
 			if (resid == 0)
 				so->so_state &= ~SS_ISSENDING;
@@ -496,6 +521,7 @@ restart:
 			    (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
 			    top, addr, control, curproc);
 			splx(s);
+			rw_exit_write(&netlock);
 			clen = 0;
 			control = NULL;
 			top = NULL;
@@ -625,8 +651,8 @@ sbsync(struct sockbuf *sb, struct mbuf *
  * must begin with an address if the protocol so specifies,
  * followed by an optional mbuf or mbufs containing ancillary data,
  * and then zero or more mbufs of data.
- * In order to avoid blocking network interrupts for the entire time here,
- * we splx() while doing the actual copy to user space.
+ * In order to avoid blocking network for the entire time here, we splx()
+ * and release ``netlock'' while doing the actual copy to user space.
  * Although the sockbuf is locked, new data may still be appended,
  * and thus we must maintain consistency of the sockbuf during that time.
  *
@@ -680,6 +706,8 @@ bad:
 restart:
 	if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0)
 		return (error);
+
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 
 	m = so->so_rcv.sb_mb;
@@ -746,6 +774,7 @@ restart:
 		sbunlock(&so->so_rcv);
 		error = sbwait(&so->so_rcv);
 		splx(s);
+		rw_exit_write(&netlock);
 		if (error)
 			return (error);
 		goto restart;
@@ -880,7 +909,9 @@ dontblock:
 			SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
 			resid = uio->uio_resid;
 			splx(s);
+			rw_exit_write(&netlock);
 			uio_error = uiomove(mtod(m, caddr_t) + moff, len, uio);
+			rw_enter_write(&netlock);
 			s = splsoftnet();
 			if (uio_error)
 				uio->uio_resid = resid - len;
@@ -964,6 +995,7 @@ dontblock:
 			if (error) {
 				sbunlock(&so->so_rcv);
 				splx(s);
+				rw_exit_write(&netlock);
 				return (0);
 			}
 			if ((m = so->so_rcv.sb_mb) != NULL)
@@ -1000,6 +1032,7 @@ dontblock:
 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
 		sbunlock(&so->so_rcv);
 		splx(s);
+		rw_exit_write(&netlock);
 		goto restart;
 	}
 
@@ -1011,6 +1044,7 @@ dontblock:
 release:
 	sbunlock(&so->so_rcv);
 	splx(s);
+	rw_exit_write(&netlock);
 	return (error);
 }
 
@@ -1020,6 +1054,7 @@ soshutdown(struct socket *so, int how)
 	struct protosw *pr = so->so_proto;
 	int s, error = 0;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	switch (how) {
 	case SHUT_RD:
@@ -1037,6 +1072,8 @@ soshutdown(struct socket *so, int how)
 		break;
 	}
 	splx(s);
+	rw_exit_write(&netlock);
+
 	return (error);
 }
 
@@ -1050,6 +1087,7 @@ sorflush(struct socket *so)
 
 	sb->sb_flags |= SB_NOINTR;
 	(void) sblock(sb, M_WAITOK);
+	/* XXXSMP */
 	s = splnet();
 	socantrcvmore(so);
 	sbunlock(sb);
@@ -1103,10 +1141,12 @@ sosplice(struct socket *so, int fd, off_
 		if ((error = sblock(&so->so_rcv,
 		    (so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0)
 			return (error);
+		rw_enter_write(&netlock);
 		s = splsoftnet();
 		if (so->so_sp->ssp_socket)
 			sounsplice(so, so->so_sp->ssp_socket, 1);
 		splx(s);
+		rw_exit_write(&netlock);
 		sbunlock(&so->so_rcv);
 		return (0);
 	}
@@ -1135,6 +1175,7 @@ sosplice(struct socket *so, int fd, off_
 		FRELE(fp, curproc);
 		return (error);
 	}
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 
 	if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
@@ -1177,6 +1218,7 @@ sosplice(struct socket *so, int fd, off_
 
  release:
 	splx(s);
+	rw_exit_write(&netlock);
 	sbunlock(&sosp->so_snd);
 	sbunlock(&so->so_rcv);
 	FRELE(fp, curproc);
@@ -1186,6 +1228,7 @@ sosplice(struct socket *so, int fd, off_
 void
 sounsplice(struct socket *so, struct socket *sosp, int wakeup)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	task_del(sosplice_taskq, &so->so_splicetask);
@@ -1203,12 +1246,14 @@ soidle(void *arg)
 	struct socket *so = arg;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
 		so->so_error = ETIMEDOUT;
 		sounsplice(so, so->so_sp->ssp_socket, 1);
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 void
@@ -1217,6 +1262,7 @@ sotask(void *arg)
 	struct socket *so = arg;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
 		/*
@@ -1227,6 +1273,7 @@ sotask(void *arg)
 		somove(so, M_DONTWAIT);
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 
 	/* Avoid user land starvation. */
 	yield();
@@ -1248,6 +1295,7 @@ somove(struct socket *so, int wait)
 	int		 error = 0, maxreached = 0;
 	short		 state;
 
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
  nextpkt:
@@ -1510,6 +1558,7 @@ somove(struct socket *so, int wait)
 void
 sorwakeup(struct socket *so)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 #ifdef SOCKET_SPLICE
@@ -1531,13 +1580,17 @@ sorwakeup(struct socket *so)
 		return;
 #endif
 	sowakeup(so, &so->so_rcv);
-	if (so->so_upcall)
+	if (so->so_upcall) {
+		rw_exit_write(&netlock);
 		(*(so->so_upcall))(so, so->so_upcallarg, M_DONTWAIT);
+		rw_enter_write(&netlock);
+	}
 }
 
 void
 sowwakeup(struct socket *so)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 #ifdef SOCKET_SPLICE
@@ -1884,7 +1937,8 @@ soo_kqfilter(struct file *fp, struct kno
 {
 	struct socket *so = kn->kn_fp->f_data;
 	struct sockbuf *sb;
-	int s;
+
+	KERNEL_ASSERT_LOCKED();
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
@@ -1902,10 +1956,9 @@ soo_kqfilter(struct file *fp, struct kno
 		return (EINVAL);
 	}
 
-	s = splnet();
 	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, kn, kn_selnext);
 	sb->sb_flags |= SB_KNOTE;
-	splx(s);
+
 	return (0);
 }
 
@@ -1913,12 +1966,12 @@ void
 filt_sordetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
-	int s = splnet();
+
+	KERNEL_ASSERT_LOCKED();
 
 	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
-	splx(s);
 }
 
 int
@@ -1947,12 +2000,12 @@ void
 filt_sowdetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
-	int s = splnet();
+
+	KERNEL_ASSERT_LOCKED();
 
 	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
-	splx(s);
 }
 
 int
Index: kern/uipc_socket2.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket2.c,v
retrieving revision 1.65
diff -u -p -r1.65 uipc_socket2.c
--- kern/uipc_socket2.c	2 Sep 2016 13:28:21 -0000	1.65
+++ kern/uipc_socket2.c	5 Oct 2016 08:11:10 -0000
@@ -138,8 +138,6 @@ soisdisconnected(struct socket *so)
  * then we allocate a new structure, properly linked into the
  * data structure of the original socket, and return this.
  * Connstatus may be 0 or SS_ISCONNECTED.
- *
- * Must be called at splsoftnet()
  */
 struct socket *
 sonewconn(struct socket *head, int connstatus)
@@ -147,6 +145,7 @@ sonewconn(struct socket *head, int conns
 	struct socket *so;
 	int soqueue = connstatus ? 1 : 0;
 
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	if (mclpools[0].pr_nout > mclpools[0].pr_hardlimit * 95 / 100)
@@ -276,10 +275,11 @@ socantrcvmore(struct socket *so)
 int
 sbwait(struct sockbuf *sb)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	sb->sb_flagsintr |= SB_WAIT;
-	return (tsleep(&sb->sb_cc,
+	return (rwsleep(&sb->sb_cc, &netlock,
 	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "netio",
 	    sb->sb_timeo));
 }
@@ -317,7 +317,8 @@ sbunlock(struct sockbuf *sb)
 void
 sowakeup(struct socket *so, struct sockbuf *sb)
 {
-	int s = splsoftnet();
+	rw_assert_wrlock(&netlock);
+	splassert(IPL_SOFTNET);
 
 	selwakeup(&sb->sb_sel);
 	sb->sb_flagsintr &= ~SB_SEL;
@@ -325,7 +326,7 @@ sowakeup(struct socket *so, struct sockb
 		sb->sb_flagsintr &= ~SB_WAIT;
 		wakeup(&sb->sb_cc);
 	}
-	splx(s);
+
 	if (so->so_state & SS_ASYNC)
 		csignal(so->so_pgid, SIGIO, so->so_siguid, so->so_sigeuid);
 }
Index: kern/uipc_syscalls.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.133
diff -u -p -r1.133 uipc_syscalls.c
--- kern/uipc_syscalls.c	9 Aug 2016 02:25:35 -0000	1.133
+++ kern/uipc_syscalls.c	5 Oct 2016 08:11:10 -0000
@@ -250,6 +250,7 @@ doaccept(struct proc *p, int sock, struc
 	if ((error = getsock(p, sock, &fp)) != 0)
 		return (error);
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	headfp = fp;
 	head = fp->f_data;
@@ -275,7 +276,8 @@ redo:
 			head->so_error = ECONNABORTED;
 			break;
 		}
-		error = tsleep(&head->so_timeo, PSOCK | PCATCH, "netcon", 0);
+		error = rwsleep(&head->so_timeo, &netlock, PSOCK | PCATCH,
+		    "netcon", 0);
 		if (error) {
 			goto bad;
 		}
@@ -352,6 +354,7 @@ redo:
 	m_freem(nam);
 bad:
 	splx(s);
+	rw_exit_write(&netlock);
 	FRELE(headfp, p);
 	return (error);
 }
@@ -406,9 +409,11 @@ sys_connect(struct proc *p, void *v, reg
 		m_freem(nam);
 		return (EINPROGRESS);
 	}
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
-		error = tsleep(&so->so_timeo, PSOCK | PCATCH, "netcon2", 0);
+		error = rwsleep(&so->so_timeo, &netlock, PSOCK | PCATCH,
+		    "netcon2", 0);
 		if (error) {
 			if (error == EINTR || error == ERESTART)
 				interrupted = 1;
@@ -420,6 +425,7 @@ sys_connect(struct proc *p, void *v, reg
 		so->so_error = 0;
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 bad:
 	if (!interrupted)
 		so->so_state &= ~SS_ISCONNECTING;
Index: kern/uipc_usrreq.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.102
diff -u -p -r1.102 uipc_usrreq.c
--- kern/uipc_usrreq.c	26 Aug 2016 07:12:30 -0000	1.102
+++ kern/uipc_usrreq.c	5 Oct 2016 08:11:10 -0000
@@ -131,7 +131,10 @@ uipc_usrreq(struct socket *so, int req, 
 		break;
 
 	case PRU_BIND:
+		rw_assert_wrlock(&netlock);
+		rw_exit_write(&netlock);
 		error = unp_bind(unp, nam, p);
+		rw_enter_write(&netlock);
 		break;
 
 	case PRU_LISTEN:
Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.452
diff -u -p -r1.452 if.c
--- net/if.c	3 Oct 2016 12:26:13 -0000	1.452
+++ net/if.c	5 Oct 2016 08:11:11 -0000
@@ -163,7 +163,13 @@ void	if_netisr(void *);
 void	ifa_print_all(void);
 #endif
 
-void	if_start_locked(struct ifnet *ifp);
+void	if_start_locked(struct ifnet *);
+int	if_ioctl_locked(struct socket *, u_long, caddr_t, struct proc *);
+
+/*
+ * Network lock: serialize socket operations.
+ */
+struct rwlock netlock = RWLOCK_INITIALIZER("netlock");
 
 /*
  * interface index map
@@ -836,10 +842,16 @@ if_netisr(void *unused)
 	int s;
 
 	KERNEL_LOCK();
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 
 	while ((n = netisr) != 0) {
-		sched_pause();
+		/* Like sched_pause() but with a rwlock dance. */
+		if (curcpu()->ci_schedstate.spc_schedflags & SPCF_SHOULDYIELD) {
+			rw_exit_write(&netlock);
+			yield();
+			rw_enter_write(&netlock);
+		}
 
 		atomic_clearbits_int(&netisr, n);
 
@@ -878,6 +890,7 @@ if_netisr(void *unused)
 #endif
 
 	splx(s);
+	rw_exit_write(&netlock);
 	KERNEL_UNLOCK();
 }
 
@@ -1435,6 +1448,7 @@ if_downall(void)
 	struct ifnet *ifp;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splnet();
 	TAILQ_FOREACH(ifp, &ifnet, if_list) {
 		if ((ifp->if_flags & IFF_UP) == 0)
@@ -1449,6 +1463,7 @@ if_downall(void)
 		}
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 /*
@@ -1508,9 +1523,11 @@ if_linkstate_task(void *xifidx)
 	if (ifp == NULL)
 		return;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	if_linkstate(ifp);
 	splx(s);
+	rw_exit_write(&netlock);
 
 	if_put(ifp);
 }
@@ -1518,6 +1535,7 @@ if_linkstate_task(void *xifidx)
 void
 if_linkstate(struct ifnet *ifp)
 {
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	rt_ifmsg(ifp);
@@ -1708,6 +1726,18 @@ if_setrdomain(struct ifnet *ifp, int rdo
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
+{
+	int error;
+
+	rw_enter_write(&netlock);
+	error = if_ioctl_locked(so, cmd, data, p);
+	rw_exit_write(&netlock);
+
+	return (error);
+}
+
+int
+if_ioctl_locked(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
 {
 	struct ifnet *ifp;
 	struct ifreq *ifr;
Index: net/rtsock.c
===================================================================
RCS file: /cvs/src/sys/net/rtsock.c,v
retrieving revision 1.207
diff -u -p -r1.207 rtsock.c
--- net/rtsock.c	27 Sep 2016 18:41:11 -0000	1.207
+++ net/rtsock.c	5 Oct 2016 08:11:11 -0000
@@ -296,6 +296,7 @@ route_ctloutput(int op, struct socket *s
 	return (error);
 }
 
+/* XXXSMP */
 void
 rt_senddesync(void *data)
 {
Index: net/route.c
===================================================================
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.332
diff -u -p -r1.332 route.c
--- net/route.c	5 Oct 2016 08:07:46 -0000	1.332
+++ net/route.c	5 Oct 2016 08:11:11 -0000
@@ -533,8 +533,6 @@ ifafree(struct ifaddr *ifa)
  * destination to go through the given gateway.
  * Normally called as a result of a routing redirect
  * message from the network layer.
- *
- * N.B.: must be called at splsoftnet
  */
 void
 rtredirect(struct sockaddr *dst, struct sockaddr *gateway,
@@ -549,6 +547,7 @@ rtredirect(struct sockaddr *dst, struct 
 	int 			 flags = RTF_GATEWAY|RTF_HOST;
 	uint8_t			 prio = RTP_NONE;
 
+	rw_assert_wrlock(&netlock);
 	splsoftassert(IPL_SOFTNET);
 
 	/* verify the gateway is directly reachable */
@@ -1592,6 +1591,7 @@ rt_timer_timer(void *arg)
 
 	current_time = time_uptime;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	for (rtq = LIST_FIRST(&rttimer_queue_head); rtq != NULL;
 	     rtq = LIST_NEXT(rtq, rtq_link)) {
@@ -1608,6 +1608,7 @@ rt_timer_timer(void *arg)
 		}
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 
 	timeout_add_sec(to, 1);
 }
Index: netinet/ip_carp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_carp.c,v
retrieving revision 1.294
diff -u -p -r1.294 ip_carp.c
--- netinet/ip_carp.c	4 Oct 2016 13:54:32 -0000	1.294
+++ netinet/ip_carp.c	5 Oct 2016 08:11:11 -0000
@@ -1045,6 +1045,7 @@ carp_send_ad(void *v)
 		return;
 	}
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 
 	/* bow out if we've gone to backup (the carp interface is going down) */
@@ -1247,6 +1248,7 @@ carp_send_ad(void *v)
 retry_later:
 	sc->cur_vhe = NULL;
 	splx(s);
+	rw_exit_write(&netlock);
 	if (advbase != 255 || advskew != 255)
 		timeout_add(&vhe->ad_tmo, tvtohz(&tv));
 }
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.282
diff -u -p -r1.282 ip_input.c
--- netinet/ip_input.c	22 Sep 2016 10:12:25 -0000	1.282
+++ netinet/ip_input.c	5 Oct 2016 08:11:11 -0000
@@ -1755,12 +1755,17 @@ ip_send_dispatch(void *xmq)
 	int s;
 
 	mq_delist(mq, &ml);
+	if (ml_empty(&ml))
+		return;
+
 	KERNEL_LOCK();
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	while ((m = ml_dequeue(&ml)) != NULL) {
 		ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 	KERNEL_UNLOCK();
 }
 
Index: netinet/ip_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_output.c,v
retrieving revision 1.327
diff -u -p -r1.327 ip_output.c
--- netinet/ip_output.c	4 Sep 2016 17:18:56 -0000	1.327
+++ netinet/ip_output.c	5 Oct 2016 08:11:11 -0000
@@ -109,6 +109,9 @@ ip_output(struct mbuf *m0, struct mbuf *
 	int rv;
 #endif
 
+	/* Make sure this thread hold the correct lock. */
+	KASSERT(rw_status(&netlock) == RW_WRITE);
+
 #ifdef IPSEC
 	if (inp && (inp->inp_flags & INP_IPV6) != 0)
 		panic("ip_output: IPv6 pcb is passed");
Index: netinet/tcp_timer.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_timer.c,v
retrieving revision 1.50
diff -u -p -r1.50 tcp_timer.c
--- netinet/tcp_timer.c	24 Sep 2016 14:51:37 -0000	1.50
+++ netinet/tcp_timer.c	5 Oct 2016 08:11:11 -0000
@@ -112,15 +112,15 @@ tcp_delack(void *arg)
 	 * for whatever reason, it will restart the delayed
 	 * ACK callout.
 	 */
-
+	rw_enter_write(&netlock);
 	s = splsoftnet();
-	if (tp->t_flags & TF_DEAD) {
-		splx(s);
-		return;
-	}
+	if (tp->t_flags & TF_DEAD)
+		goto out;
 	tp->t_flags |= TF_ACKNOW;
 	(void) tcp_output(tp);
+ out:
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 /*
@@ -193,11 +193,10 @@ tcp_timer_rexmt(void *arg)
 	uint32_t rto;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
-	if (tp->t_flags & TF_DEAD) {
-		splx(s);
-		return;
-	}
+	if (tp->t_flags & TF_DEAD)
+		goto out;
 
 	if ((tp->t_flags & TF_PMTUD_PEND) && tp->t_inpcb &&
 	    SEQ_GEQ(tp->t_pmtud_th_seq, tp->snd_una) &&
@@ -224,8 +223,7 @@ tcp_timer_rexmt(void *arg)
 		sin.sin_addr = tp->t_inpcb->inp_faddr;
 		in_pcbnotifyall(&tcbtable, sintosa(&sin),
 		    tp->t_inpcb->inp_rtableid, EMSGSIZE, tcp_mtudisc);
-		splx(s);
-		return;
+		goto out;
 	}
 
 #ifdef TCP_SACK
@@ -377,6 +375,7 @@ tcp_timer_rexmt(void *arg)
 
  out:
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 void
@@ -386,11 +385,11 @@ tcp_timer_persist(void *arg)
 	uint32_t rto;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	if ((tp->t_flags & TF_DEAD) ||
             TCP_TIMER_ISARMED(tp, TCPT_REXMT)) {
-		splx(s);
-		return;
+		goto out;
 	}
 	tcpstat.tcps_persisttimeo++;
 	/*
@@ -416,6 +415,7 @@ tcp_timer_persist(void *arg)
 	tp->t_force = 0;
  out:
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 void
@@ -424,11 +424,10 @@ tcp_timer_keep(void *arg)
 	struct tcpcb *tp = arg;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
-	if (tp->t_flags & TF_DEAD) {
-		splx(s);
-		return;
-	}
+	if (tp->t_flags & TF_DEAD)
+		goto out;
 
 	tcpstat.tcps_keeptimeo++;
 	if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
@@ -457,8 +456,9 @@ tcp_timer_keep(void *arg)
 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepintvl);
 	} else
 		TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
-
+ out:
 	splx(s);
+	rw_exit_write(&netlock);
 	return;
 
  dropit:
@@ -466,6 +466,7 @@ tcp_timer_keep(void *arg)
 	tp = tcp_drop(tp, ETIMEDOUT);
 
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 void
@@ -474,11 +475,10 @@ tcp_timer_2msl(void *arg)
 	struct tcpcb *tp = arg;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
-	if (tp->t_flags & TF_DEAD) {
-		splx(s);
-		return;
-	}
+	if (tp->t_flags & TF_DEAD)
+		goto out;
 
 #ifdef TCP_SACK
 	tcp_timer_freesack(tp);
@@ -490,5 +490,7 @@ tcp_timer_2msl(void *arg)
 	else
 		tp = tcp_close(tp);
 
+ out:
 	splx(s);
+	rw_exit_write(&netlock);
 }
Index: netinet/tcp_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/tcp_input.c,v
retrieving revision 1.329
diff -u -p -r1.329 tcp_input.c
--- netinet/tcp_input.c	4 Oct 2016 13:56:50 -0000	1.329
+++ netinet/tcp_input.c	5 Oct 2016 08:11:12 -0000
@@ -3522,11 +3522,10 @@ syn_cache_timer(void *arg)
 	struct syn_cache *sc = arg;
 	int s;
 
+	rw_enter_write(&netlock);
 	s = splsoftnet();
-	if (sc->sc_flags & SCF_DEAD) {
-		splx(s);
-		return;
-	}
+	if (sc->sc_flags & SCF_DEAD)
+		goto out;
 
 	if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
 		/* Drop it -- too many retransmissions. */
@@ -3549,7 +3548,9 @@ syn_cache_timer(void *arg)
 	sc->sc_rxtshift++;
 	SYN_CACHE_TIMER_ARM(sc);
 
+ out:
 	splx(s);
+	rw_exit_write(&netlock);
 	return;
 
  dropit:
@@ -3557,6 +3558,7 @@ syn_cache_timer(void *arg)
 	syn_cache_rm(sc);
 	syn_cache_put(sc);
 	splx(s);
+	rw_exit_write(&netlock);
 }
 
 void
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.168
diff -u -p -r1.168 ip6_input.c
--- netinet6/ip6_input.c	24 Aug 2016 09:41:12 -0000	1.168
+++ netinet6/ip6_input.c	5 Oct 2016 08:11:12 -0000
@@ -1429,12 +1429,17 @@ ip6_send_dispatch(void *xmq)
 	int s;
 
 	mq_delist(mq, &ml);
+	if (ml_empty(&ml))
+		return;
+
 	KERNEL_LOCK();
+	rw_enter_write(&netlock);
 	s = splsoftnet();
 	while ((m = ml_dequeue(&ml)) != NULL) {
 		ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
 	}
 	splx(s);
+	rw_exit_write(&netlock);
 	KERNEL_UNLOCK();
 }
 
Index: sys/systm.h
===================================================================
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.119
diff -u -p -r1.119 systm.h
--- sys/systm.h	24 Sep 2016 18:35:52 -0000	1.119
+++ sys/systm.h	5 Oct 2016 08:11:12 -0000
@@ -290,6 +290,11 @@ struct uio;
 int	uiomove(void *, size_t, struct uio *);
 
 #if defined(_KERNEL)
+/*
+ * Network lock: serialize socket operations.
+ */
+extern struct rwlock netlock;
+
 __returns_twice int	setjmp(label_t *);
 __dead void	longjmp(label_t *);
 #endif

Reply via email to