Diff below moves IPv4 & IPv6 incoming/forwarding path, PIPEX ppp processing and IPv4 & IPv6 dispatch functions outside the KERNEL_LOCK().
We currently rely on the NET_LOCK() serializing access to most global data structures for that. IP input queues are no longer used in the forwarding case. They still exist as boundary between the network and transport layers because TCP/UDP & friends still need the KERNEL_LOCK(). Since we do not want to grab the NET_LOCK() for every packet, the softnet thread will do it once before processing a batch. That means the L2 processing path, which is currently running without lock, will now run with the NET_LOCK(). IPsec is the bridge of this layer. A bad player. Since IPsec isn't ready to run without KERNEL_LOCK(), the softnet thread will grab the KERNEL_LOCK() as soon as ``ipsec_in_use'' is set. I tried to document as much as possible the current design in my commit messages and in the comment below. Please ask if something isn't clear. Tests and ok welcome. Index: net/if.c =================================================================== RCS file: /cvs/src/sys/net/if.c,v retrieving revision 1.502 diff -u -p -r1.502 if.c --- net/if.c 30 May 2017 07:50:37 -0000 1.502 +++ net/if.c 30 May 2017 08:34:49 -0000 @@ -874,7 +874,10 @@ if_input_process(void *xifidx) struct ifnet *ifp; struct ifih *ifih; struct srp_ref sr; - int s; + int s, s2; +#ifdef IPSEC + int locked = 0; +#endif /* IPSEC */ ifp = if_get(ifidx); if (ifp == NULL) @@ -887,6 +890,32 @@ if_input_process(void *xifidx) if (!ISSET(ifp->if_xflags, IFXF_CLONED)) add_net_randomness(ml_len(&ml)); +#ifdef IPSEC + /* + * IPsec is not ready to run without KERNEL_LOCK(). So all + * the traffic on your machine is punished if you have IPsec + * enabled. + */ + extern int ipsec_in_use; + if (ipsec_in_use) { + KERNEL_LOCK(); + locked = 1; + } +#endif /* IPSEC */ + + /* + * We grab the NET_LOCK() before processing any packet to + * ensure there's no contention on the routing table lock. + * + * Without it we could race with a userland thread to insert + * a L2 entry in ip{6,}_output(). Such race would result in + * one of the threads sleeping *inside* the IP output path. + * + * Since we have a NET_LOCK() we also use it to serialize access + * to PF globals, pipex globals, unicast and multicast addresses + * lists. + */ + NET_LOCK(s2); s = splnet(); while ((m = ml_dequeue(&ml)) != NULL) { /* @@ -903,7 +932,12 @@ if_input_process(void *xifidx) m_freem(m); } splx(s); + NET_UNLOCK(s2); +#ifdef IPSEC + if (locked) + KERNEL_UNLOCK(); +#endif /* IPSEC */ out: if_put(ifp); } Index: net/if_ethersubr.c =================================================================== RCS file: /cvs/src/sys/net/if_ethersubr.c,v retrieving revision 1.245 diff -u -p -r1.245 if_ethersubr.c --- net/if_ethersubr.c 30 May 2017 07:50:37 -0000 1.245 +++ net/if_ethersubr.c 30 May 2017 08:34:49 -0000 @@ -416,15 +416,11 @@ decapsulate: #ifdef PIPEX if (pipex_enable) { struct pipex_session *session; - int s; - NET_LOCK(s); if ((session = pipex_pppoe_lookup_session(m)) != NULL) { pipex_pppoe_input(m, session); - NET_UNLOCK(s); return (1); } - NET_UNLOCK(s); } #endif if (etype == ETHERTYPE_PPPOEDISC) Index: net/if_switch.c =================================================================== RCS file: /cvs/src/sys/net/if_switch.c,v retrieving revision 1.19 diff -u -p -r1.19 if_switch.c --- net/if_switch.c 12 May 2017 13:40:29 -0000 1.19 +++ net/if_switch.c 30 May 2017 08:34:49 -0000 @@ -388,9 +388,8 @@ switch_ioctl(struct ifnet *ifp, unsigned struct bstp_port *bp; struct ifnet *ifs; struct switch_port *swpo; - int s, error = 0; + int error = 0; - s = splnet(); switch (cmd) { case SIOCBRDGADD: if ((error = suser(curproc, 0)) != 0) @@ -481,7 +480,6 @@ switch_ioctl(struct ifnet *ifp, unsigned break; } - splx(s); return (error); } Index: netinet/ip_input.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_input.c,v retrieving revision 1.308 diff -u -p -r1.308 ip_input.c --- netinet/ip_input.c 30 May 2017 07:50:37 -0000 1.308 +++ netinet/ip_input.c 30 May 2017 08:34:49 -0000 @@ -127,6 +127,7 @@ int ip_sysctl_ipstat(void *, size_t *, v static struct mbuf_queue ipsend_mq; void ip_ours(struct mbuf *); +void ip_local(struct mbuf *); int ip_dooptions(struct mbuf *, struct ifnet *); int in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **); @@ -207,27 +208,31 @@ ip_init(void) mq_init(&ipsend_mq, 64, IPL_SOFTNET); } +/* + * Enqueue packet for local delivery. Queuing is used as a boundary + * between the network layer (input/forward path) running without + * KERNEL_LOCK() and the transport layer still needing it. + */ void -ipv4_input(struct ifnet *ifp, struct mbuf *m) +ip_ours(struct mbuf *m) { niq_enqueue(&ipintrq, m); } +/* + * Dequeue and process locally delivered packets. + */ void ipintr(void) { struct mbuf *m; - /* - * Get next datagram off input queue and get IP header - * in first mbuf. - */ while ((m = niq_dequeue(&ipintrq)) != NULL) { -#ifdef DIAGNOSTIC +#ifdef DIAGNOSTIC if ((m->m_flags & M_PKTHDR) == 0) panic("ipintr no HDR"); #endif - ip_input(m); + ip_local(m); } } @@ -237,18 +242,13 @@ ipintr(void) * Checksum and byte swap header. Process options. Forward or deliver. */ void -ip_input(struct mbuf *m) +ipv4_input(struct ifnet *ifp, struct mbuf *m) { - struct ifnet *ifp; struct rtentry *rt = NULL; struct ip *ip; int hlen, len; in_addr_t pfrdr = 0; - ifp = if_get(m->m_pkthdr.ph_ifidx); - if (ifp == NULL) - goto bad; - ipstat_inc(ips_total); if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { @@ -462,13 +462,11 @@ ip_input(struct mbuf *m) #endif /* IPSEC */ ip_forward(m, ifp, rt, pfrdr); - if_put(ifp); return; bad: m_freem(m); out: rtfree(rt); - if_put(ifp); } /* @@ -477,13 +475,15 @@ out: * If fragmented try to reassemble. Pass to next level. */ void -ip_ours(struct mbuf *m) +ip_local(struct mbuf *m) { struct ip *ip = mtod(m, struct ip *); struct ipq *fp; struct ipqent *ipqe; int mff, hlen; + KERNEL_ASSERT_LOCKED(); + hlen = ip->ip_hl << 2; /* @@ -1685,13 +1685,11 @@ ip_send_dispatch(void *xmq) if (ml_empty(&ml)) return; - KERNEL_LOCK(); NET_LOCK(s); while ((m = ml_dequeue(&ml)) != NULL) { ip_output(m, NULL, NULL, 0, NULL, NULL, 0); } NET_UNLOCK(s); - KERNEL_UNLOCK(); } void Index: netinet/ip_var.h =================================================================== RCS file: /cvs/src/sys/netinet/ip_var.h,v retrieving revision 1.77 diff -u -p -r1.77 ip_var.h --- netinet/ip_var.h 30 May 2017 07:50:37 -0000 1.77 +++ netinet/ip_var.h 30 May 2017 08:34:49 -0000 @@ -248,7 +248,6 @@ int ip_sysctl(int *, u_int, void *, siz void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *, struct mbuf *); void ipintr(void); -void ip_input(struct mbuf *); void ip_deliver(struct mbuf **, int *, int, int); void ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int); int rip_ctloutput(int, struct socket *, int, int, struct mbuf *); Index: netinet6/ip6_input.c =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_input.c,v retrieving revision 1.192 diff -u -p -r1.192 ip6_input.c --- netinet6/ip6_input.c 30 May 2017 07:50:37 -0000 1.192 +++ netinet6/ip6_input.c 30 May 2017 08:34:49 -0000 @@ -119,6 +119,7 @@ struct niqueue ip6intrq = NIQUEUE_INITIA struct cpumem *ip6counters; void ip6_ours(struct mbuf *); +void ip6_local(struct mbuf *); int ip6_check_rh0hdr(struct mbuf *, int *); int ip6_hbhchcheck(struct mbuf *, int *, int *, int *); int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); @@ -160,28 +161,37 @@ ip6_init(void) ip6counters = counters_alloc(ip6s_ncounters); } +/* + * Enqueue packet for local delivery. Queuing is used as a boundary + * between the network layer (input/forward path) running without + * KERNEL_LOCK() and the transport layer still needing it. + */ void -ipv6_input(struct ifnet *ifp, struct mbuf *m) +ip6_ours(struct mbuf *m) { niq_enqueue(&ip6intrq, m); } /* - * IP6 input interrupt handling. Just pass the packet to ip6_input. + * Dequeue and process locally delivered packets. */ void ip6intr(void) { struct mbuf *m; - while ((m = niq_dequeue(&ip6intrq)) != NULL) - ip6_input(m); + while ((m = niq_dequeue(&ip6intrq)) != NULL) { +#ifdef DIAGNOSTIC + if ((m->m_flags & M_PKTHDR) == 0) + panic("ipintr no HDR"); +#endif + ip6_local(m); + } } void -ip6_input(struct mbuf *m) +ipv6_input(struct ifnet *ifp, struct mbuf *m) { - struct ifnet *ifp; struct ip6_hdr *ip6; struct sockaddr_in6 sin6; struct rtentry *rt = NULL; @@ -192,10 +202,6 @@ ip6_input(struct mbuf *m) #endif int srcrt = 0; - ifp = if_get(m->m_pkthdr.ph_ifidx); - if (ifp == NULL) - goto bad; - ip6stat_inc(ip6s_total); if (m->m_len < sizeof(struct ip6_hdr)) { @@ -441,8 +447,8 @@ ip6_input(struct mbuf *m) inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst)); /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, - "ip6_input: packet to an unready address %s->%s\n", - src, dst)); + "%s: packet to an unready address %s->%s\n", + __func__, src, dst)); goto bad; } else { @@ -500,11 +506,10 @@ ip6_input(struct mbuf *m) m_freem(m); out: rtfree(rt); - if_put(ifp); } void -ip6_ours(struct mbuf *m) +ip6_local(struct mbuf *m) { int off, nxt; @@ -1461,13 +1466,11 @@ ip6_send_dispatch(void *xmq) if (ml_empty(&ml)) return; - KERNEL_LOCK(); NET_LOCK(s); while ((m = ml_dequeue(&ml)) != NULL) { ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL); } NET_UNLOCK(s); - KERNEL_UNLOCK(); } void Index: netinet6/ip6_var.h =================================================================== RCS file: /cvs/src/sys/netinet6/ip6_var.h,v retrieving revision 1.74 diff -u -p -r1.74 ip6_var.h --- netinet6/ip6_var.h 28 May 2017 09:25:51 -0000 1.74 +++ netinet6/ip6_var.h 30 May 2017 08:34:49 -0000 @@ -303,7 +303,6 @@ int icmp6_ctloutput(int, struct socket * void ip6_init(void); void ip6intr(void); -void ip6_input(struct mbuf *); void ip6_deliver(struct mbuf **, int *, int, int); void ip6_freepcbopts(struct ip6_pktopts *); void ip6_freemoptions(struct ip6_moptions *);