Diff below moves IPv4 & IPv6 incoming/forwarding path, PIPEX ppp
processing and IPv4 & IPv6 dispatch functions outside the KERNEL_LOCK().

We currently rely on the NET_LOCK() serializing access to most global
data structures for that.  IP input queues are no longer used in the
forwarding case.  They still exist as boundary between the network and
transport layers because TCP/UDP & friends still need the KERNEL_LOCK().

Since we do not want to grab the NET_LOCK() for every packet, the
softnet thread will do it once before processing a batch.  That means
the L2 processing path, which is currently running without lock, will
now run with the NET_LOCK().

IPsec is the bridge of this layer.  A bad player.  Since IPsec isn't
ready to run without KERNEL_LOCK(), the softnet thread will grab the
KERNEL_LOCK() as soon as ``ipsec_in_use'' is set.

I tried to document as much as possible the current design in my
commit messages and in the comment below.  Please ask if something
isn't clear.

Tests and ok welcome.

Index: net/if.c
===================================================================
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.502
diff -u -p -r1.502 if.c
--- net/if.c    30 May 2017 07:50:37 -0000      1.502
+++ net/if.c    30 May 2017 08:34:49 -0000
@@ -874,7 +874,10 @@ if_input_process(void *xifidx)
        struct ifnet *ifp;
        struct ifih *ifih;
        struct srp_ref sr;
-       int s;
+       int s, s2;
+#ifdef IPSEC
+       int locked = 0;
+#endif /* IPSEC */
 
        ifp = if_get(ifidx);
        if (ifp == NULL)
@@ -887,6 +890,32 @@ if_input_process(void *xifidx)
        if (!ISSET(ifp->if_xflags, IFXF_CLONED))
                add_net_randomness(ml_len(&ml));
 
+#ifdef IPSEC
+       /*
+        * IPsec is not ready to run without KERNEL_LOCK().  So all
+        * the traffic on your machine is punished if you have IPsec
+        * enabled.
+        */
+       extern int ipsec_in_use;
+       if (ipsec_in_use) {
+               KERNEL_LOCK();
+               locked = 1;
+       }
+#endif /* IPSEC */
+
+       /*
+        * We grab the NET_LOCK() before processing any packet to
+        * ensure there's no contention on the routing table lock.
+        *
+        * Without it we could race with a userland thread to insert
+        * a L2 entry in ip{6,}_output().  Such race would result in
+        * one of the threads sleeping *inside* the IP output path.
+        *
+        * Since we have a NET_LOCK() we also use it to serialize access
+        * to PF globals, pipex globals, unicast and multicast addresses
+        * lists.
+        */
+       NET_LOCK(s2);
        s = splnet();
        while ((m = ml_dequeue(&ml)) != NULL) {
                /*
@@ -903,7 +932,12 @@ if_input_process(void *xifidx)
                        m_freem(m);
        }
        splx(s);
+       NET_UNLOCK(s2);
 
+#ifdef IPSEC
+       if (locked)
+               KERNEL_UNLOCK();
+#endif /* IPSEC */
 out:
        if_put(ifp);
 }
Index: net/if_ethersubr.c
===================================================================
RCS file: /cvs/src/sys/net/if_ethersubr.c,v
retrieving revision 1.245
diff -u -p -r1.245 if_ethersubr.c
--- net/if_ethersubr.c  30 May 2017 07:50:37 -0000      1.245
+++ net/if_ethersubr.c  30 May 2017 08:34:49 -0000
@@ -416,15 +416,11 @@ decapsulate:
 #ifdef PIPEX
                if (pipex_enable) {
                        struct pipex_session *session;
-                       int s;
 
-                       NET_LOCK(s);
                        if ((session = pipex_pppoe_lookup_session(m)) != NULL) {
                                pipex_pppoe_input(m, session);
-                               NET_UNLOCK(s);
                                return (1);
                        }
-                       NET_UNLOCK(s);
                }
 #endif
                if (etype == ETHERTYPE_PPPOEDISC)
Index: net/if_switch.c
===================================================================
RCS file: /cvs/src/sys/net/if_switch.c,v
retrieving revision 1.19
diff -u -p -r1.19 if_switch.c
--- net/if_switch.c     12 May 2017 13:40:29 -0000      1.19
+++ net/if_switch.c     30 May 2017 08:34:49 -0000
@@ -388,9 +388,8 @@ switch_ioctl(struct ifnet *ifp, unsigned
        struct bstp_port        *bp;
        struct ifnet            *ifs;
        struct switch_port      *swpo;
-       int                      s, error = 0;
+       int                      error = 0;
 
-       s = splnet();
        switch (cmd) {
        case SIOCBRDGADD:
                if ((error = suser(curproc, 0)) != 0)
@@ -481,7 +480,6 @@ switch_ioctl(struct ifnet *ifp, unsigned
                break;
        }
 
-       splx(s);
        return (error);
 }
 
Index: netinet/ip_input.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.308
diff -u -p -r1.308 ip_input.c
--- netinet/ip_input.c  30 May 2017 07:50:37 -0000      1.308
+++ netinet/ip_input.c  30 May 2017 08:34:49 -0000
@@ -127,6 +127,7 @@ int ip_sysctl_ipstat(void *, size_t *, v
 static struct mbuf_queue       ipsend_mq;
 
 void   ip_ours(struct mbuf *);
+void   ip_local(struct mbuf *);
 int    ip_dooptions(struct mbuf *, struct ifnet *);
 int    in_ouraddr(struct mbuf *, struct ifnet *, struct rtentry **);
 
@@ -207,27 +208,31 @@ ip_init(void)
        mq_init(&ipsend_mq, 64, IPL_SOFTNET);
 }
 
+/*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running without
+ * KERNEL_LOCK() and the transport layer still needing it.
+ */
 void
-ipv4_input(struct ifnet *ifp, struct mbuf *m)
+ip_ours(struct mbuf *m)
 {
        niq_enqueue(&ipintrq, m);
 }
 
+/*
+ * Dequeue and process locally delivered packets.
+ */
 void
 ipintr(void)
 {
        struct mbuf *m;
 
-       /*
-        * Get next datagram off input queue and get IP header
-        * in first mbuf.
-        */
        while ((m = niq_dequeue(&ipintrq)) != NULL) {
-#ifdef DIAGNOSTIC
+#ifdef DIAGNOSTIC
                if ((m->m_flags & M_PKTHDR) == 0)
                        panic("ipintr no HDR");
 #endif
-               ip_input(m);
+               ip_local(m);
        }
 }
 
@@ -237,18 +242,13 @@ ipintr(void)
  * Checksum and byte swap header.  Process options. Forward or deliver.
  */
 void
-ip_input(struct mbuf *m)
+ipv4_input(struct ifnet *ifp, struct mbuf *m)
 {
-       struct ifnet    *ifp;
        struct rtentry  *rt = NULL;
        struct ip       *ip;
        int hlen, len;
        in_addr_t pfrdr = 0;
 
-       ifp = if_get(m->m_pkthdr.ph_ifidx);
-       if (ifp == NULL)
-               goto bad;
-
        ipstat_inc(ips_total);
        if (m->m_len < sizeof (struct ip) &&
            (m = m_pullup(m, sizeof (struct ip))) == NULL) {
@@ -462,13 +462,11 @@ ip_input(struct mbuf *m)
 #endif /* IPSEC */
 
        ip_forward(m, ifp, rt, pfrdr);
-       if_put(ifp);
        return;
 bad:
        m_freem(m);
 out:
        rtfree(rt);
-       if_put(ifp);
 }
 
 /*
@@ -477,13 +475,15 @@ out:
  * If fragmented try to reassemble.  Pass to next level.
  */
 void
-ip_ours(struct mbuf *m)
+ip_local(struct mbuf *m)
 {
        struct ip *ip = mtod(m, struct ip *);
        struct ipq *fp;
        struct ipqent *ipqe;
        int mff, hlen;
 
+       KERNEL_ASSERT_LOCKED();
+
        hlen = ip->ip_hl << 2;
 
        /*
@@ -1685,13 +1685,11 @@ ip_send_dispatch(void *xmq)
        if (ml_empty(&ml))
                return;
 
-       KERNEL_LOCK();
        NET_LOCK(s);
        while ((m = ml_dequeue(&ml)) != NULL) {
                ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
        }
        NET_UNLOCK(s);
-       KERNEL_UNLOCK();
 }
 
 void
Index: netinet/ip_var.h
===================================================================
RCS file: /cvs/src/sys/netinet/ip_var.h,v
retrieving revision 1.77
diff -u -p -r1.77 ip_var.h
--- netinet/ip_var.h    30 May 2017 07:50:37 -0000      1.77
+++ netinet/ip_var.h    30 May 2017 08:34:49 -0000
@@ -248,7 +248,6 @@ int  ip_sysctl(int *, u_int, void *, siz
 void    ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
            struct mbuf *);
 void    ipintr(void);
-void    ip_input(struct mbuf *);
 void    ip_deliver(struct mbuf **, int *, int, int);
 void    ip_forward(struct mbuf *, struct ifnet *, struct rtentry *, int);
 int     rip_ctloutput(int, struct socket *, int, int, struct mbuf *);
Index: netinet6/ip6_input.c
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.192
diff -u -p -r1.192 ip6_input.c
--- netinet6/ip6_input.c        30 May 2017 07:50:37 -0000      1.192
+++ netinet6/ip6_input.c        30 May 2017 08:34:49 -0000
@@ -119,6 +119,7 @@ struct niqueue ip6intrq = NIQUEUE_INITIA
 struct cpumem *ip6counters;
 
 void ip6_ours(struct mbuf *);
+void ip6_local(struct mbuf *);
 int ip6_check_rh0hdr(struct mbuf *, int *);
 int ip6_hbhchcheck(struct mbuf *, int *, int *, int *);
 int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
@@ -160,28 +161,37 @@ ip6_init(void)
        ip6counters = counters_alloc(ip6s_ncounters);
 }
 
+/*
+ * Enqueue packet for local delivery.  Queuing is used as a boundary
+ * between the network layer (input/forward path) running without
+ * KERNEL_LOCK() and the transport layer still needing it.
+ */
 void
-ipv6_input(struct ifnet *ifp, struct mbuf *m)
+ip6_ours(struct mbuf *m)
 {
        niq_enqueue(&ip6intrq, m);
 }
 
 /*
- * IP6 input interrupt handling. Just pass the packet to ip6_input.
+ * Dequeue and process locally delivered packets.
  */
 void
 ip6intr(void)
 {
        struct mbuf *m;
 
-       while ((m = niq_dequeue(&ip6intrq)) != NULL)
-               ip6_input(m);
+       while ((m = niq_dequeue(&ip6intrq)) != NULL) {
+#ifdef DIAGNOSTIC
+               if ((m->m_flags & M_PKTHDR) == 0)
+                       panic("ipintr no HDR");
+#endif
+               ip6_local(m);
+       }
 }
 
 void
-ip6_input(struct mbuf *m)
+ipv6_input(struct ifnet *ifp, struct mbuf *m)
 {
-       struct ifnet *ifp;
        struct ip6_hdr *ip6;
        struct sockaddr_in6 sin6;
        struct rtentry *rt = NULL;
@@ -192,10 +202,6 @@ ip6_input(struct mbuf *m)
 #endif
        int srcrt = 0;
 
-       ifp = if_get(m->m_pkthdr.ph_ifidx);
-       if (ifp == NULL)
-               goto bad;
-
        ip6stat_inc(ip6s_total);
 
        if (m->m_len < sizeof(struct ip6_hdr)) {
@@ -441,8 +447,8 @@ ip6_input(struct mbuf *m)
                        inet_ntop(AF_INET6, &ip6->ip6_dst, dst, sizeof(dst));
                        /* address is not ready, so discard the packet. */
                        nd6log((LOG_INFO,
-                           "ip6_input: packet to an unready address %s->%s\n",
-                           src, dst));
+                           "%s: packet to an unready address %s->%s\n",
+                           __func__, src, dst));
 
                        goto bad;
                } else {
@@ -500,11 +506,10 @@ ip6_input(struct mbuf *m)
        m_freem(m);
  out:
        rtfree(rt);
-       if_put(ifp);
 }
 
 void
-ip6_ours(struct mbuf *m)
+ip6_local(struct mbuf *m)
 {
        int off, nxt;
 
@@ -1461,13 +1466,11 @@ ip6_send_dispatch(void *xmq)
        if (ml_empty(&ml))
                return;
 
-       KERNEL_LOCK();
        NET_LOCK(s);
        while ((m = ml_dequeue(&ml)) != NULL) {
                ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
        }
        NET_UNLOCK(s);
-       KERNEL_UNLOCK();
 }
 
 void
Index: netinet6/ip6_var.h
===================================================================
RCS file: /cvs/src/sys/netinet6/ip6_var.h,v
retrieving revision 1.74
diff -u -p -r1.74 ip6_var.h
--- netinet6/ip6_var.h  28 May 2017 09:25:51 -0000      1.74
+++ netinet6/ip6_var.h  30 May 2017 08:34:49 -0000
@@ -303,7 +303,6 @@ int icmp6_ctloutput(int, struct socket *
 
 void   ip6_init(void);
 void   ip6intr(void);
-void   ip6_input(struct mbuf *);
 void   ip6_deliver(struct mbuf **, int *, int, int);
 void   ip6_freepcbopts(struct ip6_pktopts *);
 void   ip6_freemoptions(struct ip6_moptions *);

Reply via email to