On 2016-10-06, Christian Weisgerber <na...@mips.inka.de> wrote:

> Something is very broken at the intersection of IPv6, NDP, and IPsec
> in -current.

Found by bisection.  The culprit is this commit:

------------------------------------------------------------------------
CVSROOT:        /cvs
Module name:    src
Changes by:     mar...@cvs.openbsd.org  2016/09/13 13:56:55

Modified files:
        sys/kern       : uipc_mbuf.c 
        sys/netinet    : ip_ah.c ip_esp.c ip_ipcomp.c ipsec_output.c 
        sys/sys        : mbuf.h 
        share/man/man9 : mbuf.9 

Log message:
avoid extensive mbuf allocation for IPsec by replacing m_inject(4)
with m_makespace(4) from freebsd; ok mpi@, bluhm@, mikeb@, dlg@
------------------------------------------------------------------------

Kudos to vgross@ for calling it a few days ago:

> If so, I think we are dealing with some kind of mbuf mangling.

A backout is somewhat involved due to further changes that have been
layered on top.  Backout patch below for completeness, but I don't
suggest that this be actually committed.

Index: sys/kern/uipc_mbuf.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.231
diff -u -p -r1.231 uipc_mbuf.c
--- sys/kern/uipc_mbuf.c        15 Sep 2016 02:00:16 -0000      1.231
+++ sys/kern/uipc_mbuf.c        9 Oct 2016 00:20:34 -0000
@@ -126,6 +126,7 @@ int max_hdr;                        /* largest 
link+protocol 
 struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET);
 
 void   m_extfree(struct mbuf *);
+struct mbuf *m_copym0(struct mbuf *, int, int, int, int);
 void   nmbclust_update(void);
 void   m_zero(struct mbuf *);
 
@@ -567,7 +568,23 @@ m_prepend(struct mbuf *m, int len, int h
  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
  */
 struct mbuf *
-m_copym(struct mbuf *m0, int off, int len, int wait)
+m_copym(struct mbuf *m, int off, int len, int wait)
+{
+       return m_copym0(m, off, len, wait, 0);  /* shallow copy on M_EXT */
+}
+
+/*
+ * m_copym2() is like m_copym(), except it COPIES cluster mbufs, instead
+ * of merely bumping the reference count.
+ */
+struct mbuf *
+m_copym2(struct mbuf *m, int off, int len, int wait)
+{
+       return m_copym0(m, off, len, wait, 1);  /* deep copy */
+}
+
+struct mbuf *
+m_copym0(struct mbuf *m0, int off, int len, int wait, int deep)
 {
        struct mbuf *m, *n, **np;
        struct mbuf *top;
@@ -600,9 +617,23 @@ m_copym(struct mbuf *m0, int off, int le
                }
                n->m_len = min(len, m->m_len - off);
                if (m->m_flags & M_EXT) {
-                       n->m_data = m->m_data + off;
-                       n->m_ext = m->m_ext;
-                       MCLADDREFERENCE(m, n);
+                       if (!deep) {
+                               n->m_data = m->m_data + off;
+                               n->m_ext = m->m_ext;
+                               MCLADDREFERENCE(m, n);
+                       } else {
+                               /*
+                                * we are unsure about the way m was allocated.
+                                * copy into multiple MCLBYTES cluster mbufs.
+                                */
+                               MCLGET(n, wait);
+                               n->m_len = 0;
+                               n->m_len = M_TRAILINGSPACE(n);
+                               n->m_len = min(n->m_len, len);
+                               n->m_len = min(n->m_len, m->m_len - off);
+                               memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off,
+                                   n->m_len);
+                       }
                } else
                        memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off,
                            n->m_len);
@@ -933,6 +964,67 @@ m_getptr(struct mbuf *m, int loc, int *o
 }
 
 /*
+ * Inject a new mbuf chain of length siz in mbuf chain m0 at
+ * position len0. Returns a pointer to the first injected mbuf, or
+ * NULL on failure (m0 is left undisturbed). Note that if there is
+ * enough space for an object of size siz in the appropriate position,
+ * no memory will be allocated. Also, there will be no data movement in
+ * the first len0 bytes (pointers to that will remain valid).
+ *
+ * XXX It is assumed that siz is less than the size of an mbuf at the moment.
+ */
+struct mbuf *
+m_inject(struct mbuf *m0, int len0, int siz, int wait)
+{
+       struct mbuf *m, *n, *n2 = NULL, *n3;
+       unsigned len = len0, remain;
+
+       if ((siz >= MHLEN) || (len0 <= 0))
+               return (NULL);
+       for (m = m0; m && len > m->m_len; m = m->m_next)
+               len -= m->m_len;
+       if (m == NULL)
+               return (NULL);
+       remain = m->m_len - len;
+       if (remain == 0) {
+               if ((m->m_next) && (M_LEADINGSPACE(m->m_next) >= siz)) {
+                       m->m_next->m_len += siz;
+                       if (m0->m_flags & M_PKTHDR)
+                               m0->m_pkthdr.len += siz;
+                       m->m_next->m_data -= siz;
+                       return m->m_next;
+               }
+       } else {
+               n2 = m_copym2(m, len, remain, wait);
+               if (n2 == NULL)
+                       return (NULL);
+       }
+
+       MGET(n, wait, MT_DATA);
+       if (n == NULL) {
+               if (n2)
+                       m_freem(n2);
+               return (NULL);
+       }
+
+       n->m_len = siz;
+       if (m0->m_flags & M_PKTHDR)
+               m0->m_pkthdr.len += siz;
+       m->m_len -= remain; /* Trim */
+       if (n2) {
+               for (n3 = n; n3->m_next != NULL; n3 = n3->m_next)
+                       ;
+               n3->m_next = n2;
+       } else
+               n3 = n;
+       for (; n3->m_next != NULL; n3 = n3->m_next)
+               ;
+       n3->m_next = m->m_next;
+       m->m_next = n;
+       return n;
+}
+
+/*
  * Partition an mbuf chain in two pieces, returning the tail --
  * all but the first len0 bytes.  In case of failure, it returns NULL and
  * attempts to restore the chain to its original state.
@@ -997,121 +1089,6 @@ extpacket:
        m->m_next = NULL;
        return (n);
 }
-
-/*
- * Make space for a new header of length hlen at skip bytes
- * into the packet.  When doing this we allocate new mbufs only
- * when absolutely necessary.  The mbuf where the new header
- * is to go is returned together with an offset into the mbuf.
- * If NULL is returned then the mbuf chain may have been modified;
- * the caller is assumed to always free the chain.
- */
-struct mbuf *
-m_makespace(struct mbuf *m0, int skip, int hlen, int *off)
-{
-       struct mbuf *m;
-       unsigned remain;
-
-       KASSERT(m0 != NULL);
-       KASSERT(hlen < MHLEN);
-
-       for (m = m0; m && skip > m->m_len; m = m->m_next)
-               skip -= m->m_len;
-       if (m == NULL)
-               return (NULL);
-       /*
-        * At this point skip is the offset into the mbuf m
-        * where the new header should be placed.  Figure out
-        * if there's space to insert the new header.  If so,
-        * and copying the remainder makese sense then do so.
-        * Otherwise insert a new mbuf in the chain, splitting
-        * the contents of m as needed.
-        */
-       remain = m->m_len - skip;               /* data to move */
-       if (skip < remain && hlen <= M_LEADINGSPACE(m)) {
-               if (skip)
-                       memmove(m->m_data-hlen, m->m_data, skip);
-               m->m_data -= hlen;
-               m->m_len += hlen;
-               (*off) = skip;
-       } else if (hlen > M_TRAILINGSPACE(m)) {
-               struct mbuf *n0, *n, **np;
-               int todo, len, done, alloc;
-
-               n0 = NULL;
-               np = &n0;
-               alloc = 0;
-               done = 0;
-               todo = remain;
-               while (todo > 0) {
-                       MGET(n, M_DONTWAIT, m->m_type);
-                       len = MHLEN;
-                       if (n && todo > MHLEN) {
-                               MCLGET(n, M_DONTWAIT);
-                               len = MCLBYTES;
-                               if ((n->m_flags & M_EXT) == 0) {
-                                       m_free(n);
-                                       n = NULL;
-                               }
-                       }
-                       if (n == NULL) {
-                               m_freem(n0);
-                               return NULL;
-                       }
-                       *np = n;
-                       np = &n->m_next;
-                       alloc++;
-                       len = min(todo, len);
-                       memcpy(n->m_data, mtod(m, char *) + skip + done, len);
-                       n->m_len = len;
-                       done += len;
-                       todo -= len;
-               }
-
-               if (hlen <= M_TRAILINGSPACE(m) + remain) {
-                       m->m_len = skip + hlen;
-                       *off = skip;
-                       if (n0 != NULL) {
-                               *np = m->m_next;
-                               m->m_next = n0;
-                       }
-               }
-               else {
-                       n = m_get(M_DONTWAIT, m->m_type);
-                       if (n == NULL) {
-                               m_freem(n0);
-                               return NULL;
-                       }
-                       alloc++;
-
-                       if ((n->m_next = n0) == NULL)
-                               np = &n->m_next;
-                       n0 = n;
-
-                       *np = m->m_next;
-                       m->m_next = n0;
-
-                       n->m_len = hlen;
-                       m->m_len = skip;
-
-                       m = n;                  /* header is at front ... */
-                       *off = 0;               /* ... of new mbuf */
-               }
-       } else {
-               /*
-                * Copy the remainder to the back of the mbuf
-                * so there's space to write the new header.
-                */
-               if (remain > 0)
-                       memmove(mtod(m, caddr_t) + skip + hlen,
-                             mtod(m, caddr_t) + skip, remain);
-               m->m_len += hlen;
-               *off = skip;
-       }
-       m0->m_pkthdr.len += hlen;               /* adjust packet length */
-       return m;
-}
-
 
 /*
  * Routine to copy from device local memory into mbufs.
Index: sys/netinet/ip_ah.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_ah.c,v
retrieving revision 1.123
diff -u -p -r1.123 ip_ah.c
--- sys/netinet/ip_ah.c 19 Sep 2016 18:09:22 -0000      1.123
+++ sys/netinet/ip_ah.c 9 Oct 2016 00:03:45 -0000
@@ -932,7 +932,7 @@ ah_output(struct mbuf *m, struct tdb *td
        struct mbuf *mi;
        struct cryptop *crp;
        u_int16_t iplen;
-       int len, rplen, roff;
+       int len, rplen;
        u_int8_t prot;
        struct ah *ah;
 #if NBPFILTER > 0
@@ -1057,7 +1057,7 @@ ah_output(struct mbuf *m, struct tdb *td
        }
 
        /* Inject AH header. */
-       mi = m_makespace(m, skip, rplen + ahx->authsize, &roff);
+       mi = m_inject(m, skip, rplen + ahx->authsize, M_DONTWAIT);
        if (mi == NULL) {
                DPRINTF(("ah_output(): failed to inject AH header for SA "
                    "%s/%08x\n", ipsp_address(&tdb->tdb_dst, buf, sizeof(buf)),
@@ -1069,10 +1069,10 @@ ah_output(struct mbuf *m, struct tdb *td
        }
 
        /*
-        * The AH header is guaranteed by m_makespace() to be in
-        * contiguous memory, at 'roff' of the returned mbuf.
+        * The AH header is guaranteed by m_inject() to be in
+        * contiguous memory, at the beginning of the returned mbuf.
         */
-       ah = (struct ah *)(mtod(mi, caddr_t) + roff);
+       ah = mtod(mi, struct ah *);
 
        /* Initialize the AH header. */
        m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nh);
Index: sys/netinet/ip_esp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_esp.c,v
retrieving revision 1.141
diff -u -p -r1.141 ip_esp.c
--- sys/netinet/ip_esp.c        19 Sep 2016 18:09:22 -0000      1.141
+++ sys/netinet/ip_esp.c        9 Oct 2016 00:03:45 -0000
@@ -775,7 +775,7 @@ esp_output(struct mbuf *m, struct tdb *t
 {
        struct enc_xform *espx = (struct enc_xform *) tdb->tdb_encalgxform;
        struct auth_hash *esph = (struct auth_hash *) tdb->tdb_authalgxform;
-       int ilen, hlen, rlen, padding, blks, alen, roff;
+       int ilen, hlen, rlen, padding, blks, alen;
        u_int32_t replay;
        struct mbuf *mi, *mo = (struct mbuf *) NULL;
        struct tdb_crypto *tc;
@@ -907,7 +907,7 @@ esp_output(struct mbuf *m, struct tdb *t
        }
 
        /* Inject ESP header. */
-       mo = m_makespace(m, skip, hlen, &roff);
+       mo = m_inject(m, skip, hlen, M_DONTWAIT);
        if (mo == NULL) {
                DPRINTF(("esp_output(): failed to inject ESP header for "
                    "SA %s/%08x\n", ipsp_address(&tdb->tdb_dst, buf,
@@ -918,11 +918,10 @@ esp_output(struct mbuf *m, struct tdb *t
        }
 
        /* Initialize ESP header. */
-       bcopy((caddr_t) &tdb->tdb_spi, mtod(mo, caddr_t) + roff,
-           sizeof(u_int32_t));
+       bcopy((caddr_t) &tdb->tdb_spi, mtod(mo, caddr_t), sizeof(u_int32_t));
        tdb->tdb_rpl++;
        replay = htonl((u_int32_t)tdb->tdb_rpl);
-       memcpy(mtod(mo, caddr_t) + roff + sizeof(u_int32_t), (caddr_t) &replay,
+       memcpy(mtod(mo, caddr_t) + sizeof(u_int32_t), (caddr_t) &replay,
            sizeof(u_int32_t));
 
 #if NPFSYNC > 0
@@ -933,15 +932,15 @@ esp_output(struct mbuf *m, struct tdb *t
         * Add padding -- better to do it ourselves than use the crypto engine,
         * although if/when we support compression, we'd have to do that.
         */
-       mo = m_makespace(m, m->m_pkthdr.len, padding + alen, &roff);
+       mo = m_inject(m, m->m_pkthdr.len, padding + alen, M_DONTWAIT);
        if (mo == NULL) {
-               DPRINTF(("esp_output(): m_makespace() failed for SA %s/%08x\n",
+               DPRINTF(("esp_output(): m_inject failed for SA %s/%08x\n",
                    ipsp_address(&tdb->tdb_dst, buf, sizeof(buf)),
                    ntohl(tdb->tdb_spi)));
                m_freem(m);
                return ENOBUFS;
        }
-       pad = mtod(mo, caddr_t) + roff;
+       pad = mtod(mo, u_char *);
 
        /* Apply self-describing padding */
        for (ilen = 0; ilen < padding - 2; ilen++)
Index: sys/netinet/ip_ipcomp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_ipcomp.c,v
retrieving revision 1.48
diff -u -p -r1.48 ip_ipcomp.c
--- sys/netinet/ip_ipcomp.c     24 Sep 2016 14:51:37 -0000      1.48
+++ sys/netinet/ip_ipcomp.c     9 Oct 2016 00:03:45 -0000
@@ -528,7 +528,7 @@ ipcomp_output_cb(struct cryptop *crp)
        struct tdb_crypto *tc;
        struct tdb *tdb;
        struct mbuf *m, *mo;
-       int error, s, skip, rlen, roff;
+       int error, s, skip, rlen;
        u_int16_t cpi;
        struct ip *ip;
 #ifdef INET6
@@ -593,7 +593,7 @@ ipcomp_output_cb(struct cryptop *crp)
        }
 
        /* Inject IPCOMP header */
-       mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff);
+       mo = m_inject(m, skip, IPCOMP_HLENGTH, M_DONTWAIT);
        if (mo == NULL) {
                DPRINTF(("ipcomp_output_cb(): failed to inject IPCOMP header "
                    "for IPCA %s/%08x\n", ipsp_address(&tdb->tdb_dst, buf,
@@ -604,7 +604,7 @@ ipcomp_output_cb(struct cryptop *crp)
        }
 
        /* Initialize the IPCOMP header */
-       ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff);
+       ipcomp = mtod(mo, struct ipcomp *);
        memset(ipcomp, 0, sizeof(struct ipcomp));
        cpi = (u_int16_t) ntohl(tdb->tdb_spi);
        ipcomp->ipcomp_cpi = htons(cpi);
Index: sys/netinet/ipsec_output.c
===================================================================
RCS file: /cvs/src/sys/netinet/ipsec_output.c,v
retrieving revision 1.63
diff -u -p -r1.63 ipsec_output.c
--- sys/netinet/ipsec_output.c  13 Sep 2016 19:56:55 -0000      1.63
+++ sys/netinet/ipsec_output.c  9 Oct 2016 00:03:45 -0000
@@ -362,12 +362,13 @@ int
 ipsp_process_done(struct mbuf *m, struct tdb *tdb)
 {
        struct ip *ip;
+
 #ifdef INET6
        struct ip6_hdr *ip6;
 #endif /* INET6 */
+
        struct tdb_ident *tdbi;
        struct m_tag *mtag;
-       int roff;
 
        tdb->tdb_last_used = time_second;
 
@@ -397,12 +398,12 @@ ipsp_process_done(struct mbuf *m, struct
                        return ENXIO;
                }
 
-               mi = m_makespace(m, iphlen, sizeof(struct udphdr), &roff);
+               mi = m_inject(m, iphlen, sizeof(struct udphdr), M_DONTWAIT);
                if (mi == NULL) {
                        m_freem(m);
                        return ENOMEM;
                }
-               uh = (struct udphdr *)(mtod(mi, caddr_t) + roff);
+               uh = mtod(mi, struct udphdr *);
                uh->uh_sport = uh->uh_dport = htons(udpencap_port);
                if (tdb->tdb_udpencap_port)
                        uh->uh_dport = tdb->tdb_udpencap_port;
Index: sys/sys/mbuf.h
===================================================================
RCS file: /cvs/src/sys/sys/mbuf.h,v
retrieving revision 1.220
diff -u -p -r1.220 mbuf.h
--- sys/sys/mbuf.h      17 Sep 2016 00:38:43 -0000      1.220
+++ sys/sys/mbuf.h      9 Oct 2016 00:20:34 -0000
@@ -419,6 +419,7 @@ extern      int max_protohdr;               /* largest pro
 extern int max_hdr;                    /* largest link+protocol header */
 
 void   mbinit(void);
+struct mbuf *m_copym2(struct mbuf *, int, int, int);
 struct mbuf *m_copym(struct mbuf *, int, int, int);
 struct mbuf *m_free(struct mbuf *);
 struct mbuf *m_get(int, int);
@@ -431,7 +432,7 @@ struct      mbuf *m_prepend(struct mbuf *, in
 struct mbuf *m_pulldown(struct mbuf *, int, int, int *);
 struct mbuf *m_pullup(struct mbuf *, int);
 struct mbuf *m_split(struct mbuf *, int, int);
-struct mbuf *m_makespace(struct mbuf *, int, int, int *);
+struct  mbuf *m_inject(struct mbuf *, int, int, int);
 struct  mbuf *m_getptr(struct mbuf *, int, int *);
 int    m_leadingspace(struct mbuf *);
 int    m_trailingspace(struct mbuf *);
Index: share/man/man9/mbuf.9
===================================================================
RCS file: /cvs/src/share/man/man9/mbuf.9,v
retrieving revision 1.104
diff -u -p -r1.104 mbuf.9
--- share/man/man9/mbuf.9       15 Sep 2016 00:00:40 -0000      1.104
+++ share/man/man9/mbuf.9       9 Oct 2016 00:03:45 -0000
@@ -42,7 +42,7 @@
 .Nm m_pulldown ,
 .Nm m_pullup ,
 .Nm m_split ,
-.Nm m_makespace ,
+.Nm m_inject ,
 .Nm m_getptr ,
 .Nm m_adj ,
 .Nm m_copyback ,
@@ -92,7 +92,7 @@
 .Ft struct mbuf *
 .Fn m_split "struct mbuf *m0" "int len0" "int wait"
 .Ft struct mbuf *
-.Fn m_makespace "struct mbuf *m0" "int skip" "int hlen" "int *off"
+.Fn m_inject "struct mbuf *m0" "int len0" "int siz" "int wait"
 .Ft struct mbuf *
 .Fn m_getptr "struct mbuf *m" "int loc" "int *off"
 .Ft void
@@ -559,18 +559,18 @@ Split an mbuf chain in two pieces, retur
 the tail (which is made of the previous mbuf chain except the first
 .Fa len0
 bytes).
-.It Fn m_makespace "struct mbuf *m0" "int skip" "int hlen" "int *off"
-Make space for a continuous memory region of length
-.Fa hlen
-at
-.Fa skip
-bytes into the mbuf chain.
-On success, the mbuf of the continuous memory is returned
-together with an offset
-.Fa off
-into the mbuf.
-On failure, NULL is returned and the mbuf chain may have been modified.
-The caller is assumed to always free the chain.
+.It Fn m_inject "struct mbuf *m0" "int len0" "int siz" "int wait"
+Inject a new mbuf chain of length
+.Fa siz
+into the mbuf chain pointed to by
+.Fa m0
+at position
+.Fa len0 .
+If there is enough space for an object of size
+.Fa siz
+in the appropriate location, no memory will be allocated.
+On failure, the function returns NULL (the mbuf is left untouched) and
+on success, a pointer to the first injected mbuf is returned.
 .It Fn m_getptr "struct mbuf *m" "int loc" "int *off"
 Returns a pointer to the mbuf containing the data located at
 .Fa loc
-- 
Christian "naddy" Weisgerber                          na...@mips.inka.de

Reply via email to