On 2016-10-06, Christian Weisgerber <na...@mips.inka.de> wrote: > Something is very broken at the intersection of IPv6, NDP, and IPsec > in -current.
Found by bisection. The culprit is this commit: ------------------------------------------------------------------------ CVSROOT: /cvs Module name: src Changes by: mar...@cvs.openbsd.org 2016/09/13 13:56:55 Modified files: sys/kern : uipc_mbuf.c sys/netinet : ip_ah.c ip_esp.c ip_ipcomp.c ipsec_output.c sys/sys : mbuf.h share/man/man9 : mbuf.9 Log message: avoid extensive mbuf allocation for IPsec by replacing m_inject(4) with m_makespace(4) from freebsd; ok mpi@, bluhm@, mikeb@, dlg@ ------------------------------------------------------------------------ Kudos to vgross@ for calling it a few days ago: > If so, I think we are dealing with some kind of mbuf mangling. A backout is somewhat involved due to further changes that have been layered on top. Backout patch below for completeness, but I don't suggest that this be actually committed. Index: sys/kern/uipc_mbuf.c =================================================================== RCS file: /cvs/src/sys/kern/uipc_mbuf.c,v retrieving revision 1.231 diff -u -p -r1.231 uipc_mbuf.c --- sys/kern/uipc_mbuf.c 15 Sep 2016 02:00:16 -0000 1.231 +++ sys/kern/uipc_mbuf.c 9 Oct 2016 00:20:34 -0000 @@ -126,6 +126,7 @@ int max_hdr; /* largest link+protocol struct mutex m_extref_mtx = MUTEX_INITIALIZER(IPL_NET); void m_extfree(struct mbuf *); +struct mbuf *m_copym0(struct mbuf *, int, int, int, int); void nmbclust_update(void); void m_zero(struct mbuf *); @@ -567,7 +568,23 @@ m_prepend(struct mbuf *m, int len, int h * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. */ struct mbuf * -m_copym(struct mbuf *m0, int off, int len, int wait) +m_copym(struct mbuf *m, int off, int len, int wait) +{ + return m_copym0(m, off, len, wait, 0); /* shallow copy on M_EXT */ +} + +/* + * m_copym2() is like m_copym(), except it COPIES cluster mbufs, instead + * of merely bumping the reference count. + */ +struct mbuf * +m_copym2(struct mbuf *m, int off, int len, int wait) +{ + return m_copym0(m, off, len, wait, 1); /* deep copy */ +} + +struct mbuf * +m_copym0(struct mbuf *m0, int off, int len, int wait, int deep) { struct mbuf *m, *n, **np; struct mbuf *top; @@ -600,9 +617,23 @@ m_copym(struct mbuf *m0, int off, int le } n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { - n->m_data = m->m_data + off; - n->m_ext = m->m_ext; - MCLADDREFERENCE(m, n); + if (!deep) { + n->m_data = m->m_data + off; + n->m_ext = m->m_ext; + MCLADDREFERENCE(m, n); + } else { + /* + * we are unsure about the way m was allocated. + * copy into multiple MCLBYTES cluster mbufs. + */ + MCLGET(n, wait); + n->m_len = 0; + n->m_len = M_TRAILINGSPACE(n); + n->m_len = min(n->m_len, len); + n->m_len = min(n->m_len, m->m_len - off); + memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, + n->m_len); + } } else memcpy(mtod(n, caddr_t), mtod(m, caddr_t) + off, n->m_len); @@ -933,6 +964,67 @@ m_getptr(struct mbuf *m, int loc, int *o } /* + * Inject a new mbuf chain of length siz in mbuf chain m0 at + * position len0. Returns a pointer to the first injected mbuf, or + * NULL on failure (m0 is left undisturbed). Note that if there is + * enough space for an object of size siz in the appropriate position, + * no memory will be allocated. Also, there will be no data movement in + * the first len0 bytes (pointers to that will remain valid). + * + * XXX It is assumed that siz is less than the size of an mbuf at the moment. + */ +struct mbuf * +m_inject(struct mbuf *m0, int len0, int siz, int wait) +{ + struct mbuf *m, *n, *n2 = NULL, *n3; + unsigned len = len0, remain; + + if ((siz >= MHLEN) || (len0 <= 0)) + return (NULL); + for (m = m0; m && len > m->m_len; m = m->m_next) + len -= m->m_len; + if (m == NULL) + return (NULL); + remain = m->m_len - len; + if (remain == 0) { + if ((m->m_next) && (M_LEADINGSPACE(m->m_next) >= siz)) { + m->m_next->m_len += siz; + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += siz; + m->m_next->m_data -= siz; + return m->m_next; + } + } else { + n2 = m_copym2(m, len, remain, wait); + if (n2 == NULL) + return (NULL); + } + + MGET(n, wait, MT_DATA); + if (n == NULL) { + if (n2) + m_freem(n2); + return (NULL); + } + + n->m_len = siz; + if (m0->m_flags & M_PKTHDR) + m0->m_pkthdr.len += siz; + m->m_len -= remain; /* Trim */ + if (n2) { + for (n3 = n; n3->m_next != NULL; n3 = n3->m_next) + ; + n3->m_next = n2; + } else + n3 = n; + for (; n3->m_next != NULL; n3 = n3->m_next) + ; + n3->m_next = m->m_next; + m->m_next = n; + return n; +} + +/* * Partition an mbuf chain in two pieces, returning the tail -- * all but the first len0 bytes. In case of failure, it returns NULL and * attempts to restore the chain to its original state. @@ -997,121 +1089,6 @@ extpacket: m->m_next = NULL; return (n); } - -/* - * Make space for a new header of length hlen at skip bytes - * into the packet. When doing this we allocate new mbufs only - * when absolutely necessary. The mbuf where the new header - * is to go is returned together with an offset into the mbuf. - * If NULL is returned then the mbuf chain may have been modified; - * the caller is assumed to always free the chain. - */ -struct mbuf * -m_makespace(struct mbuf *m0, int skip, int hlen, int *off) -{ - struct mbuf *m; - unsigned remain; - - KASSERT(m0 != NULL); - KASSERT(hlen < MHLEN); - - for (m = m0; m && skip > m->m_len; m = m->m_next) - skip -= m->m_len; - if (m == NULL) - return (NULL); - /* - * At this point skip is the offset into the mbuf m - * where the new header should be placed. Figure out - * if there's space to insert the new header. If so, - * and copying the remainder makese sense then do so. - * Otherwise insert a new mbuf in the chain, splitting - * the contents of m as needed. - */ - remain = m->m_len - skip; /* data to move */ - if (skip < remain && hlen <= M_LEADINGSPACE(m)) { - if (skip) - memmove(m->m_data-hlen, m->m_data, skip); - m->m_data -= hlen; - m->m_len += hlen; - (*off) = skip; - } else if (hlen > M_TRAILINGSPACE(m)) { - struct mbuf *n0, *n, **np; - int todo, len, done, alloc; - - n0 = NULL; - np = &n0; - alloc = 0; - done = 0; - todo = remain; - while (todo > 0) { - MGET(n, M_DONTWAIT, m->m_type); - len = MHLEN; - if (n && todo > MHLEN) { - MCLGET(n, M_DONTWAIT); - len = MCLBYTES; - if ((n->m_flags & M_EXT) == 0) { - m_free(n); - n = NULL; - } - } - if (n == NULL) { - m_freem(n0); - return NULL; - } - *np = n; - np = &n->m_next; - alloc++; - len = min(todo, len); - memcpy(n->m_data, mtod(m, char *) + skip + done, len); - n->m_len = len; - done += len; - todo -= len; - } - - if (hlen <= M_TRAILINGSPACE(m) + remain) { - m->m_len = skip + hlen; - *off = skip; - if (n0 != NULL) { - *np = m->m_next; - m->m_next = n0; - } - } - else { - n = m_get(M_DONTWAIT, m->m_type); - if (n == NULL) { - m_freem(n0); - return NULL; - } - alloc++; - - if ((n->m_next = n0) == NULL) - np = &n->m_next; - n0 = n; - - *np = m->m_next; - m->m_next = n0; - - n->m_len = hlen; - m->m_len = skip; - - m = n; /* header is at front ... */ - *off = 0; /* ... of new mbuf */ - } - } else { - /* - * Copy the remainder to the back of the mbuf - * so there's space to write the new header. - */ - if (remain > 0) - memmove(mtod(m, caddr_t) + skip + hlen, - mtod(m, caddr_t) + skip, remain); - m->m_len += hlen; - *off = skip; - } - m0->m_pkthdr.len += hlen; /* adjust packet length */ - return m; -} - /* * Routine to copy from device local memory into mbufs. Index: sys/netinet/ip_ah.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_ah.c,v retrieving revision 1.123 diff -u -p -r1.123 ip_ah.c --- sys/netinet/ip_ah.c 19 Sep 2016 18:09:22 -0000 1.123 +++ sys/netinet/ip_ah.c 9 Oct 2016 00:03:45 -0000 @@ -932,7 +932,7 @@ ah_output(struct mbuf *m, struct tdb *td struct mbuf *mi; struct cryptop *crp; u_int16_t iplen; - int len, rplen, roff; + int len, rplen; u_int8_t prot; struct ah *ah; #if NBPFILTER > 0 @@ -1057,7 +1057,7 @@ ah_output(struct mbuf *m, struct tdb *td } /* Inject AH header. */ - mi = m_makespace(m, skip, rplen + ahx->authsize, &roff); + mi = m_inject(m, skip, rplen + ahx->authsize, M_DONTWAIT); if (mi == NULL) { DPRINTF(("ah_output(): failed to inject AH header for SA " "%s/%08x\n", ipsp_address(&tdb->tdb_dst, buf, sizeof(buf)), @@ -1069,10 +1069,10 @@ ah_output(struct mbuf *m, struct tdb *td } /* - * The AH header is guaranteed by m_makespace() to be in - * contiguous memory, at 'roff' of the returned mbuf. + * The AH header is guaranteed by m_inject() to be in + * contiguous memory, at the beginning of the returned mbuf. */ - ah = (struct ah *)(mtod(mi, caddr_t) + roff); + ah = mtod(mi, struct ah *); /* Initialize the AH header. */ m_copydata(m, protoff, sizeof(u_int8_t), (caddr_t) &ah->ah_nh); Index: sys/netinet/ip_esp.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_esp.c,v retrieving revision 1.141 diff -u -p -r1.141 ip_esp.c --- sys/netinet/ip_esp.c 19 Sep 2016 18:09:22 -0000 1.141 +++ sys/netinet/ip_esp.c 9 Oct 2016 00:03:45 -0000 @@ -775,7 +775,7 @@ esp_output(struct mbuf *m, struct tdb *t { struct enc_xform *espx = (struct enc_xform *) tdb->tdb_encalgxform; struct auth_hash *esph = (struct auth_hash *) tdb->tdb_authalgxform; - int ilen, hlen, rlen, padding, blks, alen, roff; + int ilen, hlen, rlen, padding, blks, alen; u_int32_t replay; struct mbuf *mi, *mo = (struct mbuf *) NULL; struct tdb_crypto *tc; @@ -907,7 +907,7 @@ esp_output(struct mbuf *m, struct tdb *t } /* Inject ESP header. */ - mo = m_makespace(m, skip, hlen, &roff); + mo = m_inject(m, skip, hlen, M_DONTWAIT); if (mo == NULL) { DPRINTF(("esp_output(): failed to inject ESP header for " "SA %s/%08x\n", ipsp_address(&tdb->tdb_dst, buf, @@ -918,11 +918,10 @@ esp_output(struct mbuf *m, struct tdb *t } /* Initialize ESP header. */ - bcopy((caddr_t) &tdb->tdb_spi, mtod(mo, caddr_t) + roff, - sizeof(u_int32_t)); + bcopy((caddr_t) &tdb->tdb_spi, mtod(mo, caddr_t), sizeof(u_int32_t)); tdb->tdb_rpl++; replay = htonl((u_int32_t)tdb->tdb_rpl); - memcpy(mtod(mo, caddr_t) + roff + sizeof(u_int32_t), (caddr_t) &replay, + memcpy(mtod(mo, caddr_t) + sizeof(u_int32_t), (caddr_t) &replay, sizeof(u_int32_t)); #if NPFSYNC > 0 @@ -933,15 +932,15 @@ esp_output(struct mbuf *m, struct tdb *t * Add padding -- better to do it ourselves than use the crypto engine, * although if/when we support compression, we'd have to do that. */ - mo = m_makespace(m, m->m_pkthdr.len, padding + alen, &roff); + mo = m_inject(m, m->m_pkthdr.len, padding + alen, M_DONTWAIT); if (mo == NULL) { - DPRINTF(("esp_output(): m_makespace() failed for SA %s/%08x\n", + DPRINTF(("esp_output(): m_inject failed for SA %s/%08x\n", ipsp_address(&tdb->tdb_dst, buf, sizeof(buf)), ntohl(tdb->tdb_spi))); m_freem(m); return ENOBUFS; } - pad = mtod(mo, caddr_t) + roff; + pad = mtod(mo, u_char *); /* Apply self-describing padding */ for (ilen = 0; ilen < padding - 2; ilen++) Index: sys/netinet/ip_ipcomp.c =================================================================== RCS file: /cvs/src/sys/netinet/ip_ipcomp.c,v retrieving revision 1.48 diff -u -p -r1.48 ip_ipcomp.c --- sys/netinet/ip_ipcomp.c 24 Sep 2016 14:51:37 -0000 1.48 +++ sys/netinet/ip_ipcomp.c 9 Oct 2016 00:03:45 -0000 @@ -528,7 +528,7 @@ ipcomp_output_cb(struct cryptop *crp) struct tdb_crypto *tc; struct tdb *tdb; struct mbuf *m, *mo; - int error, s, skip, rlen, roff; + int error, s, skip, rlen; u_int16_t cpi; struct ip *ip; #ifdef INET6 @@ -593,7 +593,7 @@ ipcomp_output_cb(struct cryptop *crp) } /* Inject IPCOMP header */ - mo = m_makespace(m, skip, IPCOMP_HLENGTH, &roff); + mo = m_inject(m, skip, IPCOMP_HLENGTH, M_DONTWAIT); if (mo == NULL) { DPRINTF(("ipcomp_output_cb(): failed to inject IPCOMP header " "for IPCA %s/%08x\n", ipsp_address(&tdb->tdb_dst, buf, @@ -604,7 +604,7 @@ ipcomp_output_cb(struct cryptop *crp) } /* Initialize the IPCOMP header */ - ipcomp = (struct ipcomp *)(mtod(mo, caddr_t) + roff); + ipcomp = mtod(mo, struct ipcomp *); memset(ipcomp, 0, sizeof(struct ipcomp)); cpi = (u_int16_t) ntohl(tdb->tdb_spi); ipcomp->ipcomp_cpi = htons(cpi); Index: sys/netinet/ipsec_output.c =================================================================== RCS file: /cvs/src/sys/netinet/ipsec_output.c,v retrieving revision 1.63 diff -u -p -r1.63 ipsec_output.c --- sys/netinet/ipsec_output.c 13 Sep 2016 19:56:55 -0000 1.63 +++ sys/netinet/ipsec_output.c 9 Oct 2016 00:03:45 -0000 @@ -362,12 +362,13 @@ int ipsp_process_done(struct mbuf *m, struct tdb *tdb) { struct ip *ip; + #ifdef INET6 struct ip6_hdr *ip6; #endif /* INET6 */ + struct tdb_ident *tdbi; struct m_tag *mtag; - int roff; tdb->tdb_last_used = time_second; @@ -397,12 +398,12 @@ ipsp_process_done(struct mbuf *m, struct return ENXIO; } - mi = m_makespace(m, iphlen, sizeof(struct udphdr), &roff); + mi = m_inject(m, iphlen, sizeof(struct udphdr), M_DONTWAIT); if (mi == NULL) { m_freem(m); return ENOMEM; } - uh = (struct udphdr *)(mtod(mi, caddr_t) + roff); + uh = mtod(mi, struct udphdr *); uh->uh_sport = uh->uh_dport = htons(udpencap_port); if (tdb->tdb_udpencap_port) uh->uh_dport = tdb->tdb_udpencap_port; Index: sys/sys/mbuf.h =================================================================== RCS file: /cvs/src/sys/sys/mbuf.h,v retrieving revision 1.220 diff -u -p -r1.220 mbuf.h --- sys/sys/mbuf.h 17 Sep 2016 00:38:43 -0000 1.220 +++ sys/sys/mbuf.h 9 Oct 2016 00:20:34 -0000 @@ -419,6 +419,7 @@ extern int max_protohdr; /* largest pro extern int max_hdr; /* largest link+protocol header */ void mbinit(void); +struct mbuf *m_copym2(struct mbuf *, int, int, int); struct mbuf *m_copym(struct mbuf *, int, int, int); struct mbuf *m_free(struct mbuf *); struct mbuf *m_get(int, int); @@ -431,7 +432,7 @@ struct mbuf *m_prepend(struct mbuf *, in struct mbuf *m_pulldown(struct mbuf *, int, int, int *); struct mbuf *m_pullup(struct mbuf *, int); struct mbuf *m_split(struct mbuf *, int, int); -struct mbuf *m_makespace(struct mbuf *, int, int, int *); +struct mbuf *m_inject(struct mbuf *, int, int, int); struct mbuf *m_getptr(struct mbuf *, int, int *); int m_leadingspace(struct mbuf *); int m_trailingspace(struct mbuf *); Index: share/man/man9/mbuf.9 =================================================================== RCS file: /cvs/src/share/man/man9/mbuf.9,v retrieving revision 1.104 diff -u -p -r1.104 mbuf.9 --- share/man/man9/mbuf.9 15 Sep 2016 00:00:40 -0000 1.104 +++ share/man/man9/mbuf.9 9 Oct 2016 00:03:45 -0000 @@ -42,7 +42,7 @@ .Nm m_pulldown , .Nm m_pullup , .Nm m_split , -.Nm m_makespace , +.Nm m_inject , .Nm m_getptr , .Nm m_adj , .Nm m_copyback , @@ -92,7 +92,7 @@ .Ft struct mbuf * .Fn m_split "struct mbuf *m0" "int len0" "int wait" .Ft struct mbuf * -.Fn m_makespace "struct mbuf *m0" "int skip" "int hlen" "int *off" +.Fn m_inject "struct mbuf *m0" "int len0" "int siz" "int wait" .Ft struct mbuf * .Fn m_getptr "struct mbuf *m" "int loc" "int *off" .Ft void @@ -559,18 +559,18 @@ Split an mbuf chain in two pieces, retur the tail (which is made of the previous mbuf chain except the first .Fa len0 bytes). -.It Fn m_makespace "struct mbuf *m0" "int skip" "int hlen" "int *off" -Make space for a continuous memory region of length -.Fa hlen -at -.Fa skip -bytes into the mbuf chain. -On success, the mbuf of the continuous memory is returned -together with an offset -.Fa off -into the mbuf. -On failure, NULL is returned and the mbuf chain may have been modified. -The caller is assumed to always free the chain. +.It Fn m_inject "struct mbuf *m0" "int len0" "int siz" "int wait" +Inject a new mbuf chain of length +.Fa siz +into the mbuf chain pointed to by +.Fa m0 +at position +.Fa len0 . +If there is enough space for an object of size +.Fa siz +in the appropriate location, no memory will be allocated. +On failure, the function returns NULL (the mbuf is left untouched) and +on success, a pointer to the first injected mbuf is returned. .It Fn m_getptr "struct mbuf *m" "int loc" "int *off" Returns a pointer to the mbuf containing the data located at .Fa loc -- Christian "naddy" Weisgerber na...@mips.inka.de