Add initial support for ICMP error handling inside MPLS. This includes
support for extended ICMP messages (RFC 4884 and 4950). I had to massage
icmp_reflect() to skip some code duplication. Because of this I did some
minimal code cleanup.
This allows me to traceroute through a pure OpenBSD MPLS cloud and get the
full MPLS path back.
traceroute -x 192.168.237.242
traceroute to 192.168.237.242 (192.168.237.242), 64 hops max, 40 byte packets
1 192.168.237.2 (192.168.237.2) 3.16 ms 0.305 ms 0.189 ms
2 10.42.3.1 (10.42.3.1) 4.568 ms 1.327 ms 0.955 ms [MPLS Label 23, 666]
3 10.42.7.1 (10.42.7.1) 2.706 ms 1.171 ms 0.910 ms [MPLS Label 23, 666]
4 10.42.42.2 (10.42.42.2) 2.892 ms 1.215 ms 0.926 ms [MPLS Label 16, 666]
5 192.168.237.242 (192.168.237.242) 2.360 ms 1.167 ms 0.894 ms
--
:wq Claudio
Index: netinet/ip_icmp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.91
diff -u -p -r1.91 ip_icmp.c
--- netinet/ip_icmp.c 9 Jul 2010 15:44:20 -0000 1.91
+++ netinet/ip_icmp.c 9 Sep 2010 13:37:53 -0000
@@ -291,7 +291,8 @@ icmp_error(struct mbuf *n, int type, int
m = icmp_do_error(n, type, code, dest, destmtu);
if (m != NULL)
- icmp_reflect(m);
+ if (!icmp_reflect(m, NULL, NULL))
+ icmp_send(m, NULL);
}
struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET };
@@ -315,6 +316,7 @@ icmp_input(struct mbuf *m, ...)
int hlen;
va_list ap;
struct rtentry *rt;
+ struct mbuf *opts;
va_start(ap, m);
hlen = va_arg(ap, int);
@@ -508,14 +510,14 @@ icmp_input(struct mbuf *m, ...)
case ICMP_MASKREQ:
if (icmpmaskrepl == 0)
break;
- /*
- * We are not able to respond with all ones broadcast
- * unless we receive it over a point-to-point interface.
- */
if (icmplen < ICMP_MASKLEN) {
icmpstat.icps_badlen++;
break;
}
+ /*
+ * We are not able to respond with all ones broadcast
+ * unless we receive it over a point-to-point interface.
+ */
if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
ip->ip_dst.s_addr == INADDR_ANY)
icmpdst.sin_addr = ip->ip_src;
@@ -548,7 +550,8 @@ reflect:
icmpstat.icps_reflect++;
icmpstat.icps_outhist[icp->icmp_type]++;
- icmp_reflect(m);
+ if (!icmp_reflect(m, &opts, NULL))
+ icmp_send(m, opts);
return;
case ICMP_REDIRECT:
@@ -637,11 +640,10 @@ freeit:
/*
* Reflect the ip packet back to the source
*/
-void
-icmp_reflect(struct mbuf *m)
+int
+icmp_reflect(struct mbuf *m, struct mbuf **op, struct in_ifaddr *ia)
{
struct ip *ip = mtod(m, struct ip *);
- struct in_ifaddr *ia;
struct in_addr t;
struct mbuf *opts = 0;
int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
@@ -649,8 +651,8 @@ icmp_reflect(struct mbuf *m)
if (!in_canforward(ip->ip_src) &&
((ip->ip_src.s_addr & IN_CLASSA_NET) !=
htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
- m_freem(m); /* Bad return address */
- goto done; /* ip_output() will check for broadcast */
+ m_freem(m); /* Bad return address */
+ return (EHOSTUNREACH);
}
#if NPF > 0
@@ -663,21 +665,24 @@ icmp_reflect(struct mbuf *m)
* use dst as the src for the reply. For broadcast, use
* the address which corresponds to the incoming interface.
*/
- TAILQ_FOREACH(ia, &in_ifaddr, ia_list) {
- if (ia->ia_ifp->if_rdomain != rtable_l2(m->m_pkthdr.rdomain))
- continue;
- if (t.s_addr == ia->ia_addr.sin_addr.s_addr)
- break;
- if ((ia->ia_ifp->if_flags & IFF_BROADCAST) &&
- t.s_addr == ia->ia_broadaddr.sin_addr.s_addr)
- break;
+ if (ia == NULL) {
+ TAILQ_FOREACH(ia, &in_ifaddr, ia_list) {
+ if (ia->ia_ifp->if_rdomain !=
+ rtable_l2(m->m_pkthdr.rdomain))
+ continue;
+ if (t.s_addr == ia->ia_addr.sin_addr.s_addr)
+ break;
+ if ((ia->ia_ifp->if_flags & IFF_BROADCAST) &&
+ t.s_addr == ia->ia_broadaddr.sin_addr.s_addr)
+ break;
+ }
}
/*
* The following happens if the packet was not addressed to us.
* Use the new source address and do a route lookup. If it fails
* drop the packet as there is no path to the host.
*/
- if (ia == (struct in_ifaddr *)0) {
+ if (ia == NULL) {
struct sockaddr_in *dst;
struct route ro;
@@ -693,7 +698,7 @@ icmp_reflect(struct mbuf *m)
if (ro.ro_rt == 0) {
ipstat.ips_noroute++;
m_freem(m);
- goto done;
+ return (EHOSTUNREACH);
}
ia = ifatoia(ro.ro_rt->rt_ifa);
@@ -715,12 +720,12 @@ icmp_reflect(struct mbuf *m)
* add on any record-route or timestamp options.
*/
cp = (u_char *) (ip + 1);
- if ((opts = ip_srcroute()) == 0 &&
+ if (op && (opts = ip_srcroute()) == 0 &&
(opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
opts->m_len = sizeof(struct in_addr);
mtod(opts, struct in_addr *)->s_addr = 0;
}
- if (opts) {
+ if (op && opts) {
#ifdef ICMPPRINTFS
if (icmpprintfs)
printf("icmp_reflect optlen %d rt %d => ",
@@ -778,10 +783,10 @@ icmp_reflect(struct mbuf *m)
(unsigned)(m->m_len - sizeof(struct ip)));
}
m->m_flags &= ~(M_BCAST|M_MCAST);
- icmp_send(m, opts);
-done:
- if (opts)
- (void)m_free(opts);
+ if (op)
+ *op = opts;
+
+ return (0);
}
/*
@@ -1051,4 +1056,66 @@ icmp_redirect_timeout(struct rtentry *rt
rtrequest1(RTM_DELETE, &info, rt->rt_priority, NULL,
r->rtt_tableid);
}
+}
+
+int
+icmp_do_exthdr(struct mbuf *m, u_int16_t class, u_int8_t ctype, void *buf,
+ size_t len)
+{
+ struct ip *ip = mtod(m, struct ip *);
+ int hlen, off;
+ struct mbuf *n;
+ struct icmp *icp;
+ struct icmp_ext_hdr *ieh;
+ struct {
+ struct icmp_ext_hdr ieh;
+ struct icmp_ext_obj_hdr ieo;
+ } hdr;
+
+ hlen = ip->ip_hl << 2;
+ icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
+ if (icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH &&
+ icp->icmp_type != ICMP_PARAMPROB)
+ /* exthdr not supported */
+ return (0);
+
+ if (icp->icmp_length != 0)
+ /* exthdr already present, giving up */
+ return (0);
+
+ /* the actuall offset starts after the common ICMP header */
+ hlen += ICMP_MINLEN;
+ /* exthdr must start on a word boundary */
+ off = roundup(ntohs(ip->ip_len) - hlen, sizeof(u_int32_t));
+ /* ... and at an offset of ICMP_EXT_OFFSET or bigger */
+ off = max(off, ICMP_EXT_OFFSET);
+ icp->icmp_length = off / sizeof(u_int32_t);
+
+ bzero(&hdr, sizeof(hdr));
+ hdr.ieh.ieh_version = ICMP_EXT_HDR_VERSION;
+ hdr.ieo.ieo_length = htons(sizeof(struct icmp_ext_obj_hdr) + len);
+ hdr.ieo.ieo_cnum = class;
+ hdr.ieo.ieo_ctype = ctype;
+
+ if (m_copyback(m, hlen + off, sizeof(hdr), &hdr, M_NOWAIT) ||
+ m_copyback(m, hlen + off + sizeof(hdr), len, buf, M_NOWAIT)) {
+ m_freem(m);
+ return (ENOBUFS);
+ }
+
+ /* calculate checksum */
+ n = m_getptr(m, hlen + off, &off);
+ if (n == NULL)
+ panic("icmp_do_exthdr: m_getptr failure");
+ /* this is disgusting, in_cksum() is stupid */
+ n->m_data += off;
+ n->m_len -= off;
+ ieh = mtod(n, struct icmp_ext_hdr *);
+ ieh->ieh_cksum = in_cksum(n, sizeof(hdr) + len);
+ n->m_data -= off;
+ n->m_len += off;
+
+ ip->ip_len = htons(m->m_pkthdr.len);
+
+ return (0);
}
Index: netinet/ip_icmp.h
===================================================================
RCS file: /cvs/src/sys/netinet/ip_icmp.h,v
retrieving revision 1.23
diff -u -p -r1.23 ip_icmp.h
--- netinet/ip_icmp.h 8 Jul 2010 20:20:11 -0000 1.23
+++ netinet/ip_icmp.h 9 Sep 2010 09:41:36 -0000
@@ -234,11 +234,12 @@ struct mbuf *
void icmp_error(struct mbuf *, int, int, n_long, int);
void icmp_input(struct mbuf *, ...);
void icmp_init(void);
-void icmp_reflect(struct mbuf *);
+int icmp_reflect(struct mbuf *, struct mbuf **, struct in_ifaddr *);
void icmp_send(struct mbuf *, struct mbuf *);
int icmp_sysctl(int *, u_int, void *, size_t *, void *, size_t);
struct rtentry *
icmp_mtudisc_clone(struct sockaddr *, u_int);
void icmp_mtudisc(struct icmp *, u_int);
+int icmp_do_exthdr(struct mbuf *, u_int16_t, u_int8_t, void *, size_t);
#endif /* _KERNEL */
#endif /* _NETINET_IP_ICMP_H_ */
Index: netmpls/mpls_input.c
===================================================================
RCS file: /cvs/src/sys/netmpls/mpls_input.c,v
retrieving revision 1.28
diff -u -p -r1.28 mpls_input.c
--- netmpls/mpls_input.c 7 Jul 2010 20:58:25 -0000 1.28
+++ netmpls/mpls_input.c 9 Sep 2010 10:41:09 -0000
@@ -33,6 +33,8 @@
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
#endif
#ifdef INET6
@@ -53,12 +55,11 @@ extern int mpls_inkloop;
#define MPLS_TTL_GET(l) (ntohl((l) & MPLS_TTL_MASK))
#endif
-extern int mpls_mapttl_ip;
-extern int mpls_mapttl_ip6;
-
int mpls_ip_adjttl(struct mbuf *, u_int8_t);
int mpls_ip6_adjttl(struct mbuf *, u_int8_t);
+struct mbuf *mpls_do_error(struct mbuf *, int, int, int);
+
void
mpls_init(void)
{
@@ -124,16 +125,14 @@ mpls_input(struct mbuf *m)
/* check and decrement TTL */
ttl = ntohl(shim->shim_label & MPLS_TTL_MASK);
- if (ttl <= 1) {
+ if (ttl-- <= 1) {
/* TTL exceeded */
- /*
- * XXX if possible hand packet up to network layer so that an
- * ICMP TTL exceeded can be sent back.
- */
- m_freem(m);
- return;
+ m = mpls_do_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0);
+ if (m == NULL)
+ return;
+ shim = mtod(m, struct shim_hdr *);
+ ttl = ntohl(shim->shim_label & MPLS_TTL_MASK);
}
- ttl--;
bzero(&sa_mpls, sizeof(sa_mpls));
smpls = &sa_mpls;
@@ -341,11 +340,11 @@ done:
int
mpls_ip_adjttl(struct mbuf *m, u_int8_t ttl)
{
- struct ip *ip;
- int hlen;
+ struct ip *ip;
+ int hlen;
if (mpls_mapttl_ip) {
- if (m->m_len < sizeof (struct ip) &&
+ if (m->m_len < sizeof(struct ip) &&
(m = m_pullup(m, sizeof(struct ip))) == NULL)
return -1;
ip = mtod(m, struct ip *);
@@ -377,7 +376,7 @@ mpls_ip6_adjttl(struct mbuf *m, u_int8_t
struct ip6_hdr *ip6hdr;
if (mpls_mapttl_ip6) {
- if (m->m_len < sizeof (struct ip6_hdr) &&
+ if (m->m_len < sizeof(struct ip6_hdr) &&
(m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL)
return -1;
@@ -387,4 +386,107 @@ mpls_ip6_adjttl(struct mbuf *m, u_int8_t
ip6hdr->ip6_hlim = ttl;
}
return 0;
+}
+
+struct mbuf *
+mpls_do_error(struct mbuf *m, int type, int code, int destmtu)
+{
+ struct shim_hdr stack[MPLS_INKERNEL_LOOP_MAX];
+ struct sockaddr_mpls sa_mpls;
+ struct sockaddr_mpls *smpls;
+ struct rtentry *rt = NULL;
+ struct shim_hdr *shim;
+ struct in_ifaddr *ia;
+ struct icmp *icp;
+ struct ip *ip;
+ int nstk;
+
+ for (nstk = 0; nstk < MPLS_INKERNEL_LOOP_MAX; nstk++) {
+ if (m->m_len < sizeof(*shim) &&
+ (m = m_pullup(m, sizeof(*ip))) == NULL)
+ return (NULL);
+ stack[nstk] = *mtod(m, struct shim_hdr *);
+ m_adj(m, sizeof(*shim));
+ if (MPLS_BOS_ISSET(stack[nstk].shim_label))
+ break;
+ }
+ shim = &stack[0];
+
+ switch (*mtod(m, u_char *) >> 4) {
+ case IPVERSION:
+ if (m->m_len < sizeof(*ip) &&
+ (m = m_pullup(m, sizeof(*ip))) == NULL)
+ return (NULL);
+ m = icmp_do_error(m, type, code, 0, destmtu);
+ if (m == NULL)
+ return (NULL);
+
+ if (icmp_do_exthdr(m, ICMP_EXT_MPLS, 1, stack,
+ (nstk + 1) * sizeof(*shim)))
+ return (NULL);
+
+ /* set ip_src to something usable, based on the MPLS label */
+ bzero(&sa_mpls, sizeof(sa_mpls));
+ smpls = &sa_mpls;
+ smpls->smpls_family = AF_MPLS;
+ smpls->smpls_len = sizeof(*smpls);
+ smpls->smpls_label = shim->shim_label & MPLS_LABEL_MASK;
+
+ rt = rtalloc1(smplstosa(smpls), RT_REPORT, 0);
+ if (rt == NULL) {
+ /* no entry for this label */
+ m_freem(m);
+ return (NULL);
+ }
+ if (rt->rt_ifa->ifa_addr->sa_family == AF_INET)
+ ia = ifatoia(rt->rt_ifa);
+ else {
+ /* XXX this needs fixing, if the MPLS is on an IP
+ * less interface we need to find some other IP to
+ * use as source.
+ */
+ RTFREE(rt);
+ m_freem(m);
+ return (NULL);
+ }
+ rt->rt_use++;
+ RTFREE(rt);
+ if (icmp_reflect(m, NULL, ia))
+ return (NULL);
+
+ ip = mtod(m, struct ip *);
+ /* stuff to fix up which is normaly done in ip_output */
+ ip->ip_v = IPVERSION;
+ ip->ip_id = htons(ip_randomid());
+ ip->ip_sum = 0;
+ ip->ip_sum = in_cksum(m, sizeof(*ip));
+
+ /* stolen from icmp_send() */
+ m->m_data += sizeof(*ip);
+ m->m_len -= sizeof(*ip);
+ icp = mtod(m, struct icmp *);
+ icp->icmp_cksum = 0;
+ icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - sizeof(*ip));
+ m->m_data -= sizeof(*ip);
+ m->m_len += sizeof(*ip);
+
+ break;
+ case IPV6_VERSION >> 4:
+ default:
+ m_freem(m);
+ return (NULL);
+ }
+
+ /* add mpls stack back to new packet */
+ M_PREPEND(m, (nstk + 1) * sizeof(*shim), M_NOWAIT);
+ if (m == NULL)
+ return (NULL);
+ m_copyback(m, 0, (nstk + 1) * sizeof(*shim), stack, M_NOWAIT);
+
+ /* change TTL to default */
+ shim = mtod(m, struct shim_hdr *);
+ shim->shim_label =
+ (shim->shim_label & ~MPLS_TTL_MASK) | htonl(mpls_defttl);
+
+ return (m);
}