Add initial support for ICMP error handling inside MPLS. This includes
support for extended ICMP messages (RFC 4884 and 4950). I had to massage
icmp_reflect() to skip some code duplication. Because of this I did some
minimal code cleanup.

This allows me to traceroute through a pure OpenBSD MPLS cloud and get the
full MPLS path back.
 traceroute -x 192.168.237.242
traceroute to 192.168.237.242 (192.168.237.242), 64 hops max, 40 byte packets
 1  192.168.237.2 (192.168.237.2)  3.16 ms  0.305 ms  0.189 ms
 2  10.42.3.1 (10.42.3.1)  4.568 ms  1.327 ms  0.955 ms [MPLS Label 23, 666]
 3  10.42.7.1 (10.42.7.1)  2.706 ms  1.171 ms  0.910 ms [MPLS Label 23, 666]
 4  10.42.42.2 (10.42.42.2)  2.892 ms  1.215 ms  0.926 ms [MPLS Label 16, 666]
 5  192.168.237.242 (192.168.237.242)  2.360 ms  1.167 ms  0.894 ms

-- 
:wq Claudio

Index: netinet/ip_icmp.c
===================================================================
RCS file: /cvs/src/sys/netinet/ip_icmp.c,v
retrieving revision 1.91
diff -u -p -r1.91 ip_icmp.c
--- netinet/ip_icmp.c   9 Jul 2010 15:44:20 -0000       1.91
+++ netinet/ip_icmp.c   9 Sep 2010 13:37:53 -0000
@@ -291,7 +291,8 @@ icmp_error(struct mbuf *n, int type, int
 
        m = icmp_do_error(n, type, code, dest, destmtu);
        if (m != NULL)
-               icmp_reflect(m);
+               if (!icmp_reflect(m, NULL, NULL))
+                       icmp_send(m, NULL);
 }
 
 struct sockaddr_in icmpsrc = { sizeof (struct sockaddr_in), AF_INET };
@@ -315,6 +316,7 @@ icmp_input(struct mbuf *m, ...)
        int hlen;
        va_list ap;
        struct rtentry *rt;
+       struct mbuf *opts;
 
        va_start(ap, m);
        hlen = va_arg(ap, int);
@@ -508,14 +510,14 @@ icmp_input(struct mbuf *m, ...)
        case ICMP_MASKREQ:
                if (icmpmaskrepl == 0)
                        break;
-               /*
-                * We are not able to respond with all ones broadcast
-                * unless we receive it over a point-to-point interface.
-                */
                if (icmplen < ICMP_MASKLEN) {
                        icmpstat.icps_badlen++;
                        break;
                }
+               /*
+                * We are not able to respond with all ones broadcast
+                * unless we receive it over a point-to-point interface.
+                */
                if (ip->ip_dst.s_addr == INADDR_BROADCAST ||
                    ip->ip_dst.s_addr == INADDR_ANY)
                        icmpdst.sin_addr = ip->ip_src;
@@ -548,7 +550,8 @@ reflect:
 
                icmpstat.icps_reflect++;
                icmpstat.icps_outhist[icp->icmp_type]++;
-               icmp_reflect(m);
+               if (!icmp_reflect(m, &opts, NULL))
+                       icmp_send(m, opts);
                return;
 
        case ICMP_REDIRECT:
@@ -637,11 +640,10 @@ freeit:
 /*
  * Reflect the ip packet back to the source
  */
-void
-icmp_reflect(struct mbuf *m)
+int
+icmp_reflect(struct mbuf *m, struct mbuf **op, struct in_ifaddr *ia)
 {
        struct ip *ip = mtod(m, struct ip *);
-       struct in_ifaddr *ia;
        struct in_addr t;
        struct mbuf *opts = 0;
        int optlen = (ip->ip_hl << 2) - sizeof(struct ip);
@@ -649,8 +651,8 @@ icmp_reflect(struct mbuf *m)
        if (!in_canforward(ip->ip_src) &&
            ((ip->ip_src.s_addr & IN_CLASSA_NET) !=
            htonl(IN_LOOPBACKNET << IN_CLASSA_NSHIFT))) {
-               m_freem(m);     /* Bad return address */
-               goto done;      /* ip_output() will check for broadcast */
+               m_freem(m);             /* Bad return address */
+               return (EHOSTUNREACH);
        }
 
 #if NPF > 0
@@ -663,21 +665,24 @@ icmp_reflect(struct mbuf *m)
         * use dst as the src for the reply.  For broadcast, use
         * the address which corresponds to the incoming interface.
         */
-       TAILQ_FOREACH(ia, &in_ifaddr, ia_list) {
-               if (ia->ia_ifp->if_rdomain != rtable_l2(m->m_pkthdr.rdomain))
-                       continue;
-               if (t.s_addr == ia->ia_addr.sin_addr.s_addr)
-                       break;
-               if ((ia->ia_ifp->if_flags & IFF_BROADCAST) &&
-                   t.s_addr == ia->ia_broadaddr.sin_addr.s_addr)
-                       break;
+       if (ia == NULL) {
+               TAILQ_FOREACH(ia, &in_ifaddr, ia_list) {
+                       if (ia->ia_ifp->if_rdomain !=
+                           rtable_l2(m->m_pkthdr.rdomain))
+                               continue;
+                       if (t.s_addr == ia->ia_addr.sin_addr.s_addr)
+                               break;
+                       if ((ia->ia_ifp->if_flags & IFF_BROADCAST) &&
+                           t.s_addr == ia->ia_broadaddr.sin_addr.s_addr)
+                               break;
+               }
        }
        /*
         * The following happens if the packet was not addressed to us.
         * Use the new source address and do a route lookup. If it fails
         * drop the packet as there is no path to the host.
         */
-       if (ia == (struct in_ifaddr *)0) {
+       if (ia == NULL) {
                struct sockaddr_in *dst;
                struct route ro;
 
@@ -693,7 +698,7 @@ icmp_reflect(struct mbuf *m)
                if (ro.ro_rt == 0) {
                        ipstat.ips_noroute++;
                        m_freem(m);
-                       goto done;
+                       return (EHOSTUNREACH);
                }
 
                ia = ifatoia(ro.ro_rt->rt_ifa);
@@ -715,12 +720,12 @@ icmp_reflect(struct mbuf *m)
                 * add on any record-route or timestamp options.
                 */
                cp = (u_char *) (ip + 1);
-               if ((opts = ip_srcroute()) == 0 &&
+               if (op && (opts = ip_srcroute()) == 0 &&
                    (opts = m_gethdr(M_DONTWAIT, MT_HEADER))) {
                        opts->m_len = sizeof(struct in_addr);
                        mtod(opts, struct in_addr *)->s_addr = 0;
                }
-               if (opts) {
+               if (op && opts) {
 #ifdef ICMPPRINTFS
                        if (icmpprintfs)
                                printf("icmp_reflect optlen %d rt %d => ",
@@ -778,10 +783,10 @@ icmp_reflect(struct mbuf *m)
                    (unsigned)(m->m_len - sizeof(struct ip)));
        }
        m->m_flags &= ~(M_BCAST|M_MCAST);
-       icmp_send(m, opts);
-done:
-       if (opts)
-               (void)m_free(opts);
+       if (op)
+               *op = opts;
+
+       return (0);
 }
 
 /*
@@ -1051,4 +1056,66 @@ icmp_redirect_timeout(struct rtentry *rt
                rtrequest1(RTM_DELETE, &info, rt->rt_priority, NULL, 
                    r->rtt_tableid);
        }
+}
+
+int
+icmp_do_exthdr(struct mbuf *m, u_int16_t class, u_int8_t ctype, void *buf,
+    size_t len)
+{
+       struct ip *ip = mtod(m, struct ip *);
+       int hlen, off;
+       struct mbuf *n;
+       struct icmp *icp;
+       struct icmp_ext_hdr *ieh;
+       struct {
+               struct icmp_ext_hdr     ieh;
+               struct icmp_ext_obj_hdr ieo;
+       } hdr;
+
+       hlen = ip->ip_hl << 2;
+       icp = (struct icmp *)(mtod(m, caddr_t) + hlen);
+       if (icp->icmp_type != ICMP_TIMXCEED && icp->icmp_type != ICMP_UNREACH &&
+           icp->icmp_type != ICMP_PARAMPROB)
+               /* exthdr not supported */
+               return (0);
+       
+       if (icp->icmp_length != 0)
+               /* exthdr already present, giving up */
+               return (0);
+
+       /* the actuall offset starts after the common ICMP header */
+       hlen += ICMP_MINLEN;
+       /* exthdr must start on a word boundary */
+       off = roundup(ntohs(ip->ip_len) - hlen, sizeof(u_int32_t));
+       /* ... and at an offset of ICMP_EXT_OFFSET or bigger */
+       off = max(off, ICMP_EXT_OFFSET);
+       icp->icmp_length = off / sizeof(u_int32_t);
+
+       bzero(&hdr, sizeof(hdr));
+       hdr.ieh.ieh_version = ICMP_EXT_HDR_VERSION;
+       hdr.ieo.ieo_length = htons(sizeof(struct icmp_ext_obj_hdr) + len);
+       hdr.ieo.ieo_cnum = class;
+       hdr.ieo.ieo_ctype = ctype;
+
+       if (m_copyback(m, hlen + off, sizeof(hdr), &hdr, M_NOWAIT) ||
+           m_copyback(m, hlen + off + sizeof(hdr), len, buf, M_NOWAIT)) {
+               m_freem(m);
+               return (ENOBUFS);
+       }
+
+       /* calculate checksum */
+       n = m_getptr(m, hlen + off, &off);
+       if (n == NULL)
+               panic("icmp_do_exthdr: m_getptr failure");
+       /* this is disgusting, in_cksum() is stupid */
+       n->m_data += off;
+       n->m_len -= off;
+       ieh = mtod(n, struct icmp_ext_hdr *);
+       ieh->ieh_cksum = in_cksum(n, sizeof(hdr) + len);
+       n->m_data -= off;
+       n->m_len += off;
+
+       ip->ip_len = htons(m->m_pkthdr.len);
+
+       return (0);
 }
Index: netinet/ip_icmp.h
===================================================================
RCS file: /cvs/src/sys/netinet/ip_icmp.h,v
retrieving revision 1.23
diff -u -p -r1.23 ip_icmp.h
--- netinet/ip_icmp.h   8 Jul 2010 20:20:11 -0000       1.23
+++ netinet/ip_icmp.h   9 Sep 2010 09:41:36 -0000
@@ -234,11 +234,12 @@ struct mbuf *
 void   icmp_error(struct mbuf *, int, int, n_long, int);
 void   icmp_input(struct mbuf *, ...);
 void   icmp_init(void);
-void   icmp_reflect(struct mbuf *);
+int    icmp_reflect(struct mbuf *, struct mbuf **, struct in_ifaddr *);
 void   icmp_send(struct mbuf *, struct mbuf *);
 int    icmp_sysctl(int *, u_int, void *, size_t *, void *, size_t);
 struct rtentry *
        icmp_mtudisc_clone(struct sockaddr *, u_int);
 void   icmp_mtudisc(struct icmp *, u_int);
+int    icmp_do_exthdr(struct mbuf *, u_int16_t, u_int8_t, void *, size_t);
 #endif /* _KERNEL */
 #endif /* _NETINET_IP_ICMP_H_ */
Index: netmpls/mpls_input.c
===================================================================
RCS file: /cvs/src/sys/netmpls/mpls_input.c,v
retrieving revision 1.28
diff -u -p -r1.28 mpls_input.c
--- netmpls/mpls_input.c        7 Jul 2010 20:58:25 -0000       1.28
+++ netmpls/mpls_input.c        9 Sep 2010 10:41:09 -0000
@@ -33,6 +33,8 @@
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
 #endif
 
 #ifdef INET6
@@ -53,12 +55,11 @@ extern int  mpls_inkloop;
 #define MPLS_TTL_GET(l)                (ntohl((l) & MPLS_TTL_MASK))
 #endif
 
-extern int     mpls_mapttl_ip;
-extern int     mpls_mapttl_ip6;
-
 int    mpls_ip_adjttl(struct mbuf *, u_int8_t);
 int    mpls_ip6_adjttl(struct mbuf *, u_int8_t);
 
+struct mbuf    *mpls_do_error(struct mbuf *, int, int, int);
+
 void
 mpls_init(void)
 {
@@ -124,16 +125,14 @@ mpls_input(struct mbuf *m)
 
        /* check and decrement TTL */
        ttl = ntohl(shim->shim_label & MPLS_TTL_MASK);
-       if (ttl <= 1) {
+       if (ttl-- <= 1) {
                /* TTL exceeded */
-               /*
-                * XXX if possible hand packet up to network layer so that an
-                * ICMP TTL exceeded can be sent back.
-                */
-               m_freem(m);
-               return;
+               m = mpls_do_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0);
+               if (m == NULL)
+                       return;
+               shim = mtod(m, struct shim_hdr *);
+               ttl = ntohl(shim->shim_label & MPLS_TTL_MASK);
        }
-       ttl--;
 
        bzero(&sa_mpls, sizeof(sa_mpls));
        smpls = &sa_mpls;
@@ -341,11 +340,11 @@ done:
 int
 mpls_ip_adjttl(struct mbuf *m, u_int8_t ttl)
 {
-       struct ip       *ip;
-       int              hlen;
+       struct ip *ip;
+       int hlen;
 
        if (mpls_mapttl_ip) {
-               if (m->m_len < sizeof (struct ip) &&
+               if (m->m_len < sizeof(struct ip) &&
                    (m = m_pullup(m, sizeof(struct ip))) == NULL)
                        return -1;
                ip = mtod(m, struct ip *);
@@ -377,7 +376,7 @@ mpls_ip6_adjttl(struct mbuf *m, u_int8_t
        struct ip6_hdr *ip6hdr;
 
        if (mpls_mapttl_ip6) {
-               if (m->m_len < sizeof (struct ip6_hdr) &&
+               if (m->m_len < sizeof(struct ip6_hdr) &&
                    (m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL)
                        return -1;
 
@@ -387,4 +386,107 @@ mpls_ip6_adjttl(struct mbuf *m, u_int8_t
                ip6hdr->ip6_hlim = ttl;
        }
        return 0;
+}
+
+struct mbuf *
+mpls_do_error(struct mbuf *m, int type, int code, int destmtu)
+{
+       struct shim_hdr stack[MPLS_INKERNEL_LOOP_MAX];
+       struct sockaddr_mpls sa_mpls;
+       struct sockaddr_mpls *smpls;
+       struct rtentry *rt = NULL;
+       struct shim_hdr *shim;
+       struct in_ifaddr *ia;
+       struct icmp *icp;
+       struct ip *ip;
+       int nstk;
+
+       for (nstk = 0; nstk < MPLS_INKERNEL_LOOP_MAX; nstk++) {
+               if (m->m_len < sizeof(*shim) &&
+                   (m = m_pullup(m, sizeof(*ip))) == NULL)
+                       return (NULL);
+               stack[nstk] = *mtod(m, struct shim_hdr *);
+               m_adj(m, sizeof(*shim));
+               if (MPLS_BOS_ISSET(stack[nstk].shim_label))
+                       break;
+       }
+       shim = &stack[0];
+
+       switch (*mtod(m, u_char *) >> 4) {
+       case IPVERSION:
+               if (m->m_len < sizeof(*ip) &&
+                   (m = m_pullup(m, sizeof(*ip))) == NULL)
+                       return (NULL);
+               m = icmp_do_error(m, type, code, 0, destmtu);
+               if (m == NULL)
+                       return (NULL);
+
+               if (icmp_do_exthdr(m, ICMP_EXT_MPLS, 1, stack,
+                   (nstk + 1) * sizeof(*shim)))
+                       return (NULL);
+
+               /* set ip_src to something usable, based on the MPLS label */
+               bzero(&sa_mpls, sizeof(sa_mpls));
+               smpls = &sa_mpls;
+               smpls->smpls_family = AF_MPLS;
+               smpls->smpls_len = sizeof(*smpls);
+               smpls->smpls_label = shim->shim_label & MPLS_LABEL_MASK;
+
+               rt = rtalloc1(smplstosa(smpls), RT_REPORT, 0);
+               if (rt == NULL) {
+                       /* no entry for this label */
+                       m_freem(m);
+                       return (NULL);
+               }
+               if (rt->rt_ifa->ifa_addr->sa_family == AF_INET)
+                       ia = ifatoia(rt->rt_ifa);
+               else {
+                       /* XXX this needs fixing, if the MPLS is on an IP
+                        * less interface we need to find some other IP to
+                        * use as source.
+                        */
+                       RTFREE(rt);
+                       m_freem(m);
+                       return (NULL);
+               }
+               rt->rt_use++;
+               RTFREE(rt);
+               if (icmp_reflect(m, NULL, ia))
+                       return (NULL);
+
+               ip = mtod(m, struct ip *);
+               /* stuff to fix up which is normaly done in ip_output */
+               ip->ip_v = IPVERSION;
+               ip->ip_id = htons(ip_randomid());
+               ip->ip_sum = 0;
+               ip->ip_sum = in_cksum(m, sizeof(*ip));
+
+               /* stolen from icmp_send() */
+               m->m_data += sizeof(*ip);
+               m->m_len -= sizeof(*ip);
+               icp = mtod(m, struct icmp *);
+               icp->icmp_cksum = 0;
+               icp->icmp_cksum = in_cksum(m, ntohs(ip->ip_len) - sizeof(*ip));
+               m->m_data -= sizeof(*ip);
+               m->m_len += sizeof(*ip);
+
+               break;
+       case IPV6_VERSION >> 4:
+       default:
+               m_freem(m);
+               return (NULL);
+       }
+
+       /* add mpls stack back to new packet */
+       M_PREPEND(m, (nstk + 1) * sizeof(*shim), M_NOWAIT);
+       if (m == NULL)
+               return (NULL);
+       m_copyback(m, 0, (nstk + 1) * sizeof(*shim), stack, M_NOWAIT);
+
+       /* change TTL to default */
+       shim = mtod(m, struct shim_hdr *);
+       shim->shim_label =
+           (shim->shim_label & ~MPLS_TTL_MASK) | htonl(mpls_defttl);
+
+       return (m);
 }

Reply via email to