On Tue, Nov 08, 2016 at 03:36:22PM +0100, Martin Pieuchot wrote:
> On 04/11/16(Fri) 10:45, Claudio Jeker wrote:
> > On Wed, Nov 02, 2016 at 05:44:14PM +0100, Martin Pieuchot wrote:
> > > [..] 
> > > Diff below should fix that by automagically creating a loopback
> > > interface when a new routing domain is created.  That mean loX will
> > > now correspond to routing domain X.
> > 
> > I like this a lot. I think having a loopback interface per rdomain is kind
> > of required but it was something I never finished.
> > If we could assign 127.0.0.1 and ::1 to those interfaces automatically
> > that would be even better (but that can be done as a second step).
> 
> Adding ::1 is trivial, it's a one line change.
> 
> To add 127.0.0.1 properly it's another story as currently netstart(8)
> sets it.

I would love to kill this part out of netstart(8). 127.0.0.1 should always
exists.
 
> > > Not that if you are currently using a lo2 and a rdomain 2 this diff
> > > will break your rdomain setup.
> > 
> > Should we split loopbacks into lo(4) and e.g. loop(4) one is for loopback
> > interfaces from userland and the other is only there for the 127.0.0.1 and
> > ::1 loopbacks? At least that we the conflict would be solved since we then
> > have different namespaces. Again this is something we can do on top of
> > this.
> 
> I'm not sure to understand the benefit.  What's the use case for loop(4)? 

2 name spaces, so that I don't have a conflict if I use lo1 for my
loopback IPs and then later on create rdomain 1.
 
> > > I'll write the manual page for rtable_loindex(9) if you're ok with
> > > the approach.
> > 
> > I think this approach is fine. The only thing I dislike is how you cram 
> > ifindex and rdomain id into the rtable 2 rdomain mapping. It feels dirty
> > to me to do it that way instead of using a little struct holding the two
> > values as independent data.
> 
> It is dirty but using a small struct is not possible without revamping the
> mp-safeness of rtable_get(9) which is, IMHO, not worth the effort. 
> 

Hmmm. That sucks. It makes the code hard to read and understand but I
understand the reason and so maybe I need to swallow this bitter pill.

> Here's an updated diff with man page, I'm asking for oks now that
> semarie confirmed the m_pullup(9) regression isn't part of my diff. 

A few things inline.
 
> Index: sys/kern/init_main.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/init_main.c,v
> retrieving revision 1.262
> diff -u -p -r1.262 init_main.c
> --- sys/kern/init_main.c      7 Nov 2016 00:26:32 -0000       1.262
> +++ sys/kern/init_main.c      8 Nov 2016 14:22:12 -0000
> @@ -389,6 +389,9 @@ main(void *framep)
>       msginit();
>  #endif
>  
> +     /* Create default routing table before attaching lo0. */
> +     rtable_init();
> +
>       /* Attach pseudo-devices. */
>       for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
>               if (pdev->pdev_count > 0)
> @@ -398,8 +401,6 @@ main(void *framep)
>       crypto_init();
>       swcr_init();
>  #endif /* CRYPTO */
> -
> -     rtable_init();
>  
>       /*
>        * Initialize protocols.  Block reception of incoming packets
> Index: sys/net/if.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.458
> diff -u -p -r1.458 if.c
> --- sys/net/if.c      8 Nov 2016 10:47:10 -0000       1.458
> +++ sys/net/if.c      8 Nov 2016 14:22:12 -0000
> @@ -258,7 +258,6 @@ struct srp_gc if_ifp_gc = SRP_GC_INITIAL
>  struct srp_gc if_map_gc = SRP_GC_INITIALIZER(if_map_dtor, NULL);
>  
>  struct ifnet_head ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
> -unsigned int lo0ifidx;
>  
>  void
>  if_idxmap_init(unsigned int limit)
> @@ -1350,12 +1349,7 @@ p2p_rtrequest(struct ifnet *ifp, int req
>  
>               KASSERT(ifa == rt->rt_ifa);
>  
> -             /*
> -              * XXX Since lo0 is in the default rdomain we should not
> -              * (ab)use it for any route related to an interface of a
> -              * different rdomain.
> -              */
> -             lo0ifp = if_get(lo0ifidx);
> +             lo0ifp = if_get(rtable_loindex(ifp->if_rdomain));
>               KASSERT(lo0ifp != NULL);
>               TAILQ_FOREACH(lo0ifa, &lo0ifp->if_addrlist, ifa_list) {
>                       if (lo0ifa->ifa_addr->sa_family ==
> @@ -1438,7 +1432,7 @@ if_up(struct ifnet *ifp)
>  
>  #ifdef INET6
>       /* Userland expects the kernel to set ::1 on lo0. */
> -     if (ifp->if_index == lo0ifidx)
> +     if (ifp->if_index == rtable_loindex(0))
>               in6_ifattach(ifp);
>  #endif
>  
> @@ -1605,14 +1599,31 @@ if_setrdomain(struct ifnet *ifp, int rdo
>       if (rdomain < 0 || rdomain > RT_TABLEID_MAX)
>               return (EINVAL);
>  
> -     /* make sure that the routing table exists */
> +     /*
> +      * Create the routing table if it does not exist, including its
> +      * loopback interface with unit == rdomain.
> +      */
>       if (!rtable_exists(rdomain)) {
> +             struct ifnet *loifp;
> +             char loifname[IFNAMSIZ];
> +
> +             snprintf(loifname, sizeof(loifname), "lo%d", rdomain);
> +             if ((error = if_clone_create(loifname, 0)))
> +                     return (error);
> +
> +             if ((loifp = ifunit(loifname)) == NULL)
> +                     return (ENXIO);
> +
>               s = splsoftnet();
>               if ((error = rtable_add(rdomain)) == 0)
> -                     rtable_l2set(rdomain, rdomain);
> +                     rtable_l2set(rdomain, rdomain, loifp->if_index);
>               splx(s);
> -             if (error)
> +             if (error) {
> +                     if_clone_destroy(loifname);
>                       return (error);
> +             }
> +
> +             loifp->if_rdomain = rdomain;
>       }
>  
>       /* make sure that the routing table is a real rdomain */
> Index: sys/net/if_loop.c
> ===================================================================
> RCS file: /cvs/src/sys/net/if_loop.c,v
> retrieving revision 1.76
> diff -u -p -r1.76 if_loop.c
> --- sys/net/if_loop.c 13 Apr 2016 11:41:15 -0000      1.76
> +++ sys/net/if_loop.c 8 Nov 2016 14:22:12 -0000
> @@ -121,6 +121,7 @@
>  #include <net/if_var.h>
>  #include <net/if_types.h>
>  #include <net/netisr.h>
> +#include <net/rtable.h>
>  #include <net/route.h>
>  
>  #include <netinet/in.h>
> @@ -182,7 +183,7 @@ loop_clone_create(struct if_clone *ifc, 
>       if (unit == 0) {
>               if_attachhead(ifp);
>               if_addgroup(ifp, ifc->ifc_name);
> -             lo0ifidx = ifp->if_index;
> +             rtable_l2set(0, 0, ifp->if_index);
>       } else
>               if_attach(ifp);
>       if_alloc_sadl(ifp);
> @@ -195,7 +196,7 @@ loop_clone_create(struct if_clone *ifc, 
>  int
>  loop_clone_destroy(struct ifnet *ifp)
>  {
> -     if (ifp->if_index == lo0ifidx)
> +     if (ifp->if_index == rtable_loindex(ifp->if_rdomain))
>               return (EPERM);
>  
>       if_detach(ifp);
> Index: sys/net/if_var.h
> ===================================================================
> RCS file: /cvs/src/sys/net/if_var.h,v
> retrieving revision 1.76
> diff -u -p -r1.76 if_var.h
> --- sys/net/if_var.h  8 Nov 2016 10:46:05 -0000       1.76
> +++ sys/net/if_var.h  8 Nov 2016 14:22:12 -0000
> @@ -291,7 +291,6 @@ int               niq_enlist(struct niqueue *, struct
>      sysctl_mq((_n), (_l), (_op), (_olp), (_np), (_nl), &(_niq)->ni_q)
>  
>  extern struct ifnet_head ifnet;
> -extern unsigned int lo0ifidx;
>  extern struct taskq *softnettq;
>  
>  void if_start(struct ifnet *);
> Index: sys/net/route.c
> ===================================================================
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.334
> diff -u -p -r1.334 route.c
> --- sys/net/route.c   8 Nov 2016 10:39:32 -0000       1.334
> +++ sys/net/route.c   8 Nov 2016 14:22:12 -0000
> @@ -197,8 +197,6 @@ route_init(void)
>       while (rt_hashjitter == 0)
>               rt_hashjitter = arc4random();
>  
> -     if (rtable_add(0) != 0)
> -             panic("route_init rtable_add");
>  #ifdef BFD
>       bfdinit();
>  #endif
> Index: sys/net/rtable.c
> ===================================================================
> RCS file: /cvs/src/sys/net/rtable.c,v
> retrieving revision 1.52
> diff -u -p -r1.52 rtable.c
> --- sys/net/rtable.c  7 Sep 2016 09:36:49 -0000       1.52
> +++ sys/net/rtable.c  8 Nov 2016 14:22:12 -0000
> @@ -1,7 +1,7 @@
>  /*   $OpenBSD: rtable.c,v 1.52 2016/09/07 09:36:49 mpi Exp $ */
>  
>  /*
> - * Copyright (c) 2014-2015 Martin Pieuchot
> + * Copyright (c) 2014-2016 Martin Pieuchot
>   *
>   * Permission to use, copy, modify, and distribute this software for any
>   * purpose with or without fee is hereby granted, provided that the above
> @@ -41,7 +41,7 @@
>   *   afmap               rtmap/dommp
>   *   -----------          ---------     -----
>   *   |   0     |--------> | 0 | 0 | ... | 0 |        Array mapping rtableid 
> (=index)
> - *   -----------          ---------     -----   to rdomain (=value).
> + *   -----------          ---------     -----   to rdomain/loopback (=value).
>   *   | AF_INET |.
>   *   ----------- `.       .---------.     .---------.
>   *       ...    `----> | rtable0 | ... | rtableN |   Array of pointers for
> @@ -59,10 +59,20 @@ struct rtmap {
>       void             **tbl;
>  };
>  
> -/* Array of rtableid -> rdomain mapping. */
> +/*
> + * Array of rtableid -> rdomain mapping.
> + *
> + * Only used for the first index as describbed above.
> + */
>  struct dommp {
>       unsigned int       limit;
> -     unsigned int      *dom;
> +     /*
> +      * Array to get the routing domain and loopback interface related to
> +      * a routing table. Format:
> +      *
> +      * 8 unused bits | 16 bits for loopback index | 8 bits for rdomain
> +      */
> +     unsigned int      *value;
>  };
>  
>  unsigned int    rtmap_limit = 0;
> @@ -146,6 +156,8 @@ rtable_init(void)
>       unsigned int     keylen = 0;
>       int              i;
>  
> +     KASSERT(sizeof(struct rtmap) == sizeof(struct dommp));
> +
>       /* We use index 0 for the rtable/rdomain map. */
>       af2idx_max = 1;
>       memset(af2idx, 0, sizeof(af2idx));
> @@ -173,6 +185,9 @@ rtable_init(void)
>           M_WAITOK|M_ZERO);
>  
>       rtmap_init();
> +
> +     if (rtable_add(0) != 0)
> +             panic("unable to create default routing table");
>  }
>  
>  int
> @@ -221,7 +236,7 @@ rtable_add(unsigned int id)
>  
>       /* Use main rtable/rdomain by default. */
>       dmm = srp_get_locked(&afmap[0]);
> -     dmm->dom[id] = 0;
> +     dmm->value[id] = 0;
>  
>       return (0);
>  }
> @@ -272,24 +287,42 @@ rtable_l2(unsigned int rtableid)
>  
>       dmm = srp_enter(&sr, &afmap[0]);
>       if (rtableid < dmm->limit)
> -             rdomain = dmm->dom[rtableid];
> +             rdomain = (dmm->value[rtableid] & RT_TABLEID_MASK);
>       srp_leave(&sr);
>  
>       return (rdomain);
>  }
>  
> +unsigned int
> +rtable_loindex(unsigned int rtableid)
> +{
> +     struct dommp    *dmm;
> +     unsigned int     loifidx = 0;
> +     struct srp_ref   sr;
> +
> +     dmm = srp_enter(&sr, &afmap[0]);
> +     if (rtableid < dmm->limit)
> +             loifidx = (dmm->value[rtableid] >> RT_TABLEID_BITS);
> +     srp_leave(&sr);
> +
> +     return (loifidx);
> +}
> +
>  void
> -rtable_l2set(unsigned int rtableid, unsigned int rdomain)
> +rtable_l2set(unsigned int rtableid, unsigned int rdomain, unsigned int 
> loifidx)
>  {
>       struct dommp    *dmm;
> +     unsigned int     value;
>  
>       KERNEL_ASSERT_LOCKED();
>  
>       if (!rtable_exists(rtableid) || !rtable_exists(rdomain))
>               return;
>  
> +     value = (rdomain & RT_TABLEID_MASK) + (loifidx << RT_TABLEID_BITS);

I would use a | here instead of the +. Seems cleaner to me.

> +
>       dmm = srp_get_locked(&afmap[0]);
> -     dmm->dom[rtableid] = rdomain;
> +     dmm->value[rtableid] = value;
>  }
>  
>  #ifndef ART
> Index: sys/net/rtable.h
> ===================================================================
> RCS file: /cvs/src/sys/net/rtable.h,v
> retrieving revision 1.16
> diff -u -p -r1.16 rtable.h
> --- sys/net/rtable.h  7 Sep 2016 09:36:49 -0000       1.16
> +++ sys/net/rtable.h  8 Nov 2016 14:22:12 -0000
> @@ -54,7 +54,8 @@ void                 rtable_init(void);
>  int           rtable_exists(unsigned int);
>  int           rtable_add(unsigned int);
>  unsigned int  rtable_l2(unsigned int);
> -void          rtable_l2set(unsigned int, unsigned int);
> +unsigned int  rtable_loindex(unsigned int);
> +void          rtable_l2set(unsigned int, unsigned int, unsigned int);
>  
>  struct rtentry       *rtable_lookup(unsigned int, struct sockaddr *,
>                    struct sockaddr *, struct sockaddr *, uint8_t);
> Index: sys/netinet/ip_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_output.c,v
> retrieving revision 1.327
> diff -u -p -r1.327 ip_output.c
> --- sys/netinet/ip_output.c   4 Sep 2016 17:18:56 -0000       1.327
> +++ sys/netinet/ip_output.c   8 Nov 2016 14:22:12 -0000
> @@ -211,7 +211,7 @@ reroute:
>  
>               ia = ifatoia(ro->ro_rt->rt_ifa);
>               if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL))
> -                     ifp = if_get(lo0ifidx);
> +                     ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid));
>               else
>                       ifp = if_get(ro->ro_rt->rt_ifidx);
>               if (ifp == NULL) {
> Index: sys/netinet6/ip6_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 ip6_input.c
> --- sys/netinet6/ip6_input.c  24 Aug 2016 09:41:12 -0000      1.168
> +++ sys/netinet6/ip6_input.c  8 Nov 2016 14:22:12 -0000
> @@ -211,7 +211,10 @@ ip6_input(struct mbuf *m)
>       } else {
>               if (m->m_next) {
>                       if (m->m_flags & M_LOOP) {
> -                             ip6stat.ip6s_m2m[lo0ifidx]++;   /*XXX*/
> +                             int ifidx;
> +
> +                             ifidx = rtable_loindex(m->m_pkthdr.ph_rtableid);
> +                             ip6stat.ip6s_m2m[ifidx]++;

This is not save because ip6s_m2m has only 32 elements. So it will
overflow for rdomains > 32 (or in some other cases).
A possible option is:
                if (m->m_next) {
                        int ifidx = ifp->if_index;
                        if (m->m_flags & M_LOOP)
                                ifidx = rtable_loindex(m->m_pkthdr.ph_rtableid);
                        if (ifidx < nitems(ip6stat.ip6s_m2m))
                                ip6stat.ip6s_m2m[ifidx]++;
                        else
                                ip6stat.ip6s_m2m[0]++;
                } else
                        ip6stat.ip6s_m1++;

But not sure if we should just drop the per interface list (probably a
different diff).

>                       } else if (ifp->if_index < nitems(ip6stat.ip6s_m2m))
>                               ip6stat.ip6s_m2m[ifp->if_index]++;
>                       else
> Index: sys/netinet6/ip6_output.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_output.c,v
> retrieving revision 1.216
> diff -u -p -r1.216 ip6_output.c
> --- sys/netinet6/ip6_output.c 19 Sep 2016 18:09:09 -0000      1.216
> +++ sys/netinet6/ip6_output.c 8 Nov 2016 14:22:12 -0000
> @@ -460,7 +460,7 @@ reroute:
>                       goto bad;
>               }
>               if (ISSET(rt->rt_flags, RTF_LOCAL))
> -                     ifp = if_get(lo0ifidx);
> +                     ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid));
>               else
>                       ifp = if_get(rt->rt_ifidx);
>       } else {
> Index: sys/sys/socket.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/socket.h,v
> retrieving revision 1.92
> diff -u -p -r1.92 socket.h
> --- sys/sys/socket.h  28 Sep 2016 18:50:20 -0000      1.92
> +++ sys/sys/socket.h  8 Nov 2016 14:22:12 -0000
> @@ -143,7 +143,9 @@ struct    splice {
>  /*
>   * Maximum number of alternate routing tables
>   */
> -#define      RT_TABLEID_MAX  255
> +#define      RT_TABLEID_MAX          255
> +#define      RT_TABLEID_BITS         8
> +#define      RT_TABLEID_MASK         0xff
>  
>  #endif /* __BSD_VISIBLE */
>  
> Index: share/man/man9/rtable_add.9
> ===================================================================
> RCS file: /cvs/src/share/man/man9/rtable_add.9,v
> retrieving revision 1.7
> diff -u -p -r1.7 rtable_add.9
> --- share/man/man9/rtable_add.9       11 Dec 2015 16:08:30 -0000      1.7
> +++ share/man/man9/rtable_add.9       8 Nov 2016 14:28:25 -0000
> @@ -21,6 +21,7 @@
>  .Sh NAME
>  .Nm rtable_add ,
>  .Nm rtable_exists ,
> +.Nm rtable_loindex ,
>  .Nm rtable_l2 ,
>  .Nm rtable_l2set
>  .Nd routing tables and routing domains interface
> @@ -31,6 +32,8 @@
>  .Ft int
>  .Fn rtable_exists "unsigned int id"
>  .Ft unsigned int
> +.Fn rtable_loindex "unsigned int id"
> +.Ft unsigned int
>  .Fn rtable_l2 "unsigned int id"
>  .Ft void
>  .Fn rtable_l2set "unsigned int id" "unsigned int rdomain"
> @@ -53,6 +56,11 @@ if routing table with ID
>  exists,
>  .Fa 0
>  otherwise.
> +.It Fn rtable_loindex "unsigned int id"
> +Return the default
> +.Xr lo 4
> +interface index for the routing table with ID of
> +.Fa id .
>  .It Fn rtable_l2 "unsigned int id"
>  Get the routing domain of routing table with ID of
>  .Fa id .
> @@ -65,6 +73,7 @@ under the routing domain with ID of
>  .Sh CONTEXT
>  .Fn rtable_add ,
>  .Fn rtable_exists ,
> +.Fn rtable_loindex ,
>  .Fn rtable_l2 ,
>  and
>  .Fn task_l2set
> @@ -82,5 +91,6 @@ already exists.
>  Memory could not be allocated to extend the list of routing domains.
>  .El
>  .Sh SEE ALSO
> +.Xr lo 4 ,
>  .Xr route 4 ,
>  .Xr route 8
> 

-- 
:wq Claudio

Reply via email to