We introduce ecmp support for the babel protocol by extending it's
definition of a route being selected to mean the route being in the ECMP
set.

In order to keep code changes minimal we keep the pointer to an arbitrary
member of the ECMP set in the FIB entry and add a new flag to babel_route
which indicates which routes were actually announced to the core.

Since keeping this flag update at all times is a hassle we take a lazy
approach and simply check the metric of the selected route against the
route in question whenever we want to know for sure if a route is in the
ECMP set.
---
 doc/bird.sgml        |  32 ++++++++++++++
 proto/babel/babel.c  | 103 +++++++++++++++++++++++++++++++++++--------
 proto/babel/babel.h  |  16 +++++++
 proto/babel/config.Y |   5 +++
 4 files changed, 138 insertions(+), 18 deletions(-)

Changes in v3:
 - Squash with ecmp weigth patch
 - Add babel_route_is_selected() as replacement for e->selected
     I'm not totally sure the lazy approach is safe yet. We might need
     additional bookeeping to reset r->active_nexthop on route
     retraction/flush instead.

diff --git a/doc/bird.sgml b/doc/bird.sgml
index 1580facd..8d159b22 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -1865,6 +1865,7 @@ protocol babel [<name>] {
        ipv4 { <channel config> };
        ipv6 [sadr] { <channel config> };
         randomize router id <switch>;
+       ecmp <switch> [limit <num>];
        interface <interface pattern> {
                type <wired|wireless>;
                rxcost <number>;
@@ -1879,6 +1880,7 @@ protocol babel [<name>] {
                check link <switch>;
                next hop ipv4 <address>;
                next hop ipv6 <address>;
+               ecmp weight <num>;
                authentication none|mac [permissive];
                password "&lt;text&gt;";
                password "&lt;text&gt;" {
@@ -1909,6 +1911,18 @@ protocol babel [<name>] {
       router ID every time it starts up, which avoids this problem at the cost
       of not having stable router IDs in the network. Default: no.
 
+      <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit 
<m>number</m>]</tag>
+
+      Determines whether babel will emit ECMP (equal-cost multipath)
+      routes, allowing to load-balancing traffic across multiple paths. If
+      enabled the maximum number of next-hops to allow can be specified,
+      defaulting to 16.
+
+      When neibours are using a dynamic link-quality metric this is
+      unlikely to be useful. For best results <ref id="babel-type"
+      name="type wired"> should be used throughout the network to get what
+      amounts to a hop count metric.
+
       <tag><label id="babel-type">type wired|wireless </tag>
       This option specifies the interface type: Wired or wireless. On wired
       interfaces a neighbor is considered unreachable after a small number of
@@ -1928,6 +1942,18 @@ protocol babel [<name>] {
       selection and not local route selection. Default: 96 for wired 
interfaces,
       256 for wireless.
 
+      <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit 
<m>number</m>]</tag>
+
+      Determines whether babel will emit ECMP (equal-cost multipath)
+      routes, allowing to load-balancing traffic across multiple paths. If
+      enabled the maximum number of next-hops to allow can be specified,
+      defaulting to 16.
+
+      When neibours are using a dynamic link-quality metric this is
+      unlikely to be useful. For best results <ref id="babel-type"
+      name="type wired"> should be used throughout the network to get what
+      amounts to a hop count metric.
+
       <tag><label id="babel-limit">limit <m/num/</tag>
       BIRD keeps track of received Hello messages from each neighbor to
       establish neighbor reachability. For wired type interfaces, this option
@@ -1983,6 +2009,12 @@ protocol babel [<name>] {
       source for Babel packets will be used. In normal operation, it should not
       be necessary to set this option.
 
+      <tag><label id="babel-ecmp-weight">ecmp weight <m>number</m></tag>
+      This specifies the relative weight used for nexthops going through
+      the iface when ECMP is enabled. Larger weight values relative to other
+      nexthops attract more traffic. Valid values are 1-256. Default value
+      is 1.
+
       <tag><label id="babel-authentication">authentication none|mac 
[permissive]</tag>
       Selects authentication method to be used. <cf/none/ means that packets
       are not authenticated at all, <cf/mac/ means MAC authentication is
diff --git a/proto/babel/babel.c b/proto/babel/babel.c
index 4a7d550f..cd5e7a20 100644
--- a/proto/babel/babel.c
+++ b/proto/babel/babel.c
@@ -164,12 +164,23 @@ babel_get_route(struct babel_proto *p, struct babel_entry 
*e, struct babel_neigh
   return r;
 }
 
+/* Check if a route is currently active as part of any RTE nexthop.
+ */
+static inline u8
+babel_route_is_selected(struct babel_route *r)
+{
+  if (!r->e->selected || r->e->selected->metric != r->metric)
+    r->active_nexthop = 0;
+
+  return r->active_nexthop;
+}
+
 static inline void
 babel_retract_route(struct babel_proto *p, struct babel_route *r)
 {
   r->metric = r->advert_metric = BABEL_INFINITY;
 
-  if (r == r->e->selected)
+  if (babel_route_is_selected(r))
     babel_select_route(p, r->e, r);
 }
 
@@ -210,7 +221,7 @@ babel_expire_route(struct babel_proto *p, struct 
babel_route *r)
 static void
 babel_refresh_route(struct babel_proto *p, struct babel_route *r)
 {
-  if (r == r->e->selected)
+  if (babel_route_is_selected(r))
     babel_send_route_request(p, r->e, r->neigh);
 
   r->refresh_time = 0;
@@ -238,7 +249,7 @@ loop:
 
       if (r->expires && r->expires <= now_)
       {
-       changed = changed || (r == e->selected);
+       changed = changed || babel_route_is_selected(r);
        babel_expire_route(p, r);
       }
     }
@@ -624,7 +635,38 @@ done:
 }
 
 /**
- * babel_announce_rte - announce selected route to the core
+ * babel_nexthop_insert - add next_hop of route to nexthop list
+ * @p: Babel protocol instance
+ * @r: Babel route
+ * @nhs: nexthop list head to append onto
+ * @nh: freshly allocated buffer to fill
+ */
+static void
+babel_nexthop_insert(
+  struct babel_proto *p,
+  struct babel_route *r,
+  struct nexthop **nhs,
+  struct nexthop *nh)
+{
+  r->active_nexthop = 1;
+
+  nh->gw = r->next_hop;
+  nh->iface = r->neigh->ifa->iface;
+  nh->weight = r->neigh->ifa->cf->ecmp_weight;
+
+  /*
+   * If we cannot find a reachable neighbour, set the entry to be onlink. This
+   * makes it possible to, e.g., assign /32 addresses on a mesh interface and
+   * have routing work.
+   */
+  if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0))
+    nh->flags = RNF_ONLINK;
+
+  nexthop_insert(nhs, nh);
+}
+
+/**
+ * babel_announce_rte - announce selected routes to the core
  * @p: Babel protocol instance
  * @e: Babel route entry to announce
  *
@@ -635,6 +677,7 @@ done:
 static void
 babel_announce_rte(struct babel_proto *p, struct babel_entry *e)
 {
+  struct babel_config *cf = (void *) p->p.cf;
   struct babel_route *r = e->selected;
   struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : 
p->ip6_channel;
 
@@ -645,18 +688,24 @@ babel_announce_rte(struct babel_proto *p, struct 
babel_entry *e)
       .source = RTS_BABEL,
       .scope = SCOPE_UNIVERSE,
       .dest = RTD_UNICAST,
-      .from = r->neigh->addr,
-      .nh.gw = r->next_hop,
-      .nh.iface = r->neigh->ifa->iface,
     };
 
-    /*
-     * If we cannot find a reachable neighbour, set the entry to be onlink. 
This
-     * makes it possible to, e.g., assign /32 addresses on a mesh interface and
-     * have routing work.
-     */
-    if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0))
-      a0.nh.flags = RNF_ONLINK;
+    struct nexthop *nhs = NULL;
+    babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop)));
+    int num_nexthops = 1;
+
+    struct babel_route *cr;
+    WALK_LIST(cr, e->routes) {
+      if (cr == r || !cr->feasible || cr->metric != r->metric)
+       continue;
+
+      if (num_nexthops++ >= cf->max_nexthops)
+       break;
+
+      babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop)));
+    }
+
+    a0.nh = *nhs;
 
     rta *a = rta_lookup(&a0);
     rte *rte = rte_get_temp(a);
@@ -736,6 +785,7 @@ babel_announce_retraction(struct babel_proto *p, struct 
babel_entry *e)
 static void
 babel_select_route(struct babel_proto *p, struct babel_entry *e, struct 
babel_route *mod)
 {
+  struct babel_config *cf = (void *) p->p.cf;
   struct babel_route *r, *best = e->selected;
 
   /* Shortcut if only non-best was modified */
@@ -744,8 +794,10 @@ babel_select_route(struct babel_proto *p, struct 
babel_entry *e, struct babel_ro
     /* Either select modified route, or keep old best route */
     if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && 
mod->feasible)
       best = mod;
-    else
+    else if (cf->max_nexthops == 1)
       return;
+    /* With ecmp one of the non-selected but equal metric routes might have
+     * changed so contine on with the announcement in that case. */
   }
   else
   {
@@ -754,9 +806,10 @@ babel_select_route(struct babel_proto *p, struct 
babel_entry *e, struct babel_ro
       best = NULL;
 
     /* Find the best feasible route from all routes */
-    WALK_LIST(r, e->routes)
+    WALK_LIST(r, e->routes) {
       if ((r->metric < (best ? best->metric : BABEL_INFINITY)) && r->feasible)
        best = r;
+    }
   }
 
   if (best)
@@ -1956,7 +2009,7 @@ babel_dump_entry(struct babel_entry *e)
   WALK_LIST(r,e->routes)
   {
     debug(" ");
-    if (r == e->selected) debug("*");
+    if (babel_route_is_selected(r)) debug("*");
     babel_dump_route(r);
   }
 }
@@ -2170,7 +2223,7 @@ babel_show_routes_(struct babel_proto *p, struct fib 
*rtable)
     struct babel_route *r;
     WALK_LIST(r, e->routes)
     {
-      char c = (r == e->selected) ? '*' : (r->feasible ? '+' : ' ');
+      char c = (babel_route_is_selected(r)) ? '*' : (r->feasible ? '+' : ' ');
       btime time = r->expires ? r->expires - current_time() : 0;
       cli_msg(-1025, "%-*N %-25I %-10s %5u %c %5u %7t", width,
              e->n.addr, r->next_hop, r->neigh->ifa->ifname,
@@ -2441,6 +2494,16 @@ babel_shutdown(struct proto *P)
   return PS_DOWN;
 }
 
+static void
+babel_reconfigure_routes(struct babel_proto *p, struct fib *rtable)
+{
+  struct fib_iterator fit;
+  FIB_ITERATE_INIT(&fit, rtable);
+  FIB_ITERATE_START(rtable, &fit, struct babel_entry, e)
+    babel_announce_rte(p, e);
+  FIB_ITERATE_END;
+}
+
 static int
 babel_reconfigure(struct proto *P, struct proto_config *CF)
 {
@@ -2460,6 +2523,10 @@ babel_reconfigure(struct proto *P, struct proto_config 
*CF)
   p->p.cf = CF;
   babel_reconfigure_ifaces(p, new);
 
+  /* Update all routes to refresh ecmp settings. */
+  babel_reconfigure_routes(p, &p->ip6_rtable);
+  babel_reconfigure_routes(p, &p->ip4_rtable);
+
   babel_trigger_update(p);
   babel_kick_timer(p);
 
diff --git a/proto/babel/babel.h b/proto/babel/babel.h
index 84feb085..153ef78c 100644
--- a/proto/babel/babel.h
+++ b/proto/babel/babel.h
@@ -62,6 +62,8 @@
 #define BABEL_OVERHEAD         (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH)
 #define BABEL_MIN_MTU          (512 + BABEL_OVERHEAD)
 
+#define BABEL_DEFAULT_ECMP_LIMIT       16
+
 #define BABEL_AUTH_NONE                        0
 #define BABEL_AUTH_MAC                 1
 
@@ -120,6 +122,9 @@ struct babel_config {
   list iface_list;                     /* List of iface configs (struct 
babel_iface_config) */
   uint hold_time;                      /* Time to hold stale entries and 
unreachable routes */
   u8 randomize_router_id;
+  u8 max_nexthops;                      /* Maximum number of nexthops to
+                                          install.  Defaults to 1. It's >1 if
+                                          ECMP is enabled. */
 
   struct channel_config *ip4_channel;
   struct channel_config *ip6_channel;
@@ -142,6 +147,8 @@ struct babel_iface_config {
   int tx_tos;
   int tx_priority;
 
+  u8 ecmp_weight;
+
   ip_addr next_hop_ip4;
   ip_addr next_hop_ip6;
 
@@ -258,6 +265,11 @@ struct babel_route {
   struct babel_neighbor *neigh;
 
   u8 feasible;
+  u8 active_nexthop;
+ /* If true route could be in the ecmp set (i.e. it's in use as a nexthop), if
+  * false definetly not. Use babel_route_is_selected() to get a boolean
+  * answer. */
+
   u16 seqno;
   u16 metric;
   u16 advert_metric;
@@ -280,6 +292,10 @@ struct babel_seqno_request {
 
 struct babel_entry {
   struct babel_route *selected;
+  /* Route currently being announced to neigbours. When ECMP is enabled this is
+   * but one of the full set of routes in use for nexthop purposes. To check if
+   * a particular route is part of the ECMP set use babel_route_is_selected().
+   */
 
   list routes;                         /* Routes for this prefix (struct 
babel_route) */
   list sources;                                /* Source entries for this 
prefix (struct babel_source). */
diff --git a/proto/babel/config.Y b/proto/babel/config.Y
index 05210fa4..0da5025d 100644
--- a/proto/babel/config.Y
+++ b/proto/babel/config.Y
@@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL
   this_proto = proto_config_new(&proto_babel, $1);
   init_list(&BABEL_CFG->iface_list);
   BABEL_CFG->hold_time = 1 S_;
+  BABEL_CFG->max_nexthops = 1;
 };
 
 babel_proto_item:
@@ -43,6 +44,8 @@ babel_proto_item:
  | proto_channel
  | INTERFACE babel_iface
  | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; }
+ | ECMP bool { BABEL_CFG->max_nexthops = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; }
+ | ECMP bool LIMIT expr { BABEL_CFG->max_nexthops = $2 ? $4 : 1; }
  ;
 
 babel_proto_opts:
@@ -67,6 +70,7 @@ babel_iface_start:
   BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL;
   BABEL_IFACE->tx_priority = sk_priority_control;
   BABEL_IFACE->check_link = 1;
+  BABEL_IFACE->ecmp_weight = 0;
 };
 
 
@@ -146,6 +150,7 @@ babel_iface_item:
  | AUTHENTICATION NONE { BABEL_IFACE->auth_type = BABEL_AUTH_NONE; }
  | AUTHENTICATION MAC { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; 
BABEL_IFACE->auth_permissive = 0; }
  | AUTHENTICATION MAC PERMISSIVE { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; 
BABEL_IFACE->auth_permissive = 1; }
+ | ECMP WEIGHT expr { BABEL_IFACE->ecmp_weight = $3 - 1; if (($3<1) || 
($3>256)) cf_error("ECMP weight must be in range 1-256"); }
  | password_list
  ;
 
-- 
2.30.2

Reply via email to