We introduce ecmp support for the babel protocol by extending it's definition of a route being selected to mean the route being in the ECMP set.
In order to keep code changes minimal we keep the pointer to an arbitrary member of the ECMP set in the FIB entry and add a new flag to babel_route which indicates which routes were actually announced to the core. Since keeping this flag update at all times is a hassle we take a lazy approach and simply check the metric of the selected route against the route in question whenever we want to know for sure if a route is in the ECMP set. --- doc/bird.sgml | 32 ++++++++++++++ proto/babel/babel.c | 103 +++++++++++++++++++++++++++++++++++-------- proto/babel/babel.h | 16 +++++++ proto/babel/config.Y | 5 +++ 4 files changed, 138 insertions(+), 18 deletions(-) Changes in v3: - Squash with ecmp weigth patch - Add babel_route_is_selected() as replacement for e->selected I'm not totally sure the lazy approach is safe yet. We might need additional bookeeping to reset r->active_nexthop on route retraction/flush instead. diff --git a/doc/bird.sgml b/doc/bird.sgml index 1580facd..8d159b22 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -1865,6 +1865,7 @@ protocol babel [<name>] { ipv4 { <channel config> }; ipv6 [sadr] { <channel config> }; randomize router id <switch>; + ecmp <switch> [limit <num>]; interface <interface pattern> { type <wired|wireless>; rxcost <number>; @@ -1879,6 +1880,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + ecmp weight <num>; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1909,6 +1911,18 @@ protocol babel [<name>] { router ID every time it starts up, which avoids this problem at the cost of not having stable router IDs in the network. Default: no. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-type">type wired|wireless </tag> This option specifies the interface type: Wired or wireless. On wired interfaces a neighbor is considered unreachable after a small number of @@ -1928,6 +1942,18 @@ protocol babel [<name>] { selection and not local route selection. Default: 96 for wired interfaces, 256 for wireless. + <tag><label id="babel-ecmp">ecmp <m>switch</m> [limit <m>number</m>]</tag> + + Determines whether babel will emit ECMP (equal-cost multipath) + routes, allowing to load-balancing traffic across multiple paths. If + enabled the maximum number of next-hops to allow can be specified, + defaulting to 16. + + When neibours are using a dynamic link-quality metric this is + unlikely to be useful. For best results <ref id="babel-type" + name="type wired"> should be used throughout the network to get what + amounts to a hop count metric. + <tag><label id="babel-limit">limit <m/num/</tag> BIRD keeps track of received Hello messages from each neighbor to establish neighbor reachability. For wired type interfaces, this option @@ -1983,6 +2009,12 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option. + <tag><label id="babel-ecmp-weight">ecmp weight <m>number</m></tag> + This specifies the relative weight used for nexthops going through + the iface when ECMP is enabled. Larger weight values relative to other + nexthops attract more traffic. Valid values are 1-256. Default value + is 1. + <tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4a7d550f..cd5e7a20 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -164,12 +164,23 @@ babel_get_route(struct babel_proto *p, struct babel_entry *e, struct babel_neigh return r; } +/* Check if a route is currently active as part of any RTE nexthop. + */ +static inline u8 +babel_route_is_selected(struct babel_route *r) +{ + if (!r->e->selected || r->e->selected->metric != r->metric) + r->active_nexthop = 0; + + return r->active_nexthop; +} + static inline void babel_retract_route(struct babel_proto *p, struct babel_route *r) { r->metric = r->advert_metric = BABEL_INFINITY; - if (r == r->e->selected) + if (babel_route_is_selected(r)) babel_select_route(p, r->e, r); } @@ -210,7 +221,7 @@ babel_expire_route(struct babel_proto *p, struct babel_route *r) static void babel_refresh_route(struct babel_proto *p, struct babel_route *r) { - if (r == r->e->selected) + if (babel_route_is_selected(r)) babel_send_route_request(p, r->e, r->neigh); r->refresh_time = 0; @@ -238,7 +249,7 @@ loop: if (r->expires && r->expires <= now_) { - changed = changed || (r == e->selected); + changed = changed || babel_route_is_selected(r); babel_expire_route(p, r); } } @@ -624,7 +635,38 @@ done: } /** - * babel_announce_rte - announce selected route to the core + * babel_nexthop_insert - add next_hop of route to nexthop list + * @p: Babel protocol instance + * @r: Babel route + * @nhs: nexthop list head to append onto + * @nh: freshly allocated buffer to fill + */ +static void +babel_nexthop_insert( + struct babel_proto *p, + struct babel_route *r, + struct nexthop **nhs, + struct nexthop *nh) +{ + r->active_nexthop = 1; + + nh->gw = r->next_hop; + nh->iface = r->neigh->ifa->iface; + nh->weight = r->neigh->ifa->cf->ecmp_weight; + + /* + * If we cannot find a reachable neighbour, set the entry to be onlink. This + * makes it possible to, e.g., assign /32 addresses on a mesh interface and + * have routing work. + */ + if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) + nh->flags = RNF_ONLINK; + + nexthop_insert(nhs, nh); +} + +/** + * babel_announce_rte - announce selected routes to the core * @p: Babel protocol instance * @e: Babel route entry to announce * @@ -635,6 +677,7 @@ done: static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r = e->selected; struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; @@ -645,18 +688,24 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, .dest = RTD_UNICAST, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, }; - /* - * If we cannot find a reachable neighbour, set the entry to be onlink. This - * makes it possible to, e.g., assign /32 addresses on a mesh interface and - * have routing work. - */ - if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + struct nexthop *nhs = NULL; + babel_nexthop_insert(p, r, &nhs, allocz(sizeof(struct nexthop))); + int num_nexthops = 1; + + struct babel_route *cr; + WALK_LIST(cr, e->routes) { + if (cr == r || !cr->feasible || cr->metric != r->metric) + continue; + + if (num_nexthops++ >= cf->max_nexthops) + break; + + babel_nexthop_insert(p, cr, &nhs, allocz(sizeof(struct nexthop))); + } + + a0.nh = *nhs; rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); @@ -736,6 +785,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *best = e->selected; /* Shortcut if only non-best was modified */ @@ -744,8 +794,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro /* Either select modified route, or keep old best route */ if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) best = mod; - else + else if (cf->max_nexthops == 1) return; + /* With ecmp one of the non-selected but equal metric routes might have + * changed so contine on with the announcement in that case. */ } else { @@ -754,9 +806,10 @@ babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_ro best = NULL; /* Find the best feasible route from all routes */ - WALK_LIST(r, e->routes) + WALK_LIST(r, e->routes) { if ((r->metric < (best ? best->metric : BABEL_INFINITY)) && r->feasible) best = r; + } } if (best) @@ -1956,7 +2009,7 @@ babel_dump_entry(struct babel_entry *e) WALK_LIST(r,e->routes) { debug(" "); - if (r == e->selected) debug("*"); + if (babel_route_is_selected(r)) debug("*"); babel_dump_route(r); } } @@ -2170,7 +2223,7 @@ babel_show_routes_(struct babel_proto *p, struct fib *rtable) struct babel_route *r; WALK_LIST(r, e->routes) { - char c = (r == e->selected) ? '*' : (r->feasible ? '+' : ' '); + char c = (babel_route_is_selected(r)) ? '*' : (r->feasible ? '+' : ' '); btime time = r->expires ? r->expires - current_time() : 0; cli_msg(-1025, "%-*N %-25I %-10s %5u %c %5u %7t", width, e->n.addr, r->next_hop, r->neigh->ifa->ifname, @@ -2441,6 +2494,16 @@ babel_shutdown(struct proto *P) return PS_DOWN; } +static void +babel_reconfigure_routes(struct babel_proto *p, struct fib *rtable) +{ + struct fib_iterator fit; + FIB_ITERATE_INIT(&fit, rtable); + FIB_ITERATE_START(rtable, &fit, struct babel_entry, e) + babel_announce_rte(p, e); + FIB_ITERATE_END; +} + static int babel_reconfigure(struct proto *P, struct proto_config *CF) { @@ -2460,6 +2523,10 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) p->p.cf = CF; babel_reconfigure_ifaces(p, new); + /* Update all routes to refresh ecmp settings. */ + babel_reconfigure_routes(p, &p->ip6_rtable); + babel_reconfigure_routes(p, &p->ip4_rtable); + babel_trigger_update(p); babel_kick_timer(p); diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 84feb085..153ef78c 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -62,6 +62,8 @@ #define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD) +#define BABEL_DEFAULT_ECMP_LIMIT 16 + #define BABEL_AUTH_NONE 0 #define BABEL_AUTH_MAC 1 @@ -120,6 +122,9 @@ struct babel_config { list iface_list; /* List of iface configs (struct babel_iface_config) */ uint hold_time; /* Time to hold stale entries and unreachable routes */ u8 randomize_router_id; + u8 max_nexthops; /* Maximum number of nexthops to + install. Defaults to 1. It's >1 if + ECMP is enabled. */ struct channel_config *ip4_channel; struct channel_config *ip6_channel; @@ -142,6 +147,8 @@ struct babel_iface_config { int tx_tos; int tx_priority; + u8 ecmp_weight; + ip_addr next_hop_ip4; ip_addr next_hop_ip6; @@ -258,6 +265,11 @@ struct babel_route { struct babel_neighbor *neigh; u8 feasible; + u8 active_nexthop; + /* If true route could be in the ecmp set (i.e. it's in use as a nexthop), if + * false definetly not. Use babel_route_is_selected() to get a boolean + * answer. */ + u16 seqno; u16 metric; u16 advert_metric; @@ -280,6 +292,10 @@ struct babel_seqno_request { struct babel_entry { struct babel_route *selected; + /* Route currently being announced to neigbours. When ECMP is enabled this is + * but one of the full set of routes in use for nexthop purposes. To check if + * a particular route is part of the ECMP set use babel_route_is_selected(). + */ list routes; /* Routes for this prefix (struct babel_route) */ list sources; /* Source entries for this prefix (struct babel_source). */ diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..0da5025d 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -36,6 +36,7 @@ babel_proto_start: proto_start BABEL this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); BABEL_CFG->hold_time = 1 S_; + BABEL_CFG->max_nexthops = 1; }; babel_proto_item: @@ -43,6 +44,8 @@ babel_proto_item: | proto_channel | INTERFACE babel_iface | RANDOMIZE ROUTER ID bool { BABEL_CFG->randomize_router_id = $4; } + | ECMP bool { BABEL_CFG->max_nexthops = $2 ? BABEL_DEFAULT_ECMP_LIMIT : 1; } + | ECMP bool LIMIT expr { BABEL_CFG->max_nexthops = $2 ? $4 : 1; } ; babel_proto_opts: @@ -67,6 +70,7 @@ babel_iface_start: BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL; BABEL_IFACE->tx_priority = sk_priority_control; BABEL_IFACE->check_link = 1; + BABEL_IFACE->ecmp_weight = 0; }; @@ -146,6 +150,7 @@ babel_iface_item: | AUTHENTICATION NONE { BABEL_IFACE->auth_type = BABEL_AUTH_NONE; } | AUTHENTICATION MAC { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 0; } | AUTHENTICATION MAC PERMISSIVE { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 1; } + | ECMP WEIGHT expr { BABEL_IFACE->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } | password_list ; -- 2.30.2