There is currently a race in bgpd when multiple nexthop become invalid at
the same time. The problem is that the decision process may select an
alternative path that also has a no longer valid nexthop but the process
that does all the adjustments did not reach that prefix yet.
The main issue here is that the RDE uses the true_nexthop to communicate
this change but the true_nexthop is 0 or :: in this case.
This diff solves the issue by no longer using true_nexthop in the kroute
message but instead have the kroute code do the lookup instead. The state
in kroute is always up to date so the system knows if the nexthop is valid
or not and either issues a change or remove depending on that.
With this the rde no longer uses the true_nexthop (it is only there to be
reported to bgpctl). It only cares about exit_nexthop, validity and the
connected network info. All of those should not cause any problems during
nexthop flips.
--
:wq Claudio
Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v
retrieving revision 1.288
diff -u -p -r1.288 kroute.c
--- kroute.c 10 Aug 2022 14:21:24 -0000 1.288
+++ kroute.c 11 Aug 2022 09:31:15 -0000
@@ -160,6 +160,7 @@ void kif_clear(void);
int kroute_validate(struct kroute *);
int kroute6_validate(struct kroute6 *);
+int knexthop_true_nexthop(struct ktable *, struct kroute_full *);
void knexthop_validate(struct ktable *, struct knexthop *);
void knexthop_track(struct ktable *, u_short);
void knexthop_send_update(struct knexthop *);
@@ -453,6 +454,8 @@ kr_change(u_int rtableid, struct kroute_
return (0);
kf->flags |= F_BGPD;
kf->priority = RTP_MINE;
+ if (!knexthop_true_nexthop(kt, kf))
+ return kroute_remove(kt, kf, 1);
switch (kf->prefix.aid) {
case AID_INET:
return (kr4_change(kt, kf));
@@ -2122,6 +2125,51 @@ kroute6_validate(struct kroute6 *kr)
}
return (kif->k.nh_reachable);
+}
+
+int
+knexthop_true_nexthop(struct ktable *kt, struct kroute_full *kf)
+{
+ struct bgpd_addr gateway = { 0 };
+ struct knexthop *kn;
+ struct kroute *kr;
+ struct kroute6 *kr6;
+
+ /*
+ * Ignore the nexthop for VPN routes. The gateway is a forced
+ * to an mpe(4) interface route using an MPLS label.
+ */
+ switch (kf->prefix.aid) {
+ case AID_VPN_IPv4:
+ case AID_VPN_IPv6:
+ return 1;
+ }
+
+ kn = knexthop_find(kt, &kf->nexthop);
+ if (kn == NULL) {
+ log_warnx("%s: nexthop %s not found", __func__,
+ log_addr(&kf->nexthop));
+ return 0;
+ }
+ if (kn->kroute == NULL)
+ return 0;
+
+ switch (kn->nexthop.aid) {
+ case AID_INET:
+ kr = kn->kroute;
+ gateway.aid = AID_INET;
+ gateway.v4.s_addr = kr->nexthop.s_addr;
+ break;
+ case AID_INET6:
+ kr6 = kn->kroute;
+ gateway.aid = AID_INET6;
+ memcpy(&gateway.v6, &kr6->nexthop, sizeof(struct in6_addr));
+ gateway.scope_id = kr6->nexthop_scope_id;
+ break;
+ }
+
+ kf->nexthop = gateway;
+ return 1;
}
void
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.562
diff -u -p -r1.562 rde.c
--- rde.c 10 Aug 2022 14:17:01 -0000 1.562
+++ rde.c 11 Aug 2022 09:03:03 -0000
@@ -3056,8 +3056,7 @@ rde_send_kroute(struct rib *rib, struct
kf.flags |= F_REJECT;
if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE)
kf.flags |= F_BLACKHOLE;
- memcpy(&kf.nexthop, &prefix_nexthop(p)->true_nexthop,
- sizeof(kf.nexthop));
+ kf.nexthop = prefix_nexthop(p)->exit_nexthop;
strlcpy(kf.label, rtlabel_id2name(prefix_aspath(p)->rtlabelid),
sizeof(kf.label));
}
@@ -3072,13 +3071,6 @@ rde_send_kroute(struct rib *rib, struct
SIMPLEQ_FOREACH(vpn, &conf->l3vpns, entry) {
if (!rde_l3vpn_import(prefix_communities(p), vpn))
continue;
- /* must send exit_nexthop so that correct MPLS tunnel
- * is chosen
- */
- if (type == IMSG_KROUTE_CHANGE)
- memcpy(&kf.nexthop,
- &prefix_nexthop(p)->exit_nexthop,
- sizeof(kf.nexthop));
/* XXX not ideal but this will change */
kf.ifindex = if_nametoindex(vpn->ifmpe);
if (imsg_compose(ibuf_main, type, vpn->rtableid, 0, -1,