On Fri, Aug 12, 2022 at 03:30:16PM +0200, Claudio Jeker wrote:
> On Fri, Aug 12, 2022 at 03:06:14PM +0200, Theo Buehler wrote:
> > On Fri, Aug 12, 2022 at 12:43:09PM +0200, Claudio Jeker wrote:
> > > There is currently a race in bgpd when multiple nexthop become invalid at
> > > the same time. The problem is that the decision process may select an
> > > alternative path that also has a no longer valid nexthop but the process
> > > that does all the adjustments did not reach that prefix yet.
> > > The main issue here is that the RDE uses the true_nexthop to communicate
> > > this change but the true_nexthop is 0 or :: in this case.
> > >
> > > This diff solves the issue by no longer using true_nexthop in the kroute
> > > message but instead have the kroute code do the lookup instead. The state
> > > in kroute is always up to date so the system knows if the nexthop is valid
> > > or not and either issues a change or remove depending on that.
> > >
> > > With this the rde no longer uses the true_nexthop (it is only there to be
> > > reported to bgpctl). It only cares about exit_nexthop, validity and the
> > > connected network info. All of those should not cause any problems during
> > > nexthop flips.
> >
> > This reads fine. If you fix the grammar of the comment, then it's ok, e.g.:
> >
> > > + * Ignore the nexthop for VPN routes. The gateway is a forced
> >
> > s/a //
> >
> > > + * to an mpe(4) interface route using an MPLS label.
>
> Something is still missing in this diff. Since changing a route does not
> update the kroutes. Guess the kroute code needs to do this proper now, I
> thought I could still use the old slow path for this but it looks like the
> update is surpressed.
>
> Will look more into this and come up with a new version (with the grammar
> fix from above).
Here we go. The bug I was chasing was actually introduced a few weeks ago.
The problem is that when you RTM_CHANGE a route knexthop_track() fails to
notice the change (okr and kr remain the same since the route did not
change) and so no update is issued.
Fix this by introducing knexthop_update() that just forces an update if
the kroute covering this knexthop is modified. The rest of the diff is
unmodified.
--
:wq Claudio
Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v
retrieving revision 1.288
diff -u -p -r1.288 kroute.c
--- kroute.c 10 Aug 2022 14:21:24 -0000 1.288
+++ kroute.c 15 Aug 2022 12:19:54 -0000
@@ -160,8 +160,10 @@ void kif_clear(void);
int kroute_validate(struct kroute *);
int kroute6_validate(struct kroute6 *);
+int knexthop_true_nexthop(struct ktable *, struct kroute_full *);
void knexthop_validate(struct ktable *, struct knexthop *);
void knexthop_track(struct ktable *, u_short);
+void knexthop_update(struct ktable *, struct kroute_full *);
void knexthop_send_update(struct knexthop *);
struct kroute *kroute_match(struct ktable *, struct bgpd_addr *, int);
struct kroute6 *kroute6_match(struct ktable *, struct bgpd_addr *, int);
@@ -453,6 +455,8 @@ kr_change(u_int rtableid, struct kroute_
return (0);
kf->flags |= F_BGPD;
kf->priority = RTP_MINE;
+ if (!knexthop_true_nexthop(kt, kf))
+ return kroute_remove(kt, kf, 1);
switch (kf->prefix.aid) {
case AID_INET:
return (kr4_change(kt, kf));
@@ -1406,10 +1410,10 @@ kr6_tofull(struct kroute6 *kr6)
bzero(&kf, sizeof(kf));
kf.prefix.aid = AID_INET6;
- memcpy(&kf.prefix.v6, &kr6->prefix, sizeof(struct in6_addr));
+ kf.prefix.v6 = kr6->prefix;
kf.prefix.scope_id = kr6->prefix_scope_id;
kf.nexthop.aid = AID_INET6;
- memcpy(&kf.nexthop.v6, &kr6->nexthop, sizeof(struct in6_addr));
+ kf.nexthop.v6 = kr6->nexthop;
kf.nexthop.scope_id = kr6->nexthop_scope_id;
strlcpy(kf.label, rtlabel_id2name(kr6->labelid), sizeof(kf.label));
kf.flags = kr6->flags;
@@ -2124,6 +2128,51 @@ kroute6_validate(struct kroute6 *kr)
return (kif->k.nh_reachable);
}
+int
+knexthop_true_nexthop(struct ktable *kt, struct kroute_full *kf)
+{
+ struct bgpd_addr gateway = { 0 };
+ struct knexthop *kn;
+ struct kroute *kr;
+ struct kroute6 *kr6;
+
+ /*
+ * Ignore the nexthop for VPN routes. The gateway is forced
+ * to an mpe(4) interface route using an MPLS label.
+ */
+ switch (kf->prefix.aid) {
+ case AID_VPN_IPv4:
+ case AID_VPN_IPv6:
+ return 1;
+ }
+
+ kn = knexthop_find(kt, &kf->nexthop);
+ if (kn == NULL) {
+ log_warnx("%s: nexthop %s not found", __func__,
+ log_addr(&kf->nexthop));
+ return 0;
+ }
+ if (kn->kroute == NULL)
+ return 0;
+
+ switch (kn->nexthop.aid) {
+ case AID_INET:
+ kr = kn->kroute;
+ gateway.aid = AID_INET;
+ gateway.v4.s_addr = kr->nexthop.s_addr;
+ break;
+ case AID_INET6:
+ kr6 = kn->kroute;
+ gateway.aid = AID_INET6;
+ gateway.v6 = kr6->nexthop;
+ gateway.scope_id = kr6->nexthop_scope_id;
+ break;
+ }
+
+ kf->nexthop = gateway;
+ return 1;
+}
+
void
knexthop_validate(struct ktable *kt, struct knexthop *kn)
{
@@ -2150,7 +2199,7 @@ knexthop_validate(struct ktable *kt, str
/*
* Send update if nexthop route changed under us if
* the route remains the same then the NH state has not
- * changed. State changes are tracked by knexthop_track().
+ * changed.
*/
if (kr != oldk)
knexthop_send_update(kn);
@@ -2170,6 +2219,9 @@ knexthop_validate(struct ktable *kt, str
}
}
+/*
+ * Called on interface state change.
+ */
void
knexthop_track(struct ktable *kt, u_short ifindex)
{
@@ -2180,6 +2232,20 @@ knexthop_track(struct ktable *kt, u_shor
knexthop_validate(kt, kn);
}
+/*
+ * Called on route change.
+ */
+void
+knexthop_update(struct ktable *kt, struct kroute_full *kf)
+{
+ struct knexthop *kn;
+
+ RB_FOREACH(kn, knexthop_tree, KT2KNT(kt))
+ if (prefix_compare(&kf->prefix, &kn->nexthop,
+ kf->prefixlen) == 0)
+ knexthop_send_update(kn);
+}
+
void
knexthop_send_update(struct knexthop *kn)
{
@@ -2187,8 +2253,8 @@ knexthop_send_update(struct knexthop *kn
struct kroute *kr;
struct kroute6 *kr6;
- bzero(&n, sizeof(n));
- memcpy(&n.nexthop, &kn->nexthop, sizeof(n.nexthop));
+ memset(&n, 0, sizeof(n));
+ n.nexthop = kn->nexthop;
if (kn->kroute == NULL) {
n.valid = 0; /* NH is not valid */
@@ -2218,14 +2284,12 @@ knexthop_send_update(struct knexthop *kn
if (memcmp(&kr6->nexthop, &in6addr_any,
sizeof(struct in6_addr)) != 0) {
n.gateway.aid = AID_INET6;
- memcpy(&n.gateway.v6, &kr6->nexthop,
- sizeof(struct in6_addr));
+ n.gateway.v6 = kr6->nexthop;
n.gateway.scope_id = kr6->nexthop_scope_id;
}
if (n.connected) {
n.net.aid = AID_INET6;
- memcpy(&n.net.v6, &kr6->prefix,
- sizeof(struct in6_addr));
+ n.net.v6 = kr6->prefix;
n.net.scope_id = kr6->prefix_scope_id;
n.netlen = kr6->prefixlen;
}
@@ -3104,7 +3168,7 @@ kr_fib_change(struct ktable *kt, struct
kr_redistribute(IMSG_NETWORK_ADD,
kt, kr_tofull(kr));
if (kr->flags & F_NEXTHOP && changed)
- knexthop_track(kt, kf->ifindex);
+ knexthop_update(kt, kf);
} else {
kr->flags &= ~F_BGPD_INSERTED;
}
@@ -3177,7 +3241,7 @@ add4:
kt, kr6_tofull(kr6));
if (kr6->flags & F_NEXTHOP && changed)
- knexthop_track(kt, kf->ifindex);
+ knexthop_update(kt, kf);
} else {
kr6->flags &= ~F_BGPD_INSERTED;
}
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.562
diff -u -p -r1.562 rde.c
--- rde.c 10 Aug 2022 14:17:01 -0000 1.562
+++ rde.c 11 Aug 2022 09:03:03 -0000
@@ -3056,8 +3056,7 @@ rde_send_kroute(struct rib *rib, struct
kf.flags |= F_REJECT;
if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE)
kf.flags |= F_BLACKHOLE;
- memcpy(&kf.nexthop, &prefix_nexthop(p)->true_nexthop,
- sizeof(kf.nexthop));
+ kf.nexthop = prefix_nexthop(p)->exit_nexthop;
strlcpy(kf.label, rtlabel_id2name(prefix_aspath(p)->rtlabelid),
sizeof(kf.label));
}
@@ -3072,13 +3071,6 @@ rde_send_kroute(struct rib *rib, struct
SIMPLEQ_FOREACH(vpn, &conf->l3vpns, entry) {
if (!rde_l3vpn_import(prefix_communities(p), vpn))
continue;
- /* must send exit_nexthop so that correct MPLS tunnel
- * is chosen
- */
- if (type == IMSG_KROUTE_CHANGE)
- memcpy(&kf.nexthop,
- &prefix_nexthop(p)->exit_nexthop,
- sizeof(kf.nexthop));
/* XXX not ideal but this will change */
kf.ifindex = if_nametoindex(vpn->ifmpe);
if (imsg_compose(ibuf_main, type, vpn->rtableid, 0, -1,