On Fri, Aug 12, 2022 at 03:30:16PM +0200, Claudio Jeker wrote:
> On Fri, Aug 12, 2022 at 03:06:14PM +0200, Theo Buehler wrote:
> > On Fri, Aug 12, 2022 at 12:43:09PM +0200, Claudio Jeker wrote:
> > > There is currently a race in bgpd when multiple nexthop become invalid at
> > > the same time. The problem is that the decision process may select an
> > > alternative path that also has a no longer valid nexthop but the process
> > > that does all the adjustments did not reach that prefix yet.
> > > The main issue here is that the RDE uses the true_nexthop to communicate
> > > this change but the true_nexthop is 0 or :: in this case.
> > > 
> > > This diff solves the issue by no longer using true_nexthop in the kroute
> > > message but instead have the kroute code do the lookup instead. The state
> > > in kroute is always up to date so the system knows if the nexthop is valid
> > > or not and either issues a change or remove depending on that.
> > > 
> > > With this the rde no longer uses the true_nexthop (it is only there to be
> > > reported to bgpctl). It only cares about exit_nexthop, validity and the
> > > connected network info. All of those should not cause any problems during
> > > nexthop flips.
> > 
> > This reads fine. If you fix the grammar of the comment, then it's ok, e.g.:
> > 
> > > +  * Ignore the nexthop for VPN routes. The gateway is a forced
> > 
> > s/a //
> > 
> > > +  * to an mpe(4) interface route using an MPLS label.
> 
> Something is still missing in this diff. Since changing a route does not
> update the kroutes. Guess the kroute code needs to do this proper now, I
> thought I could still use the old slow path for this but it looks like the
> update is surpressed. 
> 
> Will look more into this and come up with a new version (with the grammar
> fix from above).

Here we go. The bug I was chasing was actually introduced a few weeks ago.
The problem is that when you RTM_CHANGE a route knexthop_track() fails to
notice the change (okr and kr remain the same since the route did not
change) and so no update is issued.
Fix this by introducing knexthop_update() that just forces an update if
the kroute covering this knexthop is modified. The rest of the diff is
unmodified.

-- 
:wq Claudio

Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v
retrieving revision 1.288
diff -u -p -r1.288 kroute.c
--- kroute.c    10 Aug 2022 14:21:24 -0000      1.288
+++ kroute.c    15 Aug 2022 12:19:54 -0000
@@ -160,8 +160,10 @@ void                kif_clear(void);
 
 int             kroute_validate(struct kroute *);
 int             kroute6_validate(struct kroute6 *);
+int             knexthop_true_nexthop(struct ktable *, struct kroute_full *);
 void            knexthop_validate(struct ktable *, struct knexthop *);
 void            knexthop_track(struct ktable *, u_short);
+void            knexthop_update(struct ktable *, struct kroute_full *);
 void            knexthop_send_update(struct knexthop *);
 struct kroute  *kroute_match(struct ktable *, struct bgpd_addr *, int);
 struct kroute6 *kroute6_match(struct ktable *, struct bgpd_addr *, int);
@@ -453,6 +455,8 @@ kr_change(u_int rtableid, struct kroute_
                return (0);
        kf->flags |= F_BGPD;
        kf->priority = RTP_MINE;
+       if (!knexthop_true_nexthop(kt, kf))
+               return kroute_remove(kt, kf, 1);
        switch (kf->prefix.aid) {
        case AID_INET:
                return (kr4_change(kt, kf));
@@ -1406,10 +1410,10 @@ kr6_tofull(struct kroute6 *kr6)
        bzero(&kf, sizeof(kf));
 
        kf.prefix.aid = AID_INET6;
-       memcpy(&kf.prefix.v6, &kr6->prefix, sizeof(struct in6_addr));
+       kf.prefix.v6 = kr6->prefix;
        kf.prefix.scope_id = kr6->prefix_scope_id;
        kf.nexthop.aid = AID_INET6;
-       memcpy(&kf.nexthop.v6, &kr6->nexthop, sizeof(struct in6_addr));
+       kf.nexthop.v6 = kr6->nexthop;
        kf.nexthop.scope_id = kr6->nexthop_scope_id;
        strlcpy(kf.label, rtlabel_id2name(kr6->labelid), sizeof(kf.label));
        kf.flags = kr6->flags;
@@ -2124,6 +2128,51 @@ kroute6_validate(struct kroute6 *kr)
        return (kif->k.nh_reachable);
 }
 
+int
+knexthop_true_nexthop(struct ktable *kt, struct kroute_full *kf)
+{
+       struct bgpd_addr gateway = { 0 };
+       struct knexthop *kn;
+       struct kroute   *kr;
+       struct kroute6  *kr6;
+
+       /*
+        * Ignore the nexthop for VPN routes. The gateway is forced
+        * to an mpe(4) interface route using an MPLS label.
+        */
+       switch (kf->prefix.aid) {
+       case AID_VPN_IPv4:
+       case AID_VPN_IPv6:
+               return 1;
+       }
+
+       kn = knexthop_find(kt, &kf->nexthop);
+       if (kn == NULL) {
+               log_warnx("%s: nexthop %s not found", __func__,
+                   log_addr(&kf->nexthop));
+               return 0;
+       }
+       if (kn->kroute == NULL)
+               return 0;
+
+       switch (kn->nexthop.aid) {
+       case AID_INET:
+               kr = kn->kroute;
+               gateway.aid = AID_INET;
+               gateway.v4.s_addr = kr->nexthop.s_addr;
+               break;
+       case AID_INET6:
+               kr6 = kn->kroute;
+               gateway.aid = AID_INET6;
+               gateway.v6 = kr6->nexthop;
+               gateway.scope_id = kr6->nexthop_scope_id;
+               break;
+       }
+
+       kf->nexthop = gateway;
+       return 1;
+}
+
 void
 knexthop_validate(struct ktable *kt, struct knexthop *kn)
 {
@@ -2150,7 +2199,7 @@ knexthop_validate(struct ktable *kt, str
                /*
                 * Send update if nexthop route changed under us if
                 * the route remains the same then the NH state has not
-                * changed. State changes are tracked by knexthop_track().
+                * changed.
                 */
                if (kr != oldk)
                        knexthop_send_update(kn);
@@ -2170,6 +2219,9 @@ knexthop_validate(struct ktable *kt, str
        }
 }
 
+/*
+ * Called on interface state change.
+ */
 void
 knexthop_track(struct ktable *kt, u_short ifindex)
 {
@@ -2180,6 +2232,20 @@ knexthop_track(struct ktable *kt, u_shor
                        knexthop_validate(kt, kn);
 }
 
+/*
+ * Called on route change.
+ */
+void
+knexthop_update(struct ktable *kt, struct kroute_full *kf)
+{
+       struct knexthop *kn;
+
+       RB_FOREACH(kn, knexthop_tree, KT2KNT(kt))
+               if (prefix_compare(&kf->prefix, &kn->nexthop,
+                   kf->prefixlen) == 0)
+                       knexthop_send_update(kn);
+}
+
 void
 knexthop_send_update(struct knexthop *kn)
 {
@@ -2187,8 +2253,8 @@ knexthop_send_update(struct knexthop *kn
        struct kroute           *kr;
        struct kroute6          *kr6;
 
-       bzero(&n, sizeof(n));
-       memcpy(&n.nexthop, &kn->nexthop, sizeof(n.nexthop));
+       memset(&n, 0, sizeof(n));
+       n.nexthop = kn->nexthop;
 
        if (kn->kroute == NULL) {
                n.valid = 0;    /* NH is not valid */
@@ -2218,14 +2284,12 @@ knexthop_send_update(struct knexthop *kn
                if (memcmp(&kr6->nexthop, &in6addr_any,
                    sizeof(struct in6_addr)) != 0) {
                        n.gateway.aid = AID_INET6;
-                       memcpy(&n.gateway.v6, &kr6->nexthop,
-                           sizeof(struct in6_addr));
+                       n.gateway.v6 = kr6->nexthop;
                        n.gateway.scope_id = kr6->nexthop_scope_id;
                }
                if (n.connected) {
                        n.net.aid = AID_INET6;
-                       memcpy(&n.net.v6, &kr6->prefix,
-                           sizeof(struct in6_addr));
+                       n.net.v6 = kr6->prefix;
                        n.net.scope_id = kr6->prefix_scope_id;
                        n.netlen = kr6->prefixlen;
                }
@@ -3104,7 +3168,7 @@ kr_fib_change(struct ktable *kt, struct 
                                        kr_redistribute(IMSG_NETWORK_ADD,
                                            kt, kr_tofull(kr));
                                if (kr->flags & F_NEXTHOP && changed)
-                                       knexthop_track(kt, kf->ifindex);
+                                       knexthop_update(kt, kf);
                        } else {
                                kr->flags &= ~F_BGPD_INSERTED;
                        }
@@ -3177,7 +3241,7 @@ add4:
                                            kt, kr6_tofull(kr6));
 
                                if (kr6->flags & F_NEXTHOP && changed)
-                                       knexthop_track(kt, kf->ifindex);
+                                       knexthop_update(kt, kf);
                        } else {
                                kr6->flags &= ~F_BGPD_INSERTED;
                        }
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.562
diff -u -p -r1.562 rde.c
--- rde.c       10 Aug 2022 14:17:01 -0000      1.562
+++ rde.c       11 Aug 2022 09:03:03 -0000
@@ -3056,8 +3056,7 @@ rde_send_kroute(struct rib *rib, struct 
                        kf.flags |= F_REJECT;
                if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE)
                        kf.flags |= F_BLACKHOLE;
-               memcpy(&kf.nexthop, &prefix_nexthop(p)->true_nexthop,
-                   sizeof(kf.nexthop));
+               kf.nexthop = prefix_nexthop(p)->exit_nexthop;
                strlcpy(kf.label, rtlabel_id2name(prefix_aspath(p)->rtlabelid),
                    sizeof(kf.label));
        }
@@ -3072,13 +3071,6 @@ rde_send_kroute(struct rib *rib, struct 
                SIMPLEQ_FOREACH(vpn, &conf->l3vpns, entry) {
                        if (!rde_l3vpn_import(prefix_communities(p), vpn))
                                continue;
-                       /* must send exit_nexthop so that correct MPLS tunnel
-                        * is chosen
-                        */
-                       if (type == IMSG_KROUTE_CHANGE)
-                               memcpy(&kf.nexthop,
-                                   &prefix_nexthop(p)->exit_nexthop,
-                                   sizeof(kf.nexthop));
                        /* XXX not ideal but this will change */
                        kf.ifindex = if_nametoindex(vpn->ifmpe);
                        if (imsg_compose(ibuf_main, type, vpn->rtableid, 0, -1,

Reply via email to