On Thu, May 31, 2007 at 11:45:08PM +0200, Claudio Jeker wrote:
> On Wed, May 30, 2007 at 08:04:45PM +0200, Christian Plattner wrote:
> > Hi,
> >
> > I am testing OpenBGPD and OpenOSPFD on a couple of Soekris boxes.
> > Even though I am using the latest code (-stable with ospfd kroute.c
> > revision 1.48), I am having problems with the kernel routing table
> > when OSPFD has to react to changes in the topology. I verified the
> > problem on a virtual setup (a couple of OpenBSD machines on an ESX
> > server), same result.
> >
> > The problem can be summarized as follows: When I take down an interface
> > on one machine manually (e.g., ifconfig em1 down), then the OpenOSPFD
> > on another machine has no problems to detect this, routes to subnets in
> > the same AS will be adapted. However, the kernel continues to route
> > packets to destinations outside of the AS still over the dead link.
> >
> > Fix: When I restart ospfd, the kernel routing table is OK again.
> >
> > Here is an example with 3 routers that I have put together using
> > ESX/VMWare:
> >
> > /em1-(.1) --- 10.74.96.0/27 --- (.2)--em0\
> > +-- (.22)-em0-[R1] [R2]
> > | \em2-(.33) -- 10.74.96.32/27 -- (.34)--em1/
> > 10.0.0.0/24
> > |
> > +--- (.1)-em1-[R0]-em0 -- (62.2.0.0/16)
> >
> > Router R0: AS65002 announces 62.2.0.0/16 to R1
> > Router R1: AS65001 announces 10.74.96.0/21 to R0
> > Router R2: AS65001 has an IBGP session with R1
> > Loopback (lo1) addresses: R1=10.74.97.1, R2=10.74.97.2
> >
> > This setting works fine, I can ping from R2 to machines in 62.2.0.0/16.
> > Traffic between R1 and R2 flows over the upper link.
> >
> > However, lets assume that one of the links between R1 and R2 fails.
> >
> > [R1] # ifconfig em1 down (so eventually R2 will find out that I does
> > not receive any OSPF packets on em0 anymore).
> >
> > It takes a while, but then ospfd on R2 has calculated the new topology:
> >
> > [R2] # ospfctl show rib
> > Destination Nexthop Path Type Type Cost
> > 0.0.0.1 10.74.96.33 Intra-Area Router 11
> > 10.74.96.0/27 10.74.96.33 Intra-Area Network 21
> > 10.74.96.32/27 10.74.96.34 Intra-Area Network 11
> > 10.74.97.1/32 10.74.96.33 Intra-Area Network 21
> > 10.0.0.0/24 10.74.96.33 Type 1 ext Network 111
> > (uptime column deleted, to comply with the 72 char restriction
> > of the mailing list).
> >
> > [R2] # ospfctl show fib
> > flags: * = valid, O = OSPF, C = Connected, S = Static
> > Flags Destination Nexthop
> > *O 10.0.0.0/24 10.74.96.33
> > * 10.74.96.0/21 10.74.96.1
> > *C 10.74.96.0/27 link#1
> > *C 10.74.96.32/27 link#2
> > *O 10.74.97.1/32 10.74.96.33
> > * 10.74.97.2/32 10.74.97.2
> > * 62.2.0.0/16 10.74.96.1
> > *S 127.0.0.0/8 127.0.0.1
> > *C 127.0.0.1/8 link#0
> > * 127.0.0.1/32 127.0.0.1
> > *S 224.0.0.0/4 127.0.0.1
> >
> > This is not good, as the (via IBGP learned) route to 62.2.0.0/16 still
> > points to 10.74.96.1 (which is not directly reachable anymore).
> >
> > Now let's kill and restart ospfd on R2, then check again:
> >
> > # ospfctl show fib
> > flags: * = valid, O = OSPF, C = Connected, S = Static
> > Flags Destination Nexthop
> > *O 10.0.0.0/24 10.74.96.33
> > * 10.74.96.0/21 10.74.96.33
> > *C 10.74.96.0/27 link#1
> > *C 10.74.96.32/27 link#2
> > *O 10.74.97.1/32 10.74.96.33
> > * 10.74.97.2/32 10.74.97.2
> > * 62.2.0.0/16 10.74.96.33
> > *S 127.0.0.0/8 127.0.0.1
> > *C 127.0.0.1/8 link#0
> > * 127.0.0.1/32 127.0.0.1
> > *S 224.0.0.0/4 127.0.0.1
> >
> > Voil`, now it looks OK =)
> >
> > This is the ospfd.conf of R2:
> >
> > password="gurke"
> > router-id 0.0.0.2
> > redistribute connected
> > redistribute static
> >
> > area 0.0.0.0 {
> >
> > interface lo1
> >
> > interface em0 {
> > metric 10
> > auth-type simple
> > auth-key $password
> > }
> > interface em1 {
> > metric 11
> > auth-type simple
> > auth-key $password
> > }
> > }
> >
> > Any suggstions? Am I making a substantial error?
> >
> > I did not want to make this posting too long, so if somebody is
> > interested in the detailed config files then I can make them
> > available.
> >
>
> This is a bgpd bug. Because the 62.2/16 network is handled by bgpd.
> I'm currently having a look at this. Not sure why the network does not
> swing over to the working link but hopefully I will find it out.
>
And here is a preliminary diff for all the curious ones. bgpd needs to
track changes of routes with F_NEXTHOP checked and report them to the RDE.
The RDE will then update all active routes that use this nexthop. Seems to
work for me.
--
:wq Claudio
? obj
Index: kroute.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/kroute.c,v
retrieving revision 1.154
diff -u -p -r1.154 kroute.c
--- kroute.c 11 May 2007 11:27:59 -0000 1.154
+++ kroute.c 1 Jun 2007 05:45:29 -0000
@@ -118,6 +118,7 @@ int kif_validate(struct kif *);
int kroute_validate(struct kroute *);
int kroute6_validate(struct kroute6 *);
void knexthop_validate(struct knexthop_node *);
+void knexthop_track(void *);
struct kroute_node *kroute_match(in_addr_t, int);
struct kroute6_node *kroute6_match(struct in6_addr *, int);
void kroute_detach_nexthop(struct knexthop_node *);
@@ -1365,6 +1366,46 @@ knexthop_validate(struct knexthop_node *
}
}
+void
+knexthop_track(void *krn)
+{
+ struct knexthop_node *kn;
+ struct kroute_node *kr;
+ struct kroute6_node *kr6;
+ struct kroute_nexthop n;
+
+ RB_FOREACH(kn, knexthop_tree, &knt)
+ if (kn->kroute == krn) {
+ bzero(&n, sizeof(n));
+ memcpy(&n.nexthop, &kn->nexthop, sizeof(n.nexthop));
+
+ switch (kn->nexthop.af) {
+ case AF_INET:
+ kr = krn;
+ n.valid = 1;
+ n.connected = kr->r.flags & F_CONNECTED;
+ if ((n.gateway.v4.s_addr =
+ kr->r.nexthop.s_addr) != 0)
+ n.gateway.af = AF_INET;
+ memcpy(&n.kr.kr4, &kr->r, sizeof(n.kr.kr4));
+ break;
+ case AF_INET6:
+ kr6 = krn;
+ n.valid = 1;
+ n.connected = kr6->r.flags & F_CONNECTED;
+ if (memcmp(&kr6->r.nexthop, &in6addr_any,
+ sizeof(struct in6_addr)) != 0) {
+ n.gateway.af = AF_INET6;
+ memcpy(&n.gateway.v6, &kr6->r.nexthop,
+ sizeof(struct in6_addr));
+ }
+ memcpy(&n.kr.kr6, &kr6->r, sizeof(n.kr.kr6));
+ break;
+ }
+ send_nexthop_update(&n);
+ }
+}
+
struct kroute_node *
kroute_match(in_addr_t key, int matchall)
{
@@ -1447,7 +1488,6 @@ kroute_detach_nexthop(struct knexthop_no
kn->kroute = NULL;
}
-
/*
* misc helpers
*/
@@ -2397,6 +2437,8 @@ dispatch_rtmsg_addr(struct rt_msghdr *rt
kr_redistribute(IMSG_NETWORK_ADD,
&kr->r);
}
+ if (kr->r.flags & F_NEXTHOP)
+ knexthop_track(kr);
}
} else if (rtm->rtm_type == RTM_CHANGE) {
log_warnx("change req for %s/%u: not in table",
@@ -2449,6 +2491,8 @@ dispatch_rtmsg_addr(struct rt_msghdr *rt
kr_redistribute6(IMSG_NETWORK_ADD,
&kr6->r);
}
+ if (kr6->r.flags & F_NEXTHOP)
+ knexthop_track(kr6);
}
} else if (rtm->rtm_type == RTM_CHANGE) {
log_warnx("change req for %s/%u: not in table",
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.100
diff -u -p -r1.100 rde.h
--- rde.h 1 Jun 2007 04:17:30 -0000 1.100
+++ rde.h 1 Jun 2007 05:45:29 -0000
@@ -350,7 +350,8 @@ void prefix_remove(struct rde_peer *,
u_int32_t);
int prefix_write(u_char *, int, struct bgpd_addr *, u_int8_t);
struct prefix *prefix_bypeer(struct pt_entry *, struct rde_peer *, u_int32_t);
-void prefix_updateall(struct rde_aspath *, enum nexthop_state);
+void prefix_updateall(struct rde_aspath *, enum nexthop_state,
+ enum nexthop_state);
void prefix_destroy(struct prefix *);
void prefix_network_clean(struct rde_peer *, time_t);
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
retrieving revision 1.96
diff -u -p -r1.96 rde_rib.c
--- rde_rib.c 1 Jun 2007 04:17:30 -0000 1.96
+++ rde_rib.c 1 Jun 2007 05:45:29 -0000
@@ -616,7 +616,8 @@ prefix_bypeer(struct pt_entry *pte, stru
}
void
-prefix_updateall(struct rde_aspath *asp, enum nexthop_state state)
+prefix_updateall(struct rde_aspath *asp, enum nexthop_state state,
+ enum nexthop_state oldstate)
{
struct prefix *p;
@@ -632,6 +633,18 @@ prefix_updateall(struct rde_aspath *asp,
if (!(p->flags & F_LOCAL))
continue;
+ if (oldstate == state && state == NEXTHOP_REACH) {
+ /*
+ * The state of the nexthop did not change. The only
+ * thing that may have changed is the true_nexthop
+ * or other internal infos. This will not change
+ * the routing decision so shortcut here.
+ */
+ if (p == p->prefix->active)
+ rde_send_kroute(p, NULL);
+ continue;
+ }
+
/* redo the route decision */
LIST_REMOVE(p, prefix_l);
/*
@@ -817,6 +830,7 @@ nexthop_update(struct kroute_nexthop *ms
{
struct nexthop *nh;
struct rde_aspath *asp;
+ enum nexthop_state oldstate;
nh = nexthop_lookup(&msg->nexthop);
if (nh == NULL) {
@@ -825,15 +839,16 @@ nexthop_update(struct kroute_nexthop *ms
return;
}
+ if (nexthop_delete(nh))
+ /* nexthop no longer used */
+ return;
+
+ oldstate = nh->state;
if (msg->valid)
nh->state = NEXTHOP_REACH;
else
nh->state = NEXTHOP_UNREACH;
- if (nexthop_delete(nh))
- /* nexthop no longer used */
- return;
-
if (msg->connected) {
nh->flags |= NEXTHOP_CONNECTED;
memcpy(&nh->true_nexthop, &nh->exit_nexthop,
@@ -866,7 +881,7 @@ nexthop_update(struct kroute_nexthop *ms
return;
LIST_FOREACH(asp, &nh->path_h, nexthop_l) {
- prefix_updateall(asp, nh->state);
+ prefix_updateall(asp, nh->state, oldstate);
}
}