On Tue, Jun 20, 2023 at 05:31:34PM +0100, Stuart Henderson wrote: > This hasn't blown up yet... any interest?
Some comments to the kroute.c changes below. Everything else is fine. > ----- Forwarded message from Stuart Henderson <[email protected]> ----- > > From: Stuart Henderson <[email protected]> > Date: Fri, 26 May 2023 14:40:45 +0100 > To: [email protected] > Subject: ospf6d fib reload [Re: bgpd fix for possible crash in SE] > Mail-Followup-To: [email protected] > > On 2023/05/26 13:52, Stuart Henderson wrote: > > I think my main issues come around LS_REFRESH_TIME intervals, when > > there's loads of churn and "ospf6d: ospf engine" can be busy for > > minutes at a time (not always, but very often). Don't know if that rings > > any bells for anyone... (I am now reminded that RTM_DESYNC isn't handled > > by ospf6d which probably doesn't help matters). > > Here's a first attempt at porting the fib reload/desync diffs from > ospfd to ospf6d ... Not sure if it's good yet, but it didn't immediately > crash and burn when I ran "ospf6ctl fib reload", at least. > ... > Index: ospf6d/kroute.c > =================================================================== > RCS file: /cvs/src/usr.sbin/ospf6d/kroute.c,v > retrieving revision 1.67 > diff -u -p -r1.67 kroute.c > --- ospf6d/kroute.c 8 Mar 2023 04:43:14 -0000 1.67 > +++ ospf6d/kroute.c 26 May 2023 13:37:55 -0000 > @@ -45,16 +45,22 @@ struct { > u_int32_t rtseq; > pid_t pid; > int fib_sync; > + int fib_serial; > u_int8_t fib_prio; > int fd; > - struct event ev; > + struct event ev, reload; Please put reload on its own line. > u_int rdomain; > +#define KR_RELOAD_IDLE 0 > +#define KR_RELOAD_FETCH 1 > +#define KR_RELOAD_HOLD 2 > + int reload_state; > } kr_state; > > struct kroute_node { > RB_ENTRY(kroute_node) entry; > struct kroute_node *next; > struct kroute r; > + int serial; > }; > > void kr_redist_remove(struct kroute_node *, struct kroute_node *); > @@ -90,7 +96,10 @@ void if_announce(void *); > int send_rtmsg(int, int, struct kroute *); > int dispatch_rtmsg(void); > int fetchtable(void); > -int rtmsg_process(char *, size_t); > +int refetchtable(void); > +int rtmsg_process(char *, size_t); > +void kr_fib_reload_timer(int, short, void *); > +void kr_fib_reload_arm_timer(int); > > RB_HEAD(kroute_tree, kroute_node) krt; > RB_PROTOTYPE(kroute_tree, kroute_node, entry, kroute_compare) > @@ -165,6 +174,9 @@ kr_init(int fs, u_int rdomain, int redis > kr_dispatch_msg, NULL); > event_add(&kr_state.ev, NULL); > > + kr_state.reload_state = KR_RELOAD_IDLE; > + evtimer_set(&kr_state.reload, kr_fib_reload_timer, NULL); > + > return (0); > } > > @@ -374,6 +386,62 @@ kr_fib_decouple(void) > } > > void > +kr_fib_reload_timer(int fd, short event, void *bula) > +{ > + if (kr_state.reload_state == KR_RELOAD_FETCH) { > + kr_fib_reload(); > + kr_state.reload_state = KR_RELOAD_HOLD; > + kr_fib_reload_arm_timer(KR_RELOAD_HOLD_TIMER); > + } else { > + kr_state.reload_state = KR_RELOAD_IDLE; > + } > +} > + > +void > +kr_fib_reload_arm_timer(int delay) > +{ > + struct timeval tv; > + > + timerclear(&tv); > + tv.tv_sec = delay / 1000; > + tv.tv_usec = (delay % 1000) * 1000; > + > + if (evtimer_add(&kr_state.reload, &tv) == -1) > + fatal("add_reload_timer"); > +} > + > +void > +kr_fib_reload(void) > +{ > + struct kroute_node *krn, *kr, *kn; > + Maybe include the: log_info("reloading interface list and routing table"); line from ospfd/kroute.c here as well. > + kr_state.fib_serial++; > + > + if (fetchifs(0) != 0 || fetchtable() != 0) > + return; > + > + for (kr = RB_MIN(kroute_tree, &krt); kr != NULL; kr = krn) { > + krn = RB_NEXT(kroute_tree, &krt, kr); > + > + do { > + kn = kr->next; > + > + if (kr->serial != kr_state.fib_serial) { > + > + if (kr->r.priority == RTP_OSPF) { Here you should use kr_state.fib_prio instead of RTP_OSPF. > + kr->serial = kr_state.fib_serial; > + if (send_rtmsg(kr_state.fd, > + RTM_ADD, &kr->r) != 0) > + break; > + } else > + kroute_remove(kr); > + } > + > + } while ((kr = kn) != NULL); > + } > +} > + > +void > kr_fib_update_prio(u_int8_t fib_prio) > { > struct kroute_node *kr; > @@ -664,6 +732,8 @@ kroute_insert(struct kroute_node *kr) > { > struct kroute_node *krm, *krh; > > + kr->serial = kr_state.fib_serial; > + > if ((krh = RB_INSERT(kroute_tree, &krt, kr)) != NULL) { > /* > * Multipath route, add at end of list. > @@ -1279,7 +1349,7 @@ rtmsg_process(char *buf, size_t len) > int flags, mpath; > unsigned int scope; > u_short ifindex = 0; > - int rv; > + int rv, delay; > size_t offset; > char *next; > > @@ -1395,13 +1465,10 @@ rtmsg_process(char *buf, size_t len) > > if ((okr = kroute_find(&prefix, prefixlen, prio)) > != NULL) { > - /* just add new multipath routes */ > - if (mpath && rtm->rtm_type == RTM_ADD) > - goto add; > - /* get the correct route */ > kr = okr; > - if (mpath && (kr = kroute_matchgw(okr, > - &nexthop, scope)) == NULL) { > + if ((mpath || prio == kr_state.fib_prio) && > + (kr = kroute_matchgw(okr, &nexthop, scope)) > == > + NULL) { > log_warnx("rtmsg_process: mpath route" > " not found"); > /* add routes we missed out earlier */ > @@ -1432,13 +1499,15 @@ rtmsg_process(char *buf, size_t len) > kr->r.flags |= F_DOWN; > > /* just readd, the RDE will care */ > - kr_redistribute(okr); > + okr->serial = kr_state.fib_serial; > + kr_redistribute(kr); I think this should be: kr->serial = kr_state.fib_serial; kr_redistribute(okr); kr_redistribute() needs to be called with the head kroute but the serial needs to be set on the route itself. > } else { > add: > if ((kr = calloc(1, > sizeof(struct kroute_node))) == NULL) { > log_warn("rtmsg_process calloc"); > - return (-1); > + rv = -1; > + break; This change is wrong. This must remain as return (-1); > } > kr->r.prefix = prefix; > kr->r.prefixlen = prefixlen; > @@ -1517,6 +1586,23 @@ add: > break; > case RTM_IFANNOUNCE: > if_announce(next); > + break; > + case RTM_DESYNC: > + /* > + * We lost some routing packets. Schedule a reload > + * of the kernel route/interface information. > + */ > + if (kr_state.reload_state == KR_RELOAD_IDLE) { > + delay = KR_RELOAD_TIMER; > + log_info("desync; scheduling fib reload"); > + } else { > + delay = KR_RELOAD_HOLD_TIMER; > + log_debug("desync during KR_RELOAD_%s", > + kr_state.reload_state == > + KR_RELOAD_FETCH ? "FETCH" : "HOLD"); > + } > + kr_state.reload_state = KR_RELOAD_FETCH; > + kr_fib_reload_arm_timer(delay); > break; > default: > /* ignore for now */ As said above everything else is OK. -- :wq Claudio
