The Adj-RIB-Out is a bottleneck on route servers with many peers. The problem is that all outgoing prefixes are part of a linear list in struct rib_entry. Solving this requiers that the Adj-RIB-Out no longer uses the rib_entry pointer.
So the first step is to add a pointer to the prefix table entry (the actuall IP prefix) directly into struct prefix. This diff implements this and replaces all p->re->prefix with direct p->pt references. Most of it is mechanical. I also adjusted pt_ref and pt_unref a bit to simplify the code a bit and prefix_cmp() is now calling pt_prefix_cmp directly and no longer goes via rib_compare(). OK? -- :wq Claudio Index: mrt.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/mrt.c,v retrieving revision 1.94 diff -u -p -r1.94 mrt.c --- mrt.c 7 Mar 2019 07:42:36 -0000 1.94 +++ mrt.c 10 Mar 2019 11:32:47 -0000 @@ -290,7 +290,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc DUMP_SHORT(h2buf, /* ifindex */ 0); /* XXX is this for peer self? */ - aid = peer->remote_addr.aid == AID_UNSPEC ? p->re->prefix->aid : + aid = peer->remote_addr.aid == AID_UNSPEC ? p->pt->aid : peer->remote_addr.aid; switch (aid) { case AID_INET: @@ -317,7 +317,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc DUMP_SHORT(h2buf, 1); /* status */ DUMP_LONG(h2buf, p->lastchange); /* originated */ - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); n = prefix_nexthop(p); if (n == NULL) { @@ -348,7 +348,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc goto fail; } - if (prefix_writebuf(h2buf, &addr, p->re->prefix->prefixlen) == -1) { + if (prefix_writebuf(h2buf, &addr, p->pt->prefixlen) == -1) { log_warn("mrt_dump_entry_mp: prefix_writebuf error"); goto fail; } @@ -384,8 +384,8 @@ mrt_dump_entry(struct mrt *mrt, struct p u_int16_t subtype; u_int8_t dummy; - if (p->re->prefix->aid != peer->remote_addr.aid && - p->re->prefix->aid != AID_INET && p->re->prefix->aid != AID_INET6) + if (p->pt->aid != peer->remote_addr.aid && + p->pt->aid != AID_INET && p->pt->aid != AID_INET6) /* only able to dump pure IPv4/IPv6 */ return (0); @@ -397,7 +397,7 @@ mrt_dump_entry(struct mrt *mrt, struct p nexthop = prefix_nexthop(p); if (nexthop == NULL) { bzero(&addr, sizeof(struct bgpd_addr)); - addr.aid = p->re->prefix->aid; + addr.aid = p->pt->aid; nh = &addr; } else nh = &nexthop->exit_nexthop; @@ -407,7 +407,7 @@ mrt_dump_entry(struct mrt *mrt, struct p return (-1); } len = ibuf_size(buf); - aid2afi(p->re->prefix->aid, &subtype, &dummy); + aid2afi(p->pt->aid, &subtype, &dummy); if (mrt_dump_hdr_rde(&hbuf, MSG_TABLE_DUMP, subtype, len) == -1) { ibuf_free(buf); return (-1); @@ -416,8 +416,8 @@ mrt_dump_entry(struct mrt *mrt, struct p DUMP_SHORT(hbuf, 0); DUMP_SHORT(hbuf, snum); - pt_getaddr(p->re->prefix, &addr); - switch (p->re->prefix->aid) { + pt_getaddr(p->pt, &addr); + switch (p->pt->aid) { case AID_INET: DUMP_NLONG(hbuf, addr.v4.s_addr); break; @@ -428,11 +428,11 @@ mrt_dump_entry(struct mrt *mrt, struct p } break; } - DUMP_BYTE(hbuf, p->re->prefix->prefixlen); + DUMP_BYTE(hbuf, p->pt->prefixlen); DUMP_BYTE(hbuf, 1); /* state */ DUMP_LONG(hbuf, p->lastchange); /* originated */ - switch (p->re->prefix->aid) { + switch (p->pt->aid) { case AID_INET: DUMP_NLONG(hbuf, peer->remote_addr.v4.s_addr); break; Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v retrieving revision 1.471 diff -u -p -r1.471 rde.c --- rde.c 20 Jun 2019 13:18:19 -0000 1.471 +++ rde.c 20 Jun 2019 13:22:20 -0000 @@ -2234,11 +2234,11 @@ rde_dump_rib_as(struct prefix *p, struct /* announced network may have a NULL nexthop */ bzero(&rib.true_nexthop, sizeof(rib.true_nexthop)); bzero(&rib.exit_nexthop, sizeof(rib.exit_nexthop)); - rib.true_nexthop.aid = p->re->prefix->aid; - rib.exit_nexthop.aid = p->re->prefix->aid; + rib.true_nexthop.aid = p->pt->aid; + rib.exit_nexthop.aid = p->pt->aid; } - pt_getaddr(p->re->prefix, &rib.prefix); - rib.prefixlen = p->re->prefix->prefixlen; + pt_getaddr(p->pt, &rib.prefix); + rib.prefixlen = p->pt->prefixlen; rib.origin = asp->origin; rib.validation_state = p->validation_state; rib.flags = 0; @@ -2254,7 +2254,7 @@ rde_dump_rib_as(struct prefix *p, struct rib.flags &= ~F_PREF_ELIGIBLE; if (asp->flags & F_ATTR_PARSE_ERR) rib.flags |= F_PREF_INVALID; - staletime = prefix_peer(p)->staletime[p->re->prefix->aid]; + staletime = prefix_peer(p)->staletime[p->pt->aid]; if (staletime && p->lastchange <= staletime) rib.flags |= F_PREF_STALE; rib.aspath_len = aspath_length(asp->aspath); @@ -2610,10 +2610,10 @@ rde_send_kroute(struct rib *rib, struct } asp = prefix_aspath(p); - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); bzero(&kr, sizeof(kr)); memcpy(&kr.prefix, &addr, sizeof(kr.prefix)); - kr.prefixlen = p->re->prefix->prefixlen; + kr.prefixlen = p->pt->prefixlen; if (prefix_nhflags(p) == NEXTHOP_REJECT) kr.flags |= F_REJECT; if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE) @@ -3891,7 +3891,7 @@ network_dump_upcall(struct rib_entry *re asp = prefix_aspath(p); if (!(asp->flags & F_PREFIX_ANNOUNCED)) continue; - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); bzero(&k, sizeof(k)); memcpy(&k.prefix, &addr, sizeof(k.prefix)); @@ -3901,7 +3901,7 @@ network_dump_upcall(struct rib_entry *re else memcpy(&k.nexthop, &prefix_nexthop(p)->true_nexthop, sizeof(k.nexthop)); - k.prefixlen = p->re->prefix->prefixlen; + k.prefixlen = p->pt->prefixlen; k.flags = F_KERNEL; if ((asp->flags & F_ANN_DYNAMIC) == 0) k.flags = F_STATIC; Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v retrieving revision 1.216 diff -u -p -r1.216 rde.h --- rde.h 20 Jun 2019 13:38:21 -0000 1.216 +++ rde.h 20 Jun 2019 14:42:17 -0000 @@ -308,6 +308,7 @@ struct pt_entry_vpn6 { struct prefix { LIST_ENTRY(prefix) rib_l, nexthop_l; RB_ENTRY(prefix) entry; + struct pt_entry *pt; struct rib_entry *re; struct rde_aspath *aspath; struct rde_community *communities; @@ -464,20 +465,22 @@ pt_empty(struct pt_entry *pt) return (pt->refcnt == 0); } -static inline void +static inline struct pt_entry * pt_ref(struct pt_entry *pt) { ++pt->refcnt; if (pt->refcnt == 0) fatalx("pt_ref: overflow"); + return pt; } -static inline void +static inline int pt_unref(struct pt_entry *pt) { if (pt->refcnt == 0) fatalx("pt_unref: underflow"); --pt->refcnt; + return pt_empty(pt); } void pt_init(void); Index: rde_filter.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_filter.c,v retrieving revision 1.119 diff -u -p -r1.119 rde_filter.c --- rde_filter.c 17 Jun 2019 11:02:19 -0000 1.119 +++ rde_filter.c 17 Jun 2019 11:35:39 -0000 @@ -248,8 +248,8 @@ rde_filter_match(struct filter_rule *f, struct bgpd_addr addr, *prefix = &addr; u_int8_t plen; - pt_getaddr(p->re->prefix, prefix); - plen = p->re->prefix->prefixlen; + pt_getaddr(p->pt, prefix); + plen = p->pt->prefixlen; if (trie_roa_check(&f->match.originset.ps->th, prefix, plen, aspath_origin(asp->aspath)) != ROA_VALID) return (0); @@ -262,8 +262,8 @@ rde_filter_match(struct filter_rule *f, struct bgpd_addr addr, *prefix = &addr; u_int8_t plen; - pt_getaddr(p->re->prefix, prefix); - plen = p->re->prefix->prefixlen; + pt_getaddr(p->pt, prefix); + plen = p->pt->prefixlen; if (f->match.prefixset.ps == NULL || !trie_match(&f->match.prefixset.ps->th, prefix, plen, (f->match.prefixset.flags & PREFIXSET_FLAG_LONGER))) @@ -282,8 +282,8 @@ rde_prefix_match(struct filter_prefix *f struct bgpd_addr addr, *prefix = &addr; u_int8_t plen; - pt_getaddr(p->re->prefix, prefix); - plen = p->re->prefix->prefixlen; + pt_getaddr(p->pt, prefix); + plen = p->pt->prefixlen; if (fp->addr.aid != prefix->aid) /* don't use IPv4 rules for IPv6 and vice versa */ @@ -784,7 +784,7 @@ rde_filter(struct filter_head *rules, st f->skip[RDE_FILTER_SKIP_PEERID]); if (rde_filter_match(f, peer, state, p)) { - rde_apply_set(&f->set, state, p->re->prefix->aid, + rde_apply_set(&f->set, state, p->pt->aid, prefix_peer(p), peer); if (f->action != ACTION_NONE) action = f->action; Index: rde_rib.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v retrieving revision 1.194 diff -u -p -r1.194 rde_rib.c --- rde_rib.c 20 Jun 2019 19:32:16 -0000 1.194 +++ rde_rib.c 21 Jun 2019 08:32:31 -0000 @@ -103,13 +103,6 @@ rib_tree(struct rib *rib) static inline int rib_compare(const struct rib_entry *a, const struct rib_entry *b) { - /* need to handle NULL entries because of EoR marker */ - if (a == NULL && b == NULL) - return (0); - else if (b == NULL) - return (1); - else if (a == NULL) - return (-1); return (pt_prefix_cmp(a->prefix, b->prefix)); } @@ -207,10 +200,10 @@ rib_free(struct rib *rib) if (asp && asp->pftableid) { struct bgpd_addr addr; - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); /* Commit is done in peer_down() */ rde_send_pftable(asp->pftableid, &addr, - p->re->prefix->prefixlen, 1); + p->pt->prefixlen, 1); } prefix_destroy(p); if (np == NULL) @@ -308,7 +301,7 @@ rib_add(struct rib *rib, struct bgpd_add fatal("rib_add"); LIST_INIT(&re->prefix_h); - re->prefix = pte; + re->prefix = pt_ref(pte); re->rib_id = rib->id; if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) { @@ -317,7 +310,6 @@ rib_add(struct rib *rib, struct bgpd_add return (NULL); } - pt_ref(pte); rdemem.rib_cnt++; @@ -334,8 +326,7 @@ rib_remove(struct rib_entry *re) /* entry is locked, don't free it. */ return; - pt_unref(re->prefix); - if (pt_empty(re->prefix)) + if (pt_unref(re->prefix)) pt_remove(re->prefix); if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL) @@ -510,7 +501,6 @@ SIPHASH_KEY pathtablekey; #define PATH_HASH(x) &pathtable.path_hashtbl[x & pathtable.path_hashmask] - static inline int path_empty(struct rde_aspath *asp) { @@ -879,6 +869,10 @@ prefix_cmp(struct prefix *a, struct pref { if (a->eor != b->eor) return a->eor - b->eor; + /* if EOR marker no need to check the rest also a->eor == b->eor */ + if (a->eor) + return 0; + if (a->aspath != b->aspath) return (a->aspath > b->aspath ? 1 : -1); if (a->communities != b->communities) @@ -887,7 +881,7 @@ prefix_cmp(struct prefix *a, struct pref return (a->nexthop > b->nexthop ? 1 : -1); if (a->nhflags != b->nhflags) return (a->nhflags > b->nhflags ? 1 : -1); - return rib_compare(a->re, b->re); + return pt_prefix_cmp(a->pt, b->pt); } RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp) @@ -946,6 +940,7 @@ prefix_move(struct prefix *p, struct rde np->aspath = path_ref(asp); np->communities = communities_get(comm); np->peer = peer; + np->pt = p->pt; /* skip refcnt update since ref is moved */ np->re = p->re; np->validation_state = vstate; np->nhflags = nhflags; @@ -984,6 +979,7 @@ prefix_move(struct prefix *p, struct rde p->aspath = NULL; p->peer = NULL; p->re = NULL; + p->pt = NULL; prefix_free(p); /* destroy old path if empty */ @@ -1296,6 +1292,7 @@ prefix_link(struct prefix *p, struct rib p->aspath = path_ref(asp); p->communities = communities_get(comm); p->peer = peer; + p->pt = pt_ref(re->prefix); p->re = re; p->validation_state = vstate; p->nhflags = nhflags; @@ -1328,11 +1325,14 @@ prefix_unlink(struct prefix *p) communities_put(p->communities); if (p->aspath) path_unref(p->aspath); + if (pt_unref(p->pt)) + pt_remove(p->pt); p->communities = NULL; p->nexthop = NULL; p->aspath = NULL; p->peer = NULL; p->re = NULL; + p->pt = NULL; if (rib_empty(re)) rib_remove(re); Index: rde_update.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v retrieving revision 1.117 diff -u -p -r1.117 rde_update.c --- rde_update.c 19 Jun 2019 08:15:07 -0000 1.117 +++ rde_update.c 20 Jun 2019 13:07:45 -0000 @@ -70,7 +70,7 @@ up_test_update(struct rde_peer *peer, st if (asp->flags & F_ATTR_LOOP) fatalx("try to send out a looped path"); - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); if (peer->capa.mp[addr.aid] == 0) return (-1); @@ -142,9 +142,9 @@ withdraw: return; /* withdraw prefix */ - pt_getaddr(old->re->prefix, &addr); + pt_getaddr(old->pt, &addr); if (prefix_withdraw(&ribs[RIB_ADJ_OUT].rib, peer, &addr, - old->re->prefix->prefixlen) == 1) + old->pt->prefixlen) == 1) peer->up_wcnt++; } else { switch (up_test_update(peer, new)) { @@ -164,12 +164,12 @@ withdraw: goto withdraw; } - pt_getaddr(new->re->prefix, &addr); + pt_getaddr(new->pt, &addr); if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr, - new->re->prefix->prefixlen, prefix_vstate(new)) != 2) { + new->pt->prefixlen, prefix_vstate(new)) != 2) { /* only send update if path changed */ prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr, - new->re->prefix->prefixlen); + new->pt->prefixlen); peer->up_nlricnt++; } @@ -598,9 +598,9 @@ up_dump_prefix(u_char *buf, int len, str int r, wpos = 0, done = 0; RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) { - pt_getaddr(p->re->prefix, &addr); + pt_getaddr(p->pt, &addr); if ((r = prefix_write(buf + wpos, len - wpos, - &addr, p->re->prefix->prefixlen, withdraw)) == -1) + &addr, p->pt->prefixlen, withdraw)) == -1) break; wpos += r;