The Adj-RIB-Out is a bottleneck on route servers with many peers. The
problem is that all outgoing prefixes are part of a linear list in struct
rib_entry. Solving this requiers that the Adj-RIB-Out no longer uses the
rib_entry pointer. 

So the first step is to add a pointer to the prefix table entry (the
actuall IP prefix) directly into struct prefix.

This diff implements this and replaces all p->re->prefix with direct
p->pt references. Most of it is mechanical. I also adjusted pt_ref and
pt_unref a bit to simplify the code a bit and prefix_cmp() is now calling
pt_prefix_cmp directly and no longer goes via rib_compare().

OK?
-- 
:wq Claudio

Index: mrt.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/mrt.c,v
retrieving revision 1.94
diff -u -p -r1.94 mrt.c
--- mrt.c       7 Mar 2019 07:42:36 -0000       1.94
+++ mrt.c       10 Mar 2019 11:32:47 -0000
@@ -290,7 +290,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc
        DUMP_SHORT(h2buf, /* ifindex */ 0);
 
        /* XXX is this for peer self? */
-       aid = peer->remote_addr.aid == AID_UNSPEC ? p->re->prefix->aid :
+       aid = peer->remote_addr.aid == AID_UNSPEC ? p->pt->aid :
             peer->remote_addr.aid;
        switch (aid) {
        case AID_INET:
@@ -317,7 +317,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc
        DUMP_SHORT(h2buf, 1);           /* status */
        DUMP_LONG(h2buf, p->lastchange);        /* originated */
 
-       pt_getaddr(p->re->prefix, &addr);
+       pt_getaddr(p->pt, &addr);
 
        n = prefix_nexthop(p);
        if (n == NULL) {
@@ -348,7 +348,7 @@ mrt_dump_entry_mp(struct mrt *mrt, struc
                goto fail;
        }
 
-       if (prefix_writebuf(h2buf, &addr, p->re->prefix->prefixlen) == -1) {
+       if (prefix_writebuf(h2buf, &addr, p->pt->prefixlen) == -1) {
                log_warn("mrt_dump_entry_mp: prefix_writebuf error");
                goto fail;
        }
@@ -384,8 +384,8 @@ mrt_dump_entry(struct mrt *mrt, struct p
        u_int16_t        subtype;
        u_int8_t         dummy;
 
-       if (p->re->prefix->aid != peer->remote_addr.aid &&
-           p->re->prefix->aid != AID_INET && p->re->prefix->aid != AID_INET6)
+       if (p->pt->aid != peer->remote_addr.aid &&
+           p->pt->aid != AID_INET && p->pt->aid != AID_INET6)
                /* only able to dump pure IPv4/IPv6 */
                return (0);
 
@@ -397,7 +397,7 @@ mrt_dump_entry(struct mrt *mrt, struct p
        nexthop = prefix_nexthop(p);
        if (nexthop == NULL) {
                bzero(&addr, sizeof(struct bgpd_addr));
-               addr.aid = p->re->prefix->aid;
+               addr.aid = p->pt->aid;
                nh = &addr;
        } else
                nh = &nexthop->exit_nexthop;
@@ -407,7 +407,7 @@ mrt_dump_entry(struct mrt *mrt, struct p
                return (-1);
        }
        len = ibuf_size(buf);
-       aid2afi(p->re->prefix->aid, &subtype, &dummy);
+       aid2afi(p->pt->aid, &subtype, &dummy);
        if (mrt_dump_hdr_rde(&hbuf, MSG_TABLE_DUMP, subtype, len) == -1) {
                ibuf_free(buf);
                return (-1);
@@ -416,8 +416,8 @@ mrt_dump_entry(struct mrt *mrt, struct p
        DUMP_SHORT(hbuf, 0);
        DUMP_SHORT(hbuf, snum);
 
-       pt_getaddr(p->re->prefix, &addr);
-       switch (p->re->prefix->aid) {
+       pt_getaddr(p->pt, &addr);
+       switch (p->pt->aid) {
        case AID_INET:
                DUMP_NLONG(hbuf, addr.v4.s_addr);
                break;
@@ -428,11 +428,11 @@ mrt_dump_entry(struct mrt *mrt, struct p
                }
                break;
        }
-       DUMP_BYTE(hbuf, p->re->prefix->prefixlen);
+       DUMP_BYTE(hbuf, p->pt->prefixlen);
 
        DUMP_BYTE(hbuf, 1);             /* state */
        DUMP_LONG(hbuf, p->lastchange); /* originated */
-       switch (p->re->prefix->aid) {
+       switch (p->pt->aid) {
        case AID_INET:
                DUMP_NLONG(hbuf, peer->remote_addr.v4.s_addr);
                break;
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.471
diff -u -p -r1.471 rde.c
--- rde.c       20 Jun 2019 13:18:19 -0000      1.471
+++ rde.c       20 Jun 2019 13:22:20 -0000
@@ -2234,11 +2234,11 @@ rde_dump_rib_as(struct prefix *p, struct
                /* announced network may have a NULL nexthop */
                bzero(&rib.true_nexthop, sizeof(rib.true_nexthop));
                bzero(&rib.exit_nexthop, sizeof(rib.exit_nexthop));
-               rib.true_nexthop.aid = p->re->prefix->aid;
-               rib.exit_nexthop.aid = p->re->prefix->aid;
+               rib.true_nexthop.aid = p->pt->aid;
+               rib.exit_nexthop.aid = p->pt->aid;
        }
-       pt_getaddr(p->re->prefix, &rib.prefix);
-       rib.prefixlen = p->re->prefix->prefixlen;
+       pt_getaddr(p->pt, &rib.prefix);
+       rib.prefixlen = p->pt->prefixlen;
        rib.origin = asp->origin;
        rib.validation_state = p->validation_state;
        rib.flags = 0;
@@ -2254,7 +2254,7 @@ rde_dump_rib_as(struct prefix *p, struct
                rib.flags &= ~F_PREF_ELIGIBLE;
        if (asp->flags & F_ATTR_PARSE_ERR)
                rib.flags |= F_PREF_INVALID;
-       staletime = prefix_peer(p)->staletime[p->re->prefix->aid];
+       staletime = prefix_peer(p)->staletime[p->pt->aid];
        if (staletime && p->lastchange <= staletime)
                rib.flags |= F_PREF_STALE;
        rib.aspath_len = aspath_length(asp->aspath);
@@ -2610,10 +2610,10 @@ rde_send_kroute(struct rib *rib, struct 
        }
 
        asp = prefix_aspath(p);
-       pt_getaddr(p->re->prefix, &addr);
+       pt_getaddr(p->pt, &addr);
        bzero(&kr, sizeof(kr));
        memcpy(&kr.prefix, &addr, sizeof(kr.prefix));
-       kr.prefixlen = p->re->prefix->prefixlen;
+       kr.prefixlen = p->pt->prefixlen;
        if (prefix_nhflags(p) == NEXTHOP_REJECT)
                kr.flags |= F_REJECT;
        if (prefix_nhflags(p) == NEXTHOP_BLACKHOLE)
@@ -3891,7 +3891,7 @@ network_dump_upcall(struct rib_entry *re
                asp = prefix_aspath(p);
                if (!(asp->flags & F_PREFIX_ANNOUNCED))
                        continue;
-               pt_getaddr(p->re->prefix, &addr);
+               pt_getaddr(p->pt, &addr);
 
                bzero(&k, sizeof(k));
                memcpy(&k.prefix, &addr, sizeof(k.prefix));
@@ -3901,7 +3901,7 @@ network_dump_upcall(struct rib_entry *re
                else
                        memcpy(&k.nexthop, &prefix_nexthop(p)->true_nexthop,
                            sizeof(k.nexthop));
-               k.prefixlen = p->re->prefix->prefixlen;
+               k.prefixlen = p->pt->prefixlen;
                k.flags = F_KERNEL;
                if ((asp->flags & F_ANN_DYNAMIC) == 0)
                        k.flags = F_STATIC;
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.216
diff -u -p -r1.216 rde.h
--- rde.h       20 Jun 2019 13:38:21 -0000      1.216
+++ rde.h       20 Jun 2019 14:42:17 -0000
@@ -308,6 +308,7 @@ struct pt_entry_vpn6 {
 struct prefix {
        LIST_ENTRY(prefix)               rib_l, nexthop_l;
        RB_ENTRY(prefix)                 entry;
+       struct pt_entry                 *pt;
        struct rib_entry                *re;
        struct rde_aspath               *aspath;
        struct rde_community            *communities;
@@ -464,20 +465,22 @@ pt_empty(struct pt_entry *pt)
        return (pt->refcnt == 0);
 }
 
-static inline void
+static inline struct pt_entry *
 pt_ref(struct pt_entry *pt)
 {
        ++pt->refcnt;
        if (pt->refcnt == 0)
                fatalx("pt_ref: overflow");
+       return pt;
 }
 
-static inline void
+static inline int
 pt_unref(struct pt_entry *pt)
 {
        if (pt->refcnt == 0)
                fatalx("pt_unref: underflow");
        --pt->refcnt;
+       return pt_empty(pt);
 }
 
 void    pt_init(void);
Index: rde_filter.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_filter.c,v
retrieving revision 1.119
diff -u -p -r1.119 rde_filter.c
--- rde_filter.c        17 Jun 2019 11:02:19 -0000      1.119
+++ rde_filter.c        17 Jun 2019 11:35:39 -0000
@@ -248,8 +248,8 @@ rde_filter_match(struct filter_rule *f, 
                struct bgpd_addr addr, *prefix = &addr;
                u_int8_t plen;
 
-               pt_getaddr(p->re->prefix, prefix);
-               plen = p->re->prefix->prefixlen;
+               pt_getaddr(p->pt, prefix);
+               plen = p->pt->prefixlen;
                if (trie_roa_check(&f->match.originset.ps->th, prefix, plen,
                    aspath_origin(asp->aspath)) != ROA_VALID)
                        return (0);
@@ -262,8 +262,8 @@ rde_filter_match(struct filter_rule *f, 
                struct bgpd_addr addr, *prefix = &addr;
                u_int8_t plen;
 
-               pt_getaddr(p->re->prefix, prefix);
-               plen = p->re->prefix->prefixlen;
+               pt_getaddr(p->pt, prefix);
+               plen = p->pt->prefixlen;
                if (f->match.prefixset.ps == NULL ||
                    !trie_match(&f->match.prefixset.ps->th, prefix, plen,
                    (f->match.prefixset.flags & PREFIXSET_FLAG_LONGER)))
@@ -282,8 +282,8 @@ rde_prefix_match(struct filter_prefix *f
        struct bgpd_addr addr, *prefix = &addr;
        u_int8_t plen;
 
-       pt_getaddr(p->re->prefix, prefix);
-       plen = p->re->prefix->prefixlen;
+       pt_getaddr(p->pt, prefix);
+       plen = p->pt->prefixlen;
 
        if (fp->addr.aid != prefix->aid)
                /* don't use IPv4 rules for IPv6 and vice versa */
@@ -784,7 +784,7 @@ rde_filter(struct filter_head *rules, st
                     f->skip[RDE_FILTER_SKIP_PEERID]);
 
                if (rde_filter_match(f, peer, state, p)) {
-                       rde_apply_set(&f->set, state, p->re->prefix->aid,
+                       rde_apply_set(&f->set, state, p->pt->aid,
                            prefix_peer(p), peer);
                        if (f->action != ACTION_NONE)
                                action = f->action;
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
retrieving revision 1.194
diff -u -p -r1.194 rde_rib.c
--- rde_rib.c   20 Jun 2019 19:32:16 -0000      1.194
+++ rde_rib.c   21 Jun 2019 08:32:31 -0000
@@ -103,13 +103,6 @@ rib_tree(struct rib *rib)
 static inline int
 rib_compare(const struct rib_entry *a, const struct rib_entry *b)
 {
-       /* need to handle NULL entries because of EoR marker */
-       if (a == NULL && b == NULL)
-               return (0);
-       else if (b == NULL)
-               return (1);
-       else if (a == NULL)
-               return (-1);
        return (pt_prefix_cmp(a->prefix, b->prefix));
 }
 
@@ -207,10 +200,10 @@ rib_free(struct rib *rib)
                        if (asp && asp->pftableid) {
                                struct bgpd_addr addr;
 
-                               pt_getaddr(p->re->prefix, &addr);
+                               pt_getaddr(p->pt, &addr);
                                /* Commit is done in peer_down() */
                                rde_send_pftable(asp->pftableid, &addr,
-                                   p->re->prefix->prefixlen, 1);
+                                   p->pt->prefixlen, 1);
                        }
                        prefix_destroy(p);
                        if (np == NULL)
@@ -308,7 +301,7 @@ rib_add(struct rib *rib, struct bgpd_add
                fatal("rib_add");
 
        LIST_INIT(&re->prefix_h);
-       re->prefix = pte;
+       re->prefix = pt_ref(pte);
        re->rib_id = rib->id;
 
        if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
@@ -317,7 +310,6 @@ rib_add(struct rib *rib, struct bgpd_add
                return (NULL);
        }
 
-       pt_ref(pte);
 
        rdemem.rib_cnt++;
 
@@ -334,8 +326,7 @@ rib_remove(struct rib_entry *re)
                /* entry is locked, don't free it. */
                return;
 
-       pt_unref(re->prefix);
-       if (pt_empty(re->prefix))
+       if (pt_unref(re->prefix))
                pt_remove(re->prefix);
 
        if (RB_REMOVE(rib_tree, rib_tree(re_rib(re)), re) == NULL)
@@ -510,7 +501,6 @@ SIPHASH_KEY pathtablekey;
 
 #define        PATH_HASH(x)    &pathtable.path_hashtbl[x & 
pathtable.path_hashmask]
 
-
 static inline int
 path_empty(struct rde_aspath *asp)
 {
@@ -879,6 +869,10 @@ prefix_cmp(struct prefix *a, struct pref
 {
        if (a->eor != b->eor)
                return a->eor - b->eor;
+       /* if EOR marker no need to check the rest also a->eor == b->eor */
+       if (a->eor)
+               return 0;
+
        if (a->aspath != b->aspath)
                return (a->aspath > b->aspath ? 1 : -1);
        if (a->communities != b->communities)
@@ -887,7 +881,7 @@ prefix_cmp(struct prefix *a, struct pref
                return (a->nexthop > b->nexthop ? 1 : -1);
        if (a->nhflags != b->nhflags)
                return (a->nhflags > b->nhflags ? 1 : -1);
-       return rib_compare(a->re, b->re);
+       return pt_prefix_cmp(a->pt, b->pt);
 }
 
 RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp)
@@ -946,6 +940,7 @@ prefix_move(struct prefix *p, struct rde
        np->aspath = path_ref(asp);
        np->communities = communities_get(comm);
        np->peer = peer;
+       np->pt = p->pt; /* skip refcnt update since ref is moved */
        np->re = p->re;
        np->validation_state = vstate;
        np->nhflags = nhflags;
@@ -984,6 +979,7 @@ prefix_move(struct prefix *p, struct rde
        p->aspath = NULL;
        p->peer = NULL;
        p->re = NULL;
+       p->pt = NULL;
        prefix_free(p);
 
        /* destroy old path if empty */
@@ -1296,6 +1292,7 @@ prefix_link(struct prefix *p, struct rib
        p->aspath = path_ref(asp);
        p->communities = communities_get(comm);
        p->peer = peer;
+       p->pt = pt_ref(re->prefix);
        p->re = re;
        p->validation_state = vstate;
        p->nhflags = nhflags;
@@ -1328,11 +1325,14 @@ prefix_unlink(struct prefix *p)
        communities_put(p->communities);
        if (p->aspath)
                path_unref(p->aspath);
+       if (pt_unref(p->pt))
+               pt_remove(p->pt);
        p->communities = NULL;
        p->nexthop = NULL;
        p->aspath = NULL;
        p->peer = NULL;
        p->re = NULL;
+       p->pt = NULL;
 
        if (rib_empty(re))
                rib_remove(re);
Index: rde_update.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v
retrieving revision 1.117
diff -u -p -r1.117 rde_update.c
--- rde_update.c        19 Jun 2019 08:15:07 -0000      1.117
+++ rde_update.c        20 Jun 2019 13:07:45 -0000
@@ -70,7 +70,7 @@ up_test_update(struct rde_peer *peer, st
        if (asp->flags & F_ATTR_LOOP)
                fatalx("try to send out a looped path");
 
-       pt_getaddr(p->re->prefix, &addr);
+       pt_getaddr(p->pt, &addr);
        if (peer->capa.mp[addr.aid] == 0)
                return (-1);
 
@@ -142,9 +142,9 @@ withdraw:
                        return;
 
                /* withdraw prefix */
-               pt_getaddr(old->re->prefix, &addr);
+               pt_getaddr(old->pt, &addr);
                if (prefix_withdraw(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
-                   old->re->prefix->prefixlen) == 1)
+                   old->pt->prefixlen) == 1)
                        peer->up_wcnt++;
        } else {
                switch (up_test_update(peer, new)) {
@@ -164,12 +164,12 @@ withdraw:
                        goto withdraw;
                }
 
-               pt_getaddr(new->re->prefix, &addr);
+               pt_getaddr(new->pt, &addr);
                if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr,
-                   new->re->prefix->prefixlen, prefix_vstate(new)) != 2) {
+                   new->pt->prefixlen, prefix_vstate(new)) != 2) {
                        /* only send update if path changed */
                        prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr,
-                           new->re->prefix->prefixlen);
+                           new->pt->prefixlen);
                        peer->up_nlricnt++;
                }
 
@@ -598,9 +598,9 @@ up_dump_prefix(u_char *buf, int len, str
        int              r, wpos = 0, done = 0;
 
        RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) {
-               pt_getaddr(p->re->prefix, &addr);
+               pt_getaddr(p->pt, &addr);
                if ((r = prefix_write(buf + wpos, len - wpos,
-                   &addr, p->re->prefix->prefixlen, withdraw)) == -1)
+                   &addr, p->pt->prefixlen, withdraw)) == -1)
                        break;
                wpos += r;
 

Reply via email to