On 7/23/25 3:12 PM, Dima Chumak via dev wrote:
> Introduce support for route lookup across several routing tables
> following a priority from highest to lowest.
> 
> To properly prioritize route lookup order, the single router table in
> OVS is now split into three tables: local, main and default.
> Corresponding routing rules are created by default, where local table
> has the highest priority and main and default tables have two lowest
> priorities respectively. All non-standard tables can be added with a
> priority in between local and main.
> 
> Non-standard routing tables are created by reading the Routing Policy

Why only the non-standard tables?

> Database (RPDB) from the kernel, on systems that support it, and
> importing only those tables which are referenced in the RPDB rules with
> a table lookup action. The table IDs and rule priority are copied from
> the kernel RPDB as is.
> 
> Current implementation only supports RPDB rules with a source address
> selector, in form of '[not] from IP' match.
> 
> Signed-off-by: Dima Chumak <dchu...@nvidia.com>
> ---
>  lib/ovs-router.c             | 231 +++++++++++++++++++++++++++++------
>  lib/ovs-router.h             |   7 ++
>  lib/packets.c                |  20 +++
>  lib/packets.h                |   2 +
>  lib/route-table-bsd.c        |   4 +-
>  lib/route-table-stub.c       |   4 +-
>  lib/route-table.c            | 225 ++++++++++++++++++++++++++++++++--
>  lib/route-table.h            |  18 ++-
>  tests/test-lib-route-table.c |   5 +-
>  9 files changed, 465 insertions(+), 51 deletions(-)
> 
> diff --git a/lib/ovs-router.c b/lib/ovs-router.c
> index 02dd94fd18c0..2addc7b43d07 100644
> --- a/lib/ovs-router.c
> +++ b/lib/ovs-router.c
> @@ -42,6 +42,7 @@
>  #include "seq.h"
>  #include "ovs-thread.h"
>  #include "route-table.h"
> +#include "pvector.h"
>  #include "tnl-ports.h"
>  #include "unixctl.h"
>  #include "util.h"
> @@ -56,11 +57,22 @@ struct clsmap_node {
>      uint32_t table;
>  };
>  
> +struct router_rule {
> +    uint32_t prio;
> +    bool invert;
> +    uint8_t src_len;
> +    struct in6_addr from_addr;
> +    uint32_t lookup_table;
> +};
> +
>  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
>  
>  static struct ovs_mutex mutex = OVS_MUTEX_INITIALIZER;
>  static struct hmap clsmap OVS_GUARDED_BY(mutex) = HMAP_INITIALIZER(&clsmap);
> +static struct classifier cls_local;
>  static struct classifier cls_main;
> +static struct classifier cls_default;
> +static struct pvector rules;
>  
>  /* By default, use the system routing table.  For system-independent testing,
>   * the unit tests disable using the system routing table. */
> @@ -87,7 +99,16 @@ cls_find(uint32_t table)
>      struct clsmap_node *node;
>  
>      if (route_table_is_standard_id(table)) {
> -        return &cls_main;
> +        switch (table) {
> +        case CLS_DEFAULT:
> +            return &cls_default;
> +        case CLS_MAIN:
> +            return &cls_main;
> +        case CLS_LOCAL:
> +            return &cls_local;
> +        default:
> +            OVS_NOT_REACHED();
> +        }
>      }

Not sure why we need to special-case these.  See the comments for the previous 
patch.

>  
>      ovs_mutex_lock(&mutex);
> @@ -131,6 +152,14 @@ cls_destroy(struct classifier *cls, bool flush_all)
>      classifier_publish(cls);
>  }
>  
> +bool
> +ovs_router_is_empty(uint32_t table)
> +{
> +    struct classifier *cls = cls_find(table);
> +
> +    return !cls || !cls->n_rules;
> +}
> +
>  static struct ovs_router_entry *
>  ovs_router_entry_cast(const struct cls_rule *cr)
>  {
> @@ -170,26 +199,63 @@ ovs_router_lookup(uint32_t mark, const struct in6_addr 
> *ip6_dst,
>                    char output_netdev[],
>                    struct in6_addr *src, struct in6_addr *gw)
>  {
> -    const struct cls_rule *cr;
>      struct flow flow = {.ipv6_dst = *ip6_dst, .pkt_mark = mark};
> +    const struct in6_addr *from_src = src;
> +    const struct cls_rule *cr;
> +    struct router_rule *rule;
>  
>      if (src && ipv6_addr_is_set(src)) {
> -        const struct cls_rule *cr_src;
> +        struct classifier *src_cls[] = { &cls_local, &cls_main, &cls_default 
> };

Why would a local route be in a non-local classifier?
We should also, likely, drop the 'local' field from the structure,
as it doesn't carry a lot of meaning now that tables are separated.

>          struct flow flow_src = {.ipv6_dst = *src, .pkt_mark = mark};
> +        struct ovs_router_entry *p_src;
> +        const struct cls_rule *cr_src;
> +        bool local = false;
>  
> -        cr_src = classifier_lookup(&cls_main, OVS_VERSION_MAX, &flow_src,
> -                                   NULL, NULL);
> -        if (cr_src) {
> -            struct ovs_router_entry *p_src = ovs_router_entry_cast(cr_src);
> -            if (!p_src->local) {
> -                return false;
> +        for (int i = 0; i < ARRAY_SIZE(src_cls); i++) {
> +            cr_src = classifier_lookup(src_cls[i], OVS_VERSION_MAX, 
> &flow_src,
> +                                       NULL, NULL);
> +            if (!cr_src) {
> +                continue;
>              }
> -        } else {
> +            p_src = ovs_router_entry_cast(cr_src);
> +            local = p_src->local;
> +            if (local) {
> +                break;
> +            }
> +        }
> +        if (!local) {
>              return false;
>          }
>      }
>  
> -    cr = classifier_lookup(&cls_main, OVS_VERSION_MAX, &flow, NULL, NULL);
> +    if (!from_src) {
> +        from_src = &in6addr_any;
> +    }
> +
> +    PVECTOR_FOR_EACH (rule, &rules) {
> +        bool matched = !!(!rule->src_len ||
> +                          ipv6_addr_equals_masked(&rule->from_addr,
> +                                                  from_src, rule->src_len));
> +
> +        if (rule->invert) {
> +            matched = !matched;
> +        }
> +
> +        if (matched) {
> +            struct classifier *cls = cls_find(rule->lookup_table);
> +
> +            if (!cls) {
> +                /* A rule can be added before the table is created */
> +                continue;
> +            }
> +            cr = classifier_lookup(cls, OVS_VERSION_MAX, &flow, NULL,
> +                                    NULL);
> +            if (cr) {
> +                break;
> +            }
> +        }
> +    }
> +
>      if (cr) {
>          struct ovs_router_entry *p = ovs_router_entry_cast(cr);
>  
> @@ -644,37 +710,40 @@ out:
>  static void
>  ovs_router_show_text(struct ds *ds)
>  {
> +    struct classifier *std_cls[] = { &cls_local, &cls_main, &cls_default };
>      struct ovs_router_entry *rt;
>  
>      ds_put_format(ds, "Route Table:\n");
> -    CLS_FOR_EACH (rt, cr, &cls_main) {
> -        uint8_t plen;
> -        if (rt->priority == rt->plen || rt->local) {
> -            ds_put_format(ds, "Cached: ");
> -        } else {
> -            ds_put_format(ds, "User: ");
> -        }
> -        ipv6_format_mapped(&rt->nw_addr, ds);
> -        plen = rt->plen;
> -        if (IN6_IS_ADDR_V4MAPPED(&rt->nw_addr)) {
> -            plen -= 96;
> -        }
> -        ds_put_format(ds, "/%"PRIu8, plen);
> -        if (rt->mark) {
> -            ds_put_format(ds, " MARK %"PRIu32, rt->mark);
> -        }
> +    for (int i = 0; i < ARRAY_SIZE(std_cls); i++) {
> +        CLS_FOR_EACH (rt, cr, std_cls[i]) {
> +            uint8_t plen;
> +            if (rt->priority == rt->plen || rt->local) {
> +                ds_put_format(ds, "Cached: ");
> +            } else {
> +                ds_put_format(ds, "User: ");
> +            }
> +            ipv6_format_mapped(&rt->nw_addr, ds);
> +            plen = rt->plen;
> +            if (IN6_IS_ADDR_V4MAPPED(&rt->nw_addr)) {
> +                plen -= 96;
> +            }
> +            ds_put_format(ds, "/%"PRIu8, plen);
> +            if (rt->mark) {
> +                ds_put_format(ds, " MARK %"PRIu32, rt->mark);
> +            }
>  
> -        ds_put_format(ds, " dev %s", rt->output_netdev);
> -        if (ipv6_addr_is_set(&rt->gw)) {
> -            ds_put_format(ds, " GW ");
> -            ipv6_format_mapped(&rt->gw, ds);
> -        }
> -        ds_put_format(ds, " SRC ");
> -        ipv6_format_mapped(&rt->src_addr, ds);
> -        if (rt->local) {
> -            ds_put_format(ds, " local");
> +            ds_put_format(ds, " dev %s", rt->output_netdev);
> +            if (ipv6_addr_is_set(&rt->gw)) {
> +                ds_put_format(ds, " GW ");
> +                ipv6_format_mapped(&rt->gw, ds);
> +            }
> +            ds_put_format(ds, " SRC ");
> +            ipv6_format_mapped(&rt->src_addr, ds);
> +            if (rt->local) {
> +                ds_put_format(ds, " local");
> +            }
> +            ds_put_format(ds, "\n");
>          }
> -        ds_put_format(ds, "\n");
>      }
>  }
>  
> @@ -747,22 +816,106 @@ ovs_router_flush(bool flush_all)
>              free(node);
>          }
>      }
> +    cls_destroy(&cls_local, flush_all);
>      cls_destroy(&cls_main, flush_all);
> +    cls_destroy(&cls_default, flush_all);
>      ovs_mutex_unlock(&mutex);
>      seq_change(tnl_conf_seq);
>  }
>  
> +static void
> +init_standard_rules(void)
> +{
> +    /* Add default rules using same priorities as Linux kernel does. */
> +    ovs_router_rule_add(0, false, 0, &in6addr_any, CLS_LOCAL);
> +    ovs_router_rule_add(0x7FFE, false, 0, &in6addr_any, CLS_MAIN);
> +    ovs_router_rule_add(0x7FFF, false, 0, &in6addr_any, CLS_DEFAULT);
> +}
> +
> +static void
> +rule_destroy_cb(struct router_rule *rule)
> +{
> +    ovsrcu_postpone(free, rule);
> +}
> +
> +void
> +ovs_router_rules_flush(void)

A thread safety annotation would be good here.

> +{
> +    struct router_rule *rule;
> +
> +    PVECTOR_FOR_EACH (rule, &rules) {
> +        pvector_remove(&rules, rule);
> +        ovsrcu_postpone(rule_destroy_cb, rule);
> +    }
> +    pvector_publish(&rules);
> +    init_standard_rules();
> +}
> +
>  static void
>  ovs_router_flush_handler(void *aux OVS_UNUSED)
>  {
> +    ovs_router_rules_flush();
>      ovs_router_flush(true);
>  
>      ovs_mutex_lock(&mutex);
> +    pvector_destroy(&rules);
>      hmap_destroy(&clsmap);
>      hmap_init(&clsmap);
>      ovs_mutex_unlock(&mutex);
>  }
>  
> +bool
> +ovs_router_is_referenced(uint32_t table)
> +{
> +    struct router_rule *rule;
> +
> +    if (route_table_is_standard_id(table)) {
> +        return true;
> +    }
> +
> +    PVECTOR_FOR_EACH (rule, &rules) {
> +        if (rule->lookup_table == table) {
> +            return true;
> +        }
> +    }

A lookup in the hash map may be better here instead of iteration.

> +    return false;
> +}
> +
> +static int
> +rule_pvec_prio(uint32_t prio)
> +{
> +    /* Invert the priority of a pvector entry to reverse the default sorting
> +     * order (descending) to maintain the standard rules semantic where 0 is
> +     * the highest priority and UINT_MAX is the lowest. The mapping is the
> +     * following:
> +     *
> +     *     0        -> INT_MAX
> +     *     INT_MAX  -> 0
> +     *     UINT_MAX -> INT_MIN
> +     */
> +    if (prio <= INT_MAX) {
> +        return -(INT_MIN + (int) prio + 1);
> +    } else {
> +        return -(INT_MIN + INT_MAX + (int) (prio - INT_MAX - 1) + 1) - 1;

The math seems fishy here, especially the INT_MIN + INT_MAX part that
is just -1.  Unless I'm missing something, this should be exquivalent
to -(int) (prio - INT_MAX - 1) - 1, as INT_MIN + INT_MAX + 1 is just 0.

This also assumes that int is 4 bytes long, which is not always the case.

Alternatively we may just ignore rules above INT_MAX with a WARN_ONCE(),
as it is unlikely that we'll have more rules above UNIT16_MAX in most
cases, or convert pvector to use int64_t.

> +    }
> +}
> +
> +void
> +ovs_router_rule_add(uint32_t prio, bool invert, uint8_t src_len,
> +                    const struct in6_addr *from, uint32_t lookup_table)
> +{
> +    struct router_rule *rule = xzalloc(sizeof *rule);
> +
> +    rule->prio = prio;
> +    rule->invert = invert;
> +    rule->src_len = src_len;
> +    rule->from_addr = *from;
> +    rule->lookup_table = lookup_table;
> +
> +    pvector_insert(&rules, rule, rule_pvec_prio(prio));
> +    pvector_publish(&rules);
> +}
> +
>  void
>  ovs_router_init(void)
>  {
> @@ -771,7 +924,11 @@ ovs_router_init(void)
>      if (ovsthread_once_start(&once)) {
>          ovs_mutex_lock(&mutex);
>          hmap_init(&clsmap);
> +        classifier_init(&cls_local, NULL);
>          classifier_init(&cls_main, NULL);
> +        classifier_init(&cls_default, NULL);
> +        pvector_init(&rules);
> +        init_standard_rules();
>          ovs_mutex_unlock(&mutex);
>          fatal_signal_add_hook(ovs_router_flush_handler, NULL, NULL, true);
>          unixctl_command_register("ovs/route/add",
> diff --git a/lib/ovs-router.h b/lib/ovs-router.h
> index e2d995f3a8dc..a6e100e2f79d 100644
> --- a/lib/ovs-router.h
> +++ b/lib/ovs-router.h
> @@ -27,7 +27,9 @@ extern "C" {
>  #endif
>  
>  enum {
> +    CLS_DEFAULT = 253, /* RT_TABLE_DEFAULT */
>      CLS_MAIN = 254,    /* RT_TABLE_MAIN */
> +    CLS_LOCAL = 255,   /* RT_TABLE_LOCAL */

We need build assertions for these in route-table.c.  And then we
can remove these comments.

>      CLS_ALL = UINT_MAX,

UINT32_MAX

>  };
>  
> @@ -35,6 +37,8 @@ bool ovs_router_lookup(uint32_t mark, const struct in6_addr 
> *ip_dst,
>                         char output_netdev[],
>                         struct in6_addr *src, struct in6_addr *gw);
>  void ovs_router_init(void);
> +bool ovs_router_is_empty(uint32_t table);
> +bool ovs_router_is_referenced(uint32_t table);
>  void ovs_router_insert(uint32_t table, uint32_t mark,
>                         const struct in6_addr *ip_dst,
>                         uint8_t plen, bool local,
> @@ -46,7 +50,10 @@ void ovs_router_force_insert(uint32_t table, uint32_t mark,
>                               const char output_netdev[],
>                               const struct in6_addr *gw,
>                               const struct in6_addr *prefsrc);
> +void ovs_router_rule_add(uint32_t prio, bool invert, uint8_t src_len,
> +                         const struct in6_addr *from, uint32_t lookup_table);
>  void ovs_router_flush(bool flush_all);
> +void ovs_router_rules_flush(void);
>  
>  void ovs_router_disable_system_routing_table(void);
>  
> diff --git a/lib/packets.c b/lib/packets.c
> index a0bb2ad482f1..1bb1f3f779fa 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -1081,6 +1081,26 @@ ipv6_is_cidr(const struct in6_addr *netmask)
>      return true;
>  }
>  
> +bool
> +ipv6_addr_equals_masked(const struct in6_addr *a, const struct in6_addr *b,
> +                        int m)
> +{
> +    struct in6_addr mask;
> +    struct in6_addr ma;
> +    struct in6_addr mb;
> +
> +    if (((IN6_IS_ADDR_V4MAPPED(a) || IN6_IS_ADDR_V4MAPPED(b)) && m == 32) ||
> +        m == 128) {
> +        return ipv6_addr_equals(a, b);
> +    }
> +
> +    mask = ipv6_create_mask(m);
> +    ma = ipv6_addr_bitand(a, &mask);
> +    mb = ipv6_addr_bitand(b, &mask);
> +
> +    return ipv6_addr_equals(&ma, &mb);
> +}
> +
>  /* Populates 'b' with an Ethernet II packet headed with the given 'eth_dst',
>   * 'eth_src' and 'eth_type' parameters.  A payload of 'size' bytes is 
> allocated
>   * in 'b' and returned.  This payload may be populated with appropriate
> diff --git a/lib/packets.h b/lib/packets.h
> index 6eba07700a69..6e263d81c6e4 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -1608,6 +1608,8 @@ bool ipv6_is_zero(const struct in6_addr *a);
>  struct in6_addr ipv6_create_mask(int mask);
>  int ipv6_count_cidr_bits(const struct in6_addr *netmask);
>  bool ipv6_is_cidr(const struct in6_addr *netmask);
> +bool ipv6_addr_equals_masked(const struct in6_addr *a,
> +                             const struct in6_addr *b, int m);

Should expand the 'm' into something more meaningful, e.g. 'mask'
or 'plen'.

>  
>  bool ipv6_parse(const char *s, struct in6_addr *ip);
>  char *ipv6_parse_masked(const char *s, struct in6_addr *ipv6,
> diff --git a/lib/route-table-bsd.c b/lib/route-table-bsd.c
> index 6488df2637c8..ce6eb8771ac8 100644
> --- a/lib/route-table-bsd.c
> +++ b/lib/route-table-bsd.c
> @@ -47,7 +47,9 @@ bool
>  route_table_is_standard_id(uint32_t table_id)
>  {
>      return !table_id
> -           || table_id == CLS_MAIN;
> +           || table_id == CLS_DEFAULT
> +           || table_id == CLS_MAIN
> +           || table_id == CLS_LOCAL;
>  }
>  
>  bool
> diff --git a/lib/route-table-stub.c b/lib/route-table-stub.c
> index 5bf00e820796..761dea3c4699 100644
> --- a/lib/route-table-stub.c
> +++ b/lib/route-table-stub.c
> @@ -22,7 +22,9 @@ bool
>  route_table_is_standard_id(uint32_t table_id)
>  {
>      return !table_id
> -           || table_id == CLS_MAIN;
> +           || table_id == CLS_DEFAULT
> +           || table_id == CLS_MAIN
> +           || table_id == CLS_LOCAL;
>  }
>  
>  bool
> diff --git a/lib/route-table.c b/lib/route-table.c
> index da7b276544e5..85ded4716d7c 100644
> --- a/lib/route-table.c
> +++ b/lib/route-table.c
> @@ -23,6 +23,7 @@
>  #include <netinet/in.h>
>  #include <arpa/inet.h>
>  #include <sys/socket.h>
> +#include <linux/fib_rules.h>
>  #include <linux/rtnetlink.h>
>  #include <net/if.h>
>  
> @@ -59,13 +60,23 @@ static struct nln *nln = NULL;
>  static struct route_table_msg nln_rtmsg_change;
>  static struct nln_notifier *route_notifier = NULL;
>  static struct nln_notifier *route6_notifier = NULL;
> +static struct nln_notifier *rule_notifier = NULL;
> +static struct nln_notifier *rule6_notifier = NULL;
>  static struct nln_notifier *name_notifier = NULL;
>  
>  static bool route_table_valid = false;
> +static bool rules_valid = false;
> +
> +static int route_nln_parse(struct ofpbuf *, void *change);
> +
> +static void rule_handle_msg(const struct route_table_msg *);
> +static int rule_parse(struct ofpbuf *, void *change);
>  
>  static void route_table_reset(void);
> -static void route_table_handle_msg(const struct route_table_msg *, void 
> *aux);
> +static void route_table_handle_msg(const struct route_table_msg *, void *aux,
> +                                   uint32_t table);
>  static void route_table_change(struct route_table_msg *, void *aux);
> +static void rules_change(const struct route_table_msg *, void *aux);
>  static void route_map_clear(void);
>  
>  static void name_table_init(void);
> @@ -105,9 +116,11 @@ route_table_init(void)
>      ovs_assert(!nln);
>      ovs_assert(!route_notifier);
>      ovs_assert(!route6_notifier);
> +    ovs_assert(!rule_notifier);
> +    ovs_assert(!rule6_notifier);
>  
>      ovs_router_init();
> -    nln = nln_create(NETLINK_ROUTE, route_table_parse, &nln_rtmsg_change);
> +    nln = nln_create(NETLINK_ROUTE, route_nln_parse, &nln_rtmsg_change);
>  
>      route_notifier =
>          nln_notifier_create(nln, RTNLGRP_IPV4_ROUTE,
> @@ -116,6 +129,13 @@ route_table_init(void)
>          nln_notifier_create(nln, RTNLGRP_IPV6_ROUTE,
>                              (nln_notify_func *) route_table_change, NULL);
>  
> +    rule_notifier =
> +        nln_notifier_create(nln, RTNLGRP_IPV4_RULE,
> +                            (nln_notify_func *) rules_change, NULL);
> +    rule6_notifier =
> +        nln_notifier_create(nln, RTNLGRP_IPV6_RULE,
> +                            (nln_notify_func *) rules_change, NULL);
> +
>      route_table_reset();
>      name_table_init();
>  
> @@ -132,7 +152,7 @@ route_table_run(void)
>          rtnetlink_run();
>          nln_run(nln);
>  
> -        if (!route_table_valid) {
> +        if (!route_table_valid || !rules_valid) {
>              route_table_reset();
>          }
>      }
> @@ -191,7 +211,7 @@ route_table_dump_one_table(uint32_t id,
>              if (!(nlmsghdr->nlmsg_flags & NLM_F_DUMP_FILTERED)) {
>                  filtered = false;
>              }
> -            handle_msg_cb(&msg, aux);
> +            handle_msg_cb(&msg, aux, id);
>              route_data_destroy(&msg.rd);
>          }
>      }
> @@ -201,6 +221,36 @@ route_table_dump_one_table(uint32_t id,
>      return filtered;
>  }
>  
> +static void
> +rules_dump(void)
> +{
> +    uint64_t reply_stub[NL_DUMP_BUFSIZE / 8];
> +    struct ofpbuf request, reply, buf;
> +    struct fib_rule_hdr *rq_msg;
> +    struct nl_dump dump;
> +
> +    ofpbuf_init(&request, 0);
> +
> +    nl_msg_put_nlmsghdr(&request, sizeof *rq_msg, RTM_GETRULE, 
> NLM_F_REQUEST);
> +
> +    rq_msg = ofpbuf_put_zeros(&request, sizeof *rq_msg);
> +    rq_msg->family = AF_UNSPEC;
> +
> +    nl_dump_start(&dump, NETLINK_ROUTE, &request);
> +    ofpbuf_uninit(&request);
> +
> +    ofpbuf_use_stub(&buf, reply_stub, sizeof reply_stub);
> +    while (nl_dump_next(&dump, &reply, &buf)) {
> +        struct route_table_msg msg;
> +
> +        if (rule_parse(&reply, &msg)) {
> +            rule_handle_msg(&msg);
> +        }
> +    }
> +    ofpbuf_uninit(&buf);
> +    nl_dump_done(&dump);
> +}
> +
>  static void
>  route_table_reset(void)
>  {
> @@ -213,6 +263,7 @@ route_table_reset(void)
>      route_map_clear();
>      netdev_get_addrs_list_flush();
>      route_table_valid = true;
> +    rules_valid = true;
>      rt_change_seq++;
>  
>      COVERAGE_INC(route_table_dump);
> @@ -224,6 +275,156 @@ route_table_reset(void)
>              break;
>          }
>      }
> +    rules_dump();
> +}
> +
> +static void
> +rule_handle_msg(const struct route_table_msg *change)
> +{
> +    if (change->relevant) {
> +        const struct rule_data *rd = &change->rud;
> +
> +        if (!route_table_is_standard_id(rd->lookup_table)) {

Not sure why we need this check.

> +            route_table_dump_one_table(rd->lookup_table,
> +                                       route_table_handle_msg, NULL);
> +        }
> +        ovs_router_rule_add(rd->prio, rd->invert, rd->src_len, 
> &rd->from_addr,
> +                            rd->lookup_table);
> +    }
> +}
> +
> +static int route_nln_parse(struct ofpbuf *buf, void *change_)
> +{
> +    const struct nlmsghdr *nlmsg = buf->data;
> +
> +    if (nlmsg->nlmsg_type == RTM_NEWROUTE ||
> +        nlmsg->nlmsg_type == RTM_DELROUTE) {
> +        return route_table_parse(buf, change_);
> +    } else if (nlmsg->nlmsg_type == RTM_NEWRULE ||
> +               nlmsg->nlmsg_type == RTM_DELRULE) {
> +        return rule_parse(buf, change_);
> +    }
> +
> +    VLOG_DBG_RL(&rl, "received unsupported rtnetlink route message");
> +    return 0;
> +}
> +
> +/* Return RTNLGRP_IPV4_RULE or RTNLGRP_IPV6_RULE on success, 0 on parse
> + * error. */
> +static int
> +rule_parse(struct ofpbuf *buf, void *change_)
> +{
> +    struct route_table_msg *change = change_;
> +    bool parsed, ipv4 = false;
> +
> +    static const struct nl_policy policy[] = {
> +        [FRA_PRIORITY] = { .type = NL_A_U32, .optional = false  },
> +        [FRA_SRC] = { .type = NL_A_U32, .optional = true  },
> +        [FRA_TABLE] = { .type = NL_A_U32, .optional = true },
> +    };
> +
> +    static const struct nl_policy policy6[] = {
> +        [FRA_PRIORITY] = { .type = NL_A_U32, .optional = false  },

Too many spaces at the end here and above.

> +        [FRA_SRC] = { .type = NL_A_IPV6, .optional = true },
> +        [FRA_TABLE] = { .type = NL_A_U32, .optional = true },
> +    };
> +
> +    struct nlattr *attrs[ARRAY_SIZE(policy)];
> +    const struct fib_rule_hdr *frh;
> +
> +    frh = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *frh);
> +    if (frh->action != FR_ACT_TO_TBL || frh->tos || frh->dst_len) {

Should also check that frh is not NULL.

> +        /* unsupported rule */

Start with a capital, end with a dot.

> +        return 0;
> +    }
> +
> +    if (frh->family == AF_INET) {
> +        parsed = nl_policy_parse(buf, NLMSG_HDRLEN +
> +                                 sizeof(struct fib_rule_hdr), policy, attrs,
> +                                 ARRAY_SIZE(policy));
> +        ipv4 = true;
> +    } else if (frh->family == AF_INET6) {
> +        parsed = nl_policy_parse(buf, NLMSG_HDRLEN +
> +                                 sizeof(struct fib_rule_hdr), policy6, attrs,
> +                                 ARRAY_SIZE(policy6));
> +    } else {
> +        VLOG_DBG_RL(&rl, "received non AF_INET rtnetlink route message");
> +        return 0;
> +    }
> +
> +    if (parsed) {
> +        const struct nlmsghdr *nlmsg;
> +
> +        nlmsg = buf->data;
> +
> +        memset(change, 0, sizeof *change);
> +        change->relevant = true;
> +        change->nlmsg_type = nlmsg->nlmsg_type;
> +        change->rud.invert = false;
> +        change->rud.src_len = frh->src_len;
> +        change->rud.lookup_table = frh->table;
> +
> +        if (frh->flags & FIB_RULE_INVERT) {
> +            /* Invert the matching of rule selector */

Period at the end of the comment.

> +            change->rud.invert = true;
> +        }
> +
> +        if (attrs[FRA_PRIORITY]) {
> +            change->rud.prio = nl_attr_get_u32(attrs[FRA_PRIORITY]);
> +        }
> +
> +        if (attrs[FRA_SRC]) {
> +            if (ipv4) {
> +                ovs_be32 src;
> +                src = nl_attr_get_be32(attrs[FRA_SRC]);

Either add an empty line after the declatration, or combine these two lines.
The line might be good in both cases though.

> +                in6_addr_set_mapped_ipv4(&change->rud.from_addr, src);
> +            } else {
> +                change->rud.from_addr = nl_attr_get_in6_addr(attrs[FRA_SRC]);
> +            }
> +        } else if (ipv4) {
> +            in6_addr_set_mapped_ipv4(&change->rud.from_addr, 0);
> +        }
> +
> +        if (attrs[FRA_TABLE]) {
> +            change->rud.lookup_table = nl_attr_get_u32(attrs[FRA_TABLE]);
> +            if (route_table_is_standard_id(change->rud.lookup_table)) {

Not sure why these are not relevant.

> +                change->relevant = false;
> +            }
> +        } else {
> +            change->relevant = false;
> +        }
> +
> +        if (change->rud.invert && !change->rud.src_len) {
> +            change->relevant = false;
> +        }
> +    } else {
> +        VLOG_DBG_RL(&rl, "received unparseable rtnetlink rule message");
> +        return 0;
> +    }
> +
> +    /* Check if there are any additional attributes that aren't supported
> +     * currently by OVS rule-based route lookup. */
> +    if (change->relevant) {
> +        size_t offset = NLMSG_HDRLEN + sizeof(struct fib_rule_hdr);
> +        struct nlattr *nla;
> +        size_t left;
> +
> +        NL_ATTR_FOR_EACH (nla, left, ofpbuf_at(buf, offset, 0),
> +                          buf->size - offset) {
> +            uint16_t type = nl_attr_type(nla);
> +
> +            if ((type > FRA_SRC && type < FRA_PRIORITY) ||
> +                (type > FRA_PRIORITY && type < FRA_SUPPRESS_PREFIXLEN) ||
> +                (type > FRA_TABLE && type < FRA_PROTOCOL) ||
> +                type > FRA_PROTOCOL || type > FRA_MAX) {
> +                change->relevant = false;
> +                break;

Some of these values are relatively new.  We should have fallback
definitions for values above FRA_FLOW, in case they are not defined.
The type > FRA_MAX is a little pointless here as well.

We should also have a system test for this, i.e. add some rules that
OVS doesn't understand and make sure they are ignored.

> +            }
> +        }
> +    }
> +
> +    /* Success. */
> +    return ipv4 ? RTNLGRP_IPV4_RULE : RTNLGRP_IPV6_RULE;
>  }
>  
>  /* Returns true if the given route requires nexthop information (output
> @@ -533,7 +734,7 @@ route_table_change(struct route_table_msg *change, void 
> *aux OVS_UNUSED)
>  {
>      if (!change
>          || (change->relevant
> -            && route_table_is_standard_id(change->rd.rta_table_id))) {
> +            && ovs_router_is_referenced(change->rd.rta_table_id))) {
>          route_table_valid = false;
>      }
>      if (change) {
> @@ -543,7 +744,7 @@ route_table_change(struct route_table_msg *change, void 
> *aux OVS_UNUSED)
>  
>  static void
>  route_table_handle_msg(const struct route_table_msg *change,
> -                       void *aux OVS_UNUSED)
> +                       void *aux OVS_UNUSED, uint32_t table)
>  {
>      if (change->relevant && change->nlmsg_type == RTM_NEWROUTE
>              && !ovs_list_is_empty(&change->rd.nexthops)) {
> @@ -556,7 +757,7 @@ route_table_handle_msg(const struct route_table_msg 
> *change,
>          rdnh = CONTAINER_OF(ovs_list_front(&change->rd.nexthops),
>                              const struct route_data_nexthop, nexthop_node);
>  
> -        ovs_router_insert(CLS_MAIN, rd->rta_mark, &rd->rta_dst,
> +        ovs_router_insert(table, rd->rta_mark, &rd->rta_dst,
>                            IN6_IS_ADDR_V4MAPPED(&rd->rta_dst)
>                            ? rd->rtm_dst_len + 96 : rd->rtm_dst_len,
>                            rd->rtn_local, rdnh->ifname, &rdnh->addr,
> @@ -564,9 +765,19 @@ route_table_handle_msg(const struct route_table_msg 
> *change,
>      }
>  }
>  
> +static void
> +rules_change(const struct route_table_msg *change OVS_UNUSED,
> +             void *aux OVS_UNUSED)
> +{
> +    if (!change || change->relevant) {
> +        rules_valid = false;
> +    }
> +}
> +
>  static void
>  route_map_clear(void)
>  {
> +    ovs_router_rules_flush();
>      ovs_router_flush(false);
>  }
>  
> diff --git a/lib/route-table.h b/lib/route-table.h
> index 5e060581fd25..64858a357187 100644
> --- a/lib/route-table.h
> +++ b/lib/route-table.h
> @@ -143,12 +143,24 @@ struct route_data {
>      uint32_t rta_priority;       /* 0 if missing. */
>  };
>  
> +struct rule_data {
> +    bool invert;
> +    uint32_t prio;
> +    uint8_t src_len;
> +    struct in6_addr from_addr;
> +    uint32_t lookup_table;
> +};
> +
>  /* A digested version of a route message sent down by the kernel to indicate
> - * that a route has changed. */
> + * that a route or a rule has changed. */
>  struct route_table_msg {
>      bool relevant;        /* Should this message be processed? */
>      uint16_t nlmsg_type;  /* e.g. RTM_NEWROUTE, RTM_DELROUTE. */

Outdated comment.

> -    struct route_data rd; /* Data parsed from this message. */
> +    union {               /* Data parsed from this message, depending on
> +                           * nlmsg_type. */
> +        struct route_data rd;
> +        struct rule_data rud;
> +    };
>  };
>  
>  uint64_t route_table_get_change_seq(void);
> @@ -160,7 +172,7 @@ bool route_table_fallback_lookup(const struct in6_addr 
> *ip6_dst,
>                                   struct in6_addr *gw6);
>  
>  typedef void route_table_handle_msg_callback(const struct route_table_msg *,
> -                                             void *aux);
> +                                             void *aux, uint32_t table);
>  
>  bool route_table_is_standard_id(uint32_t table_id);
>  bool route_table_dump_one_table(uint32_t id,
> diff --git a/tests/test-lib-route-table.c b/tests/test-lib-route-table.c
> index 4d4fbe0d35d7..4f9695f07c30 100644
> --- a/tests/test-lib-route-table.c
> +++ b/tests/test-lib-route-table.c
> @@ -67,7 +67,8 @@ rt_table_name(uint32_t id)
>  
>  static void
>  test_lib_route_table_handle_msg(const struct route_table_msg *change,
> -                                void *data OVS_UNUSED)
> +                                void *data OVS_UNUSED,
> +                                uint32_t table OVS_UNUSED)
>  {
>      struct ds nexthop_addr = DS_EMPTY_INITIALIZER;
>      struct ds rta_prefsrc = DS_EMPTY_INITIALIZER;
> @@ -115,7 +116,7 @@ static void
>  test_lib_route_table_change(struct route_table_msg *change,
>                              void *aux OVS_UNUSED)
>  {
> -    test_lib_route_table_handle_msg(change, NULL);
> +    test_lib_route_table_handle_msg(change, NULL, 0);
>      route_data_destroy(&change->rd);
>  }
>  

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to