On 06/17/2015 01:08 PM, Peter Nørlund wrote:
This patch adds L3 and L4 hash-based multipath routing, selectable on a
per-route basis with the reintroduced RTA_MP_ALGO attribute. The default is
now RT_MP_ALG_L3_HASH.

Signed-off-by: Peter Nørlund <p...@ordbogen.com>
---
  include/net/ip_fib.h           |  4 ++-
  include/net/route.h            |  5 ++--
  include/uapi/linux/rtnetlink.h | 14 ++++++++++-
  net/ipv4/fib_frontend.c        |  4 +++
  net/ipv4/fib_semantics.c       | 34 ++++++++++++++++++++++---
  net/ipv4/icmp.c                |  4 +--
  net/ipv4/route.c               | 56 +++++++++++++++++++++++++++++++++++-------
  net/ipv4/xfrm4_policy.c        |  2 +-
  8 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4be4f25..250d98e 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -37,6 +37,7 @@ struct fib_config {
        u32                     fc_flags;
        u32                     fc_priority;
        __be32                  fc_prefsrc;
+       int                     fc_mp_alg;
        struct nlattr           *fc_mx;
        struct rtnexthop        *fc_mp;
        int                     fc_mx_len;
@@ -116,6 +117,7 @@ struct fib_info {
        int                     fib_nhs;
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
        int                     fib_mp_weight;
+       int                     fib_mp_alg;
  #endif
        struct rcu_head         rcu;
        struct fib_nh           fib_nh[0];
@@ -308,7 +310,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
  int fib_sync_down_dev(struct net_device *dev, int force);
  int fib_sync_down_addr(struct net *net, __be32 local);
  int fib_sync_up(struct net_device *dev);
-void fib_select_multipath(struct fib_result *res);
+void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow);

  /* Exported by fib_trie.c */
  void fib_trie_init(void);
diff --git a/include/net/route.h b/include/net/route.h
index fe22d03..1fc7deb 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -110,7 +110,8 @@ struct in_device;
  int ip_rt_init(void);
  void rt_cache_flush(struct net *net);
  void rt_flush_dev(struct net_device *dev);
-struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
+struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp,
+                                    const struct flowi4 *mp_flow);
  struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
                                    struct sock *sk);
  struct dst_entry *ipv4_blackhole_route(struct net *net,
@@ -267,7 +268,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 
*fl4,
                              sport, dport, sk);

        if (!dst || !src) {
-               rt = __ip_route_output_key(net, fl4);
+               rt = __ip_route_output_key(net, fl4, NULL);
                if (IS_ERR(rt))
                        return rt;
                ip_rt_put(rt);
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 17fb02f..dff4a72 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -271,6 +271,18 @@ enum rt_scope_t {
  #define RTM_F_EQUALIZE                0x400   /* Multipath equalizer: NI      
*/
  #define RTM_F_PREFIX          0x800   /* Prefix addresses             */

+/* Multipath algorithms */
+
+enum rt_mp_alg_t {
+       RT_MP_ALG_L3_HASH,      /* Was IP_MP_ALG_NONE */
+       RT_MP_ALG_PER_PACKET,   /* Was IP_MP_ALG_RR */
+       RT_MP_ALG_DRR,          /* not used */
+       RT_MP_ALG_RANDOM,       /* not used */
+       RT_MP_ALG_WRANDOM,      /* not used */
+       RT_MP_ALG_L4_HASH,
+       __RT_MP_ALG_MAX
+};
+
  /* Reserved table identifiers */

  enum rt_class_t {
@@ -301,7 +313,7 @@ enum rtattr_type_t {
        RTA_FLOW,
        RTA_CACHEINFO,
        RTA_SESSION, /* no longer used */
-       RTA_MP_ALGO, /* no longer used */
+       RTA_MP_ALGO,
        RTA_TABLE,
        RTA_MARK,
        RTA_MFC_STATS,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 872494e..376e8c1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -590,6 +590,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
        [RTA_PREFSRC]           = { .type = NLA_U32 },
        [RTA_METRICS]           = { .type = NLA_NESTED },
        [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
+       [RTA_MP_ALGO]           = { .type = NLA_U32 },
        [RTA_FLOW]              = { .type = NLA_U32 },
  };

@@ -650,6 +651,9 @@ static int rtm_to_fib_config(struct net *net, struct 
sk_buff *skb,
                        cfg->fc_mp = nla_data(attr);
                        cfg->fc_mp_len = nla_len(attr);
                        break;
+               case RTA_MP_ALGO:
+                       cfg->fc_mp_alg = nla_get_u32(attr);
+                       break;
                case RTA_FLOW:
                        cfg->fc_flow = nla_get_u32(attr);
                        break;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8c8df80..da06e88 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -257,6 +257,11 @@ static inline int nh_comp(const struct fib_info *fi, const 
struct fib_info *ofi)
  {
        const struct fib_nh *onh = ofi->fib_nh;

+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+       if (fi->fib_mp_alg != ofi->fib_mp_alg)
+               return -1;
+#endif
+
        for_nexthops(fi) {
                if (nh->nh_oif != onh->nh_oif ||
                    nh->nh_gw  != onh->nh_gw ||
@@ -896,6 +901,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)

        if (cfg->fc_mp) {
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
+               fi->fib_mp_alg = cfg->fc_mp_alg;
                err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
                if (err != 0)
                        goto failure;
@@ -1085,6 +1091,10 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 
seq, int event,
                struct rtnexthop *rtnh;
                struct nlattr *mp;

+               if (fi->fib_mp_alg &&
+                   nla_put_u32(skb, RTA_MP_ALGO, fi->fib_mp_alg))
+                       goto nla_put_failure;
+
                mp = nla_nest_start(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;
@@ -1312,15 +1322,31 @@ int fib_sync_up(struct net_device *dev)
  }

  /*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
+ * Compute multipath hash based on 3- or 5-tuple
   */
-void fib_select_multipath(struct fib_result *res)
+static int fib_multipath_hash(const struct fib_result *res,
+                             const struct flowi4 *flow)
+{
+       u32 hash = flow->saddr ^ flow->daddr;
+
+       if (res->fi->fib_mp_alg == RT_MP_ALG_L4_HASH && flow->flowi4_proto != 0)
+               hash ^= flow->flowi4_proto ^ flow->fl4_sport ^ flow->fl4_dport;
+
+       hash ^= hash >> 16;
+       hash ^= hash >> 8;
+       return hash & 0xFF;
+}
+

This hash is still far from optimal. Really I think you should look at something such as jhash_3words or the like for mixing up the addresses. Right now just XORing the values together like you are will end up with a fairly high collision rate since for example in the case of two endpoints on the same subnet you would lose the subnet as a result of XORing the source and destination addresses. Also you would lose the port data in the case of a protocol using something such as UDP where the source and destination ports might be the same value.

+void fib_select_multipath(struct fib_result *res, const struct flowi4 *flow)
  {
        struct fib_info *fi = res->fi;
        u8 w;

-       w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
+       if (res->fi->fib_mp_alg == RT_MP_ALG_PER_PACKET) {
+               w = bitrev8(this_cpu_inc_return(fib_mp_rr_counter));
+       } else {
+               w = fib_multipath_hash(res, flow);
+       }

        for_nexthops(fi) {
                if (w >= atomic_read(&nh->nh_mp_upper_bound))
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index f5203fb..3abcfea 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -459,7 +459,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
        fl4->fl4_icmp_type = type;
        fl4->fl4_icmp_code = code;
        security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
-       rt = __ip_route_output_key(net, fl4);
+       rt = __ip_route_output_key(net, fl4, NULL);
        if (IS_ERR(rt))
                return rt;

@@ -481,7 +481,7 @@ static struct rtable *icmp_route_lookup(struct net *net,
                goto relookup_failed;

        if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) {
-               rt2 = __ip_route_output_key(net, &fl4_dec);
+               rt2 = __ip_route_output_key(net, &fl4_dec, NULL);
                if (IS_ERR(rt2))
                        err = PTR_ERR(rt2);
        } else {
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f605598..a1ec62c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1006,7 +1006,7 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net 
*net, u32 mtu,

        __build_flow_key(&fl4, NULL, iph, oif,
                         RT_TOS(iph->tos), protocol, mark, flow_flags);
-       rt = __ip_route_output_key(net, &fl4);
+       rt = __ip_route_output_key(net, &fl4, NULL);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
@@ -1025,7 +1025,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, 
struct sock *sk, u32 mtu)
        if (!fl4.flowi4_mark)
                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);

-       rt = __ip_route_output_key(sock_net(sk), &fl4);
+       rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
@@ -1094,7 +1094,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,

        __build_flow_key(&fl4, NULL, iph, oif,
                         RT_TOS(iph->tos), protocol, mark, flow_flags);
-       rt = __ip_route_output_key(net, &fl4);
+       rt = __ip_route_output_key(net, &fl4, NULL);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
@@ -1109,7 +1109,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock 
*sk)
        struct rtable *rt;

        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
-       rt = __ip_route_output_key(sock_net(sk), &fl4);
+       rt = __ip_route_output_key(sock_net(sk), &fl4, NULL);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
@@ -1631,6 +1631,39 @@ out:
        return err;
  }

+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+/* Fill flow key data based on packet for use in multipath routing. */
+static void ip_multipath_flow(const struct sk_buff *skb, struct flowi4 *flow)
+{
+       const struct iphdr *iph;
+
+       iph = ip_hdr(skb);
+
+       flow->saddr = iph->saddr;
+       flow->daddr = iph->daddr;
+       flow->flowi4_proto = iph->protocol;
+       flow->fl4_sport = 0;
+       flow->fl4_dport = 0;
+
+       if (unlikely(ip_is_fragment(iph)))
+               return;
+

I'm not sure if checking for fragmentation is enough. For example if this system is routing and received a flow of UDP packets, some fragmented some not then it might end up mixing them over 2 separate next hops since some will include L4 header data and some won't.

As such you may want to have the option to exclude UDP from the protocols listed below.

+       if (iph->protocol == IPPROTO_TCP ||
+           iph->protocol == IPPROTO_UDP ||
+           iph->protocol == IPPROTO_SCTP) {
+               __be16 _ports;
+               const __be16 *ports;
+
+               ports = skb_header_pointer(skb, iph->ihl * 4, sizeof(_ports),
+                                          &_ports);
+               if (ports) {
+                       flow->fl4_sport = ports[0];
+                       flow->fl4_dport = ports[1];
+               }
+       }
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
  static int ip_mkroute_input(struct sk_buff *skb,
                            struct fib_result *res,
                            const struct flowi4 *fl4,
@@ -1638,8 +1671,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
                            __be32 daddr, __be32 saddr, u32 tos)
  {
  #ifdef CONFIG_IP_ROUTE_MULTIPATH
-       if (res->fi && res->fi->fib_nhs > 1)
-               fib_select_multipath(res);
+       if (res->fi && res->fi->fib_nhs > 1) {
+               struct flowi4 mp_flow;
+
+               ip_multipath_flow(skb, &mp_flow);
+               fib_select_multipath(res, &mp_flow);
+       }

What is the point in populating the mp_flow if you don't know if it is going to be used? You are populating it in ip_multipath_flow, and then you might completely ignore it in fib_select_multipath.

Maybe instead of passing the mp_flow you could instead look at passing a function pointer that would alter the flow for the multipath case instead.

  #endif

        /* create a routing cache entry */
@@ -2012,7 +2049,8 @@ add:
   * Major route resolver routine.
   */

-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4,
+                                    const struct flowi4 *mp_flow)
  {
        struct net_device *dev_out = NULL;
        __u8 tos = RT_FL_TOS(fl4);
@@ -2170,7 +2208,7 @@ struct rtable *__ip_route_output_key(struct net *net, 
struct flowi4 *fl4)

  #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
-               fib_select_multipath(&res);
+               fib_select_multipath(&res, (mp_flow ? mp_flow : fl4));
        else
  #endif
        if (!res.prefixlen &&
@@ -2273,7 +2311,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, 
struct dst_entry *dst_or
  struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
                                    struct sock *sk)
  {
-       struct rtable *rt = __ip_route_output_key(net, flp4);
+       struct rtable *rt = __ip_route_output_key(net, flp4, NULL);

        if (IS_ERR(rt))
                return rt;
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index bff6974..7eae158 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -31,7 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, 
struct flowi4 *fl4,
        if (saddr)
                fl4->saddr = saddr->a4;

-       rt = __ip_route_output_key(net, fl4);
+       rt = __ip_route_output_key(net, fl4, NULL);
        if (!IS_ERR(rt))
                return &rt->dst;


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to