[PATCH net-next v3 1/2] mpls: multipath route support

2015-10-11 Thread Roopa Prabhu
From: Roopa Prabhu 

This patch adds support for MPLS multipath routes.

Includes following changes to support multipath:
- splits struct mpls_route into 'struct mpls_route + struct mpls_nh'

- 'struct mpls_nh' represents a mpls nexthop label forwarding entry

- moves mpls route and nexthop structures into internal.h

- A mpls_route can point to multiple mpls_nh structs

- the nexthops are maintained as a array (similar to ipv4 fib)

- In the process of restructuring, this patch also consistently changes
  all labels to u8

- Adds support to parse/fill RTA_MULTIPATH netlink attribute for
multipath routes similar to ipv4/v6 fib

- In this patch, the multipath route nexthop selection algorithm
simply returns the first nexthop. It is replaced by a
hash based algorithm from Robert Shearman in the next patch

- mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
mpls_route_update though implemented to update based on dev, it was
never used that way. And the dev handling gets tricky with multiple nexthops.
Cannot match against any single nexthops dev. So, this patch removes the unused
'dev' handling in mpls_route_update.

Example:

$ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
nexthop as 700 via inet 10.1.1.6 dev swp2 \
nexthop as 800 via inet 40.1.1.2 dev swp3

$ip  -f mpls route show
100
nexthop as to 200 via inet 10.1.1.2  dev swp1
nexthop as to 700 via inet 10.1.1.6  dev swp2
nexthop as to 800 via inet 40.1.1.2  dev swp3

Signed-off-by: Roopa Prabhu 
---
 include/net/mpls_iptunnel.h |   2 +-
 net/mpls/af_mpls.c  | 500 +++-
 net/mpls/internal.h |  52 -
 3 files changed, 404 insertions(+), 150 deletions(-)

diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 4757997..179253f 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -18,7 +18,7 @@
 
 struct mpls_iptunnel_encap {
u32 label[MAX_NEW_LABELS];
-   u32 labels;
+   u8  labels;
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct 
lwtunnel_state *lwtstate)
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bb185a2..4d819df 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -19,37 +19,9 @@
 #include 
 #include 
 #endif
+#include 
 #include "internal.h"
 
-#define LABEL_NOT_SPECIFIED (1<<20)
-#define MAX_NEW_LABELS 2
-
-/* This maximum ha length copied from the definition of struct neighbour */
-#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
-
-enum mpls_payload_type {
-   MPT_UNSPEC, /* IPv4 or IPv6 */
-   MPT_IPV4 = 4,
-   MPT_IPV6 = 6,
-
-   /* Other types not implemented:
-*  - Pseudo-wire with or without control word (RFC4385)
-*  - GAL (RFC5586)
-*/
-};
-
-struct mpls_route { /* next hop label forwarding entry */
-   struct net_device __rcu *rt_dev;
-   struct rcu_head rt_rcu;
-   u32 rt_label[MAX_NEW_LABELS];
-   u8  rt_protocol; /* routing protocol that set this 
entry */
-   u8  rt_payload_type;
-   u8  rt_labels;
-   u8  rt_via_alen;
-   u8  rt_via_table;
-   u8  rt_via[0];
-};
-
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
 
@@ -80,10 +52,10 @@ bool mpls_output_possible(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(mpls_output_possible);
 
-static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
 {
/* The size of the layer 2.5 labels to be added for this route */
-   return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+   return nh->nh_labels * sizeof(struct mpls_shim_hdr);
 }
 
 unsigned int mpls_dev_mtu(const struct net_device *dev)
@@ -105,6 +77,12 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt)
+{
+   /* assume single nexthop for now */
+   return >rt_nh[0];
+}
+
 static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
struct mpls_entry_decoded dec)
 {
@@ -159,6 +137,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
struct net *net = dev_net(dev);
struct mpls_shim_hdr *hdr;
struct mpls_route *rt;
+   struct mpls_nh *nh;
struct mpls_entry_decoded dec;
struct net_device *out_dev;
struct mpls_dev *mdev;
@@ -166,6 +145,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
unsigned int new_header_size;
unsigned int mtu;
int err;
+   int nhidx;
 
/* Careful 

Re: [PATCH net-next v3 1/2] mpls: multipath route support

2015-10-11 Thread roopa
On 10/11/15, 1:41 PM, Eric W. Biederman wrote:
> Roopa Prabhu  writes:
>
>> From: Roopa Prabhu 
>>
>> This patch adds support for MPLS multipath routes.
>>
>> Includes following changes to support multipath:
>> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>>
>> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>>
>> - moves mpls route and nexthop structures into internal.h
>>
>> - A mpls_route can point to multiple mpls_nh structs
>>
>> - the nexthops are maintained as a array (similar to ipv4 fib)
>>
>> - In the process of restructuring, this patch also consistently changes
>>   all labels to u8
>>
>> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for
>> multipath routes similar to ipv4/v6 fib
>>
>> - In this patch, the multipath route nexthop selection algorithm
>> simply returns the first nexthop. It is replaced by a
>> hash based algorithm from Robert Shearman in the next patch
>>
>> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
>> mpls_route_update though implemented to update based on dev, it was
>> never used that way. And the dev handling gets tricky with multiple nexthops.
>> Cannot match against any single nexthops dev. So, this patch removes the 
>> unused
>> 'dev' handling in mpls_route_update.
> Ugh.  Looking at this patch I have found if not a bug a misfeature.
>
> The existing code has a function:
>
>> bool mpls_output_possible(const struct net_device *dev)
>> {
>>  return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
>> }
> Which makes a lot of sense if you only have a single path to choose
> from.   When multipath comes into play we need to do something to ensure
> we don't select a path that where no output is possible for a prolonged
> period of time.

I was planning to add support for this in incremental patches (they are in my 
todo). Basically introduce the required
support for RTNH_F_DEAD and RTNH_F_LINKDOWN and skipping dead and link down 
nexthops.
I will submit this as an incremental patch.
>
>> Example:
>>
>> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
>> nexthop as 700 via inet 10.1.1.6 dev swp2 \
>> nexthop as 800 via inet 40.1.1.2 dev swp3
>>
>> $ip  -f mpls route show
>> 100
>> nexthop as to 200 via inet 10.1.1.2  dev swp1
>> nexthop as to 700 via inet 10.1.1.6  dev swp2
>> nexthop as to 800 via inet 40.1.1.2  dev swp3
>>
>> Signed-off-by: Roopa Prabhu 
>> ---
>>  include/net/mpls_iptunnel.h |   2 +-
>>  net/mpls/af_mpls.c  | 500 
>> +++-
>>  net/mpls/internal.h |  52 -
>>  3 files changed, 404 insertions(+), 150 deletions(-)
>>
> [...]
>> @@ -196,9 +176,13 @@ static int mpls_forward(struct sk_buff *skb, struct 
>> net_device *dev,
>>  if (!rt)
>>  goto drop;
>>  
>> +nh = mpls_select_multipath(rt);
>> +if (!nh)
>> +goto drop;
>> +
>>  /* Find the output device */
>> -out_dev = rcu_dereference(rt->rt_dev);
>> -if (!mpls_output_possible(out_dev))
>> +out_dev = rcu_dereference(nh->nh_dev);
>> +if (!out_dev || !mpls_output_possible(out_dev))
>>  goto drop;
>>
> nit: mpls_output_possible already embeds the !out_dev test, making this
> hunk redundant.
ack,
>
>>  if (skb_warn_if_lro(skb))
>> @@ -635,9 +763,11 @@ static void mpls_ifdown(struct net_device *dev)
>>  struct mpls_route *rt = rtnl_dereference(platform_label[index]);
>>  if (!rt)
>>  continue;
>> -if (rtnl_dereference(rt->rt_dev) != dev)
>> -continue;
>> -rt->rt_dev = NULL;
>> +for_nexthops(rt) {
>> +if (rtnl_dereference(nh->nh_dev) != dev)
>> +continue;
>> +nh->nh_dev = NULL;
>> +} endfor_nexthops(rt);
>>  }
>>  
>>  mdev = mpls_dev_get(dev);
>> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
>> index 2681a4b..d7757be3 100644
>> --- a/net/mpls/internal.h
>> +++ b/net/mpls/internal.h
>> @@ -21,6 +21,54 @@ struct mpls_dev {
>>  
>>  struct sk_buff;
>>  
>> +#define LABEL_NOT_SPECIFIED (1 << 20)
>> +#define MAX_NEW_LABELS 2
>> +
>> +/* This maximum ha length copied from the definition of struct neighbour */
>> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
>>
>> +enum mpls_payload_type {
>> +MPT_UNSPEC, /* IPv4 or IPv6 */
>> +MPT_IPV4 = 4,
>> +MPT_IPV6 = 6,
>> +
>> +/* Other types not implemented:
>> + *  - Pseudo-wire with or without control word (RFC4385)
>> + *  - GAL (RFC5586)
>> + */
>> +};
>> +
>> +struct mpls_nh { /* next hop label forwarding entry */
>> +struct net_device __rcu *nh_dev;
>> +u32 nh_label[MAX_NEW_LABELS];
>> +u8  nh_labels;
>> 

Re: [PATCH net-next v3 1/2] mpls: multipath route support

2015-10-11 Thread Eric W. Biederman
Roopa Prabhu  writes:

> From: Roopa Prabhu 
>
> This patch adds support for MPLS multipath routes.
>
> Includes following changes to support multipath:
> - splits struct mpls_route into 'struct mpls_route + struct mpls_nh'
>
> - 'struct mpls_nh' represents a mpls nexthop label forwarding entry
>
> - moves mpls route and nexthop structures into internal.h
>
> - A mpls_route can point to multiple mpls_nh structs
>
> - the nexthops are maintained as a array (similar to ipv4 fib)
>
> - In the process of restructuring, this patch also consistently changes
>   all labels to u8
>
> - Adds support to parse/fill RTA_MULTIPATH netlink attribute for
> multipath routes similar to ipv4/v6 fib
>
> - In this patch, the multipath route nexthop selection algorithm
> simply returns the first nexthop. It is replaced by a
> hash based algorithm from Robert Shearman in the next patch
>
> - mpls_route_update cleanup: remove 'dev' handling in mpls_route_update.
> mpls_route_update though implemented to update based on dev, it was
> never used that way. And the dev handling gets tricky with multiple nexthops.
> Cannot match against any single nexthops dev. So, this patch removes the 
> unused
> 'dev' handling in mpls_route_update.

Ugh.  Looking at this patch I have found if not a bug a misfeature.

The existing code has a function:

> bool mpls_output_possible(const struct net_device *dev)
> {
>   return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
> }

Which makes a lot of sense if you only have a single path to choose
from.   When multipath comes into play we need to do something to ensure
we don't select a path that where no output is possible for a prolonged
period of time.

> Example:
>
> $ip -f mpls route add 100 nexthop as 200 via inet 10.1.1.2 dev swp1 \
> nexthop as 700 via inet 10.1.1.6 dev swp2 \
> nexthop as 800 via inet 40.1.1.2 dev swp3
>
> $ip  -f mpls route show
> 100
> nexthop as to 200 via inet 10.1.1.2  dev swp1
> nexthop as to 700 via inet 10.1.1.6  dev swp2
> nexthop as to 800 via inet 40.1.1.2  dev swp3
>
> Signed-off-by: Roopa Prabhu 
> ---
>  include/net/mpls_iptunnel.h |   2 +-
>  net/mpls/af_mpls.c  | 500 
> +++-
>  net/mpls/internal.h |  52 -
>  3 files changed, 404 insertions(+), 150 deletions(-)
>
[...]
> @@ -196,9 +176,13 @@ static int mpls_forward(struct sk_buff *skb, struct 
> net_device *dev,
>   if (!rt)
>   goto drop;
>  
> + nh = mpls_select_multipath(rt);
> + if (!nh)
> + goto drop;
> +
>   /* Find the output device */
> - out_dev = rcu_dereference(rt->rt_dev);
> - if (!mpls_output_possible(out_dev))
> + out_dev = rcu_dereference(nh->nh_dev);
> + if (!out_dev || !mpls_output_possible(out_dev))
>   goto drop;
>

nit: mpls_output_possible already embeds the !out_dev test, making this
hunk redundant.

>   if (skb_warn_if_lro(skb))

> @@ -635,9 +763,11 @@ static void mpls_ifdown(struct net_device *dev)
>   struct mpls_route *rt = rtnl_dereference(platform_label[index]);
>   if (!rt)
>   continue;
> - if (rtnl_dereference(rt->rt_dev) != dev)
> - continue;
> - rt->rt_dev = NULL;
> + for_nexthops(rt) {
> + if (rtnl_dereference(nh->nh_dev) != dev)
> + continue;
> + nh->nh_dev = NULL;
> + } endfor_nexthops(rt);
>   }
>  
>   mdev = mpls_dev_get(dev);
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index 2681a4b..d7757be3 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -21,6 +21,54 @@ struct mpls_dev {
>  
>  struct sk_buff;
>  
> +#define LABEL_NOT_SPECIFIED (1 << 20)
> +#define MAX_NEW_LABELS 2
> +
> +/* This maximum ha length copied from the definition of struct neighbour */
> +#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
>
> +enum mpls_payload_type {
> + MPT_UNSPEC, /* IPv4 or IPv6 */
> + MPT_IPV4 = 4,
> + MPT_IPV6 = 6,
> +
> + /* Other types not implemented:
> +  *  - Pseudo-wire with or without control word (RFC4385)
> +  *  - GAL (RFC5586)
> +  */
> +};
> +
> +struct mpls_nh { /* next hop label forwarding entry */
> + struct net_device __rcu *nh_dev;
> + u32 nh_label[MAX_NEW_LABELS];
> + u8  nh_labels;
> + u8  nh_via_alen;
> + u8  nh_via_table;
> + u8  nh_via[MAX_VIA_ALEN];
> +};

I think for this patch this is a good version of mpls_nh.

We do have significant optimization opportunities with the mpls_nh
structure.

- We waste a byte on each mpls label.
- We need a full byte on any of the fields (nh_lables, nh_via_alen,