On Tue, Nov 20, 2018 at 6:23 AM Alexis Bauvin <abau...@scaleway.com> wrote:
>
> Creating a VXLAN device with is underlay in the non-default VRF makes
> egress route lookup fail or incorrect since it will resolve in the
> default VRF, and ingress fail because the socket listens in the default
> VRF.
>
> This patch binds the underlying UDP tunnel socket to the l3mdev of the
> lower device of the VXLAN device. This will listen in the proper VRF and
> output traffic from said l3mdev, matching l3mdev routing rules and
> looking up the correct routing table.
>
> When the VXLAN device does not have a lower device, or the lower device
> is in the default VRF, the socket will not be bound to any interface,
> keeping the previous behaviour.
>
> The underlay l3mdev is deduced from the VXLAN lower device
> (IFLA_VXLAN_LINK).
>
> The l3mdev_master_upper_ifindex_by_index function has been added to
> l3mdev. Its goal is to fetch the effective l3mdev of an interface which
> is not a direct slave of said l3mdev. It handles the following example,
> properly resolving the l3mdev of eth0 to vrf-blue:
>
> +----------+                         +---------+
> |          |                         |         |
> | vrf-blue |                         | vrf-red |
> |          |                         |         |
> +----+-----+                         +----+----+
>      |                                    |
>      |                                    |
> +----+-----+                         +----+----+
> |          |                         |         |
> | br-blue  |                         | br-red  |
> |          |                         |         |
> +----+-----+                         +---+-+---+
>      |                                   | |
>      |                             +-----+ +-----+
>      |                             |             |
> +----+-----+                +------+----+   +----+----+
> |          |  lower device  |           |   |         |
> |   eth0   | <- - - - - - - | vxlan-red |   | tap-red | (... more taps)
> |          |                |           |   |         |
> +----------+                +-----------+   +---------+
>
> Signed-off-by: Alexis Bauvin <abau...@scaleway.com>
> Reviewed-by: Amine Kherbouche <akherbou...@scaleway.com>
> Tested-by: Amine Kherbouche <akherbou...@scaleway.com>
> ---
>  drivers/net/vxlan.c  | 32 ++++++++++++++++++++++++--------
>  include/net/l3mdev.h | 22 ++++++++++++++++++++++
>  net/l3mdev/l3mdev.c  | 18 ++++++++++++++++++
>  3 files changed, 64 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index 27bd586b94b0..a3de08122269 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -212,7 +212,7 @@ static inline struct vxlan_rdst *first_remote_rtnl(struct 
> vxlan_fdb *fdb)
>   * and enabled unshareable flags.
>   */
>  static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t 
> family,
> -                                         __be16 port, u32 flags)
> +                                         __be16 port, u32 flags, int ifindex)
>  {
>         struct vxlan_sock *vs;
>
> @@ -221,7 +221,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net 
> *net, sa_family_t family,
>         hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
>                 if (inet_sk(vs->sock->sk)->inet_sport == port &&
>                     vxlan_get_sk_family(vs) == family &&
> -                   vs->flags == flags)
> +                   vs->flags == flags &&
> +                   vs->sock->sk->sk_bound_dev_if == ifindex)
>                         return vs;
>         }
>         return NULL;
> @@ -261,7 +262,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, 
> int ifindex,
>  {
>         struct vxlan_sock *vs;
>
> -       vs = vxlan_find_sock(net, family, port, flags);
> +       vs = vxlan_find_sock(net, family, port, flags, ifindex);
>         if (!vs)
>                 return NULL;
>
> @@ -2172,6 +2173,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
> net_device *dev,
>                 struct rtable *rt;
>                 __be16 df = 0;
>
> +               if (!ifindex)
> +                       ifindex = sock4->sock->sk->sk_bound_dev_if;
> +
>                 rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
>                                      dst->sin.sin_addr.s_addr,
>                                      &local_ip.sin.sin_addr.s_addr,
> @@ -2210,6 +2214,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
> net_device *dev,
>         } else {
>                 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
>
> +               if (!ifindex)
> +                       ifindex = sock6->sock->sk->sk_bound_dev_if;
> +
>                 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
>                                         label, &dst->sin6.sin6_addr,
>                                         &local_ip.sin6.sin6_addr,
> @@ -2813,7 +2820,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = {
>  };
>
>  static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
> -                                       __be16 port, u32 flags)
> +                                       __be16 port, u32 flags, int ifindex)
>  {
>         struct socket *sock;
>         struct udp_port_cfg udp_conf;
> @@ -2831,6 +2838,7 @@ static struct socket *vxlan_create_sock(struct net 
> *net, bool ipv6,
>         }
>
>         udp_conf.local_udp_port = port;
> +       udp_conf.bind_ifindex = ifindex;
>
>         /* Open UDP socket */
>         err = udp_sock_create(net, &udp_conf, &sock);
> @@ -2842,7 +2850,8 @@ static struct socket *vxlan_create_sock(struct net 
> *net, bool ipv6,
>
>  /* Create new listen socket if needed */
>  static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
> -                                             __be16 port, u32 flags)
> +                                             __be16 port, u32 flags,
> +                                             int ifindex)
>  {
>         struct vxlan_net *vn = net_generic(net, vxlan_net_id);
>         struct vxlan_sock *vs;
> @@ -2857,7 +2866,7 @@ static struct vxlan_sock *vxlan_socket_create(struct 
> net *net, bool ipv6,
>         for (h = 0; h < VNI_HASH_SIZE; ++h)
>                 INIT_HLIST_HEAD(&vs->vni_list[h]);
>
> -       sock = vxlan_create_sock(net, ipv6, port, flags);
> +       sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
>         if (IS_ERR(sock)) {
>                 kfree(vs);
>                 return ERR_CAST(sock);
> @@ -2894,11 +2903,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, 
> bool ipv6)
>         struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
>         struct vxlan_sock *vs = NULL;
>         struct vxlan_dev_node *node;
> +       int l3mdev_index;
> +
> +       l3mdev_index =
> +               l3mdev_master_upper_ifindex_by_index(vxlan->net,
> +                                                    
> vxlan->cfg.remote_ifindex);

vxlan->cfg.remote_ifindex is optional, so we can avoid trying to
derive the l3mdev_ifindex for cases where it is not present


>
>         if (!vxlan->cfg.no_share) {
>                 spin_lock(&vn->sock_lock);
>                 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
> -                                    vxlan->cfg.dst_port, vxlan->cfg.flags);
> +                                    vxlan->cfg.dst_port, vxlan->cfg.flags,
> +                                    l3mdev_index);
>                 if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
>                         spin_unlock(&vn->sock_lock);
>                         return -EBUSY;
> @@ -2907,7 +2922,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, 
> bool ipv6)
>         }
>         if (!vs)
>                 vs = vxlan_socket_create(vxlan->net, ipv6,
> -                                        vxlan->cfg.dst_port, 
> vxlan->cfg.flags);
> +                                        vxlan->cfg.dst_port, 
> vxlan->cfg.flags,
> +                                        l3mdev_index);
>         if (IS_ERR(vs))
>                 return PTR_ERR(vs);
>  #if IS_ENABLED(CONFIG_IPV6)
> diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
> index 3832099289c5..78fa0ac4613c 100644
> --- a/include/net/l3mdev.h
> +++ b/include/net/l3mdev.h
> @@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct 
> net_device *_dev)
>         return master;
>  }
>
> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex);
> +static inline
> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
> +{
> +       rcu_read_lock();
> +       ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex);
> +       rcu_read_unlock();
> +
> +       return ifindex;
> +}
> +
>  u32 l3mdev_fib_table_rcu(const struct net_device *dev);
>  u32 l3mdev_fib_table_by_index(struct net *net, int ifindex);
>  static inline u32 l3mdev_fib_table(const struct net_device *dev)
> @@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct 
> net *net, int ifindex)
>         return 0;
>  }
>
> +static inline
> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
> +{
> +       return 0;
> +}
> +static inline
> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex)
> +{
> +       return 0;
> +}
> +
>  static inline
>  struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev)
>  {
> diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
> index 8da86ceca33d..309dee76724e 100644
> --- a/net/l3mdev/l3mdev.c
> +++ b/net/l3mdev/l3mdev.c
> @@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
>  }
>  EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
>
> +/**
> + *     l3mdev_master_upper_ifindex_by_index - get index of upper l3 master
> + *                                            device
> + *     @net: network namespace for device index lookup
> + *     @ifindex: targeted interface
> + */
> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
> +{
> +       struct net_device *dev;
> +
> +       dev = dev_get_by_index_rcu(net, ifindex);
> +       while (dev && !netif_is_l3_master(dev))
> +               dev = netdev_master_upper_dev_get(dev);
> +
> +       return dev ? dev->ifindex : 0;
> +}
> +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
> +
>  /**
>   *     l3mdev_fib_table - get FIB table id associated with an L3
>   *                             master interface
> --
>

Reply via email to