Le 20 nov. 2018 à 16:25, Roopa Prabhu <ro...@cumulusnetworks.com> a écrit : > > On Tue, Nov 20, 2018 at 6:23 AM Alexis Bauvin <abau...@scaleway.com> wrote: >> >> Creating a VXLAN device with is underlay in the non-default VRF makes >> egress route lookup fail or incorrect since it will resolve in the >> default VRF, and ingress fail because the socket listens in the default >> VRF. >> >> This patch binds the underlying UDP tunnel socket to the l3mdev of the >> lower device of the VXLAN device. This will listen in the proper VRF and >> output traffic from said l3mdev, matching l3mdev routing rules and >> looking up the correct routing table. >> >> When the VXLAN device does not have a lower device, or the lower device >> is in the default VRF, the socket will not be bound to any interface, >> keeping the previous behaviour. >> >> The underlay l3mdev is deduced from the VXLAN lower device >> (IFLA_VXLAN_LINK). >> >> The l3mdev_master_upper_ifindex_by_index function has been added to >> l3mdev. Its goal is to fetch the effective l3mdev of an interface which >> is not a direct slave of said l3mdev. It handles the following example, >> properly resolving the l3mdev of eth0 to vrf-blue: >> >> +----------+ +---------+ >> | | | | >> | vrf-blue | | vrf-red | >> | | | | >> +----+-----+ +----+----+ >> | | >> | | >> +----+-----+ +----+----+ >> | | | | >> | br-blue | | br-red | >> | | | | >> +----+-----+ +---+-+---+ >> | | | >> | +-----+ +-----+ >> | | | >> +----+-----+ +------+----+ +----+----+ >> | | lower device | | | | >> | eth0 | <- - - - - - - | vxlan-red | | tap-red | (... more taps) >> | | | | | | >> +----------+ +-----------+ +---------+ >> >> Signed-off-by: Alexis Bauvin <abau...@scaleway.com> >> Reviewed-by: Amine Kherbouche <akherbou...@scaleway.com> >> Tested-by: Amine Kherbouche <akherbou...@scaleway.com> >> --- >> drivers/net/vxlan.c | 32 ++++++++++++++++++++++++-------- >> include/net/l3mdev.h | 22 ++++++++++++++++++++++ >> net/l3mdev/l3mdev.c | 18 ++++++++++++++++++ >> 3 files changed, 64 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c >> index 27bd586b94b0..a3de08122269 100644 >> --- a/drivers/net/vxlan.c >> +++ b/drivers/net/vxlan.c >> @@ -212,7 +212,7 @@ static inline struct vxlan_rdst >> *first_remote_rtnl(struct vxlan_fdb *fdb) >> * and enabled unshareable flags. >> */ >> static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t >> family, >> - __be16 port, u32 flags) >> + __be16 port, u32 flags, int >> ifindex) >> { >> struct vxlan_sock *vs; >> >> @@ -221,7 +221,8 @@ static struct vxlan_sock *vxlan_find_sock(struct net >> *net, sa_family_t family, >> hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { >> if (inet_sk(vs->sock->sk)->inet_sport == port && >> vxlan_get_sk_family(vs) == family && >> - vs->flags == flags) >> + vs->flags == flags && >> + vs->sock->sk->sk_bound_dev_if == ifindex) >> return vs; >> } >> return NULL; >> @@ -261,7 +262,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, >> int ifindex, >> { >> struct vxlan_sock *vs; >> >> - vs = vxlan_find_sock(net, family, port, flags); >> + vs = vxlan_find_sock(net, family, port, flags, ifindex); >> if (!vs) >> return NULL; >> >> @@ -2172,6 +2173,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct >> net_device *dev, >> struct rtable *rt; >> __be16 df = 0; >> >> + if (!ifindex) >> + ifindex = sock4->sock->sk->sk_bound_dev_if; >> + >> rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos, >> dst->sin.sin_addr.s_addr, >> &local_ip.sin.sin_addr.s_addr, >> @@ -2210,6 +2214,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct >> net_device *dev, >> } else { >> struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); >> >> + if (!ifindex) >> + ifindex = sock6->sock->sk->sk_bound_dev_if; >> + >> ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos, >> label, &dst->sin6.sin6_addr, >> &local_ip.sin6.sin6_addr, >> @@ -2813,7 +2820,7 @@ static const struct ethtool_ops vxlan_ethtool_ops = { >> }; >> >> static struct socket *vxlan_create_sock(struct net *net, bool ipv6, >> - __be16 port, u32 flags) >> + __be16 port, u32 flags, int ifindex) >> { >> struct socket *sock; >> struct udp_port_cfg udp_conf; >> @@ -2831,6 +2838,7 @@ static struct socket *vxlan_create_sock(struct net >> *net, bool ipv6, >> } >> >> udp_conf.local_udp_port = port; >> + udp_conf.bind_ifindex = ifindex; >> >> /* Open UDP socket */ >> err = udp_sock_create(net, &udp_conf, &sock); >> @@ -2842,7 +2850,8 @@ static struct socket *vxlan_create_sock(struct net >> *net, bool ipv6, >> >> /* Create new listen socket if needed */ >> static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, >> - __be16 port, u32 flags) >> + __be16 port, u32 flags, >> + int ifindex) >> { >> struct vxlan_net *vn = net_generic(net, vxlan_net_id); >> struct vxlan_sock *vs; >> @@ -2857,7 +2866,7 @@ static struct vxlan_sock *vxlan_socket_create(struct >> net *net, bool ipv6, >> for (h = 0; h < VNI_HASH_SIZE; ++h) >> INIT_HLIST_HEAD(&vs->vni_list[h]); >> >> - sock = vxlan_create_sock(net, ipv6, port, flags); >> + sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); >> if (IS_ERR(sock)) { >> kfree(vs); >> return ERR_CAST(sock); >> @@ -2894,11 +2903,17 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, >> bool ipv6) >> struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); >> struct vxlan_sock *vs = NULL; >> struct vxlan_dev_node *node; >> + int l3mdev_index; >> + >> + l3mdev_index = >> + l3mdev_master_upper_ifindex_by_index(vxlan->net, >> + >> vxlan->cfg.remote_ifindex); > > vxlan->cfg.remote_ifindex is optional, so we can avoid trying to > derive the l3mdev_ifindex for cases where it is not present
So you do suggest to check first remote_ifindex, and derive the l3mdev_ifindex only if not zero? If so, I will add it in next version. >> >> if (!vxlan->cfg.no_share) { >> spin_lock(&vn->sock_lock); >> vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, >> - vxlan->cfg.dst_port, vxlan->cfg.flags); >> + vxlan->cfg.dst_port, vxlan->cfg.flags, >> + l3mdev_index); >> if (vs && !refcount_inc_not_zero(&vs->refcnt)) { >> spin_unlock(&vn->sock_lock); >> return -EBUSY; >> @@ -2907,7 +2922,8 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, >> bool ipv6) >> } >> if (!vs) >> vs = vxlan_socket_create(vxlan->net, ipv6, >> - vxlan->cfg.dst_port, >> vxlan->cfg.flags); >> + vxlan->cfg.dst_port, >> vxlan->cfg.flags, >> + l3mdev_index); >> if (IS_ERR(vs)) >> return PTR_ERR(vs); >> #if IS_ENABLED(CONFIG_IPV6) >> diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h >> index 3832099289c5..78fa0ac4613c 100644 >> --- a/include/net/l3mdev.h >> +++ b/include/net/l3mdev.h >> @@ -101,6 +101,17 @@ struct net_device *l3mdev_master_dev_rcu(const struct >> net_device *_dev) >> return master; >> } >> >> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex); >> +static inline >> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) >> +{ >> + rcu_read_lock(); >> + ifindex = l3mdev_master_upper_ifindex_by_index_rcu(net, ifindex); >> + rcu_read_unlock(); >> + >> + return ifindex; >> +} >> + >> u32 l3mdev_fib_table_rcu(const struct net_device *dev); >> u32 l3mdev_fib_table_by_index(struct net *net, int ifindex); >> static inline u32 l3mdev_fib_table(const struct net_device *dev) >> @@ -207,6 +218,17 @@ static inline int l3mdev_master_ifindex_by_index(struct >> net *net, int ifindex) >> return 0; >> } >> >> +static inline >> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) >> +{ >> + return 0; >> +} >> +static inline >> +int l3mdev_master_upper_ifindex_by_index(struct net *net, int ifindex) >> +{ >> + return 0; >> +} >> + >> static inline >> struct net_device *l3mdev_master_dev_rcu(const struct net_device *dev) >> { >> diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c >> index 8da86ceca33d..309dee76724e 100644 >> --- a/net/l3mdev/l3mdev.c >> +++ b/net/l3mdev/l3mdev.c >> @@ -46,6 +46,24 @@ int l3mdev_master_ifindex_rcu(const struct net_device >> *dev) >> } >> EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu); >> >> +/** >> + * l3mdev_master_upper_ifindex_by_index - get index of upper l3 master >> + * device >> + * @net: network namespace for device index lookup >> + * @ifindex: targeted interface >> + */ >> +int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex) >> +{ >> + struct net_device *dev; >> + >> + dev = dev_get_by_index_rcu(net, ifindex); >> + while (dev && !netif_is_l3_master(dev)) >> + dev = netdev_master_upper_dev_get(dev); >> + >> + return dev ? dev->ifindex : 0; >> +} >> +EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu); >> + >> /** >> * l3mdev_fib_table - get FIB table id associated with an L3 >> * master interface >> --