Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-21 Thread Thomas Graf
On 07/21/15 at 10:30am, Alexei Starovoitov wrote:
 RX:
 +info-mode = IP_TUNNEL_INFO_RX;
 +info-key.tun_flags = TUNNEL_KEY;
 +info-key.tun_id = cpu_to_be64(vni  8);
 ...
 TX:
 +dst_port = info-key.tp_dst ? : vxlan-dst_port;
 +vni = be64_to_cpu(info-key.tun_id);
 
 I think the copy paste of ovs_tunnel_info into ip_tunnel_info
 can be improved. In particular instead of '__be64 tun_id'
 we can use '__u64 tun_id' which will avoid extra byteswaps for rx/tx
 paths.
 
 netlink for this part also seems inconsistent.
 In the patch 16:
 +static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
 + [IP_TUN_ID] = { .type = NLA_U64 },
 ...
 + if (tb[IP_TUN_ID])
 + tun_info-key.tun_id = nla_get_u64(tb[IP_TUN_ID]);
 
 I think nla_get_be64 should be there?
 and with my suggestion we can add be64_to_cpu() here instead
 of doing it per packet.
 Thoughts?

I like this. The be64 originates from how OVS stores the tun_id in the
flow key. I agree that it makes sense to limit and delay the byteswaps
to when OVS inherits the flow key from the ip_tunnel_info. I will send
a follow-up.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-21 Thread Alexei Starovoitov

On 7/21/15 1:43 AM, Thomas Graf wrote:

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.


+1. looks very useful.

RX:

+   info-mode = IP_TUNNEL_INFO_RX;
+   info-key.tun_flags = TUNNEL_KEY;
+   info-key.tun_id = cpu_to_be64(vni  8);

...
TX:

+   dst_port = info-key.tp_dst ? : vxlan-dst_port;
+   vni = be64_to_cpu(info-key.tun_id);


I think the copy paste of ovs_tunnel_info into ip_tunnel_info
can be improved. In particular instead of '__be64 tun_id'
we can use '__u64 tun_id' which will avoid extra byteswaps for rx/tx
paths.

netlink for this part also seems inconsistent.
In the patch 16:
+static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = {
+   [IP_TUN_ID] = { .type = NLA_U64 },
...
+   if (tb[IP_TUN_ID])
+   tun_info-key.tun_id = nla_get_u64(tb[IP_TUN_ID]);

I think nla_get_be64 should be there?
and with my suggestion we can add be64_to_cpu() here instead
of doing it per packet.
Thoughts?

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-21 Thread Thomas Graf
Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf tg...@suug.ch
Signed-off-by: Pravin B Shelar pshe...@nicira.com
---
 drivers/net/vxlan.c  | 149 ---
 include/linux/skbuff.h   |   1 +
 include/net/dst_metadata.h   |  13 
 include/net/ip_tunnels.h |  14 
 include/net/vxlan.h  |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 165 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index ec86a11..06c092b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include net/ip6_tunnel.h
 #include net/ip6_checksum.h
 #endif
+#include net/dst_metadata.h
 
 #define VXLAN_VERSION  0.1
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+   return vs-flags  VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff 
*skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+   struct metadata_dst *tun_dst = NULL;
+   struct ip_tunnel_info *info;
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
u32 flags, vni;
-   struct vxlan_metadata md = {0};
+   struct vxlan_metadata _md;
+   struct vxlan_metadata *md = _md;
 
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
vni = VXLAN_VNI_MASK;
}
 
+   if (vxlan_collect_metadata(vs)) {
+   const struct iphdr *iph = ip_hdr(skb);
+
+   tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+   if (!tun_dst)
+   goto drop;
+
+   info = tun_dst-u.tun_info;
+   info-key.ipv4_src = iph-saddr;
+   info-key.ipv4_dst = iph-daddr;
+   info-key.ipv4_tos = iph-tos;
+   info-key.ipv4_ttl = iph-ttl;
+   info-key.tp_src = udp_hdr(skb)-source;
+   info-key.tp_dst = udp_hdr(skb)-dest;
+
+   info-mode = IP_TUNNEL_INFO_RX;
+   info-key.tun_flags = TUNNEL_KEY;
+   info-key.tun_id = cpu_to_be64(vni  8);
+   if (udp_hdr(skb)-check != 0)
+   info-key.tun_flags |= TUNNEL_CSUM;
+
+   md = ip_tunnel_info_opts(info, sizeof(*md));
+   md-tun_dst = tun_dst;
+   } else {
+   memset(md, 0, sizeof(*md));
+   }
+
/* For backwards compatibility, only allow reserved fields to be
 * used by VXLAN extensions if explicitly requested.
 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
struct vxlanhdr_gbp *gbp;
 
gbp = (struct vxlanhdr_gbp *)vxh;
-   md.gbp = ntohs(gbp-policy_id);
+   md-gbp = ntohs(gbp-policy_id);
+
+   if (tun_dst)
+   info-key.tun_flags |= TUNNEL_VXLAN_OPT;
 
if (gbp-dont_learn)
-   md.gbp |= VXLAN_GBP_DONT_LEARN;
+

Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-20 Thread Thomas Graf
On 07/17/15 at 11:41am, Alexei Starovoitov wrote:
 On 7/17/15 5:55 AM, Thomas Graf wrote:
 @@ -2373,6 +2470,12 @@ static void vxlan_setup(struct net_device *dev)
  netif_keep_dst(dev);
  dev-priv_flags |= IFF_LIVE_ADDR_CHANGE;
 
 +/* If in flow based mode, keep the dst including encapsulation
 + * instructions for vxlan_xmit().
 + */
 +if (vxlan-flags  VXLAN_F_FLOW_BASED)
 +netif_keep_dst(dev);
 
 hmm, isn't this done already few lines above? ;)

Good point, fixed ;)
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-20 Thread Marcelo Ricardo Leitner
On Mon, Jul 20, 2015 at 02:15:22PM -0300, Marcelo Ricardo Leitner wrote:
 On Fri, Jul 17, 2015 at 02:55:33PM +0200, Thomas Graf wrote:
 
 [ snip ]
 
  @@ -2373,6 +2470,12 @@ static void vxlan_setup(struct net_device *dev)
  netif_keep_dst(dev); --- (a)
  dev-priv_flags |= IFF_LIVE_ADDR_CHANGE;
   
  +   /* If in flow based mode, keep the dst including encapsulation
  +* instructions for vxlan_xmit().
  +*/
  +   if (vxlan-flags  VXLAN_F_FLOW_BASED)
  +   netif_keep_dst(dev); --- (b)
  +
  INIT_LIST_HEAD(vxlan-next);
  spin_lock_init(vxlan-hash_lock);
   
 
 Sounds like after 0287587884b1 (net: better IFF_XMIT_DST_RELEASE
 support), which introduced (a) above, (b) is not needed?

Oops, it was already reported by Alexei.

  Marcelo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-20 Thread Marcelo Ricardo Leitner
On Fri, Jul 17, 2015 at 02:55:33PM +0200, Thomas Graf wrote:

[ snip ]

 @@ -2373,6 +2470,12 @@ static void vxlan_setup(struct net_device *dev)
   netif_keep_dst(dev); --- (a)
   dev-priv_flags |= IFF_LIVE_ADDR_CHANGE;
  
 + /* If in flow based mode, keep the dst including encapsulation
 +  * instructions for vxlan_xmit().
 +  */
 + if (vxlan-flags  VXLAN_F_FLOW_BASED)
 + netif_keep_dst(dev); --- (b)
 +
   INIT_LIST_HEAD(vxlan-next);
   spin_lock_init(vxlan-hash_lock);
  

Sounds like after 0287587884b1 (net: better IFF_XMIT_DST_RELEASE
support), which introduced (a) above, (b) is not needed?

  Marcelo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-17 Thread Thomas Graf
Allows putting a VXLAN device into a new flow-based mode in which
skbs with a ip_tunnel_info dst metadata attached will be encapsulated
according to the instructions stored in there with the VXLAN device
defaults taken into consideration.

Similar on the receive side, if the VXLAN_F_COLLECT_METADATA flag is
set, the packet processing will populate a ip_tunnel_info struct for
each packet received and attach it to the skb using the new metadata
dst.  The metadata structure will contain the outer header and tunnel
header fields which have been stripped off. Layers further up in the
stack such as routing, tc or netfitler can later match on these fields
and perform forwarding. It is the responsibility of upper layers to
ensure that the flag is set if the metadata is needed. The flag limits
the additional cost of metadata collecting based on demand.

This prepares the VXLAN device to be steered by the routing and other
subsystems which allows to support encapsulation for a large number
of tunnel endpoints and tunnel ids through a single net_device which
improves the scalability.

It also allows for OVS to leverage this mode which in turn allows for
the removal of the OVS specific VXLAN code.

Because the skb is currently scrubed in vxlan_rcv(), the attachment of
the new dst metadata is postponed until after scrubing which requires
the temporary addition of a new member to vxlan_metadata. This member
is removed again in a later commit after the indirect VXLAN receive API
has been removed.

Signed-off-by: Thomas Graf tg...@suug.ch
Signed-off-by: Pravin B Shelar pshe...@nicira.com
---
 drivers/net/vxlan.c  | 155 +--
 include/linux/skbuff.h   |   1 +
 include/net/dst_metadata.h   |  13 
 include/net/ip_tunnels.h |  14 
 include/net/vxlan.h  |  10 ++-
 include/uapi/linux/if_link.h |   1 +
 6 files changed, 171 insertions(+), 23 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 34c519e..994d89c 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -49,6 +49,7 @@
 #include net/ip6_tunnel.h
 #include net/ip6_checksum.h
 #endif
+#include net/dst_metadata.h
 
 #define VXLAN_VERSION  0.1
 
@@ -140,6 +141,11 @@ struct vxlan_dev {
 static u32 vxlan_salt __read_mostly;
 static struct workqueue_struct *vxlan_wq;
 
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+   return vs-flags  VXLAN_F_COLLECT_METADATA;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 static inline
 bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
@@ -1164,10 +1170,13 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff 
*skb, struct vxlanhdr *vh,
 /* Callback from net/ipv4/udp.c to receive packets */
 static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
 {
+   struct metadata_dst *tun_dst = NULL;
+   struct ip_tunnel_info *info;
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
u32 flags, vni;
-   struct vxlan_metadata md = {0};
+   struct vxlan_metadata _md;
+   struct vxlan_metadata *md = _md;
 
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -1202,6 +1211,33 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
vni = VXLAN_VNI_MASK;
}
 
+   if (vxlan_collect_metadata(vs)) {
+   const struct iphdr *iph = ip_hdr(skb);
+
+   tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC);
+   if (!tun_dst)
+   goto drop;
+
+   info = tun_dst-u.tun_info;
+   info-key.ipv4_src = iph-saddr;
+   info-key.ipv4_dst = iph-daddr;
+   info-key.ipv4_tos = iph-tos;
+   info-key.ipv4_ttl = iph-ttl;
+   info-key.tp_src = udp_hdr(skb)-source;
+   info-key.tp_dst = udp_hdr(skb)-dest;
+
+   info-mode = IP_TUNNEL_INFO_RX;
+   info-key.tun_flags = TUNNEL_KEY;
+   info-key.tun_id = cpu_to_be64(vni  8);
+   if (udp_hdr(skb)-check != 0)
+   info-key.tun_flags |= TUNNEL_CSUM;
+
+   md = ip_tunnel_info_opts(info, sizeof(*md));
+   md-tun_dst = tun_dst;
+   } else {
+   memset(md, 0, sizeof(*md));
+   }
+
/* For backwards compatibility, only allow reserved fields to be
 * used by VXLAN extensions if explicitly requested.
 */
@@ -1209,13 +1245,16 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct 
sk_buff *skb)
struct vxlanhdr_gbp *gbp;
 
gbp = (struct vxlanhdr_gbp *)vxh;
-   md.gbp = ntohs(gbp-policy_id);
+   md-gbp = ntohs(gbp-policy_id);
+
+   if (tun_dst)
+   info-key.tun_flags |= TUNNEL_VXLAN_OPT;
 
if (gbp-dont_learn)
-   md.gbp |= VXLAN_GBP_DONT_LEARN;
+

Re: [PATCH net-next 14/22] vxlan: Flow based tunneling

2015-07-17 Thread Alexei Starovoitov

On 7/17/15 5:55 AM, Thomas Graf wrote:

@@ -2373,6 +2470,12 @@ static void vxlan_setup(struct net_device *dev)
netif_keep_dst(dev);
dev-priv_flags |= IFF_LIVE_ADDR_CHANGE;

+   /* If in flow based mode, keep the dst including encapsulation
+* instructions for vxlan_xmit().
+*/
+   if (vxlan-flags  VXLAN_F_FLOW_BASED)
+   netif_keep_dst(dev);


hmm, isn't this done already few lines above? ;)
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html