date:20160506

[PATCH v2 next-next 06/12] fou: Split out {fou,gue}_build_header

2016-05-06 Thread Tom Herbert

Create __fou_build_header and __gue_build_header. These implement the
protocol generic parts of building the fou and gue header.
fou_build_header and gue_build_header implement the IPv4 specific
functions and call the __*_build_header functions.

Signed-off-by: Tom Herbert 
---
 include/net/fou.h |  8 
 net/ipv4/fou.c| 47 +--
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/include/net/fou.h b/include/net/fou.h
index 19b8a0c..7d2fda2 100644
--- a/include/net/fou.h
+++ b/include/net/fou.h
@@ -11,9 +11,9 @@
 size_t fou_encap_hlen(struct ip_tunnel_encap *e);
 static size_t gue_encap_hlen(struct ip_tunnel_encap *e);
 
-int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
-u8 *protocol, struct flowi4 *fl4);
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
-u8 *protocol, struct flowi4 *fl4);
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+  u8 *protocol, __be16 *sport, int type);
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+  u8 *protocol, __be16 *sport, int type);
 
 #endif
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index a8b5cbf..971c8c6 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -778,6 +778,22 @@ static void fou_build_udp(struct sk_buff *skb, struct 
ip_tunnel_encap *e,
*protocol = IPPROTO_UDP;
 }
 
+int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+  u8 *protocol, __be16 *sport, int type)
+{
+   int err;
+
+   err = iptunnel_handle_offloads(skb, type);
+   if (err)
+   return err;
+
+   *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+   skb, 0, 0, false);
+
+   return 0;
+}
+EXPORT_SYMBOL(__fou_build_header);
+
 int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
 u8 *protocol, struct flowi4 *fl4)
 {
@@ -786,26 +802,21 @@ int fou_build_header(struct sk_buff *skb, struct 
ip_tunnel_encap *e,
__be16 sport;
int err;
 
-   err = iptunnel_handle_offloads(skb, type);
+   err = __fou_build_header(skb, e, protocol, &sport, type);
if (err)
return err;
 
-   sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
-  skb, 0, 0, false);
fou_build_udp(skb, e, fl4, protocol, sport);
 
return 0;
 }
 EXPORT_SYMBOL(fou_build_header);
 
-int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
-u8 *protocol, struct flowi4 *fl4)
+int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+  u8 *protocol, __be16 *sport, int type)
 {
-   int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
-  SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
-   __be16 sport;
void *data;
bool need_priv = false;
int err;
@@ -824,8 +835,8 @@ int gue_build_header(struct sk_buff *skb, struct 
ip_tunnel_encap *e,
return err;
 
/* Get source port (based on flow hash) before skb_push */
-   sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
-  skb, 0, 0, false);
+   *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev),
+   skb, 0, 0, false);
 
hdrlen = sizeof(struct guehdr) + optlen;
 
@@ -870,6 +881,22 @@ int gue_build_header(struct sk_buff *skb, struct 
ip_tunnel_encap *e,
 
}
 
+   return 0;
+}
+EXPORT_SYMBOL(__gue_build_header);
+
+int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+u8 *protocol, struct flowi4 *fl4)
+{
+   int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+  SKB_GSO_UDP_TUNNEL;
+   __be16 sport;
+   int err;
+
+   err = __gue_build_header(skb, e, protocol, &sport, type);
+   if (err)
+   return err;
+
fou_build_udp(skb, e, fl4, protocol, sport);
 
return 0;
-- 
2.8.0.rc2

[PATCH v2 next-next 07/12] fou: Add encap ops for IPv6 tunnels

2016-05-06 Thread Tom Herbert

Thsi packet adds IP tunnel encapsulation operations for IPv6. This
includes the infrastructure to add and delete operations. IPv6 variants
for fou6_build_header and gue6_build_header are added in a new
fou6 module. These encapsulation operations for fou and gue are
automatically added when the fou6 module loads.

Signed-off-by: Tom Herbert 
---
 include/net/fou.h  |   2 +-
 include/net/ip6_tunnel.h   |  14 +
 net/ipv6/Makefile  |   4 +-
 net/ipv6/fou6.c| 140 +
 net/ipv6/ip6_tunnel_core.c |  44 ++
 5 files changed, 202 insertions(+), 2 deletions(-)
 create mode 100644 net/ipv6/fou6.c
 create mode 100644 net/ipv6/ip6_tunnel_core.c

diff --git a/include/net/fou.h b/include/net/fou.h
index 7d2fda2..f5cc691 100644
--- a/include/net/fou.h
+++ b/include/net/fou.h
@@ -9,7 +9,7 @@
 #include 
 
 size_t fou_encap_hlen(struct ip_tunnel_encap *e);
-static size_t gue_encap_hlen(struct ip_tunnel_encap *e);
+size_t gue_encap_hlen(struct ip_tunnel_encap *e);
 
 int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
   u8 *protocol, __be16 *sport, int type);
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index fb9e015..1c14c27 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -34,6 +34,20 @@ struct __ip6_tnl_parm {
__be32  o_key;
 };
 
+struct ip6_tnl_encap_ops {
+   size_t (*encap_hlen)(struct ip_tunnel_encap *e);
+   int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e,
+   u8 *protocol, struct flowi6 *fl6);
+};
+
+extern const struct ip6_tnl_encap_ops __rcu *
+   ip6tun_encaps[MAX_IPTUN_ENCAP_OPS];
+
+int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *op,
+ unsigned int num);
+int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *op,
+ unsigned int num);
+
 /* IPv6 tunnel */
 struct ip6_tnl {
struct ip6_tnl __rcu *next; /* next tunnel in list */
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 5e9d6bf..5cf4a1f 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -9,7 +9,7 @@ ipv6-objs :=af_inet6.o anycast.o ip6_output.o ip6_input.o 
addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
-   udp_offload.o
+   udp_offload.o ip6_tunnel_core.o
 
 ipv6-offload :=ip6_offload.o tcpv6_offload.o exthdrs_offload.o
 
@@ -43,6 +43,8 @@ obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
 
+obj-$(CONFIG_NET_FOU) += fou6.o
+
 obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
 
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
new file mode 100644
index 000..c972d0b
--- /dev/null
+++ b/net/ipv6/fou6.c
@@ -0,0 +1,140 @@
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
+  struct flowi6 *fl6, u8 *protocol, __be16 sport)
+{
+   struct udphdr *uh;
+
+   skb_push(skb, sizeof(struct udphdr));
+   skb_reset_transport_header(skb);
+
+   uh = udp_hdr(skb);
+
+   uh->dest = e->dport;
+   uh->source = sport;
+   uh->len = htons(skb->len);
+   udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb,
+ &fl6->saddr, &fl6->daddr, skb->len);
+
+   *protocol = IPPROTO_UDP;
+}
+
+int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+   __be16 sport;
+   int err;
+   int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+   SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+   err = __fou_build_header(skb, e, protocol, &sport, type);
+   if (err)
+   return err;
+
+   fou6_build_udp(skb, e, fl6, protocol, sport);
+
+   return 0;
+}
+EXPORT_SYMBOL(fou6_build_header);
+
+int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
+ u8 *protocol, struct flowi6 *fl6)
+{
+   __be16 sport;
+   int err;
+   int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ?
+   SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+
+   err = __gue_build_header(skb, e, protocol, &sport, type);
+   if (err)
+   return err;
+
+   fou6_build_udp(skb, e, fl6, protocol, sport);
+
+   return 0;
+}
+EXPORT_SYMBOL(gue6_build_header);
+
+#ifdef CONFIG_NET_FOU_IP_TUNNELS
+
+static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
+   .encap_hlen = fou

[PATCH v2 next-next 10/12] fou: Support IPv6 in fou

2016-05-06 Thread Tom Herbert

This patch adds receive path support for IPv6 with fou.

- Add address family to fou structure for open sockets. This supports
  AF_INET and AF_INET6. Lookups for fou ports are performed on both the
  port number and family.
- In fou and gue receive adjust tot_len in IPv4 header or payload_len
  based on address family.
- Allow AF_INET6 in FOU_ATTR_AF netlink attribute.

Signed-off-by: Tom Herbert 
---
 net/ipv4/fou.c | 47 +++
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 971c8c6..75db828 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -21,6 +21,7 @@ struct fou {
u8 protocol;
u8 flags;
__be16 port;
+   u8 family;
u16 type;
struct list_head list;
struct rcu_head rcu;
@@ -47,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
 }
 
-static int fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
 {
-   struct iphdr *iph = ip_hdr(skb);
-
/* Remove 'len' bytes from the packet (UDP header and
 * FOU header if present).
 */
-   iph->tot_len = htons(ntohs(iph->tot_len) - len);
+   if (fou->family == AF_INET)
+   ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+   else
+   ipv6_hdr(skb)->payload_len =
+   htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
+
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
@@ -68,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
 
-   if (fou_recv_pull(skb, sizeof(struct udphdr)))
+   if (fou_recv_pull(skb, fou, sizeof(struct udphdr)))
goto drop;
 
return -fou->protocol;
@@ -141,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff 
*skb)
 
hdrlen = sizeof(struct guehdr) + optlen;
 
-   ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+   if (fou->family == AF_INET)
+   ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len);
+   else
+   ipv6_hdr(skb)->payload_len =
+   htons(ntohs(ipv6_hdr(skb)->payload_len) - len);
 
/* Pull csum through the guehdr now . This can be used if
 * there is a remote checksum offload.
@@ -424,7 +432,8 @@ static int fou_add_to_port_list(struct net *net, struct fou 
*fou)
 
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
-   if (fou->port == fout->port) {
+   if (fou->port == fout->port &&
+   fou->family == fout->family) {
mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
@@ -469,8 +478,9 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
 
sk = sock->sk;
 
-   fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
+   fou->family = cfg->udp_config.family;
+   fou->flags = cfg->flags;
fou->type = cfg->type;
fou->sock = sock;
 
@@ -522,12 +532,13 @@ static int fou_destroy(struct net *net, struct fou_cfg 
*cfg)
 {
struct fou_net *fn = net_generic(net, fou_net_id);
__be16 port = cfg->udp_config.local_udp_port;
+   u8 family = cfg->udp_config.family;
int err = -EINVAL;
struct fou *fou;
 
mutex_lock(&fn->fou_lock);
list_for_each_entry(fou, &fn->fou_list, list) {
-   if (fou->port == port) {
+   if (fou->port == port && fou->family == family) {
fou_release(fou);
err = 0;
break;
@@ -565,8 +576,15 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
 
-   if (family != AF_INET)
-   return -EINVAL;
+   switch (family) {
+   case AF_INET:
+   break;
+   case AF_INET6:
+   cfg->udp_config.ipv6_v6only = 1;
+   break;
+   default:
+   return -EAFNOSUPPORT;
+   }
 
cfg->udp_config.family = family;
}
@@ -657,6 +675,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct 
genl_info *info)
struct fou_cfg cfg;
struct fou *fout;
__be16 port;
+   u8 family;
int ret;
 
ret = parse_nl_config(info, &cfg);
@@ -666,6 +685,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct 
genl_info *info)
if (port == 0)
return -EINVAL;
 
+   family = cfg.udp_config.family;
+   if (family != AF_I

[PATCH v2 next-next 09/12] ipv6: Change "final" protocol processing for encapsulation

2016-05-06 Thread Tom Herbert

When performing foo-over-UDP, UDP are receveived processed by the
encapsulation header which returns another protocol to process.
This may result in processing two (or more) protocols in the
loop that are marked as INET6_PROTO_FINAL. The actions taken
for hitting a final protocol, in particular the skb_postpull_rcsum
can only be performed.

This patch set adds a check of a final protocol has been seen. The
rules are:
  - If the final protocol has not been seen any protocol is processed
(final and non-final). In the case of a final protocol, the final
actions are taken (like the skb_postpull_rcsum)
  - If a final protocol has been seen (e.g. an encapsulating UDP
header) then no further non-final protocols are allowed
(e.g. extension headers). For more final protocols the
final actions are not taken (e.g. skb_postpull_rcsum).

Signed-off-by: Tom Herbert 
---
 net/ipv6/ip6_input.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 2a0258a..7d98d01 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -216,6 +216,7 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
unsigned int nhoff;
int nexthdr;
bool raw;
+   bool have_final = false;
 
/*
 *  Parse extension headers
@@ -235,9 +236,21 @@ resubmit:
if (ipprot) {
int ret;
 
-   if (ipprot->flags & INET6_PROTO_FINAL) {
+   if (have_final) {
+   if (!(ipprot->flags & INET6_PROTO_FINAL)) {
+   /* Once we've seen a final protocol don't
+* allow encapsulation on any non-final
+* ones. This allows foo in UDP encapsulation
+* to work.
+*/
+   goto discard;
+   }
+   } else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
 
+   /* Only do this once for first final protocol */
+   have_final = true;
+
/* Free reference early: we don't need it any more,
   and it may hold ip_conntrack module loaded
   indefinitely. */
-- 
2.8.0.rc2

[PATCH v2 next-next 03/12] gre6: Fix flag translations

2016-05-06 Thread Tom Herbert

GRE for IPv6 does not properly translate for GRE flags to tunnel
flags and vice versa. This patch fixes that.

Signed-off-by: Tom Herbert 
---
 net/ipv6/ip6_gre.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 47b671a..70a1f72 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -799,8 +799,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm 
*p,
p->link = u->link;
p->i_key = u->i_key;
p->o_key = u->o_key;
-   p->i_flags = u->i_flags;
-   p->o_flags = u->o_flags;
+   p->i_flags = gre_flags_to_tnl_flags(u->i_flags);
+   p->o_flags = gre_flags_to_tnl_flags(u->o_flags);
memcpy(p->name, u->name, sizeof(u->name));
 }
 
@@ -817,8 +817,8 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
u->link = p->link;
u->i_key = p->i_key;
u->o_key = p->o_key;
-   u->i_flags = p->i_flags;
-   u->o_flags = p->o_flags;
+   u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+   u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
memcpy(u->name, p->name, sizeof(u->name));
 }
 
@@ -1217,10 +1217,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
 
if (data[IFLA_GRE_IFLAGS])
-   parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
+   parms->i_flags = gre_flags_to_tnl_flags(
+   nla_get_be16(data[IFLA_GRE_IFLAGS]));
 
if (data[IFLA_GRE_OFLAGS])
-   parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
+   parms->o_flags = gre_flags_to_tnl_flags(
+   nla_get_be16(data[IFLA_GRE_OFLAGS]));
 
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1412,8 +1414,10 @@ static int ip6gre_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
struct __ip6_tnl_parm *p = &t->parms;
 
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
-   nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) ||
-   nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) ||
+   nla_put_be16(skb, IFLA_GRE_IFLAGS,
+gre_tnl_flags_to_gre_flags(p->i_flags)) ||
+   nla_put_be16(skb, IFLA_GRE_OFLAGS,
+gre_tnl_flags_to_gre_flags(p->o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
-- 
2.8.0.rc2

[PATCH v2 next-next 11/12] ip6_tun: Add infrastructure for doing encapsulation

2016-05-06 Thread Tom Herbert

Add encap_hlen and ip_tunnel_encap structure to ip6_tnl. Add functions
for getting encap hlen, setting up encap on a tunnel, performing
encapsulation operation.

Signed-off-by: Tom Herbert 
---
 include/net/ip6_tunnel.h   |  8 +-
 net/ipv6/ip6_tunnel.c  |  4 +++
 net/ipv6/ip6_tunnel_core.c | 64 ++
 3 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 1c14c27..1b8db86 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -66,10 +66,16 @@ struct ip6_tnl {
__u32 o_seqno;  /* The last output seqno */
int hlen;   /* tun_hlen + encap_hlen */
int tun_hlen;   /* Precalculated header length */
+   int encap_hlen; /* Encap header length (FOU,GUE) */
+   struct ip_tunnel_encap encap;
int mlink;
-
 };
 
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+   struct ip_tunnel_encap *ipencap);
+int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t,
+ u8 *protocol, struct flowi6 *fl6);
+
 /* Tunnel encapsulation limit destination sub-option */
 
 struct ipv6_tlv_tnl_enc_lim {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index ade55af..2c096ab 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1013,6 +1013,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device 
*dev, __u8 dsfield,
unsigned int max_headroom = sizeof(struct ipv6hdr);
int err = -1;
 
+   err = ip6_tnl_encap(skb, t, &proto, fl6);
+   if (err)
+   return err;
+
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
struct in6_addr *addr6;
diff --git a/net/ipv6/ip6_tunnel_core.c b/net/ipv6/ip6_tunnel_core.c
index 5f5b79e..94aa414 100644
--- a/net/ipv6/ip6_tunnel_core.c
+++ b/net/ipv6/ip6_tunnel_core.c
@@ -42,3 +42,67 @@ int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops 
*ops,
 }
 EXPORT_SYMBOL(ip6_tnl_encap_del_ops);
 
+static int ip6_encap_hlen(struct ip_tunnel_encap *e)
+{
+   const struct ip6_tnl_encap_ops *ops;
+   int hlen = -EINVAL;
+
+   if (e->type == TUNNEL_ENCAP_NONE)
+   return 0;
+
+   if (e->type >= MAX_IPTUN_ENCAP_OPS)
+   return -EINVAL;
+
+   rcu_read_lock();
+   ops = rcu_dereference(ip6tun_encaps[e->type]);
+   if (likely(ops && ops->encap_hlen))
+   hlen = ops->encap_hlen(e);
+   rcu_read_unlock();
+
+   return hlen;
+}
+
+int ip6_tnl_encap_setup(struct ip6_tnl *t,
+   struct ip_tunnel_encap *ipencap)
+{
+   int hlen;
+
+   memset(&t->encap, 0, sizeof(t->encap));
+
+   hlen = ip6_encap_hlen(ipencap);
+   if (hlen < 0)
+   return hlen;
+
+   t->encap.type = ipencap->type;
+   t->encap.sport = ipencap->sport;
+   t->encap.dport = ipencap->dport;
+   t->encap.flags = ipencap->flags;
+
+   t->encap_hlen = hlen;
+   t->hlen = t->encap_hlen + t->tun_hlen;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+
+int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t,
+ u8 *protocol, struct flowi6 *fl6)
+{
+   const struct ip6_tnl_encap_ops *ops;
+   int ret = -EINVAL;
+
+   if (t->encap.type == TUNNEL_ENCAP_NONE)
+   return 0;
+
+   if (t->encap.type >= MAX_IPTUN_ENCAP_OPS)
+   return -EINVAL;
+
+   rcu_read_lock();
+   ops = rcu_dereference(ip6tun_encaps[t->encap.type]);
+   if (likely(ops && ops->build_header))
+   ret = ops->build_header(skb, &t->encap, protocol, fl6);
+   rcu_read_unlock();
+
+   return ret;
+}
+EXPORT_SYMBOL(ip6_tnl_encap);
-- 
2.8.0.rc2

[PATCH v2 next-next 00/12] ipv6: Enable GUEoIPv6 and more fixes for v6 tunneling

2016-05-06 Thread Tom Herbert

This patch set:
  - Fixes GRE6 to process translate flags correctly from configuration
  - Adds support for GSO and GRO for ip6ip6 and ip4ip6
  - Add support for FOU and GUE in IPv6
  - Support GRE, ip6ip6 and ip4ip6 over FOU/GUE
  - Fixes ip6_input to deal with UDP encapsulations
  - Some other minor fixes

v2:
  - Removed a check of GSO types in MPLS
  - Define GSO type SKB_GSO_IPXIP6 and SKB_GSO_IPXIP4 (based on input
from Alexander)
  - Don't define GSO types specifally for IP6IP6 and IP4IP6, above
fix makes that uncessary
  - Don't bother clearing encapsulation flag in UDP tunnel segment
(another item suggested by Alexander).

Tested:
   Tested a variety of case, but not the full matrix (which is quite
   large now). Most of the obivous cases (e.g. GRE) work fine. Still
   some issues probably with GSO/GRO being effective in all cases.

- IPv4/GRE/GUE/IPv6 with RCO
  1 TCP_STREAM
6616 Mbps
  200 TCP_RR
1244043 tps
141/243/446 90/95/99% latencies
86.61% CPU utilization
- IPv6/GRE/GUE/IPv6 with RCO
  1 TCP_STREAM
6940 Mbps
  200 TCP_RR
1270903 tps
138/236/440 90/95/99% latencies
87.51% CPU utilization

 - IP6IP6
  1 TCP_STREAM
2576 Mbps
  200 TCP_RR
498981 tps
388/498/631 90/95/99% latencies
19.75% CPU utilization (1 CPU saturated)

 - IP6IP6/GUE/IPv6 with RCO
  1 TCP_STREAM
1854 Mbps
  200 TCP_RR
1233818 tps
143/244/451 90/95/99% latencies
87.57 CPU utilization

 - IP4IP6
  1 TCP_STREAM
  200 TCP_RR
763774 tps
250/318/466 90/95/99% latencies
35.25% CPU utilization (1 CPU saturated)

 - GRE with keyid
  200 TCP_RR
744173 tps
258/332/461 90/95/99% latencies
34.59% CPU utilization (1 CPU saturated)
  
Tom Herbert (12):
  gso: Remove arbitrary checks for unsupported GSO
  net: define gso types for IPx over IPv4 and IPv6
  gre6: Fix flag translations
  udp: Don't set skb->encapsulation with RCO
  fou: Call setup_udp_tunnel_sock
  fou: Split out {fou,gue}_build_header
  fou: Add encap ops for IPv6 tunnels
  ipv6: Fix nexthdr for reinjection
  ipv6: Change "final" protocol processing for encapsulation
  fou: Support IPv6 in fou
  ip6_tun: Add infrastructure for doing encapsulation
  ip6_gre: Add support for fou/gue encapsulation

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |   5 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   4 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   3 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   3 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c |   3 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |   3 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   3 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |   3 +-
 include/linux/netdev_features.h   |  12 +-
 include/linux/netdevice.h |   4 +-
 include/linux/skbuff.h|   4 +-
 include/net/fou.h |  10 +-
 include/net/ip6_tunnel.h  |  22 +++-
 net/core/ethtool.c|   4 +-
 net/ipv4/af_inet.c|  20 +--
 net/ipv4/fou.c| 144 +-
 net/ipv4/gre_offload.c|  14 ---
 net/ipv4/ipip.c   |   2 +-
 net/ipv4/tcp_offload.c|  19 ---
 net/ipv4/udp_offload.c|  19 +--
 net/ipv6/Makefile |   4 +-
 net/ipv6/fou6.c   | 140 +
 net/ipv6/ip6_gre.c|  95 --
 net/ipv6/ip6_input.c  |  24 +++-
 net/ipv6/ip6_offload.c|  22 +---
 net/ipv6/ip6_tunnel.c |   4 +
 net/ipv6/ip6_tunnel_core.c| 108 
 net/ipv6/sit.c|   4 +-
 net/ipv6/udp_offload.c|  13 --
 net/mpls/mpls_gso.c   |   9 --
 net/netfilter/ipvs/ip_vs_xmit.c   |  11 +-
 31 files changed, 511 insertions(+), 224 deletions(-)
 create mode 100644 net/ipv6/fou6.c
 create mode 100644 net/ipv6/ip6_tunnel_core.c

-- 
2.8.0.rc2

[PATCH v2 next-next 01/12] gso: Remove arbitrary checks for unsupported GSO

2016-05-06 Thread Tom Herbert

In several gso_segment functions there are checks of gso_type against
a seemingly arbitrary list of SKB_GSO_* flags. This seems like an
attempt to identify unsupported GSO types, but since the stack is
the one that set these GSO types in the first place this seems
unnecessary to do. If a combination isn't valid in the first
place that stack should not allow setting it.

This is a code simplication especially for add new GSO types.

Signed-off-by: Tom Herbert 
---
 net/ipv4/af_inet.c | 18 --
 net/ipv4/gre_offload.c | 14 --
 net/ipv4/tcp_offload.c | 19 ---
 net/ipv4/udp_offload.c | 10 --
 net/ipv6/ip6_offload.c | 18 --
 net/ipv6/udp_offload.c | 13 -
 net/mpls/mpls_gso.c|  9 -
 7 files changed, 101 deletions(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2e6e65f..7f08d45 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1205,24 +1205,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff 
*skb,
int ihl;
int id;
 
-   if (unlikely(skb_shinfo(skb)->gso_type &
-~(SKB_GSO_TCPV4 |
-  SKB_GSO_UDP |
-  SKB_GSO_DODGY |
-  SKB_GSO_TCP_ECN |
-  SKB_GSO_GRE |
-  SKB_GSO_GRE_CSUM |
-  SKB_GSO_IPIP |
-  SKB_GSO_SIT |
-  SKB_GSO_TCPV6 |
-  SKB_GSO_UDP_TUNNEL |
-  SKB_GSO_UDP_TUNNEL_CSUM |
-  SKB_GSO_TCP_FIXEDID |
-  SKB_GSO_TUNNEL_REMCSUM |
-  SKB_GSO_PARTIAL |
-  0)))
-   goto out;
-
skb_reset_network_header(skb);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*iph
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e88190a..ecd1e09 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -26,20 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
int gre_offset, outer_hlen;
bool need_csum, ufo;
 
-   if (unlikely(skb_shinfo(skb)->gso_type &
-   ~(SKB_GSO_TCPV4 |
- SKB_GSO_TCPV6 |
- SKB_GSO_UDP |
- SKB_GSO_DODGY |
- SKB_GSO_TCP_ECN |
- SKB_GSO_TCP_FIXEDID |
- SKB_GSO_GRE |
- SKB_GSO_GRE_CSUM |
- SKB_GSO_IPIP |
- SKB_GSO_SIT |
- SKB_GSO_PARTIAL)))
-   goto out;
-
if (!skb->encapsulation)
goto out;
 
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 02737b6..5c59649 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -83,25 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
-   int type = skb_shinfo(skb)->gso_type;
-
-   if (unlikely(type &
-~(SKB_GSO_TCPV4 |
-  SKB_GSO_DODGY |
-  SKB_GSO_TCP_ECN |
-  SKB_GSO_TCP_FIXEDID |
-  SKB_GSO_TCPV6 |
-  SKB_GSO_GRE |
-  SKB_GSO_GRE_CSUM |
-  SKB_GSO_IPIP |
-  SKB_GSO_SIT |
-  SKB_GSO_UDP_TUNNEL |
-  SKB_GSO_UDP_TUNNEL_CSUM |
-  SKB_GSO_TUNNEL_REMCSUM |
-  0) ||
-!(type & (SKB_GSO_TCPV4 |
-  SKB_GSO_TCPV6
-   goto out;
 
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 097060de..b556ef6 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -209,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff 
*skb,
 
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
/* Packet is from an untrusted source, reset gso_segs. */
-   int type = skb_shinfo(skb)->gso_type;
-
-   if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
- SKB_GSO_UDP_TUNNEL |
- SKB_GSO_UDP_TUNNEL_CSUM |
- SKB_GSO_TUNNEL_REMCSUM |
- SKB_GSO_IPIP |
- SKB_GSO_GRE | SKB_GSO_G

[PATCH v2 next-next 02/12] net: define gso types for IPx over IPv4 and IPv6

2016-05-06 Thread Tom Herbert

This patch defines two new GDO definitions SKB_GSO_IPXIP4 and
SKB_GSO_IPXIP6 along with corresponding NETIF_F_GSO_IPXIP4 and
NETIF_F_GSO_IPXIP6. These are used to described IP in IP
tunnel and what the outer protocol is. The inner protocol
can be deduced from other GSO types (e.g. SKB_GSO_TCPV4 and
SKB_GSO_TCPV6). The GSO types of SKB_GSO_IPIP and SKB_GSO_SIT
are removed (these are both instances of SKB_GSO_IPXIP4).
SKB_GSO_IPXIP6 will be used when support for GSO with IP
encapsulation over IPv6 is added.

Signed-off-by: Tom Herbert 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |  5 ++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |  4 ++--
 drivers/net/ethernet/intel/i40e/i40e_main.c   |  3 +--
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  3 +--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c |  3 +--
 drivers/net/ethernet/intel/i40evf/i40evf_main.c   |  3 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  3 +--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  3 +--
 include/linux/netdev_features.h   | 12 ++--
 include/linux/netdevice.h |  4 ++--
 include/linux/skbuff.h|  4 ++--
 net/core/ethtool.c|  4 ++--
 net/ipv4/af_inet.c|  2 +-
 net/ipv4/ipip.c   |  2 +-
 net/ipv6/ip6_offload.c|  4 ++--
 net/ipv6/sit.c|  4 ++--
 net/netfilter/ipvs/ip_vs_xmit.c   | 11 ++-
 17 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index d465bd7..0a5b770 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13259,12 +13259,11 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct 
pci_dev *pdev,
NETIF_F_RXHASH | NETIF_F_HW_VLAN_CTAG_TX;
if (!chip_is_e1x) {
dev->hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL |
-   NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT;
+   NETIF_F_GSO_IPXIP4;
dev->hw_enc_features =
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
-   NETIF_F_GSO_IPIP |
-   NETIF_F_GSO_SIT |
+   NETIF_F_GSO_IPXIP4 |
NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL;
}
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index fd85b6d..e449228 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -6218,7 +6218,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG |
   NETIF_F_TSO | NETIF_F_TSO6 |
   NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
-  NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
+  NETIF_F_GSO_IPXIP4 |
   NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
   NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
   NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO;
@@ -6228,7 +6228,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
NETIF_F_TSO | NETIF_F_TSO6 |
NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
-   NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT |
+   NETIF_F_GSO_IPXIP4;
NETIF_F_GSO_PARTIAL;
dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM |
NETIF_F_GSO_GRE_CSUM;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f6da6b7..c2a4c10 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -9131,8 +9131,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi)
   NETIF_F_TSO6 |
   NETIF_F_GSO_GRE  |
   NETIF_F_GSO_GRE_CSUM |
-  NETIF_F_GSO_IPIP |
-  NETIF_F_GSO_SIT  |
+  NETIF_F_GSO_IPXIP4   |
   NETIF_F_GSO_UDP_TUNNEL   |
   NETIF_F_GSO_UDP_TUNNEL_CSUM  |
   NETIF_F_GSO_PARTIA

[PATCH v2 next-next 05/12] fou: Call setup_udp_tunnel_sock

2016-05-06 Thread Tom Herbert

Use helper function to set up UDP tunnel related information for a fou
socket.

Signed-off-by: Tom Herbert 
---
 net/ipv4/fou.c | 50 --
 1 file changed, 16 insertions(+), 34 deletions(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 7ac5ec8..a8b5cbf 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -446,31 +446,13 @@ static void fou_release(struct fou *fou)
kfree_rcu(fou, rcu);
 }
 
-static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg 
*cfg)
-{
-   udp_sk(sk)->encap_rcv = fou_udp_recv;
-   udp_sk(sk)->gro_receive = fou_gro_receive;
-   udp_sk(sk)->gro_complete = fou_gro_complete;
-   fou_from_sock(sk)->protocol = cfg->protocol;
-
-   return 0;
-}
-
-static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg 
*cfg)
-{
-   udp_sk(sk)->encap_rcv = gue_udp_recv;
-   udp_sk(sk)->gro_receive = gue_gro_receive;
-   udp_sk(sk)->gro_complete = gue_gro_complete;
-
-   return 0;
-}
-
 static int fou_create(struct net *net, struct fou_cfg *cfg,
  struct socket **sockp)
 {
struct socket *sock = NULL;
struct fou *fou = NULL;
struct sock *sk;
+   struct udp_tunnel_sock_cfg tunnel_cfg;
int err;
 
/* Open UDP socket */
@@ -489,33 +471,33 @@ static int fou_create(struct net *net, struct fou_cfg 
*cfg,
 
fou->flags = cfg->flags;
fou->port = cfg->udp_config.local_udp_port;
+   fou->type = cfg->type;
+   fou->sock = sock;
+
+   memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
+   tunnel_cfg.encap_type = 1;
+   tunnel_cfg.sk_user_data = fou;
+   tunnel_cfg.encap_destroy = NULL;
 
/* Initial for fou type */
switch (cfg->type) {
case FOU_ENCAP_DIRECT:
-   err = fou_encap_init(sk, fou, cfg);
-   if (err)
-   goto error;
+   tunnel_cfg.encap_rcv = fou_udp_recv;
+   tunnel_cfg.gro_receive = fou_gro_receive;
+   tunnel_cfg.gro_complete = fou_gro_complete;
+   fou->protocol = cfg->protocol;
break;
case FOU_ENCAP_GUE:
-   err = gue_encap_init(sk, fou, cfg);
-   if (err)
-   goto error;
+   tunnel_cfg.encap_rcv = gue_udp_recv;
+   tunnel_cfg.gro_receive = gue_gro_receive;
+   tunnel_cfg.gro_complete = gue_gro_complete;
break;
default:
err = -EINVAL;
goto error;
}
 
-   fou->type = cfg->type;
-
-   udp_sk(sk)->encap_type = 1;
-   udp_encap_enable();
-
-   sk->sk_user_data = fou;
-   fou->sock = sock;
-
-   inet_inc_convert_csum(sk);
+   setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
 
sk->sk_allocation = GFP_ATOMIC;
 
-- 
2.8.0.rc2

[PATCH v2 next-next 12/12] ip6_gre: Add support for fou/gue encapsulation

2016-05-06 Thread Tom Herbert

Add netlink and setup for encapsulation

Signed-off-by: Tom Herbert 
---
 net/ipv6/ip6_gre.c | 75 ++
 1 file changed, 75 insertions(+)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 70a1f72..ed5ddcc 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1027,6 +1027,8 @@ static int ip6gre_tunnel_init_common(struct net_device 
*dev)
 
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
 
+   tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
 
dev->needed_headroom= LL_MAX_HEADER + t_hlen + 4;
@@ -1293,15 +1295,57 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 }
 
+static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
+  struct ip_tunnel_encap *ipencap)
+{
+   bool ret = false;
+
+   memset(ipencap, 0, sizeof(*ipencap));
+
+   if (!data)
+   return ret;
+
+   if (data[IFLA_GRE_ENCAP_TYPE]) {
+   ret = true;
+   ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]);
+   }
+
+   if (data[IFLA_GRE_ENCAP_FLAGS]) {
+   ret = true;
+   ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]);
+   }
+
+   if (data[IFLA_GRE_ENCAP_SPORT]) {
+   ret = true;
+   ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]);
+   }
+
+   if (data[IFLA_GRE_ENCAP_DPORT]) {
+   ret = true;
+   ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]);
+   }
+
+   return ret;
+}
+
 static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
 {
struct ip6_tnl *nt;
struct net *net = dev_net(dev);
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+   struct ip_tunnel_encap ipencap;
int err;
 
nt = netdev_priv(dev);
+
+   if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+   int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+   if (err < 0)
+   return err;
+   }
+
ip6gre_netlink_parms(data, &nt->parms);
 
if (ip6gre_tunnel_find(net, &nt->parms, dev->type))
@@ -1348,10 +1392,18 @@ static int ip6gre_changelink(struct net_device *dev, 
struct nlattr *tb[],
struct net *net = nt->net;
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
struct __ip6_tnl_parm p;
+   struct ip_tunnel_encap ipencap;
 
if (dev == ign->fb_tunnel_dev)
return -EINVAL;
 
+   if (ip6gre_netlink_encap_parms(data, &ipencap)) {
+   int err = ip6_tnl_encap_setup(nt, &ipencap);
+
+   if (err < 0)
+   return err;
+   }
+
ip6gre_netlink_parms(data, &p);
 
t = ip6gre_tunnel_locate(net, &p, 0);
@@ -1405,6 +1457,14 @@ static size_t ip6gre_get_size(const struct net_device 
*dev)
nla_total_size(4) +
/* IFLA_GRE_FLAGS */
nla_total_size(4) +
+   /* IFLA_GRE_ENCAP_TYPE */
+   nla_total_size(2) +
+   /* IFLA_GRE_ENCAP_FLAGS */
+   nla_total_size(2) +
+   /* IFLA_GRE_ENCAP_SPORT */
+   nla_total_size(2) +
+   /* IFLA_GRE_ENCAP_DPORT */
+   nla_total_size(2) +
0;
 }
 
@@ -1428,6 +1488,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const 
struct net_device *dev)
nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags))
goto nla_put_failure;
+
+   if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
+   t->encap.type) ||
+   nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT,
+t->encap.sport) ||
+   nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT,
+t->encap.dport) ||
+   nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS,
+   t->encap.flags))
+   goto nla_put_failure;
+
return 0;
 
 nla_put_failure:
@@ -1446,6 +1517,10 @@ static const struct nla_policy 
ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
[IFLA_GRE_FLOWINFO]= { .type = NLA_U32 },
[IFLA_GRE_FLAGS]   = { .type = NLA_U32 },
+   [IFLA_GRE_ENCAP_TYPE]   = { .type = NLA_U16 },
+   [IFLA_GRE_ENCAP_FLAGS]  = { .type = NLA_U16 },
+   [IFLA_GRE_ENCAP_SPORT]  = { .type = NLA_U16 },
+   [IFLA_GRE_ENCAP_DPORT]  = { .type = NLA_U16 },
 };
 
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
-- 
2.8.0.rc2

[PATCH v2 next-next 04/12] udp: Don't set skb->encapsulation with RCO

2016-05-06 Thread Tom Herbert

When RCO is in effect we want to ensure that the outer checksum is
properly offloaded. Don't set skb->encapsulation in this case to
ensure that checksum offload is later considered for hw_features
instead of hw_enc_features.

Signed-off-by: Tom Herbert 
---
 net/ipv4/udp_offload.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b556ef6..92a9222 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -94,11 +94,12 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
do {
unsigned int len;
 
-   if (remcsum)
+   if (remcsum) {
skb->ip_summed = CHECKSUM_NONE;
-
-   /* Set up inner headers if we are offloading inner checksum */
-   if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   /* Set up inner headers if we are offloading inner
+* checksum
+*/
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
-- 
2.8.0.rc2

[PATCH v2 next-next 08/12] ipv6: Fix nexthdr for reinjection

2016-05-06 Thread Tom Herbert

In ip6_input_finish the protocol handle returns a value greater than
zero the packet needs to be resubmitted using the returned protocol.
The returned protocol is being ignored and each time through resubmit
nexthdr is taken from an offest in the packet. This patch fixes that
so that nexthdr is taken from return value of the protocol handler.

Signed-off-by: Tom Herbert 
---
 net/ipv6/ip6_input.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6ed5601..2a0258a 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -222,13 +222,14 @@ static int ip6_input_finish(struct net *net, struct sock 
*sk, struct sk_buff *sk
 */
 
rcu_read_lock();
-resubmit:
+
idev = ip6_dst_idev(skb_dst(skb));
if (!pskb_pull(skb, skb_transport_offset(skb)))
goto discard;
nhoff = IP6CB(skb)->nhoff;
nexthdr = skb_network_header(skb)[nhoff];
 
+resubmit:
raw = raw6_local_deliver(skb, nexthdr);
ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot) {
@@ -256,10 +257,12 @@ resubmit:
goto discard;
 
ret = ipprot->handler(skb);
-   if (ret > 0)
+   if (ret > 0) {
+   nexthdr = ret;
goto resubmit;
-   else if (ret == 0)
+   } else if (ret == 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
+   }
} else {
if (!raw) {
if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-- 
2.8.0.rc2

Re: [PATCH net-next] ipv4: tcp: ip_send_unicast_reply() is not BH safe

2016-05-06 Thread David Miller

From: Eric Dumazet 
Date: Fri, 06 May 2016 09:46:18 -0700

> From: Eric Dumazet 
> 
> I forgot that ip_send_unicast_reply() is not BH safe (yet).
> 
> Disabling preemption before calling it was not a good move.
> 
> Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible")
> Signed-off-by: Eric Dumazet 
> Reported-by: Andres Lagar-Cavilla  

Applied, thanks.

Re: [PATCH net-next 0/7] bpf: introduce direct packet access

2016-05-06 Thread David Miller

From: Alexei Starovoitov 
Date: Thu, 5 May 2016 19:49:08 -0700

> This set of patches introduce 'direct packet access' from
> cls_bpf and act_bpf programs (which are root only).

Series applied, thanks Alexei.

Re: [patch net 0/3] mlxsw: Couple of fixes

2016-05-06 Thread David Miller

From: Jiri Pirko 
Date: Fri,  6 May 2016 11:17:21 +0200

> From: Jiri Pirko 
> 
> Ido Schimmel (2):
>   mlxsw: spectrum: Fix rollback order in LAG join failure
>   mlxsw: spectrum: Add missing rollback in flood configuration
> 
> Jiri Pirko (1):
>   mlxsw: spectrum: Fix ordering in mlxsw_sp_fini

What tree is this for?  Because on 'net' this makes the build fail.

drivers/net/ethernet/mellanox/mlxsw/spectrum.c: In function ‘mlxsw_sp_fini’:
drivers/net/ethernet/mellanox/mlxsw/spectrum.c:2162:2: error: implicit 
declaration of function ‘mlxsw_sp_buffers_fini’ 
[-Werror=implicit-function-declaration]

Re: [net-next 00/11][pull request] 40GbE Intel Wired LAN Driver Updates 2016-05-05

2016-05-06 Thread David Miller

From: Jeff Kirsher 
Date: Fri,  6 May 2016 00:03:37 -0700

> This series contains updates to i40e and i40evf.

Looks good, pulled, thanks!

Re: [PATCH net-next v2] net: vrf: Create FIB tables on link create

2016-05-06 Thread David Miller

From: David Ahern 
Date: Wed,  4 May 2016 21:46:12 -0700

> Tables have to exist for VRFs to function. Ensure they exist
> when VRF device is created.
> 
> Signed-off-by: David Ahern 
> ---
> v2
> - create table before rt6 allocation per comment from DaveM

Yep, this looks better, applied.

Re: [PATCH net 1/1] qede: prevent chip hang when increasing channels

2016-05-06 Thread David Miller

From: Sudarsana Reddy Kalluru 
Date: Thu, 5 May 2016 00:35:16 -0400

> qede requires qed to provide enough resources to accommodate 16 combined
> channels, but that upper-bound isn't actually being enforced by it.
> Instead, qed inform back to qede how many channels can be opened based on
> available resources - but that calculation doesn't really take into account
> the resources requested by qede; Instead it considers other FW/HW available
> resources.
> 
> As a result, if a user would increase the number of channels to more than
> 16 [e.g., using ethtool] the chip would hang.
> 
> This change increments the resources requested by qede to 64 combined
> channels instead of 16; This value is an upper bound on the possible
> available channels [due to other FW/HW resources].
> 
> Signed-off-by: Sudarsana Reddy Kalluru 
> Signed-off-by: Yuval Mintz 

Applied.

Re: [PATCH net v3 2/2] udp_offload: Set encapsulation before inner completes.

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 12:34 PM, David Miller  wrote:
> From: Jarno Rajahalme 
> Date: Tue,  3 May 2016 16:10:21 -0700
>
>> UDP tunnel segmentation code relies on the inner offsets being set for
>> an UDP tunnel GSO packet, but the inner *_complete() functions will
>> set the inner offsets only if 'encapsulation' is set before calling
>> them.  Currently, udp_gro_complete() sets 'encapsulation' only after
>> the inner *_complete() functions are done.  This causes the inner
>> offsets having invalid values after udp_gro_complete() returns, which
>> in turn will make it impossible to properly segment the packet in case
>> it needs to be forwarded, which would be visible to the user either as
>> invalid packets being sent or as packet loss.
>>
>> This patch fixes this by setting skb's 'encapsulation' in
>> udp_gro_complete() before calling into the inner complete functions,
>> and by making each possible UDP tunnel gro_complete() callback set the
>> inner_mac_header to the beginning of the tunnel payload.
>>
>> Signed-off-by: Jarno Rajahalme 
>> ---
>> v3: Added setting inner_mac_header from all possible callbacks to cover
>> cases where there is no inner mac header.
>
> Alex and Tom, can you please review this new version since you guys had
> so much feedback for v2?
>
> THanks.

I had reviewed it a day or so ago.  It did address the issues I saw
with the original patch, and from what I can tell it is fixing the
original issue reported.

Reviewed-by: Alexander Duyck

Re: [PATCH] net: ipv6: tcp reset, icmp need to consider L3 domain

2016-05-06 Thread David Miller

From: David Ahern 
Date: Wed,  4 May 2016 21:26:08 -0700

> Responses for packets to unused ports are getting lost with L3 domains.
> 
> IPv4 has ip_send_unicast_reply for sending TCP responses which accounts
> for L3 domains; update the IPv6 counterpart tcp_v6_send_response.
> For icmp the L3 master check needs to be moved up in icmp6_send
> to properly respond to UDP packets to a port with no listener.
> 
> Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack")
> Signed-off-by: David Ahern 

Applied and queued up for -stable, thanks.

Re: [PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 12:36 PM, Mike Manning  wrote:
> On 05/06/2016 06:02 PM, Alexander Duyck wrote:
>> On Fri, May 6, 2016 at 6:26 AM, Mike Manning  wrote:
>>> The MAC address of the physical interface is only copied to the VLAN
>>> when it is first created, resulting in an inconsistency after MAC
>>> address changes of only newly created VLANs having an up-to-date MAC.
>>>
>>> The VLANs should continue inheriting the MAC address of the physical
>>> interface, unless explicitly changed to be different from this.
>>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
>>> to the MAC of the physical interface and thus for DAD to behave as
>>> expected.
>>>
>>> Signed-off-by: Mike Manning 
>>> ---
>>>  include/linux/if_vlan.h |2 ++
>>>  net/8021q/vlan.c|   17 +++--
>>>  net/8021q/vlan_dev.c|   13 ++---
>>>  3 files changed, 23 insertions(+), 9 deletions(-)
>>>
>>> --- a/include/linux/if_vlan.h
>>> +++ b/include/linux/if_vlan.h
>>> @@ -138,6 +138,7 @@ struct netpoll;
>>>   * @flags: device flags
>>>   * @real_dev: underlying netdevice
>>>   * @real_dev_addr: address of underlying netdevice
>>> + * @addr_assign_type: address assignment type
>>>   * @dent: proc dir entry
>>>   * @vlan_pcpu_stats: ptr to percpu rx stats
>>>   */
>>> @@ -153,6 +154,7 @@ struct vlan_dev_priv {
>>>
>>> struct net_device   *real_dev;
>>> unsigned char   real_dev_addr[ETH_ALEN];
>>> +   unsigned char   addr_assign_type;
>>>
>>> struct proc_dir_entry   *dent;
>>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
>>
>> Please don't start adding new members to structures when it already
>> exists in the net_device.  If anything you should be able to drop
>> read_dev_addr if you do this correctly because you shouldn't need to
>> clone the lower dev address to watch for changes.  All you will need
>> to do is watch NET_ADDR_STOLEN.
>>
>
> Thanks for the detailed review. I had initially used the existing type
> in net_device, but the problem with this was that it got overwritten to
> NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify.
> It would just be a case of setting the type earlier in that function
> (and caching the previous value in case there is an error).
>
> However, based on your later comment, it seems I should not bother with
> the approach I have here, namely that if the VLAN MAC is set to the same
> value as that of the lower device MAC, that is to be considered as
> resetting it and thus for MAC inheritance to resume. Instead, I will just
> make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited,
> and if it is set to anything (even the value of the lower device MAC),
> inheritance is stopped. I agree this makes for a far simpler changeset.
>
> I don't think I can remove real_dev_addr, as that is still needed for
> the existing functionality in vlan_sync_address() to determine if the sync
> should be done, also as a way of caching it for handling in vlan_dev_open().

The thing is that logic isn't really needed anymore though if you are
going to be following the lower dev.  If you follow the code what it
is doing is adding the address via dev_uc_add if the lower address
moves away from the VLAN address.  With your changes you are updating
the VLAN MAC address to the lower value in the NET_ADDR_STOLEN case so
you don't need to add or remove an extra unicast address.  If the user
sets the MAC address you can then use the vlandev->dev_addr as the
address you add/remove from the unicast list and you probably don't
need to bother with tracking the lower device state anyway.

> As a matter of interest, what is the advantage of not updating the VLAN
> MAC when it is down? I appreciate that one should not add/delete
> secondary unicast addresses in this case, but there is no such
> restriction for copying the MAC.

Basically you are just wasting cycles messing with it while it is
down.  You don't need to bother with syncing up the addresses until
you bring the interface up.  At that point you essentially need to do
the vlan_sync_address type work anyway because you have to push your
address to the lower dev, or you have to pull it up from the lower dev
in the case of the stolen address.  You don't want to have MAC
addresses written to the device for an interface that is down.

Re: [Y2038] [RESEND PATCH 2/3] fs: poll/select/recvmmsg: use timespec64 for timeout events

2016-05-06 Thread David Miller

From: John Stultz 
Date: Wed, 4 May 2016 17:01:24 -0700

> On Wed, May 4, 2016 at 4:51 PM, Andrew Morton  
> wrote:
>> On Wed, 04 May 2016 23:08:11 +0200 Arnd Bergmann  wrote:
>>
>>> > But I'm less comfortable making the call on this one. It looks
>>> > relatively straight forward, but it would be good to have maintainer
>>> > acks before I add it to my tree.
>>>
>>> Agreed. Feel free to add my
>>>
>>> Reviewed-by: Arnd Bergmann 
>>>
>>> at least (whoever picks it up).
>>
>> In reply to [1/3] John said
>>
>> : Looks ok at the first glance. I've queued these up for testing,
>> : however I only got #1 and #3 of the set. Are you hoping these two
>> : patches will go through tip/timers/core or are you looking for acks so
>> : they can go via another tree?
>>
>> However none of the patches are in linux-next.
>>
>> John had qualms about [2/3], but it looks like a straightforward
>> substitution in areas which will get plenty of testing
> 
> Yea. My main concern is just not stepping on any other maintainers toes.

The networking changes look fine to me:

Acked-by: David S. Miller

Re: [PATCH net-next] cnic: call cp->stop_hw() in cnic_start_hw() on allocation failure

2016-05-06 Thread David Miller

From: Jon Maxwell 
Date: Thu,  5 May 2016 09:55:51 +1000

> We recently had a system crash in the cnic module. Vmcore analysis confirmed 
> that "ip link up" was executed which failed due to an allocation failure 
> because of memory fragmentation. Futher analysis revealed that the cnic irq 
> vector was still allocated after the "ip link up" that failed. When 
> "ip link down" was executed it called free_msi_irqs() which crashed the 
> system 
> because the cnic irq was still inuse.
 ...
> The cnic_start_hw() routine is not handling the allocation failure correctly. 
> Fix this by checking whether CNIC_DRV_STATE_HANDLES_IRQ flag is set 
> indicating 
> that the hardware has been started in cnic_start_hw(). If it has then call 
> cp->stop_hw() which frees the cnic irq vector and cnic resources. Otherwise 
> just maintain the previous behaviour and free cnic resources. 
> 
> I reproduced this by injecting an ENOMEM error into cnic_cm_alloc_mem()s 
> return
> code. 
> 
> # ip link set dev enpX down
> # ip link set dev enpX up <--- hit's allocation failure
> # ip link set dev enpX down <--- crashes here
> 
> With this patch I confirmed there was no crash in the reproducer.
> 
> Signed-off-by: Jon Maxwell 

Applied, thank you.

Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)

2016-05-06 Thread Dave Taht

On Fri, May 6, 2016 at 11:56 AM, Roman Yeryomin  wrote:
> On 6 May 2016 at 21:43, Roman Yeryomin  wrote:
>> On 6 May 2016 at 15:47, Jesper Dangaard Brouer  wrote:
>>>
>>> I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2]
>>> closed Felix'es OpenWRT email account (bad choice! emails bouncing).
>>> Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project
>>> is in some kind of conflict.
>>>
>>> OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349
>>>
>>> [2] 
>>> http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335
>>
>> OK, so, after porting the patch to 4.1 openwrt kernel and playing a
>> bit with fq_codel limits I was able to get 420Mbps UDP like this:
>> tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256
>
> Forgot to mention, I've reduced drop_batch_size down to 32

0) Not clear to me if that's the right line, there are 4 wifi queues,
and the third one
is the BE queue. That is too low a limit, also, for normal use. And:
for the purpose of this particular UDP test, flows 16 is ok, but not
ideal.

1) What's the tcp number (with a simultaneous ping) with this latest patchset?
(I care about tcp performance a lot more than udp floods - surviving a
udp flood yes, performance, no)

before/after?

tc -s qdisc show dev wlan0 during/after results?

IF you are doing builds for the archer c7v2, I can join in on this... (?)

I did do a test of the ath10k "before", fq_codel *never engaged*, and
tcp induced latencies under load, e at 100mbit, cracked 600ms, while
staying flat (20ms) at 100mbit. (not the same patches you are testing)
on x86. I have got tcp 300Mbit out of an osx box, similar latency,
have yet to get anything more on anything I currently have
before/after patchsets.

I'll go add flooding to the tests, I just finished a series comparing
two different speed stations and life was good on that.

"before" - fq_codel never engages, we see seconds of latency under load.

root@apu2:~# tc -s qdisc show dev wlp4s0
qdisc mq 0: root
 Sent 8570563893 bytes 6326983 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
qdisc fq_codel 0: parent :1 limit 10240p flows 1024 quantum 1514
target 5.0ms interval 100.0ms ecn
 Sent 2262 bytes 17 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0
  new_flows_len 0 old_flows_len 0
qdisc fq_codel 0: parent :2 limit 10240p flows 1024 quantum 1514
target 5.0ms interval 100.0ms ecn
 Sent 220486569 bytes 152058 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  maxpacket 18168 drop_overlimit 0 new_flow_count 1 ecn_mark 0
  new_flows_len 0 old_flows_len 1
qdisc fq_codel 0: parent :3 limit 10240p flows 1024 quantum 1514
target 5.0ms interval 100.0ms ecn
 Sent 8340546509 bytes 6163431 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  maxpacket 68130 drop_overlimit 0 new_flow_count 120050 ecn_mark 0
  new_flows_len 1 old_flows_len 3
qdisc fq_codel 0: parent :4 limit 10240p flows 1024 quantum 1514
target 5.0ms interval 100.0ms ecn
 Sent 9528553 bytes 11477 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  maxpacket 66 drop_overlimit 0 new_flow_count 1 ecn_mark 0
  new_flows_len 1 old_flows_len 0
  ```

>> This is certainly better than 30Mbps but still more than two times
>> less than before (900).

The number that I still am not sure we got is that you were sending
900mbit udp and recieving 900mbit on the prior tests?

>> TCP also improved a little (550 to ~590).

The limit is probably a bit low, also.  You might want to try target
20ms as well.

>>
>> Felix, others, do you want to see the ported patch, maybe I did something 
>> wrong?
>> Doesn't look like it will save ath10k from performance regression.

what was tcp "before"? (I'm sorry, such a long thread)

>>
>>>
>>> On Fri, 6 May 2016 11:42:43 +0200
>>> Jesper Dangaard Brouer  wrote:
>>>
 Hi Felix,

 This is an important fix for OpenWRT, please read!

 OpenWRT changed the default fq_codel sch->limit from 10240 to 1024,
 without also adjusting q->flows_cnt.  Eric explains below that you must
 also adjust the buckets (q->flows_cnt) for this not to break. (Just
 adjust it to 128)

 Problematic OpenWRT commit in question:
  http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e
  12cd6578084e ("kernel: revert fq_codel quantum override to prevent it 
 from causing too much cpu load with higher speed (#21326)")

 I also highly recommend you cherry-pick this very recent commit:
  net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()")
  https://git.kernel.org/davem/net-next/c/9d18562a227

 This should fix very high CPU usage in-case fq_codel goes into drop mode.
 The problem is that drop mode was considered rare, and implementation
 wise it was chosen to be more expensive (to save cycles on normal mode).
 U

Re: [PATCH 2/3] net/mlx5e: make VXLAN support conditional

2016-05-06 Thread David Miller

From: Arnd Bergmann 
Date: Thu, 05 May 2016 20:09:19 +0200

> For reference, I've tried it out on the MLX4 driver, and it does
> seem nicer that way, see below.

Is it possible to wind down this conversation and have someone submit
whatever final patch everyone agrees to?

Thanks.

[PATCH] e1000e: prevent division by zero if TIMINCA is zero

2016-05-06 Thread Denys Vlasenko

Users report that under VMWare, er32(TIMINCA) returns zero.
This causes division by zero at init time as follows:

 ==>incvalue = er32(TIMINCA) & E1000_TIMINCA_INCVALUE_MASK;
for (i = 0; i < E1000_MAX_82574_SYSTIM_REREADS; i++) {
/* latch SYSTIMH on read of SYSTIML */
systim_next = (cycle_t)er32(SYSTIML);
systim_next |= (cycle_t)er32(SYSTIMH) << 32;

time_delta = systim_next - systim;
temp = time_delta;
 >  rem = do_div(temp, incvalue);

This change makes kernel survive this, and users report that
NIC does work after this change.

Since on real hardware incvalue is never zero, this should not affect
real hardware use case.

Signed-off-by: Denys Vlasenko 
CC: Jeff Kirsher 
CC: "Ruinskiy, Dima" 
CC: intel-wired-...@lists.osuosl.org
CC: netdev@vger.kernel.org
CC: LKML 
---
 drivers/net/ethernet/intel/e1000e/netdev.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index 269087c..0626935 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4315,7 +4315,8 @@ static cycle_t e1000e_cyclecounter_read(const struct 
cyclecounter *cc)
 
time_delta = systim_next - systim;
temp = time_delta;
-   rem = do_div(temp, incvalue);
+   /* VMWare users have seen incvalue of zero, don't div / 
0 */
+   rem = incvalue ? do_div(temp, incvalue) : (time_delta 
!= 0);
 
systim = systim_next;
 
-- 
1.8.1.4

Re: [PATCH net] netfilter: nf_conntrack: Use net_mutex for helper unregistration.

2016-05-06 Thread Joe Stringer

On 6 May 2016 at 04:03, Pablo Neira Ayuso  wrote:
> Hi Joe,
>
> On Thu, May 05, 2016 at 03:50:37PM -0700, Joe Stringer wrote:
>> diff --git a/net/netfilter/nf_conntrack_helper.c 
>> b/net/netfilter/nf_conntrack_helper.c
>> index 3b40ec575cd5..6860b19be406 100644
>> --- a/net/netfilter/nf_conntrack_helper.c
>> +++ b/net/netfilter/nf_conntrack_helper.c
>> @@ -449,10 +449,10 @@ void nf_conntrack_helper_unregister(struct 
>> nf_conntrack_helper *me)
>>*/
>>   synchronize_rcu();
>>
>> - rtnl_lock();
>> + mutex_lock(&net_mutex);
>>   for_each_net(net)
>>   __nf_conntrack_helper_unregister(me, net);
>> - rtnl_unlock();
>> + mutex_unlock(&net_mutex);
>
> This simple solution works because we have no .exit callbacks in any
> of our helpers. Otherwise, the helper code may be already gone by when
> the worker has a chance to run to release the netns.

I'm open to any alternative solutions, but if helper code isn't doing
this yet then perhaps this fix is sufficient?

> If so, probably I can append this as comment to this function so we
> don't forget. If we ever have .exit callbacks (I don't expect so), we
> would need to wait for worker completion.

Sounds reasonable to me.

I see there's a bunch of other unregister locations like
nf_nat_l3proto_clean(), nf_nat_l4proto_clean(), nf_unregister_hook()
which might need similar treatment?

Re: [PATCH net-next 1/2] sfc: Support setting rss_cpus to 'cores', 'packages' or 'hyperthreads'

2016-05-06 Thread David Miller

From: Edward Cree 
Date: Wed, 4 May 2016 18:01:52 +0100

> These settings autoconfigure the number of RSS channels to match the number of
> CPUs present.
> 
> Signed-off-by: Edward Cree 

I can't believe I allowed this 'rss_cpus' thing into the tree to begin with.

It's completely wrong and is exactly the kind of thing we are trying
to actively avoid in network drivers.

If another network driver wants to provide the same facility they will
add a module parameter with a slightly different name, a different
set of valid choices, and different semantics.

Define a proper global, stable, tree-wide mechanism to configure these
kinds of things and use that instead.

Thanks.

Re: [PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Mike Manning

On 05/06/2016 06:02 PM, Alexander Duyck wrote:
> On Fri, May 6, 2016 at 6:26 AM, Mike Manning  wrote:
>> The MAC address of the physical interface is only copied to the VLAN
>> when it is first created, resulting in an inconsistency after MAC
>> address changes of only newly created VLANs having an up-to-date MAC.
>>
>> The VLANs should continue inheriting the MAC address of the physical
>> interface, unless explicitly changed to be different from this.
>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
>> to the MAC of the physical interface and thus for DAD to behave as
>> expected.
>>
>> Signed-off-by: Mike Manning 
>> ---
>>  include/linux/if_vlan.h |2 ++
>>  net/8021q/vlan.c|   17 +++--
>>  net/8021q/vlan_dev.c|   13 ++---
>>  3 files changed, 23 insertions(+), 9 deletions(-)
>>
>> --- a/include/linux/if_vlan.h
>> +++ b/include/linux/if_vlan.h
>> @@ -138,6 +138,7 @@ struct netpoll;
>>   * @flags: device flags
>>   * @real_dev: underlying netdevice
>>   * @real_dev_addr: address of underlying netdevice
>> + * @addr_assign_type: address assignment type
>>   * @dent: proc dir entry
>>   * @vlan_pcpu_stats: ptr to percpu rx stats
>>   */
>> @@ -153,6 +154,7 @@ struct vlan_dev_priv {
>>
>> struct net_device   *real_dev;
>> unsigned char   real_dev_addr[ETH_ALEN];
>> +   unsigned char   addr_assign_type;
>>
>> struct proc_dir_entry   *dent;
>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
> 
> Please don't start adding new members to structures when it already
> exists in the net_device.  If anything you should be able to drop
> read_dev_addr if you do this correctly because you shouldn't need to
> clone the lower dev address to watch for changes.  All you will need
> to do is watch NET_ADDR_STOLEN.
> 

Thanks for the detailed review. I had initially used the existing type
in net_device, but the problem with this was that it got overwritten to
NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify.
It would just be a case of setting the type earlier in that function
(and caching the previous value in case there is an error).

However, based on your later comment, it seems I should not bother with
the approach I have here, namely that if the VLAN MAC is set to the same
value as that of the lower device MAC, that is to be considered as
resetting it and thus for MAC inheritance to resume. Instead, I will just
make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited,
and if it is set to anything (even the value of the lower device MAC),
inheritance is stopped. I agree this makes for a far simpler changeset.

I don't think I can remove real_dev_addr, as that is still needed for
the existing functionality in vlan_sync_address() to determine if the sync
should be done, also as a way of caching it for handling in vlan_dev_open().

As a matter of interest, what is the advantage of not updating the VLAN
MAC when it is down? I appreciate that one should not add/delete
secondary unicast addresses in this case, but there is no such 
restriction for copying the MAC.


>> --- a/net/8021q/vlan.c
>> +++ b/net/8021q/vlan.c
>> @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net
>> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
>> return;
>>
>> +   /* vlan continues to inherit address of parent interface */
>> +   if (vlan->addr_assign_type == NET_ADDR_STOLEN) {
>> +   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
>> +   goto out;
>> +   }
>> +
>> +   if (!(vlandev->flags & IFF_UP))
>> +   goto out;
>> +
>> /* vlan address was different from the old address and is equal to
>>  * the new address */
>> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
>> @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net
>> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
>> dev_uc_add(dev, vlandev->dev_addr);
>>
>> +out:
>> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
>>  }
>>
>> @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti
>>
>> case NETDEV_CHANGEADDR:
>> /* Adjust unicast filters on underlying device */
>> -   vlan_group_for_each_dev(grp, i, vlandev) {
>> -   flgs = vlandev->flags;
>> -   if (!(flgs & IFF_UP))
>> -   continue;
>> -
>> +   vlan_group_for_each_dev(grp, i, vlandev)
>> vlan_sync_address(dev, vlandev);
>> -   }
>> break;
>>
>> case NETDEV_CHANGEMTU:
> 
> So all of this is far more complicated than it needs to be.  If
> NET_ADDR_STOLEN is set you have to follow the lower device MAC
> address, otherwise you

Re: [PATCH net v3 2/2] udp_offload: Set encapsulation before inner completes.

2016-05-06 Thread David Miller

From: Jarno Rajahalme 
Date: Tue,  3 May 2016 16:10:21 -0700

> UDP tunnel segmentation code relies on the inner offsets being set for
> an UDP tunnel GSO packet, but the inner *_complete() functions will
> set the inner offsets only if 'encapsulation' is set before calling
> them.  Currently, udp_gro_complete() sets 'encapsulation' only after
> the inner *_complete() functions are done.  This causes the inner
> offsets having invalid values after udp_gro_complete() returns, which
> in turn will make it impossible to properly segment the packet in case
> it needs to be forwarded, which would be visible to the user either as
> invalid packets being sent or as packet loss.
> 
> This patch fixes this by setting skb's 'encapsulation' in
> udp_gro_complete() before calling into the inner complete functions,
> and by making each possible UDP tunnel gro_complete() callback set the
> inner_mac_header to the beginning of the tunnel payload.
> 
> Signed-off-by: Jarno Rajahalme 
> ---
> v3: Added setting inner_mac_header from all possible callbacks to cover
> cases where there is no inner mac header.

Alex and Tom, can you please review this new version since you guys had
so much feedback for v2?

THanks.

Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)

2016-05-06 Thread Roman Yeryomin

On 6 May 2016 at 21:43, Roman Yeryomin  wrote:
> On 6 May 2016 at 15:47, Jesper Dangaard Brouer  wrote:
>>
>> I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2]
>> closed Felix'es OpenWRT email account (bad choice! emails bouncing).
>> Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project
>> is in some kind of conflict.
>>
>> OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349
>>
>> [2] 
>> http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335
>
> OK, so, after porting the patch to 4.1 openwrt kernel and playing a
> bit with fq_codel limits I was able to get 420Mbps UDP like this:
> tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256

Forgot to mention, I've reduced drop_batch_size down to 32

> This is certainly better than 30Mbps but still more than two times
> less than before (900).
> TCP also improved a little (550 to ~590).
>
> Felix, others, do you want to see the ported patch, maybe I did something 
> wrong?
> Doesn't look like it will save ath10k from performance regression.
>
>>
>> On Fri, 6 May 2016 11:42:43 +0200
>> Jesper Dangaard Brouer  wrote:
>>
>>> Hi Felix,
>>>
>>> This is an important fix for OpenWRT, please read!
>>>
>>> OpenWRT changed the default fq_codel sch->limit from 10240 to 1024,
>>> without also adjusting q->flows_cnt.  Eric explains below that you must
>>> also adjust the buckets (q->flows_cnt) for this not to break. (Just
>>> adjust it to 128)
>>>
>>> Problematic OpenWRT commit in question:
>>>  http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e
>>>  12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from 
>>> causing too much cpu load with higher speed (#21326)")
>>>
>>>
>>> I also highly recommend you cherry-pick this very recent commit:
>>>  net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()")
>>>  https://git.kernel.org/davem/net-next/c/9d18562a227
>>>
>>> This should fix very high CPU usage in-case fq_codel goes into drop mode.
>>> The problem is that drop mode was considered rare, and implementation
>>> wise it was chosen to be more expensive (to save cycles on normal mode).
>>> Unfortunately is it easy to trigger with an UDP flood. Drop mode is
>>> especially expensive for smaller devices, as it scans a 4K big array,
>>> thus 64 cache misses for small devices!
>>>
>>> The fix is to allow drop-mode to bulk-drop more packets when entering
>>> drop-mode (default 64 bulk drop).  That way we don't suddenly
>>> experience a significantly higher processing cost per packet, but
>>> instead can amortize this.
>>>
>>> To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk
>>> drop, given we also recommend bucket size to be 128 ? (thus the amount
>>> of memory to scan is less, but their CPU is also much smaller).
>>>
>>> --Jesper
>>>
>>>
>>> On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet  
>>> wrote:
>>>
>>> > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote:
>>> > > On 5 May 2016 at 19:12, Eric Dumazet  wrote:
>>> > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote:
>>> > > >
>>> > > >>
>>> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024
>>> > > >> quantum 1514 target 5.0ms interval 100.0ms ecn
>>> > > >>  Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0)
>>> > > >>  backlog 0b 0p requeues 0
>>> > > >>   maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0
>>> > > >>   new_flows_len 0 old_flows_len 0
>>> > > >
>>> > > >
>>> > > > Limit of 1024 packets and 1024 flows is not wise I think.
>>> > > >
>>> > > > (If all buckets are in use, each bucket has a virtual queue of 1 
>>> > > > packet,
>>> > > > which is almost the same than having no queue at all)
>>> > > >
>>> > > > I suggest to have at least 8 packets per bucket, to let Codel have a
>>> > > > chance to trigger.
>>> > > >
>>> > > > So you could either reduce number of buckets to 128 (if memory is
>>> > > > tight), or increase limit to 8192.
>>> > >
>>> > > Will try, but what I've posted is default, I didn't change/configure 
>>> > > that.
>>> >
>>> > fq_codel has a default of 10240 packets and 1024 buckets.
>>> >
>>> > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413
>>> >
>>> > If someone changed that in the linux variant you use, he probably should
>>> > explain the rationale.
>>
>> --
>> Best regards,
>>   Jesper Dangaard Brouer
>>   MSc.CS, Principal Kernel Engineer at Red Hat
>>   Author of http://www.iptv-analyzer.org
>>   LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCH] Add support for configuring Infiniband GUIDs

2016-05-06 Thread Stephen Hemminger

On Fri,  6 May 2016 10:43:25 -0500
Eli Cohen  wrote:

> Add two NLA's that allow configuration of Infiniband node or port GUIDs
> by referencing the IPoIB net device set over then physical function. The
> format to be used is as follows:
> 
> ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70
> ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78
> 
> Issue: 702759
> Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e
> Signed-off-by: Eli Cohen 

I am not that familiar with Infiniband, but the documentation seems
to use a non-colon form:
 # ip link set dev ib0 vf 0 node_guid 0002c90300216e70

Seems like ip should follow the lead of ibstat and friends.

Re: [PATCH iproute2 0/2] ip link gre: fix external mode handling

2016-05-06 Thread Stephen Hemminger

On Wed, 27 Apr 2016 16:11:12 +0200
Jiri Benc  wrote:

> Fix two bugs with handling of the 'external' keyword for GRE.
> 
> Jiri Benc (2):
>   ip link gre: create interfaces in external mode correctly
>   ip link gre: print only relevant info in external mode
> 
>  ip/link_gre.c | 43 +--
>  1 file changed, 25 insertions(+), 18 deletions(-)
> 

Applied

Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)

2016-05-06 Thread Roman Yeryomin

On 6 May 2016 at 15:47, Jesper Dangaard Brouer  wrote:
>
> I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2]
> closed Felix'es OpenWRT email account (bad choice! emails bouncing).
> Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project
> is in some kind of conflict.
>
> OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349
>
> [2] 
> http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335

OK, so, after porting the patch to 4.1 openwrt kernel and playing a
bit with fq_codel limits I was able to get 420Mbps UDP like this:
tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256

This is certainly better than 30Mbps but still more than two times
less than before (900).
TCP also improved a little (550 to ~590).

Felix, others, do you want to see the ported patch, maybe I did something wrong?
Doesn't look like it will save ath10k from performance regression.

>
> On Fri, 6 May 2016 11:42:43 +0200
> Jesper Dangaard Brouer  wrote:
>
>> Hi Felix,
>>
>> This is an important fix for OpenWRT, please read!
>>
>> OpenWRT changed the default fq_codel sch->limit from 10240 to 1024,
>> without also adjusting q->flows_cnt.  Eric explains below that you must
>> also adjust the buckets (q->flows_cnt) for this not to break. (Just
>> adjust it to 128)
>>
>> Problematic OpenWRT commit in question:
>>  http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e
>>  12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from 
>> causing too much cpu load with higher speed (#21326)")
>>
>>
>> I also highly recommend you cherry-pick this very recent commit:
>>  net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()")
>>  https://git.kernel.org/davem/net-next/c/9d18562a227
>>
>> This should fix very high CPU usage in-case fq_codel goes into drop mode.
>> The problem is that drop mode was considered rare, and implementation
>> wise it was chosen to be more expensive (to save cycles on normal mode).
>> Unfortunately is it easy to trigger with an UDP flood. Drop mode is
>> especially expensive for smaller devices, as it scans a 4K big array,
>> thus 64 cache misses for small devices!
>>
>> The fix is to allow drop-mode to bulk-drop more packets when entering
>> drop-mode (default 64 bulk drop).  That way we don't suddenly
>> experience a significantly higher processing cost per packet, but
>> instead can amortize this.
>>
>> To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk
>> drop, given we also recommend bucket size to be 128 ? (thus the amount
>> of memory to scan is less, but their CPU is also much smaller).
>>
>> --Jesper
>>
>>
>> On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet  
>> wrote:
>>
>> > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote:
>> > > On 5 May 2016 at 19:12, Eric Dumazet  wrote:
>> > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote:
>> > > >
>> > > >>
>> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024
>> > > >> quantum 1514 target 5.0ms interval 100.0ms ecn
>> > > >>  Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0)
>> > > >>  backlog 0b 0p requeues 0
>> > > >>   maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0
>> > > >>   new_flows_len 0 old_flows_len 0
>> > > >
>> > > >
>> > > > Limit of 1024 packets and 1024 flows is not wise I think.
>> > > >
>> > > > (If all buckets are in use, each bucket has a virtual queue of 1 
>> > > > packet,
>> > > > which is almost the same than having no queue at all)
>> > > >
>> > > > I suggest to have at least 8 packets per bucket, to let Codel have a
>> > > > chance to trigger.
>> > > >
>> > > > So you could either reduce number of buckets to 128 (if memory is
>> > > > tight), or increase limit to 8192.
>> > >
>> > > Will try, but what I've posted is default, I didn't change/configure 
>> > > that.
>> >
>> > fq_codel has a default of 10240 packets and 1024 buckets.
>> >
>> > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413
>> >
>> > If someone changed that in the linux variant you use, he probably should
>> > explain the rationale.
>
> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread John Stultz

On Tue, May 3, 2016 at 2:16 PM, Dean Jenkins  wrote:
> A good test would be to run "ping -c 1 -s $packet_length $ip_address" inside
> a script which has a loop with an increasing payload length $packet_length
> with a small delay between ping calls. This will show whether particular
> packet sizes trigger the failures.
>
> Then try with "ping -f -c 200 -s $packet_length $ip_address" to load up the
> USB link.

I've tried both of these on my x86_64 system.  I can send single pings
up to 65507 without triggering the issue (after which I get errors
sending on the host side as I think I cross a 64k boundary with
headers, not the asix errors).

Then when I try ping -f -c 200 -s 65507 $ip_address, I don't see any
failures. I did it for a count of 2000 as well without any issues.

I'll be adding more debug prints in soon.

thanks
-john

Re: [PATCH] netdev: enc28j60 kernel panic fix.

2016-05-06 Thread David Russell

I kind of thought my patch was at best incomplete.  When you state
this change silences the bug but does not fix it, what are the
implications of systems running this patch?  We have some production
systems using this patch.  They reboot daily, but have been solid.

In addition, if we sent you a pi and the ethernet controller and a
small but reasonable sum of money for your labor, would you be able to
properly fix it?  Short of that, do you have any recommendations on
quick overviews of the networking stack in the kernel and then
documentation on the various flags and such?

Thanks.

-David Russell
APRS World, LLC
http://www.aprsworld.com/


On Thu, May 5, 2016 at 3:51 AM, Francois Romieu  wrote:
> David Russell  :
>> When connected directly to another system (not via a switch)
>> eventually a condition where a NULL pointer dereference occurs in
>> enc28j60_hw_tx() and this patch simply checks for that condition and
>> returns gracefully without causing a kernel panic.  I believe, but
>> have not investigated this is caused by a packet collision and am not
>> sure if the kernel tracks collisions or counts them as errors, so that
>> should probably be added if this is what's happening.  I'm also not
>> familiar with the linux kernel, so may have fixed this in a less than
>> ideal way.
>
> Is it possible for EIR.EIR_TXERIF and EIR.EIR_TXIF to be set for the
> same packet ?
>
> If so the driver is intrinsically racy:
> - EIR.EIR_TXIF completes transmission, clears tx_skb and enables queueing
>   again (see netif_wake_queue in enc28j60_tx_clear)
>
> - insert start_xmit here: tx_skb is set and enc28j60_hw_tx is scheduled
>   for late execution (user context work)
>
> - EIR.EIR_EIR.EIR_TXERIF issues same enc28j60_tx_clear and clears tx_skb
>
> - enc28j60_hw_tx is run but tx_skb is NULL
>
>> diff --git a/drivers/net/ethernet/microchip/enc28j60.c
>> b/drivers/net/ethernet/microchip/enc28j60.c
>> index 86ea17e..36ac65f 100644
>> --- a/drivers/net/ethernet/microchip/enc28j60.c
>> +++ b/drivers/net/ethernet/microchip/enc28j60.c
>> @@ -1233,6 +1233,9 @@ static void enc28j60_irq_work_handler(struct
>> work_struct *work)
>>   */
>>  static void enc28j60_hw_tx(struct enc28j60_net *priv)
>>  {
>> +   if (!priv->tx_skb)
>> +   return;
>> +
>> if (netif_msg_tx_queued(priv))
>> printk(KERN_DEBUG DRV_NAME
>> ": Tx Packet Len:%d\n", priv->tx_skb->len);
>
> enc28j60_hw_tx isn't the culprit. It's the victim.
>
> This change silences the bug but it does not fix it at all.
>
> --
> Ueimor

Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 11:01 AM, Larry Finger  wrote:
> On 05/06/2016 12:13 PM, Alexander Duyck wrote:
>>
>> On Fri, May 6, 2016 at 9:33 AM, Wang YanQing  wrote:
>>>
>>> We can't use kfree_skb in irq disable context, because spin_lock_irqsave
>>> make sure we are always in irq disable context, use dev_kfree_skb_irq
>>> instead of kfree_skb is better than dev_kfree_skb_any.
>>>
>>> This patch fix below kernel warning:
>>> [ 7612.095528] [ cut here ]
>>> [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150
>>> __local_bh_enable_ip+0x58/0x80()
>>> [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal
>>> btcoexist rtl_pci rtlwifi rtl8723_common
>>> [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW
>>> 4.4.0+ #4
>>> [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW
>>> (1.19 ) 08/27/2015
>>> [ 7612.095574]    da37fc70 c12ce7c5  da37fca0
>>> c104cc59 c19d4454
>>> [ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8
>>> 0200 c1b42400
>>> [ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc
>>> c10508a8 f21f08b8
>>> [ 7612.095604] Call Trace:
>>> [ 7612.095614]  [] dump_stack+0x41/0x5c
>>> [ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
>>> [ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
>>> [ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
>>> [ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
>>> [ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
>>> [ 7612.095653]  [] destroy_conntrack+0x64/0xa0
>>> [ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
>>> [ 7612.095665]  [] skb_release_head_state+0x55/0xa0
>>> [ 7612.095670]  [] skb_release_all+0xb/0x20
>>> [ 7612.095674]  [] __kfree_skb+0xb/0x60
>>> [ 7612.095679]  [] kfree_skb+0x30/0x70
>>> [ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370
>>> [rtl_pci]
>>> [ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
>>> [ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
>>> [ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
>>> [ 7612.095712]  [] drv_start+0x36/0xc0
>>> [ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
>>> [ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
>>> [ 7612.095730]  [] ieee80211_open+0x4d/0x50
>>> [ 7612.095736]  [] __dev_open+0xa3/0x130
>>> [ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
>>> [ 7612.095748]  [] __dev_change_flags+0x89/0x140
>>> [ 7612.095753]  [] ? selinux_capable+0xd/0x10
>>> [ 7612.095759]  [] dev_change_flags+0x29/0x60
>>> [ 7612.095765]  [] devinet_ioctl+0x553/0x670
>>> [ 7612.095772]  [] ? _copy_to_user+0x28/0x40
>>> [ 7612.095777]  [] inet_ioctl+0x85/0xb0
>>> [ 7612.095783]  [] sock_ioctl+0x67/0x260
>>> [ 7612.095788]  [] ? sock_fasync+0x80/0x80
>>> [ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
>>> [ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
>>> [ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
>>> [ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
>>> [ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
>>> [ 7612.095827]  [] SyS_ioctl+0x70/0x80
>>> [ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
>>> [ 7612.095839]  [] sysenter_past_esp+0x36/0x55
>>> [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---
>>>
>>> Signed-off-by: Wang YanQing 
>>> Cc: Stable 
>>> ---
>>>   Changes:
>>>   v1-v2:
>>>   1: add a Cc to stable.
>>>
>>>   drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
>>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c
>>> b/drivers/net/wireless/realtek/rtlwifi/pci.c
>>> index 1ac41b8..99a3a03 100644
>>> --- a/drivers/net/wireless/realtek/rtlwifi/pci.c
>>> +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
>>> @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
>>>   true,
>>>
>>> HW_DESC_TXBUFF_ADDR),
>>>   skb->len,
>>> PCI_DMA_TODEVICE);
>>> -   kfree_skb(skb);
>>> +   dev_kfree_skb_irq(skb);
>>>  ring->idx = (ring->idx + 1) %
>>> ring->entries;
>>>  }
>>>  ring->idx = 0;
>>
>>
>> Is this always called in IRQ context?  You might be better off using
>> dev_kfree_skb_any instead if this is something that can be called from
>> net_device_ops since that way you avoid having to call into the Tx
>> softirq cleanup routine to free the buffers later unless you really
>> need it.
>>
>> - Alex
>>
>
> Alex,
>
> Six lines below the change is a spin_unlock_irqrestore(), which is always
> called. I believe that the patch is correct.

Okay.  That works then.

Thanks.

- Alex

Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Larry Finger


On 05/06/2016 12:13 PM, Alexander Duyck wrote:

On Fri, May 6, 2016 at 9:33 AM, Wang YanQing  wrote:

We can't use kfree_skb in irq disable context, because spin_lock_irqsave
make sure we are always in irq disable context, use dev_kfree_skb_irq
instead of kfree_skb is better than dev_kfree_skb_any.

This patch fix below kernel warning:
[ 7612.095528] [ cut here ]
[ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 
__local_bh_enable_ip+0x58/0x80()
[ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist 
rtl_pci rtlwifi rtl8723_common
[ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW   4.4.0+ 
#4
[ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 
) 08/27/2015
[ 7612.095574]    da37fc70 c12ce7c5  da37fca0 c104cc59 
c19d4454
[ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8 0200 
c1b42400
[ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc c10508a8 
f21f08b8
[ 7612.095604] Call Trace:
[ 7612.095614]  [] dump_stack+0x41/0x5c
[ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
[ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
[ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
[ 7612.095653]  [] destroy_conntrack+0x64/0xa0
[ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
[ 7612.095665]  [] skb_release_head_state+0x55/0xa0
[ 7612.095670]  [] skb_release_all+0xb/0x20
[ 7612.095674]  [] __kfree_skb+0xb/0x60
[ 7612.095679]  [] kfree_skb+0x30/0x70
[ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
[ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
[ 7612.095712]  [] drv_start+0x36/0xc0
[ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
[ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
[ 7612.095730]  [] ieee80211_open+0x4d/0x50
[ 7612.095736]  [] __dev_open+0xa3/0x130
[ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
[ 7612.095748]  [] __dev_change_flags+0x89/0x140
[ 7612.095753]  [] ? selinux_capable+0xd/0x10
[ 7612.095759]  [] dev_change_flags+0x29/0x60
[ 7612.095765]  [] devinet_ioctl+0x553/0x670
[ 7612.095772]  [] ? _copy_to_user+0x28/0x40
[ 7612.095777]  [] inet_ioctl+0x85/0xb0
[ 7612.095783]  [] sock_ioctl+0x67/0x260
[ 7612.095788]  [] ? sock_fasync+0x80/0x80
[ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
[ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
[ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
[ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
[ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
[ 7612.095827]  [] SyS_ioctl+0x70/0x80
[ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
[ 7612.095839]  [] sysenter_past_esp+0x36/0x55
[ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---

Signed-off-by: Wang YanQing 
Cc: Stable 
---
  Changes:
  v1-v2:
  1: add a Cc to stable.

  drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 1ac41b8..99a3a03 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
  true,
  HW_DESC_TXBUFF_ADDR),
  skb->len, PCI_DMA_TODEVICE);
-   kfree_skb(skb);
+   dev_kfree_skb_irq(skb);
 ring->idx = (ring->idx + 1) % ring->entries;
 }
 ring->idx = 0;


Is this always called in IRQ context?  You might be better off using
dev_kfree_skb_any instead if this is something that can be called from
net_device_ops since that way you avoid having to call into the Tx
softirq cleanup routine to free the buffers later unless you really
need it.

- Alex



Alex,

Six lines below the change is a spin_unlock_irqrestore(), which is always 
called. I believe that the patch is correct.


Larry

[PATCH v1 1/1] ISDN: eicon: replace custom hex_asc_lo() / hex_pack_byte()

2016-05-06 Thread Andy Shevchenko

Instead of custom approach re-use generic helpers to convert byte to hex
format.

Signed-off-by: Andy Shevchenko 
---
 drivers/isdn/hardware/eicon/message.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/drivers/isdn/hardware/eicon/message.c 
b/drivers/isdn/hardware/eicon/message.c
index d7c2866..1a1d997 100644
--- a/drivers/isdn/hardware/eicon/message.c
+++ b/drivers/isdn/hardware/eicon/message.c
@@ -1147,8 +1147,6 @@ static byte test_c_ind_mask_bit(PLCI *plci, word b)
 
 static void dump_c_ind_mask(PLCI *plci)
 {
-   static char hex_digit_table[0x10] =
-   {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 
'c', 'd', 'e', 'f'};
word i, j, k;
dword d;
char *p;
@@ -1165,7 +1163,7 @@ static void dump_c_ind_mask(PLCI *plci)
d = plci->c_ind_mask_table[i + j];
for (k = 0; k < 8; k++)
{
-   *(--p) = hex_digit_table[d & 0xf];
+   *(--p) = hex_asc_lo(d);
d >>= 4;
}
}
@@ -10507,7 +10505,6 @@ static void mixer_set_bchannel_id(PLCI *plci, byte *chi)
 
 static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a)
 {
-   static char hex_digit_table[0x10] = {'0', '1', '2', '3', '4', '5', '6', 
'7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
word n, i, j;
char *p;
char hex_line[2 * MIXER_MAX_DUMP_CHANNELS + MIXER_MAX_DUMP_CHANNELS / 8 
+ 4];
@@ -10690,13 +10687,13 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER 
*a)
n = li_total_channels;
if (n > MIXER_MAX_DUMP_CHANNELS)
n = MIXER_MAX_DUMP_CHANNELS;
+
p = hex_line;
for (j = 0; j < n; j++)
{
if ((j & 0x7) == 0)
*(p++) = ' ';
-   *(p++) = hex_digit_table[li_config_table[j].curchnl >> 4];
-   *(p++) = hex_digit_table[li_config_table[j].curchnl & 0xf];
+   p = hex_byte_pack(p, li_config_table[j].curchnl);
}
*p = '\0';
dbug(1, dprintf("[%06lx] CURRENT %s",
@@ -10706,8 +10703,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a)
{
if ((j & 0x7) == 0)
*(p++) = ' ';
-   *(p++) = hex_digit_table[li_config_table[j].channel >> 4];
-   *(p++) = hex_digit_table[li_config_table[j].channel & 0xf];
+   p = hex_byte_pack(p, li_config_table[j].channel);
}
*p = '\0';
dbug(1, dprintf("[%06lx] CHANNEL %s",
@@ -10717,8 +10713,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a)
{
if ((j & 0x7) == 0)
*(p++) = ' ';
-   *(p++) = hex_digit_table[li_config_table[j].chflags >> 4];
-   *(p++) = hex_digit_table[li_config_table[j].chflags & 0xf];
+   p = hex_byte_pack(p, li_config_table[j].chflags);
}
*p = '\0';
dbug(1, dprintf("[%06lx] CHFLAG  %s",
@@ -10730,8 +10725,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a)
{
if ((j & 0x7) == 0)
*(p++) = ' ';
-   *(p++) = 
hex_digit_table[li_config_table[i].flag_table[j] >> 4];
-   *(p++) = 
hex_digit_table[li_config_table[i].flag_table[j] & 0xf];
+   p = hex_byte_pack(p, li_config_table[i].flag_table[j]);
}
*p = '\0';
dbug(1, dprintf("[%06lx] FLAG[%02x]%s",
@@ -10744,8 +10738,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a)
{
if ((j & 0x7) == 0)
*(p++) = ' ';
-   *(p++) = 
hex_digit_table[li_config_table[i].coef_table[j] >> 4];
-   *(p++) = 
hex_digit_table[li_config_table[i].coef_table[j] & 0xf];
+   p = hex_byte_pack(p, li_config_table[i].coef_table[j]);
}
*p = '\0';
dbug(1, dprintf("[%06lx] COEF[%02x]%s",
-- 
2.8.1

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread John Stultz

On Fri, May 6, 2016 at 8:00 AM, Dean Jenkins  wrote:
> My conclusion is that your USB to Ethernet Adaptor is not running at high
> speed (480Mbps) mode which is causing a partial loss (corruption) of
> Ethernet frames across the USB link. A USB Protocol Analyser or software
> tool usbmon could be used to confirm this scenario.
>
> Therefore please retest with a working high-speed USB hub or remove the
> full-speed USB hub from the test environment and directly connect the USB to
> Ethernet Adaptor to the root hub of the USB port. Then repeat the tests to
> see whether anything improved.
>
> In other words, you need to eliminate the dmesg messages saying "not running
> at top speed; connect to a high speed hub".

The aarch64 system has a quirk that at the moment limits it to the
slower full-speed mode, which also exacerbates the issue (basically
taking a fairly slow 1.1.Mb/s network connection without your patch,
to an almost unusable 30Kb/s with it).

But that isn't the case on the x86_64 system, which is seeing a very
similar problem (though the performance effect isn't nearly as bad, as
the error rate in time seems relatively similar on both, and I think
my scp transmissions are cpu bound on this atom board :).

thanks
-john

[PATCH v2] Documentation/networking: more accurate LCO explanation

2016-05-06 Thread Shmulik Ladkani

In few places the term "ones-complement sum" was used but the actual
meaning is "the complement of the ones-complement sum".

Also, avoid enclosing long statements with underscore, to ease
readability.

Signed-off-by: Shmulik Ladkani 
Acked-by: Edward Cree 

---
 v2:
  - Fixed one more occurence where "complement of" was missing
  - Got rid of unreadable underscore wrapped statements

 Took the liberty having the underscore removal as part of this patch.
 Let me know if you feel this needs a patch split.

 Documentation/networking/checksum-offloads.txt | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Documentation/networking/checksum-offloads.txt 
b/Documentation/networking/checksum-offloads.txt
index de2a327766..56e3686124 100644
--- a/Documentation/networking/checksum-offloads.txt
+++ b/Documentation/networking/checksum-offloads.txt
@@ -69,18 +69,18 @@ LCO: Local Checksum Offload
 LCO is a technique for efficiently computing the outer checksum of an
  encapsulated datagram when the inner checksum is due to be offloaded.
 The ones-complement sum of a correctly checksummed TCP or UDP packet is
- equal to the sum of the pseudo header, because everything else gets
- 'cancelled out' by the checksum field.  This is because the sum was
+ equal to the complement of the sum of the pseudo header, because everything
+ else gets 'cancelled out' by the checksum field.  This is because the sum was
  complemented before being written to the checksum field.
 More generally, this holds in any case where the 'IP-style' ones complement
  checksum is used, and thus any checksum that TX Checksum Offload supports.
 That is, if we have set up TX Checksum Offload with a start/offset pair, we
- know that _after the device has filled in that checksum_, the ones
+ know that after the device has filled in that checksum, the ones
  complement sum from csum_start to the end of the packet will be equal to
- _whatever value we put in the checksum field beforehand_.  This allows us
- to compute the outer checksum without looking at the payload: we simply
- stop summing when we get to csum_start, then add the 16-bit word at
- (csum_start + csum_offset).
+ the complement of whatever value we put in the checksum field beforehand.
+ This allows us to compute the outer checksum without looking at the payload:
+ we simply stop summing when we get to csum_start, then add the complement of
+ the 16-bit word at (csum_start + csum_offset).
 Then, when the true inner checksum is filled in (either by hardware or by
  skb_checksum_help()), the outer checksum will become correct by virtue of
  the arithmetic.
-- 
2.7.4

Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 9:33 AM, Wang YanQing  wrote:
> We can't use kfree_skb in irq disable context, because spin_lock_irqsave
> make sure we are always in irq disable context, use dev_kfree_skb_irq
> instead of kfree_skb is better than dev_kfree_skb_any.
>
> This patch fix below kernel warning:
> [ 7612.095528] [ cut here ]
> [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 
> __local_bh_enable_ip+0x58/0x80()
> [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist 
> rtl_pci rtlwifi rtl8723_common
> [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW   
> 4.4.0+ #4
> [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW 
> (1.19 ) 08/27/2015
> [ 7612.095574]    da37fc70 c12ce7c5  da37fca0 
> c104cc59 c19d4454
> [ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8 
> 0200 c1b42400
> [ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc 
> c10508a8 f21f08b8
> [ 7612.095604] Call Trace:
> [ 7612.095614]  [] dump_stack+0x41/0x5c
> [ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
> [ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
> [ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
> [ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
> [ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
> [ 7612.095653]  [] destroy_conntrack+0x64/0xa0
> [ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
> [ 7612.095665]  [] skb_release_head_state+0x55/0xa0
> [ 7612.095670]  [] skb_release_all+0xb/0x20
> [ 7612.095674]  [] __kfree_skb+0xb/0x60
> [ 7612.095679]  [] kfree_skb+0x30/0x70
> [ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
> [ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
> [ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
> [ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
> [ 7612.095712]  [] drv_start+0x36/0xc0
> [ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
> [ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
> [ 7612.095730]  [] ieee80211_open+0x4d/0x50
> [ 7612.095736]  [] __dev_open+0xa3/0x130
> [ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
> [ 7612.095748]  [] __dev_change_flags+0x89/0x140
> [ 7612.095753]  [] ? selinux_capable+0xd/0x10
> [ 7612.095759]  [] dev_change_flags+0x29/0x60
> [ 7612.095765]  [] devinet_ioctl+0x553/0x670
> [ 7612.095772]  [] ? _copy_to_user+0x28/0x40
> [ 7612.095777]  [] inet_ioctl+0x85/0xb0
> [ 7612.095783]  [] sock_ioctl+0x67/0x260
> [ 7612.095788]  [] ? sock_fasync+0x80/0x80
> [ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
> [ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
> [ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
> [ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
> [ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
> [ 7612.095827]  [] SyS_ioctl+0x70/0x80
> [ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
> [ 7612.095839]  [] sysenter_past_esp+0x36/0x55
> [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---
>
> Signed-off-by: Wang YanQing 
> Cc: Stable 
> ---
>  Changes:
>  v1-v2:
>  1: add a Cc to stable.
>
>  drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
> b/drivers/net/wireless/realtek/rtlwifi/pci.c
> index 1ac41b8..99a3a03 100644
> --- a/drivers/net/wireless/realtek/rtlwifi/pci.c
> +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
> @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
>  true,
>  HW_DESC_TXBUFF_ADDR),
>  skb->len, PCI_DMA_TODEVICE);
> -   kfree_skb(skb);
> +   dev_kfree_skb_irq(skb);
> ring->idx = (ring->idx + 1) % ring->entries;
> }
> ring->idx = 0;

Is this always called in IRQ context?  You might be better off using
dev_kfree_skb_any instead if this is something that can be called from
net_device_ops since that way you avoid having to call into the Tx
softirq cleanup routine to free the buffers later unless you really
need it.

- Alex

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread John Stultz

On Thu, May 5, 2016 at 1:11 AM, Dean Jenkins  wrote:
> On 05/05/16 00:45, John Stultz wrote:
>>
>> Just as a sample point, I have managed to reproduce exactly this issue
>> on an x86_64 system by simply scp'ing a large file.
>
> Please tell us the x86_64 kernel version number that you used and which
> Linux Distribution it was ? This allows other people a chance to reproduce
> your observations.

Sorry for being a little slow here, had some other issues I had to chase.

On my x86_64 system, its Ubuntu 14.04.4, with a 4.6.0-rc2 kernel.


>> [  417.819276] asix 1-5:1.0 eth1: asix_rx_fixup() Data Header
>> synchronisation was lost, remaining 988
>
> It is interesting that the reported "remaining" value is 988. Is 988 always
> shown ? I mean that do you see any other "remaining" values for the "Data
> Header synchronisation was lost" error message ?

Yep. Its always the same 988 remaining, on either architecture.


>> [  417.823415] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length
>> 0xef830347, offset 4
>
> The gap in the timestamps shows 417.823415 - 417.819276 = 0.004139 = 4ms
> which is a large gap in terms of USB 2.0 high speed communications. This gap
> is expected to be in the 100us range for consecutive URBs. So 4ms is
> strange.
>
> The expectation is that the "Data Header synchronisation was lost" error
> message resets the 32-bit header word synchronisation to the start of the
> next URB buffer. The "Bad Header Length, offset 4" is the expected outcome
> for the next URB because it is unlikely the 32-bit header word is at the
> start of URB buffer due to Ethernet frames spanning across URBs.
>>
>> [  417.827502] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length
>> 0x31e2b348, offset 4
>
> Timestamps show the gap to be 4ms which is strange for USB 2.0 high speed,
> are you sure high speed mode is being used ?
>>

Yep, on my x86_64 system, it seems to be.

[3.101115] usb 1-5: new high-speed USB device number 2 using ehci-pci
[3.232309] usb 1-5: New USB device found, idVendor=0b95, idProduct=772b
[3.232327] usb 1-5: New USB device strings: Mfr=1, Product=2, SerialNumber=3
[3.232339] usb 1-5: Product: AX88772B
[3.232350] usb 1-5: Manufacturer: ASIX Elec. Corp.
[3.232360] usb 1-5: SerialNumber: 188298
[4.032206] asix 1-5:1.0 eth1: register 'asix' at
usb-:00:04.1-5, ASIX AX88772B USB 2.0 Ethernet, 00:50:b6:18:82:98


> Please can you supply the output of ifconfig for the USB to Ethernet
> adaptor, your example above shows eth1 as the device.
>
> Please show the output of ifconfig eth1 before and after the issue is seen.
> This will show us whether the kernel logs any network errors and how many
> bytes have been transferred.

Before:
$ ifconfig eth1
eth1  Link encap:Ethernet  HWaddr 00:50:b6:18:82:98
  inet addr:192.168.0.12  Bcast:192.168.0.255  Mask:255.255.255.0
  inet6 addr: 2601:1c2:1002:83f0:250:b6ff:fe18:8298/64 Scope:Global
  inet6 addr: fe80::250:b6ff:fe18:8298/64 Scope:Link
  inet6 addr: 2601:1c2:1002:83f0:b0f0:71a0:6c7e:346b/64 Scope:Global
  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
  RX packets:372 errors:0 dropped:0 overruns:0 frame:0
  TX packets:385 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:1000
  RX bytes:38523 (38.5 KB)  TX bytes:48801 (48.8 KB)


After:
$ ifconfig eth1
eth1  Link encap:Ethernet  HWaddr 00:50:b6:18:82:98
  inet addr:192.168.0.12  Bcast:192.168.0.255  Mask:255.255.255.0
  inet6 addr: 2601:1c2:1002:83f0:250:b6ff:fe18:8298/64 Scope:Global
  inet6 addr: fe80::250:b6ff:fe18:8298/64 Scope:Link
  inet6 addr: 2601:1c2:1002:83f0:b0f0:71a0:6c7e:346b/64 Scope:Global
  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
  RX packets:151005 errors:169 dropped:0 overruns:0 frame:0
  TX packets:61351 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:1000
  RX bytes:225874384 (225.8 MB)  TX bytes:4431098 (4.4 MB)




> After the issue is seen, please can you show us the output of "dmesg | grep
> asix" so that we can see status messages from the ASIX driver that the USB
> to Ethernet adaptor is using. In particular we need to check that USB high
> speed operation (480Mbps) is being used and not full speed operation
> (12Mbps).


[2.766525] usbcore: registered new interface driver asix
[4.031443] asix 1-5:1.0 eth1: register 'asix' at
usb-:00:04.1-5, ASIX AX88772B USB 2.0 Ethernet, 00:50:b6:18:82:98
[   31.578983] asix 1-5:1.0 eth1: link down
[   33.244743] asix 1-5:1.0 eth1: link up, 100Mbps, full-duplex, lpa 0xCDE1
[  171.959244] asix 1-5:1.0 eth1: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  171.959530] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length
0x1651c2bf, offset 4
[  171.959768] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length
0xfcf61092, offset 4
[  171.960001] asix 1-5:1.0 eth1: asix_rx_fixup() Ba

Re: [PATCH] Add support for configuring Infiniband GUIDs

2016-05-06 Thread Sergei Shtylyov


Hello.

On 05/06/2016 06:43 PM, Eli Cohen wrote:


Add two NLA's that allow configuration of Infiniband node or port GUIDs
by referencing the IPoIB net device set over then physical function. The
format to be used is as follows:

ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70
ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78

Issue: 702759
Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e
Signed-off-by: Eli Cohen 
---
  ip/iplink.c   | 40 
  man/man8/ip-link.8.in | 12 +++-
  2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index d2e586b6d133..3f885defdfeb 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -237,6 +237,30 @@ struct iplink_req {
charbuf[1024];
  };

+static int extract_guid(__u64 *guid, char *arg)
+{
+   __u64 ret;
+   int g[8];
+   int err;
+
+   err = sscanf(arg, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+g, g + 1, g + 2, g + 3, g + 4, g + 5, g + 6, g + 7);
+   if (err != 8)


   Strange name for a variable, if sscanf() returns # of fields read... In 
fact, you don't even need this variable.



+   return -1;
+
+   ret = ((__u64)(g[0]) << 56) |
+ ((__u64)(g[1]) << 48) |
+ ((__u64)(g[2]) << 40) |
+ ((__u64)(g[3]) << 32) |
+ ((__u64)(g[4]) << 24) |
+ ((__u64)(g[5]) << 16) |
+ ((__u64)(g[6]) << 8) |
+ ((__u64)(g[7]));
+   *guid = ret;
+
+   return 0;
+}
+
  static int iplink_parse_vf(int vf, int *argcp, char ***argvp,
   struct iplink_req *req, int dev_index)
  {

[...]

MBR, Sergei

Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Larry Finger


On 05/06/2016 11:33 AM, Wang YanQing wrote:

We can't use kfree_skb in irq disable context, because spin_lock_irqsave
make sure we are always in irq disable context, use dev_kfree_skb_irq
instead of kfree_skb is better than dev_kfree_skb_any.

This patch fix below kernel warning:
[ 7612.095528] [ cut here ]
[ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 
__local_bh_enable_ip+0x58/0x80()
[ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist 
rtl_pci rtlwifi rtl8723_common
[ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW   4.4.0+ 
#4
[ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 
) 08/27/2015
[ 7612.095574]    da37fc70 c12ce7c5  da37fca0 c104cc59 
c19d4454
[ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8 0200 
c1b42400
[ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc c10508a8 
f21f08b8
[ 7612.095604] Call Trace:
[ 7612.095614]  [] dump_stack+0x41/0x5c
[ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
[ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
[ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
[ 7612.095653]  [] destroy_conntrack+0x64/0xa0
[ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
[ 7612.095665]  [] skb_release_head_state+0x55/0xa0
[ 7612.095670]  [] skb_release_all+0xb/0x20
[ 7612.095674]  [] __kfree_skb+0xb/0x60
[ 7612.095679]  [] kfree_skb+0x30/0x70
[ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
[ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
[ 7612.095712]  [] drv_start+0x36/0xc0
[ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
[ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
[ 7612.095730]  [] ieee80211_open+0x4d/0x50
[ 7612.095736]  [] __dev_open+0xa3/0x130
[ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
[ 7612.095748]  [] __dev_change_flags+0x89/0x140
[ 7612.095753]  [] ? selinux_capable+0xd/0x10
[ 7612.095759]  [] dev_change_flags+0x29/0x60
[ 7612.095765]  [] devinet_ioctl+0x553/0x670
[ 7612.095772]  [] ? _copy_to_user+0x28/0x40
[ 7612.095777]  [] inet_ioctl+0x85/0xb0
[ 7612.095783]  [] sock_ioctl+0x67/0x260
[ 7612.095788]  [] ? sock_fasync+0x80/0x80
[ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
[ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
[ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
[ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
[ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
[ 7612.095827]  [] SyS_ioctl+0x70/0x80
[ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
[ 7612.095839]  [] sysenter_past_esp+0x36/0x55
[ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---

Signed-off-by: Wang YanQing 
Cc: Stable 
---
  Changes:
  v1-v2:
  1: add a Cc to stable.

  drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 1ac41b8..99a3a03 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
 true,
 HW_DESC_TXBUFF_ADDR),
 skb->len, PCI_DMA_TODEVICE);
-   kfree_skb(skb);
+   dev_kfree_skb_irq(skb);
ring->idx = (ring->idx + 1) % ring->entries;
}
ring->idx = 0;


Acked-by: Larry Finger 

Thanks,

Larry

Re: [PATCH v9 net-next 1/2] hv_sock: introduce Hyper-V Sockets

2016-05-06 Thread David Miller

From: Dexuan Cui 
Date: Wed,  4 May 2016 09:56:57 -0700

> +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE)
> +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE)
> +
> +#define HVSOCK_RCV_BUF_SZVMBUS_RINGBUFFER_SIZE_HVSOCK_RECV
 ...
> +struct hvsock_sock {
 ...
> + /* The 'hdr' and 'buf' in the below 'send' and 'recv' definitions must
> +  * be consecutive: see hvsock_send_data() and hvsock_recv_data().
> +  */
> + struct {
> + struct vmpipe_proto_header hdr;
> + u8 buf[HVSOCK_SND_BUF_SZ];
> + } send;
> +
> + struct {
> + struct vmpipe_proto_header hdr;
> + u8 buf[HVSOCK_RCV_BUF_SZ];
> +
> + unsigned int data_len;
> + unsigned int data_offset;
> + } recv;

I don't think allocating 5 pages of unswappable memory for every Hyper-V socket
created is reasonable.

Re: [PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 6:26 AM, Mike Manning  wrote:
> The MAC address of the physical interface is only copied to the VLAN
> when it is first created, resulting in an inconsistency after MAC
> address changes of only newly created VLANs having an up-to-date MAC.
>
> The VLANs should continue inheriting the MAC address of the physical
> interface, unless explicitly changed to be different from this.
> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
> to the MAC of the physical interface and thus for DAD to behave as
> expected.
>
> Signed-off-by: Mike Manning 
> ---
>  include/linux/if_vlan.h |2 ++
>  net/8021q/vlan.c|   17 +++--
>  net/8021q/vlan_dev.c|   13 ++---
>  3 files changed, 23 insertions(+), 9 deletions(-)
>
> --- a/include/linux/if_vlan.h
> +++ b/include/linux/if_vlan.h
> @@ -138,6 +138,7 @@ struct netpoll;
>   * @flags: device flags
>   * @real_dev: underlying netdevice
>   * @real_dev_addr: address of underlying netdevice
> + * @addr_assign_type: address assignment type
>   * @dent: proc dir entry
>   * @vlan_pcpu_stats: ptr to percpu rx stats
>   */
> @@ -153,6 +154,7 @@ struct vlan_dev_priv {
>
> struct net_device   *real_dev;
> unsigned char   real_dev_addr[ETH_ALEN];
> +   unsigned char   addr_assign_type;
>
> struct proc_dir_entry   *dent;
> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;

Please don't start adding new members to structures when it already
exists in the net_device.  If anything you should be able to drop
read_dev_addr if you do this correctly because you shouldn't need to
clone the lower dev address to watch for changes.  All you will need
to do is watch NET_ADDR_STOLEN.

> --- a/net/8021q/vlan.c
> +++ b/net/8021q/vlan.c
> @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net
> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
> return;
>
> +   /* vlan continues to inherit address of parent interface */
> +   if (vlan->addr_assign_type == NET_ADDR_STOLEN) {
> +   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
> +   goto out;
> +   }
> +
> +   if (!(vlandev->flags & IFF_UP))
> +   goto out;
> +
> /* vlan address was different from the old address and is equal to
>  * the new address */
> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
> @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net
> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
> dev_uc_add(dev, vlandev->dev_addr);
>
> +out:
> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
>  }
>
> @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti
>
> case NETDEV_CHANGEADDR:
> /* Adjust unicast filters on underlying device */
> -   vlan_group_for_each_dev(grp, i, vlandev) {
> -   flgs = vlandev->flags;
> -   if (!(flgs & IFF_UP))
> -   continue;
> -
> +   vlan_group_for_each_dev(grp, i, vlandev)
> vlan_sync_address(dev, vlandev);
> -   }
> break;
>
> case NETDEV_CHANGEMTU:

So all of this is far more complicated than it needs to be.  If
NET_ADDR_STOLEN is set you have to follow the lower device MAC
address, otherwise you maintain your own address and have to hold a
reference to it on the lower device.

You should also be able to maintain the current logic of not updating
a down interface on an address change.  You don't need to update a
stolen MAC address until the open routine is called for the interface.

> --- a/net/8021q/vlan_dev.c
> +++ b/net/8021q/vlan_dev.c
> @@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi
>
>  static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
>  {
> -   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
> +   struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
> +   struct net_device *real_dev = vlan->real_dev;
> struct sockaddr *addr = p;
> +   bool is_real_addr;
> int err;
>
> if (!is_valid_ether_addr(addr->sa_data))
> return -EADDRNOTAVAIL;
>
> +   is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr);
> +
> if (!(dev->flags & IFF_UP))
> goto out;
>
> -   if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) {
> +   if (!is_real_addr) {
> err = dev_uc_add(real_dev, addr->sa_data);
> if (err < 0)
> return err;
> @@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru
>
>  out:
> ether_addr_copy(dev->dev_addr, addr->sa_data);
> +   vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : 
> NET_ADDR_SET;

Re: [PATCHv2 net] bridge: fix igmp / mld query parsing

2016-05-06 Thread David Miller

From: Linus Lüssing 
Date: Wed,  4 May 2016 17:25:02 +0200

> With the newly introduced helper functions the skb pulling is hidden
> in the checksumming function - and undone before returning to the
> caller.
> 
> The IGMP and MLD query parsing functions in the bridge still
> assumed that the skb is pointing to the beginning of the IGMP/MLD
> message while it is now kept at the beginning of the IPv4/6 header.
> 
> If there is a querier somewhere else, then this either causes
> the multicast snooping to stay disabled even though it could be
> enabled. Or, if we have the querier enabled too, then this can
> create unnecessary IGMP / MLD query messages on the link.
> 
> Fixing this by taking the offset between IP and IGMP/MLD header into
> account, too.
> 
> Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code")
> Reported-by: Simon Wunderlich 
> Signed-off-by: Linus Lüssing 

Applied and queued up for -stable, thanks.

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread Dean Jenkins


On 06/05/16 16:27, Andrew Lunn wrote:

In other words, the full-speed hub is restricting the USB to
Ethernet Adaptor to a 12Mbps (half-duplex) bandwidth to support
Ethernet 100Mbps (full-duplex) traffic. That is not going to work
very well because Ethernet frames (perhaps partial Ethernet frames)
need to be discarded within the USB link.

If that really is true, the design is broken. I would expect the
adaptor to reliably transfer whole frames over USB, and drop whole
frames from its receive queue when the USB is congested. TCP is also
going to see the USB bottleneck as just like any bottleneck in the
network and back off. So TCP streams should not cause major congestion
on the USB link.
The host's USB host controller polls the USB to Ethernet adaptor for 
more data. The USB to Ethernet adaptor cannot predict when the next poll 
request comes. The AX88772B can span Ethernet frames across multiple 
poll requests. This means it is possible get a partial Ethernet frame 
received in the USB host controller on one poll and it is assumed that 
the next poll (sometime in the near future) will get the remaining part 
of the Ethernet frame.


However, the USB to Ethernet adaptor does not contain an infinitely 
sized RX Ethernet buffer for the incoming Ethernet frames. I believe the 
USB to Ethernet adaptor is just a pipe and does not directly implement 
flow control for Ethernet frames so the RX buffer is going to overflow 
causing loss of whole Ethernet frames. I suspect the IP stack in the 
host computer implements flow control for Ethernet frames.


Because the AX88772B can span Ethernet frames across multiple poll 
requests there is a risk that the designers of the device could of 
implemented a solution to discard the remaining part of the Ethernet 
frame before the next poll arrives due to the RX buffer overflowing. I 
don't know the algorithm used in the AX88772B but there will be loss of 
data due to the mismatch in bandwidths. I agree that dropping whole 
Ethernet frames would be preferable to dropping partial Ethernet frames 
which would corrupt the data stream.


My suspicion is that the URB buffers are containing discontinues in the 
data stream because of lost data due to insufficient bandwidth on the 
USB link.



Going over a 12Mbps USB link should be no different
to hitting an old Ethernet hub which can only do 10/Half.
Not exactly, because USB is a transport link which is agnostic to the 
type of data that is flowing. It is up to the layers above USB to manage 
the data content.


In other words, the USB speed needs to be higher than the Ethernet speed 
to avoid mismatches in bandwidth.

Therefore please retest with a working high-speed USB hub or remove
the full-speed USB hub from the test environment and directly
connect the USB to Ethernet Adaptor to the root hub of the USB port.
Then repeat the tests to see whether anything improved.

In other words, you need to eliminate the dmesg messages saying "not
running at top speed; connect to a high speed hub".

I would also suggest testing with the Ethernet at 10/half. You should
be able to use Ethtool to set that up. Your USB and Ethernet bandwidth
become more equal. If you still see errors, it suggests a protocol
implementation error somewhere.
I agree with the suggestion but I hope USB high speed (480Mbps) 
operation was the intended environment rather than the useless USB full 
speed (12Mbps) operation.


Let's hope that not using the USB hub improves things.

Regards,
Dean


 Andrew


--
Dean Jenkins
Embedded Software Engineer
Linux Transportation Solutions
Mentor Embedded Software Division
Mentor Graphics (UK) Ltd.

Re: [PATCH] Documentation/networking: more accurate LCO explanation

2016-05-06 Thread David Miller

From: Alexander Duyck 
Date: Fri, 6 May 2016 09:29:56 -0700

> I don't really see the point of using an underscore before and after
> that statement.  If it was only one or two words it might work for
> emphasis but the statement is large enough that starting it with an
> underscore just makes it harder to read.

Agreed.

[PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Wang YanQing

We can't use kfree_skb in irq disable context, because spin_lock_irqsave
make sure we are always in irq disable context, use dev_kfree_skb_irq
instead of kfree_skb is better than dev_kfree_skb_any.

This patch fix below kernel warning:
[ 7612.095528] [ cut here ]
[ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 
__local_bh_enable_ip+0x58/0x80()
[ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist 
rtl_pci rtlwifi rtl8723_common
[ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW   4.4.0+ 
#4
[ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 
) 08/27/2015
[ 7612.095574]    da37fc70 c12ce7c5  da37fca0 c104cc59 
c19d4454
[ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8 0200 
c1b42400
[ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc c10508a8 
f21f08b8
[ 7612.095604] Call Trace:
[ 7612.095614]  [] dump_stack+0x41/0x5c
[ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
[ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
[ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
[ 7612.095653]  [] destroy_conntrack+0x64/0xa0
[ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
[ 7612.095665]  [] skb_release_head_state+0x55/0xa0
[ 7612.095670]  [] skb_release_all+0xb/0x20
[ 7612.095674]  [] __kfree_skb+0xb/0x60
[ 7612.095679]  [] kfree_skb+0x30/0x70
[ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
[ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
[ 7612.095712]  [] drv_start+0x36/0xc0
[ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
[ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
[ 7612.095730]  [] ieee80211_open+0x4d/0x50
[ 7612.095736]  [] __dev_open+0xa3/0x130
[ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
[ 7612.095748]  [] __dev_change_flags+0x89/0x140
[ 7612.095753]  [] ? selinux_capable+0xd/0x10
[ 7612.095759]  [] dev_change_flags+0x29/0x60
[ 7612.095765]  [] devinet_ioctl+0x553/0x670
[ 7612.095772]  [] ? _copy_to_user+0x28/0x40
[ 7612.095777]  [] inet_ioctl+0x85/0xb0
[ 7612.095783]  [] sock_ioctl+0x67/0x260
[ 7612.095788]  [] ? sock_fasync+0x80/0x80
[ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
[ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
[ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
[ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
[ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
[ 7612.095827]  [] SyS_ioctl+0x70/0x80
[ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
[ 7612.095839]  [] sysenter_past_esp+0x36/0x55
[ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---

Signed-off-by: Wang YanQing 
Cc: Stable 
---
 Changes:
 v1-v2:
 1: add a Cc to stable.

 drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 1ac41b8..99a3a03 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
 true,
 HW_DESC_TXBUFF_ADDR),
 skb->len, PCI_DMA_TODEVICE);
-   kfree_skb(skb);
+   dev_kfree_skb_irq(skb);
ring->idx = (ring->idx + 1) % ring->entries;
}
ring->idx = 0;
-- 
1.8.5.6.2.g3d8a54e.dirty

[PATCH net-next] ipv4: tcp: ip_send_unicast_reply() is not BH safe

2016-05-06 Thread Eric Dumazet

From: Eric Dumazet 

I forgot that ip_send_unicast_reply() is not BH safe (yet).

Disabling preemption before calling it was not a good move.

Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible")
Signed-off-by: Eric Dumazet 
Reported-by: Andres Lagar-Cavilla  
---
 net/ipv4/tcp_ipv4.c |8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 
a7ab9472d64560d86ea24ac1b6e1a7800f89989d..8219d0d8dc8370d0d3e6fc4cd17b4925617968ab
 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -692,7 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
 
arg.tos = ip_hdr(skb)->tos;
-   preempt_disable();
+   local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
  skb, &TCP_SKB_CB(skb)->header.h4.opt,
  ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
@@ -700,7 +700,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct 
sk_buff *skb)
 
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
-   preempt_enable();
+   local_bh_enable();
 
 #ifdef CONFIG_TCP_MD5SIG
 out:
@@ -776,14 +776,14 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
-   preempt_disable();
+   local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
  skb, &TCP_SKB_CB(skb)->header.h4.opt,
  ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
  &arg, arg.iov[0].iov_len);
 
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
-   preempt_enable();
+   local_bh_enable();
 }
 
 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)

Re: [PATCH] Documentation/networking: more accurate LCO explanation

2016-05-06 Thread Alexander Duyck

On Fri, May 6, 2016 at 8:57 AM, Shmulik Ladkani
 wrote:
> In few places the term "ones-complement sum" was used but the actual
> meaning is "the complement of the ones-complement sum".

Looks like there might still be a few minor corrections needed.
Comments inline below.

>
> Signed-off-by: Shmulik Ladkani 
> ---
>
>  I assume readers interpret the term "ones-complement sum" as the sum
>  using one's complement arithmentic, without the final bitwise
>  complement of sum's result.
>  Hence I added "the complement of" where applicable.
>
>  Documentation/networking/checksum-offloads.txt | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/networking/checksum-offloads.txt 
> b/Documentation/networking/checksum-offloads.txt
> index de2a327766..9567200e1f 100644
> --- a/Documentation/networking/checksum-offloads.txt
> +++ b/Documentation/networking/checksum-offloads.txt
> @@ -69,17 +69,17 @@ LCO: Local Checksum Offload
>  LCO is a technique for efficiently computing the outer checksum of an
>   encapsulated datagram when the inner checksum is due to be offloaded.
>  The ones-complement sum of a correctly checksummed TCP or UDP packet is
> - equal to the sum of the pseudo header, because everything else gets
> - 'cancelled out' by the checksum field.  This is because the sum was
> + equal to the complement of the sum of the pseudo header, because everything
> + else gets 'cancelled out' by the checksum field.  This is because the sum 
> was
>   complemented before being written to the checksum field.
>  More generally, this holds in any case where the 'IP-style' ones complement
>   checksum is used, and thus any checksum that TX Checksum Offload supports.
>  That is, if we have set up TX Checksum Offload with a start/offset pair, we
>   know that _after the device has filled in that checksum_, the ones
>   complement sum from csum_start to the end of the packet will be equal to
> - _whatever value we put in the checksum field beforehand_.  This allows us
> - to compute the outer checksum without looking at the payload: we simply
> - stop summing when we get to csum_start, then add the 16-bit word at
> + the complement of _whatever value we put in the checksum field beforehand_.

I don't really see the point of using an underscore before and after
that statement.  If it was only one or two words it might work for
emphasis but the statement is large enough that starting it with an
underscore just makes it harder to read.

> + This allows us to compute the outer checksum without looking at the payload:
> + we simply stop summing when we get to csum_start, then add the 16-bit word 
> at
>   (csum_start + csum_offset).

You don't add the 16-bit word you add the compliment of the 16 bit word.

RE: [Intel-wired-lan] NULL dereference on v4.1.x while enabling VF

2016-05-06 Thread Skidmore, Donald C

Hey William,

My validation hasn't be able to recreate the dereference on v4.1.x, v4.5.x or 
net_next.  Where exactly did you place the two line script in your rc scripts.  
Our validation was able to run it as soon as ~14 second after the first boot 
message logged in dmesg.  Is this anywhere close to where you were executing 
it?  Likewise he has attempted running at both run level 3 and 5 in case that 
changed any of the times of how soon the rc scripts are executed.  So the more 
detail you can give of setup the more it might help us recreate what you are 
seeing.

Thanks,
-Don Skidmore 

> -Original Message-
> From: William Dauchy [mailto:wdau...@gmail.com]
> Sent: Tuesday, May 03, 2016 5:33 AM
> To: Skidmore, Donald C 
> Cc: NETDEV ; intel-wired-...@lists.osuosl.org;
> Alex Duyck 
> Subject: Re: [Intel-wired-lan] NULL dereference on v4.1.x while enabling VF
> 
> Hello Don,
> 
> Thank you for your reply.
> 
> On Mon, May 2, 2016 at 11:33 PM, Skidmore, Donald C
>  wrote:
> > Thanks for reporting the dereference.  Could you provide a little more
> detail on how you created this issue?  Are you just running the two
> commands (ip, sriov_numvfs) in some rc script and if you put a few second
> sleep in front of it you don't see the failure?
> 
> Your understanding is correct; a rc script is run with ip and echo in numvfs
> commands. I tried to reduce it to the minimum. If I put a sleep
> 20 in front of it, it does not crash. I also forgot to add I did not had the 
> issue
> in 3.14.x with the same script.
> 
> Best,
> --
> William

Re: [PATCH] Documentation/networking: more accurate LCO explanation

2016-05-06 Thread Edward Cree

On 06/05/16 16:57, Shmulik Ladkani wrote:
> In few places the term "ones-complement sum" was used but the actual
> meaning is "the complement of the ones-complement sum".
>
> Signed-off-by: Shmulik Ladkani 
Acked-by: Edward Cree

Re: [PATCH] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring

2016-05-06 Thread Larry Finger


On 05/05/2016 12:19 PM, Wang YanQing wrote:

We can't use kfree_skb in irq disable context, because spin_lock_irqsave
make sure we are always in irq disable context, use dev_kfree_skb_irq
instead of kfree_skb is better than dev_kfree_skb_any.

This patch fix below kernel warning:
[ 7612.095528] [ cut here ]
[ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 
__local_bh_enable_ip+0x58/0x80()
[ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist 
rtl_pci rtlwifi rtl8723_common
[ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW   4.4.0+ 
#4
[ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 
) 08/27/2015
[ 7612.095574]    da37fc70 c12ce7c5  da37fca0 c104cc59 
c19d4454
[ 7612.095584]  0003 116c c19d4784 0096 c10508a8 c10508a8 0200 
c1b42400
[ 7612.095594]  f29be780 da37fcb0 c104ccad 0009  da37fcbc c10508a8 
f21f08b8
[ 7612.095604] Call Trace:
[ 7612.095614]  [] dump_stack+0x41/0x5c
[ 7612.095620]  [] warn_slowpath_common+0x89/0xc0
[ 7612.095628]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095634]  [] ? __local_bh_enable_ip+0x58/0x80
[ 7612.095640]  [] warn_slowpath_null+0x1d/0x20
[ 7612.095646]  [] __local_bh_enable_ip+0x58/0x80
[ 7612.095653]  [] destroy_conntrack+0x64/0xa0
[ 7612.095660]  [] nf_conntrack_destroy+0xf/0x20
[ 7612.095665]  [] skb_release_head_state+0x55/0xa0
[ 7612.095670]  [] skb_release_all+0xb/0x20
[ 7612.095674]  [] __kfree_skb+0xb/0x60
[ 7612.095679]  [] kfree_skb+0x30/0x70
[ 7612.095686]  [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095692]  [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci]
[ 7612.095698]  [] rtl_pci_start+0x19/0x190 [rtl_pci]
[ 7612.095705]  [] rtl_op_start+0x56/0x90 [rtlwifi]
[ 7612.095712]  [] drv_start+0x36/0xc0
[ 7612.095717]  [] ieee80211_do_open+0x2d3/0x890
[ 7612.095725]  [] ? call_netdevice_notifiers_info+0x2e/0x60
[ 7612.095730]  [] ieee80211_open+0x4d/0x50
[ 7612.095736]  [] __dev_open+0xa3/0x130
[ 7612.095742]  [] ? _raw_spin_unlock_bh+0x13/0x20
[ 7612.095748]  [] __dev_change_flags+0x89/0x140
[ 7612.095753]  [] ? selinux_capable+0xd/0x10
[ 7612.095759]  [] dev_change_flags+0x29/0x60
[ 7612.095765]  [] devinet_ioctl+0x553/0x670
[ 7612.095772]  [] ? _copy_to_user+0x28/0x40
[ 7612.095777]  [] inet_ioctl+0x85/0xb0
[ 7612.095783]  [] sock_ioctl+0x67/0x260
[ 7612.095788]  [] ? sock_fasync+0x80/0x80
[ 7612.095795]  [] do_vfs_ioctl+0x6b/0x550
[ 7612.095800]  [] ? selinux_file_ioctl+0x102/0x1e0
[ 7612.095807]  [] ? timekeeping_suspend+0x294/0x320
[ 7612.095813]  [] ? __hrtimer_run_queues+0x14a/0x210
[ 7612.095820]  [] ? security_file_ioctl+0x34/0x50
[ 7612.095827]  [] SyS_ioctl+0x70/0x80
[ 7612.095832]  [] do_fast_syscall_32+0x84/0x120
[ 7612.095839]  [] sysenter_past_esp+0x36/0x55
[ 7612.095844] ---[ end trace 97e9c637a20e8348 ]---

Signed-off-by: Wang YanQing 
---
  drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c 
b/drivers/net/wireless/realtek/rtlwifi/pci.c
index 1ac41b8..99a3a03 100644
--- a/drivers/net/wireless/realtek/rtlwifi/pci.c
+++ b/drivers/net/wireless/realtek/rtlwifi/pci.c
@@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw)
 true,
 HW_DESC_TXBUFF_ADDR),
 skb->len, PCI_DMA_TODEVICE);
-   kfree_skb(skb);
+   dev_kfree_skb_irq(skb);
ring->idx = (ring->idx + 1) % ring->entries;
}
ring->idx = 0;


After testing, this patch is OK other than needing a Cc to stable. Please fix 
that and resubmit V2.


Larry

[PATCH] Documentation/networking: more accurate LCO explanation

2016-05-06 Thread Shmulik Ladkani

In few places the term "ones-complement sum" was used but the actual
meaning is "the complement of the ones-complement sum".

Signed-off-by: Shmulik Ladkani 
---

 I assume readers interpret the term "ones-complement sum" as the sum
 using one's complement arithmentic, without the final bitwise
 complement of sum's result.
 Hence I added "the complement of" where applicable.

 Documentation/networking/checksum-offloads.txt | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/checksum-offloads.txt 
b/Documentation/networking/checksum-offloads.txt
index de2a327766..9567200e1f 100644
--- a/Documentation/networking/checksum-offloads.txt
+++ b/Documentation/networking/checksum-offloads.txt
@@ -69,17 +69,17 @@ LCO: Local Checksum Offload
 LCO is a technique for efficiently computing the outer checksum of an
  encapsulated datagram when the inner checksum is due to be offloaded.
 The ones-complement sum of a correctly checksummed TCP or UDP packet is
- equal to the sum of the pseudo header, because everything else gets
- 'cancelled out' by the checksum field.  This is because the sum was
+ equal to the complement of the sum of the pseudo header, because everything
+ else gets 'cancelled out' by the checksum field.  This is because the sum was
  complemented before being written to the checksum field.
 More generally, this holds in any case where the 'IP-style' ones complement
  checksum is used, and thus any checksum that TX Checksum Offload supports.
 That is, if we have set up TX Checksum Offload with a start/offset pair, we
  know that _after the device has filled in that checksum_, the ones
  complement sum from csum_start to the end of the packet will be equal to
- _whatever value we put in the checksum field beforehand_.  This allows us
- to compute the outer checksum without looking at the payload: we simply
- stop summing when we get to csum_start, then add the 16-bit word at
+ the complement of _whatever value we put in the checksum field beforehand_.
+ This allows us to compute the outer checksum without looking at the payload:
+ we simply stop summing when we get to csum_start, then add the 16-bit word at
  (csum_start + csum_offset).
 Then, when the true inner checksum is filled in (either by hardware or by
  skb_checksum_help()), the outer checksum will become correct by virtue of
-- 
2.7.4

[PATCH net-next] fq_codel: add memory limitation per queue

2016-05-06 Thread Eric Dumazet

From: Eric Dumazet 

On small embedded routers, one wants to control maximal amount of
memory used by fq_codel, instead of controlling number of packets or
bytes, since GRO/TSO make these not practical.

Assuming skb->truesize is accurate, we have to keep track of
skb->truesize sum for skbs in queue.

This patch adds a new TCA_FQ_CODEL_MEMORY_LIMIT attribute.

I chose a default value of 32 MBytes, which looks reasonable even
for heavy duty usages. (Prior fq_codel users should not be hurt
when they upgrade their kernels)

Two fields are added to tc_fq_codel_qd_stats to report :
 - Current memory usage
 - Number of drops caused by memory limits

# tc qd replace dev eth1 root est 1sec 4sec fq_codel memory_limit 4M
..
# tc -s -d qd sh dev eth1
qdisc fq_codel 8008: root refcnt 257 limit 10240p flows 1024
 quantum 1514 target 5.0ms interval 100.0ms memory_limit 4Mb ecn 
 Sent 2083566791363 bytes 1376214889 pkt (dropped 4994406, overlimits 0
requeues 21705223) 
 rate 9841Mbit 812549pps backlog 3906120b 376p requeues 21705223 
  maxpacket 68130 drop_overlimit 4994406 new_flow_count 28855414
  ecn_mark 0 memory_used 4190048 drop_overmemory 4994406
  new_flows_len 1 old_flows_len 177


Signed-off-by: Eric Dumazet 
Cc: Jesper Dangaard Brouer 
Cc: Dave Täht 
Cc: Sebastian Möller 
---
 include/uapi/linux/pkt_sched.h |3 +++
 net/sched/sch_fq_codel.c   |   27 ---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index a11afecd4482..2382eed50278 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -719,6 +719,7 @@ enum {
TCA_FQ_CODEL_QUANTUM,
TCA_FQ_CODEL_CE_THRESHOLD,
TCA_FQ_CODEL_DROP_BATCH_SIZE,
+   TCA_FQ_CODEL_MEMORY_LIMIT,
__TCA_FQ_CODEL_MAX
 };
 
@@ -743,6 +744,8 @@ struct tc_fq_codel_qd_stats {
__u32   new_flows_len;  /* count of flows in new list */
__u32   old_flows_len;  /* count of flows in old list */
__u32   ce_mark;/* packets above ce_threshold */
+   __u32   memory_usage;   /* in bytes */
+   __u32   drop_overmemory;
 };
 
 struct tc_fq_codel_cl_stats {
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index e7b42b0d5145..bb8bd9314629 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -60,8 +60,11 @@ struct fq_codel_sched_data {
u32 perturbation;   /* hash perturbation */
u32 quantum;/* psched_mtu(qdisc_dev(sch)); */
u32 drop_batch_size;
+   u32 memory_limit;
struct codel_params cparams;
struct codel_stats cstats;
+   u32 memory_usage;
+   u32 drop_overmemory;
u32 drop_overlimit;
u32 new_flow_count;
 
@@ -143,6 +146,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, 
unsigned int max_packets)
unsigned int maxbacklog = 0, idx = 0, i, len;
struct fq_codel_flow *flow;
unsigned int threshold;
+   unsigned int mem = 0;
 
/* Queue is full! Find the fat flow and drop packet(s) from it.
 * This might sound expensive, but with 1024 flows, we scan
@@ -167,11 +171,13 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, 
unsigned int max_packets)
do {
skb = dequeue_head(flow);
len += qdisc_pkt_len(skb);
+   mem += skb->truesize;
kfree_skb(skb);
} while (++i < max_packets && len < threshold);
 
flow->dropped += i;
q->backlogs[idx] -= len;
+   q->memory_usage -= mem;
sch->qstats.drops += i;
sch->qstats.backlog -= len;
sch->q.qlen -= i;
@@ -193,6 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct 
Qdisc *sch)
unsigned int idx, prev_backlog, prev_qlen;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
+   bool memory_limited;
 
idx = fq_codel_classify(skb, sch, &ret);
if (idx == 0) {
@@ -215,7 +222,9 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct 
Qdisc *sch)
flow->deficit = q->quantum;
flow->dropped = 0;
}
-   if (++sch->q.qlen <= sch->limit)
+   q->memory_usage += skb->truesize;
+   memory_limited = q->memory_usage > q->memory_limit;
+   if (++sch->q.qlen <= sch->limit && !memory_limited)
return NET_XMIT_SUCCESS;
 
prev_backlog = sch->qstats.backlog;
@@ -229,7 +238,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct 
Qdisc *sch)
ret = fq_codel_drop(sch, q->drop_batch_size);
 
q->drop_overlimit += prev_qlen - sch->q.qlen;
-
+   if (memory_limited)
+   q->drop_overmemory += prev_qlen - sch->q.qlen;
/* As we dropped packet(s), better let upper stack know this */
qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,

[PATCH] Add support for configuring Infiniband GUIDs

2016-05-06 Thread Eli Cohen

Add two NLA's that allow configuration of Infiniband node or port GUIDs
by referencing the IPoIB net device set over then physical function. The
format to be used is as follows:

ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70
ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78

Issue: 702759
Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e
Signed-off-by: Eli Cohen 
---
 ip/iplink.c   | 40 
 man/man8/ip-link.8.in | 12 +++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/ip/iplink.c b/ip/iplink.c
index d2e586b6d133..3f885defdfeb 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -237,6 +237,30 @@ struct iplink_req {
charbuf[1024];
 };
 
+static int extract_guid(__u64 *guid, char *arg)
+{
+   __u64 ret;
+   int g[8];
+   int err;
+
+   err = sscanf(arg, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x",
+g, g + 1, g + 2, g + 3, g + 4, g + 5, g + 6, g + 7);
+   if (err != 8)
+   return -1;
+
+   ret = ((__u64)(g[0]) << 56) |
+ ((__u64)(g[1]) << 48) |
+ ((__u64)(g[2]) << 40) |
+ ((__u64)(g[3]) << 32) |
+ ((__u64)(g[4]) << 24) |
+ ((__u64)(g[5]) << 16) |
+ ((__u64)(g[6]) << 8) |
+ ((__u64)(g[7]));
+   *guid = ret;
+
+   return 0;
+}
+
 static int iplink_parse_vf(int vf, int *argcp, char ***argvp,
   struct iplink_req *req, int dev_index)
 {
@@ -383,6 +407,22 @@ static int iplink_parse_vf(int vf, int *argcp, char 
***argvp,
invarg("Invalid \"state\" value\n", *argv);
ivl.vf = vf;
addattr_l(&req->n, sizeof(*req), IFLA_VF_LINK_STATE, 
&ivl, sizeof(ivl));
+   } else if (matches(*argv, "node_guid") == 0) {
+   struct ifla_vf_guid ivg;
+
+   NEXT_ARG();
+   ivg.vf = vf;
+   if (extract_guid(&ivg.guid, *argv))
+   return -1;
+   addattr_l(&req->n, sizeof(*req), IFLA_VF_IB_NODE_GUID, 
&ivg, sizeof(ivg));
+   } else if (matches(*argv, "port_guid") == 0) {
+   struct ifla_vf_guid ivg;
+
+   NEXT_ARG();
+   ivg.vf = vf;
+   if (extract_guid(&ivg.guid, *argv))
+   return -1;
+   addattr_l(&req->n, sizeof(*req), IFLA_VF_IB_PORT_GUID, 
&ivg, sizeof(ivg));
} else {
/* rewind arg */
PREV_ARG();
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index 805511423ef2..e143a5ec8a9a 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -143,7 +143,11 @@ ip-link \- network device configuration
 .br
 .RB "[ " state " { " auto " | " enable " | " disable " } ]"
 .br
-.RB "[ " trust " { " on " | " off " } ] ]"
+.RB "[ " trust " { " on " | " off " } ]"
+.br
+.RB "[ " node_guid " eui64 ]"
+.br
+.RB "[ " port_guid " eui64 ] ]"
 .br
 .in -9
 .RB "[ " master
@@ -1033,6 +1037,12 @@ sent by the VF.
 .BI trust " on|off"
 - trust the specified VF user. This enables that VF user can set a specific 
feature
 which may impact security and/or performance. (e.g. VF multicast promiscuous 
mode)
+.sp
+.BI node_guid " eui64"
+- configure node GUID for the VF.
+.sp
+.BI port_guid " eui64"
+- configure port GUID for the VF.
 .in -8
 
 .TP
-- 
2.8.1

Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting

2016-05-06 Thread Phil Sutter

On Fri, May 06, 2016 at 04:14:11PM +0100, Edward Cree wrote:
> On 06/05/16 15:43, Phil Sutter wrote:
> > On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote:
> >> Since we can only configure unicast, we probably want to be able to
> >> display unicast, rather than multicast.
> > Furthermore, the kernel even rejects multicast peer addresses.
> Yes, but a future kernel might not, and iproute2 is meant to be forward-
> compatible.

Sorry, but I fail to see how this might break forward compatibility.
Quite the contrary, suppose geneve in future supported multicast peers,
current iproute2 would fail to recognize it's existence. What am I
missing here?

> > Why do you then propose a dubious fix to a dubious check instead of
> > getting rid of it in the first place?
> Because John Linville clearly had some reason for putting a check there,
> and he probably knows better than me.  Chesterton's fence.

A valid point, indeed. In my opinion the same applies to your patch as
well, as instead of removing the fence you're moving it to the other
lane. :)

Cheers, Phil

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread Andrew Lunn

> In other words, the full-speed hub is restricting the USB to
> Ethernet Adaptor to a 12Mbps (half-duplex) bandwidth to support
> Ethernet 100Mbps (full-duplex) traffic. That is not going to work
> very well because Ethernet frames (perhaps partial Ethernet frames)
> need to be discarded within the USB link.

If that really is true, the design is broken. I would expect the
adaptor to reliably transfer whole frames over USB, and drop whole
frames from its receive queue when the USB is congested. TCP is also
going to see the USB bottleneck as just like any bottleneck in the
network and back off. So TCP streams should not cause major congestion
on the USB link. Going over a 12Mbps USB link should be no different
to hitting an old Ethernet hub which can only do 10/Half.

> Therefore please retest with a working high-speed USB hub or remove
> the full-speed USB hub from the test environment and directly
> connect the USB to Ethernet Adaptor to the root hub of the USB port.
> Then repeat the tests to see whether anything improved.
> 
> In other words, you need to eliminate the dmesg messages saying "not
> running at top speed; connect to a high speed hub".

I would also suggest testing with the Ethernet at 10/half. You should
be able to use Ethtool to set that up. Your USB and Ethernet bandwidth
become more equal. If you still see errors, it suggests a protocol
implementation error somewhere.

 Andrew

Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting

2016-05-06 Thread Edward Cree

On 06/05/16 15:43, Phil Sutter wrote:
> On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote:
>> Since we can only configure unicast, we probably want to be able to
>> display unicast, rather than multicast.
> Furthermore, the kernel even rejects multicast peer addresses.
Yes, but a future kernel might not, and iproute2 is meant to be forward-
compatible.

> Why do you then propose a dubious fix to a dubious check instead of
> getting rid of it in the first place?
Because John Linville clearly had some reason for putting a check there,
and he probably knows better than me.  Chesterton's fence.

-Ed

Re: [RFC PATCH net-next 14/20] net: dsa: mv88e6xxx: factorize VLAN Ethertype

2016-05-06 Thread Vivien Didelot

Hi Andrew,

Andrew Lunn  writes:

>> @@ -55,6 +58,7 @@ static const struct mv88e6xxx_info mv88e6131_table[] = {
>>  .num_databases = 256,
>>  .num_ports = 10,
>>  .flags = MV88E6XXX_FLAG_ATU |
>> +MV88E6XXX_FLAG_CORE_TAG_TYPE |
>>  MV88E6XXX_FLAG_PPU |
>>  MV88E6XXX_FLAG_VLANTABLE |
>>  MV88E6XXX_FLAG_VTU,
>
> Rather than repeating these flags again and again, could you add one
> #define containing the flags, and then use that to initialise .flags.

Hum OK, I wasn't sure, but looking at the final mv88e6xxx_info table, I
can see that models from the same family all have the same set of flags,
even if they don't have the same number of ports or databases.

I'll add one MV88E6XXX_FLAGS_ per family.

Thanks,

Vivien

Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-06 Thread Dean Jenkins


On 05/05/16 13:19, Guodong Xu wrote:

Hi, Dean

I am not sure why do you insist 'not full speed'. Actually, the tests
I run on ARM-64bit is at USB full speed mode. I pasted my log here:
http://paste.ubuntu.com/16236442/
, which includes the information you requested above, ifconfig, dmesg.
The interval between two consecutive errors varies from 10 to 40ms.



Your log from http://paste.ubuntu.com/16236442/ shows high speed for 
device 3 is not being used:


[3.586968] usb 1-1: new full-speed USB device number 2 using dwc2
[3.792091] usb 1-1: not running at top speed; connect to a high 
speed hub

[3.800477] hub 1-1:1.0: USB hub found
[3.803658] hub 1-1:1.0: 3 ports detected
[4.086636] usb 1-1.2: new full-speed USB device number 3 using dwc2
[4.202209] usb 1-1.2: not running at top speed; connect to a high 
speed hub
[8.851236] asix 1-1.2:1.0 eth0: register 'asix' at 
usb-f72c.usb-1.2, ASIX AX88772B USB 2.0 Ethernet, 00:0e:c6:fa:bf:fd


Hopefully, you know USB 2.0 high speed (480Mbps) is faster than full 
speed (12Mbps) mode.


Therefore, your USB to Ethernet Adaptor is not running in its optimal 
"normal" high speed operation and there is a USB hub in the way that is 
not running at USB high speed mode. This is an abnormal configuration 
and potentially explains some of your failure observations.


Running at full-speed (12Mbps) mode would explain why the timestamps has 
gaps of ms rather than us gaps (for 480Mbps).


In other words, the full-speed hub is restricting the USB to Ethernet 
Adaptor to a 12Mbps (half-duplex) bandwidth to support Ethernet 100Mbps 
(full-duplex) traffic. That is not going to work very well because 
Ethernet frames (perhaps partial Ethernet frames) need to be discarded 
within the USB link.


Your ifconfig output from http://paste.ubuntu.com/16236442/ shows 249 errors

eth0  Link encap:Ethernet  HWaddr 00:0e:c6:fa:bf:fd
  inet addr:192.168.1.11  Bcast:192.168.1.255 Mask:255.255.255.0
  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
  RX packets:865 errors:249 dropped:0 overruns:0 frame:0
  TX packets:880 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:1000
  RX bytes:1228273 (1.1 MiB)  TX bytes:68955 (67.3 KiB)

Before the test
RX packets:28 errors:0 dropped:0 overruns:0 frame:0

After the test
RX packets:865 errors:249 dropped:0 overruns:0 frame:0

Good test packets = 865 - 28 = 837
Detected bad Ethernet frames = 249

Bad to good ratio is 249:837 = 1:3.36 so 1 detected bad Ethernet frame 
per 3.36 good Ethernet frames



Your ifconfig output from http://paste.ubuntu.com/16236764/ shows 1282 
errors


eth0  Link encap:Ethernet  HWaddr 00:0e:c6:fa:bf:fd
  inet addr:192.168.1.11  Bcast:192.168.1.255 Mask:255.255.255.0
  UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
  RX packets:55 errors:1282 dropped:0 overruns:0 frame:0
  TX packets:64 errors:0 dropped:0 overruns:0 carrier:0
  collisions:0 txqueuelen:1000
  RX bytes:14287 (13.9 KiB)  TX bytes:7639 (7.4 KiB)

Before the test
RX packets:19 errors:0 dropped:0 overruns:0 frame:0

After the test
RX packets:55 errors:1282 dropped:0 overruns:0 frame:0

Good test packets = 55 - 19 = 36
Detected bad Ethernet frames = 1282

Bad to good ratio is 1282:36 = 1:0.28 so 1 detected bad Ethernet frame 
per 0.028 good Ethernet frames


This suggests a very high error rate.



It is interesting that the reported "remaining" value is 988. Is 988 always
shown ? I mean that do you see any other "remaining" values for the "Data
Header synchronisation was lost" error message ?

Yes and No. When doing iperf test in TCP mode, always 988. I have
never seen other "remaining" value.

But,
1. I tried "ping -f -s 1400 [my.arm.64bit.board.ip]", but this cannot
trigger the error.
2. Tried iperf in UDP mode, I saw "Data Header synchronisation was
lost" remaining value is 984 (again, seemingly always in several
tries). Log is pasted here. http://paste.ubuntu.com/16236764/


In http://paste.ubuntu.com/16236764/ you see very many
[   41.938370] asix 1-1.2:1.0 eth0: asix_rx_fixup() Bad Header Length 
0x11400040, offset 4


but only a few
[   42.214607] asix 1-1.2:1.0 eth0: asix_rx_fixup() Data Header 
synchronisation was lost, remaining 984


This suggests that the "Bad Header Length" and "Data Header 
synchronisation was lost" error messages are not related to consecutive 
URBs. The expectation is that a "Data Header synchronisation was lost" 
error message is immediately followed by a "Bad Header Length" message 
with a timestamp much less than 1ms (for high speed USB). This is 
because an Ethernet frame that spans URBs needs low latency so should be 
sent quickly in consecutive URBs.


The Bad Header Length error messages with offset 4 indicates that 32-bit 
header word was not found in the expected location at the start of the 
URB buffer.
[   41.938370] asix 1-1.2:1.0 eth0: asix_rx_fixup() Bad Header Le

Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting

2016-05-06 Thread Phil Sutter

On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote:
> Since we can only configure unicast, we probably want to be able to
> display unicast, rather than multicast.

Furthermore, the kernel even rejects multicast peer addresses.

> I'm assuming this is what was intended, but tbh I don't know why we
> need to check for multicast on the display side at all, rather than
> just displaying whatever the kernel gives us.

Why do you then propose a dubious fix to a dubious check instead of
getting rid of it in the first place? Reminds me a bit of this here (no
offense intended):

http://geekandpoke.typepad.com/geekandpoke/2011/07/good-coders.html

Cheers, Phil

[PATCH iproute2] geneve: fix IPv6 remote address reporting

2016-05-06 Thread Edward Cree

Since we can only configure unicast, we probably want to be able to
display unicast, rather than multicast.

Fixes: 906ac5437ab8 ("geneve: add support for IPv6 link partners")
Signed-off-by: Edward Cree 
---
I'm assuming this is what was intended, but tbh I don't know why we
need to check for multicast on the display side at all, rather than
just displaying whatever the kernel gives us.

 ip/iplink_geneve.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 84d948f..65af6b3 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -204,7 +204,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
memcpy(&addr, RTA_DATA(tb[IFLA_GENEVE_REMOTE6]), sizeof(struct 
in6_addr));
if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0) {
-   if (IN6_IS_ADDR_MULTICAST(&addr))
+   if (!IN6_IS_ADDR_MULTICAST(&addr))
fprintf(f, "remote %s ",
format_host(AF_INET6, sizeof(struct 
in6_addr), &addr));
}
-- 
2.4.3

[PATCH v4 1/2] soc: qcom: smd: Introduce compile stubs

2016-05-06 Thread Bjorn Andersson

Introduce compile stubs for the SMD API, allowing consumers to be
compile tested.

Acked-by: Andy Gross 
Signed-off-by: Bjorn Andersson 
---

Changes since v3:
- None

Changes since v2:
- Introduce this patch, to allow compile testing of QRTR_SMD

 include/linux/soc/qcom/smd.h | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h
index d0cb6d189a0a..46a984f5e3a3 100644
--- a/include/linux/soc/qcom/smd.h
+++ b/include/linux/soc/qcom/smd.h
@@ -45,13 +45,39 @@ struct qcom_smd_driver {
int (*callback)(struct qcom_smd_device *, const void *, size_t);
 };
 
+#if IS_ENABLED(CONFIG_QCOM_SMD)
+
 int qcom_smd_driver_register(struct qcom_smd_driver *drv);
 void qcom_smd_driver_unregister(struct qcom_smd_driver *drv);
 
+int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
+
+#else
+
+static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv)
+{
+   return -ENXIO;
+}
+
+static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv)
+{
+   /* This shouldn't be possible */
+   WARN_ON(1);
+}
+
+static inline int qcom_smd_send(struct qcom_smd_channel *channel,
+   const void *data, int len)
+{
+   /* This shouldn't be possible */
+   WARN_ON(1);
+   return -ENXIO;
+}
+
+#endif
+
 #define module_qcom_smd_driver(__smd_driver) \
module_driver(__smd_driver, qcom_smd_driver_register, \
  qcom_smd_driver_unregister)
 
-int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len);
 
 #endif
-- 
2.5.0

[PATCH v4 2/2] net: Add Qualcomm IPC router

2016-05-06 Thread Bjorn Andersson

From: Courtney Cavin 

Add an implementation of Qualcomm's IPC router protocol, used to
communicate with service providing remote processors.

Signed-off-by: Courtney Cavin 
Signed-off-by: Bjorn Andersson 
[bjorn: Cope with 0 being a valid node id and implement RTM_NEWADDR]
Signed-off-by: Bjorn Andersson 
---

Changes since v3:
- Made it possible to compile qrtr as module

Changes since v2:
- Altered Kconfig dependency for QRTR_SMD to be compile testable

Changes since v1:
- Made node 0 (normally the Qualcomm modem) a valid node
- Implemented RTM_NEWADDR for specifying the local node id

 include/linux/socket.h|4 +-
 include/uapi/linux/qrtr.h |   12 +
 net/Kconfig   |1 +
 net/Makefile  |1 +
 net/qrtr/Kconfig  |   24 ++
 net/qrtr/Makefile |2 +
 net/qrtr/qrtr.c   | 1007 +
 net/qrtr/qrtr.h   |   31 ++
 net/qrtr/smd.c|  117 ++
 9 files changed, 1198 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/qrtr.h
 create mode 100644 net/qrtr/Kconfig
 create mode 100644 net/qrtr/Makefile
 create mode 100644 net/qrtr/qrtr.c
 create mode 100644 net/qrtr/qrtr.h
 create mode 100644 net/qrtr/smd.c

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 73bf6c6a833b..b5cc5a6d7011 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -201,8 +201,9 @@ struct ucred {
 #define AF_NFC 39  /* NFC sockets  */
 #define AF_VSOCK   40  /* vSockets */
 #define AF_KCM 41  /* Kernel Connection Multiplexor*/
+#define AF_QIPCRTR 42  /* Qualcomm IPC Router  */
 
-#define AF_MAX 42  /* For now.. */
+#define AF_MAX 43  /* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC  AF_UNSPEC
@@ -249,6 +250,7 @@ struct ucred {
 #define PF_NFC AF_NFC
 #define PF_VSOCK   AF_VSOCK
 #define PF_KCM AF_KCM
+#define PF_QIPCRTR AF_QIPCRTR
 #define PF_MAX AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h
new file mode 100644
index ..66c0748d26e2
--- /dev/null
+++ b/include/uapi/linux/qrtr.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_QRTR_H
+#define _LINUX_QRTR_H
+
+#include 
+
+struct sockaddr_qrtr {
+   __kernel_sa_family_t sq_family;
+   __u32 sq_node;
+   __u32 sq_port;
+};
+
+#endif /* _LINUX_QRTR_H */
diff --git a/net/Kconfig b/net/Kconfig
index a8934d8c8fda..b841c42e5c9b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -236,6 +236,7 @@ source "net/mpls/Kconfig"
 source "net/hsr/Kconfig"
 source "net/switchdev/Kconfig"
 source "net/l3mdev/Kconfig"
+source "net/qrtr/Kconfig"
 
 config RPS
bool
diff --git a/net/Makefile b/net/Makefile
index 81d14119eab5..bdd14553a774 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -78,3 +78,4 @@ endif
 ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
 obj-y  += l3mdev/
 endif
+obj-$(CONFIG_QRTR) += qrtr/
diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig
new file mode 100644
index ..673fd1f86ebe
--- /dev/null
+++ b/net/qrtr/Kconfig
@@ -0,0 +1,24 @@
+# Qualcomm IPC Router configuration
+#
+
+config QRTR
+   tristate "Qualcomm IPC Router support"
+   depends on ARCH_QCOM || COMPILE_TEST
+   ---help---
+ Say Y if you intend to use Qualcomm IPC router protocol.  The
+ protocol is used to communicate with services provided by other
+ hardware blocks in the system.
+
+ In order to do service lookups, a userspace daemon is required to
+ maintain a service listing.
+
+if QRTR
+
+config QRTR_SMD
+   tristate "SMD IPC Router channels"
+   depends on QCOM_SMD || COMPILE_TEST
+   ---help---
+ Say Y here to support SMD based ipcrouter channels.  SMD is the
+ most common transport for IPC Router.
+
+endif # QRTR
diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile
new file mode 100644
index ..6c00dc623b7e
--- /dev/null
+++ b/net/qrtr/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_QRTR) := qrtr.o
+obj-$(CONFIG_QRTR_SMD) += smd.o
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
new file mode 100644
index ..c985ecbe9bd6
--- /dev/null
+++ b/net/qrtr/qrtr.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2015, Sony Mobile Communications Inc.
+ * Copyright (c) 2013, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include

[PATCH net v3] vlan: Propagate MAC address to VLANs

2016-05-06 Thread Mike Manning

The MAC address of the physical interface is only copied to the VLAN
when it is first created, resulting in an inconsistency after MAC
address changes of only newly created VLANs having an up-to-date MAC.

The VLANs should continue inheriting the MAC address of the physical
interface, unless explicitly changed to be different from this. 
This allows IPv6 EUI64 addresses for the VLAN to reflect any changes
to the MAC of the physical interface and thus for DAD to behave as
expected.

Signed-off-by: Mike Manning 
---
 include/linux/if_vlan.h |2 ++
 net/8021q/vlan.c|   17 +++--
 net/8021q/vlan_dev.c|   13 ++---
 3 files changed, 23 insertions(+), 9 deletions(-)

--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -138,6 +138,7 @@ struct netpoll;
  * @flags: device flags
  * @real_dev: underlying netdevice
  * @real_dev_addr: address of underlying netdevice
+ * @addr_assign_type: address assignment type
  * @dent: proc dir entry
  * @vlan_pcpu_stats: ptr to percpu rx stats
  */
@@ -153,6 +154,7 @@ struct vlan_dev_priv {
 
struct net_device   *real_dev;
unsigned char   real_dev_addr[ETH_ALEN];
+   unsigned char   addr_assign_type;
 
struct proc_dir_entry   *dent;
struct vlan_pcpu_stats __percpu *vlan_pcpu_stats;
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -291,6 +291,15 @@ static void vlan_sync_address(struct net
if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
return;
 
+   /* vlan continues to inherit address of parent interface */
+   if (vlan->addr_assign_type == NET_ADDR_STOLEN) {
+   ether_addr_copy(vlandev->dev_addr, dev->dev_addr);
+   goto out;
+   }
+
+   if (!(vlandev->flags & IFF_UP))
+   goto out;
+
/* vlan address was different from the old address and is equal to
 * the new address */
if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
@@ -303,6 +312,7 @@ static void vlan_sync_address(struct net
!ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
dev_uc_add(dev, vlandev->dev_addr);
 
+out:
ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
 }
 
@@ -389,13 +399,8 @@ static int vlan_device_event(struct noti
 
case NETDEV_CHANGEADDR:
/* Adjust unicast filters on underlying device */
-   vlan_group_for_each_dev(grp, i, vlandev) {
-   flgs = vlandev->flags;
-   if (!(flgs & IFF_UP))
-   continue;
-
+   vlan_group_for_each_dev(grp, i, vlandev)
vlan_sync_address(dev, vlandev);
-   }
break;
 
case NETDEV_CHANGEMTU:
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi
 
 static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
 {
-   struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+   struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+   struct net_device *real_dev = vlan->real_dev;
struct sockaddr *addr = p;
+   bool is_real_addr;
int err;
 
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
 
+   is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr);
+
if (!(dev->flags & IFF_UP))
goto out;
 
-   if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) {
+   if (!is_real_addr) {
err = dev_uc_add(real_dev, addr->sa_data);
if (err < 0)
return err;
@@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru
 
 out:
ether_addr_copy(dev->dev_addr, addr->sa_data);
+   vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : NET_ADDR_SET;
return 0;
 }
 
@@ -558,8 +563,10 @@ static int vlan_dev_init(struct net_devi
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
 
-   if (is_zero_ether_addr(dev->dev_addr))
+   if (is_zero_ether_addr(dev->dev_addr)) {
eth_hw_addr_inherit(dev, real_dev);
+   vlan_dev_priv(dev)->addr_assign_type = NET_ADDR_STOLEN;
+   }
if (is_zero_ether_addr(dev->broadcast))
memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);

-- 
1.7.10.4

[PATCH net] macvtap: segmented packet is consumed

2016-05-06 Thread Eric Dumazet

From: Eric Dumazet 

If GSO packet is segmented and its segments are properly queued,
we call consume_skb() instead of kfree_skb() to be drop monitor
friendly.

Fixes: 3e4f8b7873709 ("macvtap: Perform GSO on forwarding path.")
Signed-off-by: Eric Dumazet 
Cc: Vlad Yasevich 
---
 drivers/net/macvtap.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 95394edd1ed5..9a35aa462314 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -373,7 +373,7 @@ static rx_handler_result_t macvtap_handle_frame(struct 
sk_buff **pskb)
goto wake_up;
}
 
-   kfree_skb(skb);
+   consume_skb(skb);
while (segs) {
struct sk_buff *nskb = segs->next;

Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)

2016-05-06 Thread Jesper Dangaard Brouer


I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2]
closed Felix'es OpenWRT email account (bad choice! emails bouncing).
Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project
is in some kind of conflict.

OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349

[2] http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335


On Fri, 6 May 2016 11:42:43 +0200
Jesper Dangaard Brouer  wrote:

> Hi Felix,
> 
> This is an important fix for OpenWRT, please read!
> 
> OpenWRT changed the default fq_codel sch->limit from 10240 to 1024,
> without also adjusting q->flows_cnt.  Eric explains below that you must
> also adjust the buckets (q->flows_cnt) for this not to break. (Just
> adjust it to 128)
> 
> Problematic OpenWRT commit in question:
>  http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e
>  12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from 
> causing too much cpu load with higher speed (#21326)")
> 
> 
> I also highly recommend you cherry-pick this very recent commit:
>  net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()")
>  https://git.kernel.org/davem/net-next/c/9d18562a227
> 
> This should fix very high CPU usage in-case fq_codel goes into drop mode.
> The problem is that drop mode was considered rare, and implementation
> wise it was chosen to be more expensive (to save cycles on normal mode).
> Unfortunately is it easy to trigger with an UDP flood. Drop mode is
> especially expensive for smaller devices, as it scans a 4K big array,
> thus 64 cache misses for small devices!
> 
> The fix is to allow drop-mode to bulk-drop more packets when entering
> drop-mode (default 64 bulk drop).  That way we don't suddenly
> experience a significantly higher processing cost per packet, but
> instead can amortize this.
> 
> To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk
> drop, given we also recommend bucket size to be 128 ? (thus the amount
> of memory to scan is less, but their CPU is also much smaller).
> 
> --Jesper
> 
> 
> On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet  
> wrote:
> 
> > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote:  
> > > On 5 May 2016 at 19:12, Eric Dumazet  wrote:
> > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote:
> > > >
> > > >>
> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024
> > > >> quantum 1514 target 5.0ms interval 100.0ms ecn
> > > >>  Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0)
> > > >>  backlog 0b 0p requeues 0
> > > >>   maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0
> > > >>   new_flows_len 0 old_flows_len 0
> > > >
> > > >
> > > > Limit of 1024 packets and 1024 flows is not wise I think.
> > > >
> > > > (If all buckets are in use, each bucket has a virtual queue of 1 packet,
> > > > which is almost the same than having no queue at all)
> > > >
> > > > I suggest to have at least 8 packets per bucket, to let Codel have a
> > > > chance to trigger.
> > > >
> > > > So you could either reduce number of buckets to 128 (if memory is
> > > > tight), or increase limit to 8192.
> > > 
> > > Will try, but what I've posted is default, I didn't change/configure 
> > > that.
> > 
> > fq_codel has a default of 10240 packets and 1024 buckets.
> > 
> > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413
> > 
> > If someone changed that in the linux variant you use, he probably should
> > explain the rationale.  

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

[PATCH v2] net: arc/emac: Move arc_emac_tx_clean() into arc_emac_tx() and disable tx interrut

2016-05-06 Thread Caesar Wang

From: Shuyu Wei 

Doing tx_clean() inside poll() may scramble the tx ring buffer if
tx() is running. This will cause tx to stop working, which can be
reproduced by simultaneously downloading two large files at high speed.

Moving tx_clean() into tx() will prevent this. And tx interrupt is no
longer needed now.

Picked the Shuyu's patch up, the patch is sent on
https://patchwork.kernel.org/patch/8356821/, since that make sense for
rockchip platform.
Note: Many people feedback the cransh problems with rk3036/rk3188 emac when
download the heavy loading and this patch is indeed can fix the crash.

The crash log as the followings:
...
[ 2191.996127 ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc6 #114
[ 2192.002475 ] Hardware name: Rockchip (Device Tree)
[ 2192.007174 ] Backtrace:
[ 2192.009658 ] [] (dump_backtrace) from []
(show_stack+0x18/0x1c)
[ 2192.017220 ]  r7:c051c4f8 r6:ef463180 r5:c05b7000 r4:
[ 2192.022948 ] [] (show_stack) from []
(dump_stack+0x90/0xa0)
[ 2192.030176 ] [] (dump_stack) from []
(bad_page+0xdc/0x12c)
[ 2192.037302 ]  r5:c059a100 r4:c05f430c
[ 2192.040913 ] [] (bad_page) from []
(get_page_from_freelist+0x388/0x95c)
[ 2192.049166 ]  r9:0008 r8:ef463180 r7:c051c4d0 r6:
r5: r4:c051c4e4
[ 2192.056982 ] [] (get_page_from_freelist) from
[] (__alloc_pages_nodemask+0xd8/0x8e8)
[ 2192.066362 ]  r10:c001b068 r9: r8:ee0b02b0 r7:6113
r6:0003 r5:02095220
[ 2192.074254 ]  r4:c05ca1c0
[ 2192.076809 ] [] (__alloc_pages_nodemask) from
[] (__alloc_page_frag+0xb0/0x160)
[ 2192.085757 ]  r10:c001b068 r9: r8:ee0b02b0 r7:6113
r6:02080020 r5:0740
[ 2192.093650 ]  r4:eedbc884
[ 2192.096207 ] [] (__alloc_page_frag) from []
(__netdev_alloc_skb+0xa0/0x104)
[ 2192.104806 ]  r7:6113 r6:eedbc884 r5:ee0b r4:0740
[ 2192.110525 ] [] (__netdev_alloc_skb) from []
(arc_emac_poll+0x318/0x57c)
[ 2192.118865 ]  r9: r8:ee0b02b0 r7:019c r6:ee163780
r5:0670 r4:ee0b
[ 2192.126683 ] [] (arc_emac_poll) from []
(net_rx_action+0x1f0/0x2ec)
[ 2192.134590 ]  r10:c0599df8 r9:c059a100 r8:00073760 r7:012c
r6:0028 r5:c02aa8e8
[ 2192.142483 ]  r4:ee0b04e0
[ 2192.145040 ] [] (net_rx_action) from []
(__do_softirq+0x134/0x258)
[ 2192.152860 ]  r10:c059a080 r9:4003 r8:0003 r7:0100
r6:c0598000 r5:c059a08c
[ 2192.160751 ]  r4:
...

Signed-off-by: Shuyu Wei 
Tested-by: Michael Niewoehner 
Tested-by: Xing Zheng 
Cc: "David S. Miller" 
Cc: Alexander Kochetkov 
Cc: netdev@vger.kernel.org
Signed-off-by: Caesar Wang 
---

 drivers/net/ethernet/arc/emac_main.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/arc/emac_main.c 
b/drivers/net/ethernet/arc/emac_main.c
index a3a9392..4f4e25e 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -311,12 +311,10 @@ static int arc_emac_poll(struct napi_struct *napi, int 
budget)
struct arc_emac_priv *priv = netdev_priv(ndev);
unsigned int work_done;
 
-   arc_emac_tx_clean(ndev);
-
work_done = arc_emac_rx(ndev, budget);
if (work_done < budget) {
napi_complete(napi);
-   arc_reg_or(priv, R_ENABLE, RXINT_MASK | TXINT_MASK);
+   arc_reg_or(priv, R_ENABLE, RXINT_MASK);
}
 
return work_done;
@@ -345,9 +343,9 @@ static irqreturn_t arc_emac_intr(int irq, void 
*dev_instance)
/* Reset all flags except "MDIO complete" */
arc_reg_set(priv, R_STATUS, status);
 
-   if (status & (RXINT_MASK | TXINT_MASK)) {
+   if (status & RXINT_MASK) {
if (likely(napi_schedule_prep(&priv->napi))) {
-   arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK);
+   arc_reg_clr(priv, R_ENABLE, RXINT_MASK);
__napi_schedule(&priv->napi);
}
}
@@ -461,7 +459,7 @@ static int arc_emac_open(struct net_device *ndev)
arc_reg_set(priv, R_TX_RING, (unsigned int)priv->txbd_dma);
 
/* Enable interrupts */
-   arc_reg_set(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
+   arc_reg_set(priv, R_ENABLE, RXINT_MASK | ERR_MASK);
 
/* Set CONTROL */
arc_reg_set(priv, R_CTRL,
@@ -594,7 +592,7 @@ static int arc_emac_stop(struct net_device *ndev)
netif_stop_queue(ndev);
 
/* Disable interrupts */
-   arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK);
+   arc_reg_clr(priv, R_ENABLE, RXINT_MASK | ERR_MASK);
 
/* Disable EMAC */
arc_reg_clr(priv, R_CTRL, EN_MASK);
@@ -656,6 +654,8 @@ static int arc_emac_tx(struct sk_buff *skb, struct 
net_device *ndev)
__le32 *info = &priv->txbd[*txbd_curr].info;
dma_addr_t addr;
 
+   arc_emac_tx_clean(ndev);
+
if (skb_padto(skb, ETH_ZLEN))
return NETDEV_TX_OK;
 
-- 
1.9.1

Re: [PATCH net] netfilter: nf_conntrack: Use net_mutex for helper unregistration.

2016-05-06 Thread Pablo Neira Ayuso

Hi Joe,

On Thu, May 05, 2016 at 03:50:37PM -0700, Joe Stringer wrote:
> diff --git a/net/netfilter/nf_conntrack_helper.c 
> b/net/netfilter/nf_conntrack_helper.c
> index 3b40ec575cd5..6860b19be406 100644
> --- a/net/netfilter/nf_conntrack_helper.c
> +++ b/net/netfilter/nf_conntrack_helper.c
> @@ -449,10 +449,10 @@ void nf_conntrack_helper_unregister(struct 
> nf_conntrack_helper *me)
>*/
>   synchronize_rcu();
>  
> - rtnl_lock();
> + mutex_lock(&net_mutex);
>   for_each_net(net)
>   __nf_conntrack_helper_unregister(me, net);
> - rtnl_unlock();
> + mutex_unlock(&net_mutex);

This simple solution works because we have no .exit callbacks in any
of our helpers. Otherwise, the helper code may be already gone by when
the worker has a chance to run to release the netns.

If so, probably I can append this as comment to this function so we
don't forget. If we ever have .exit callbacks (I don't expect so), we
would need to wait for worker completion.

Re: ixgbe: cannot enable LRO

2016-05-06 Thread Otto Sabart

> On Wed, Apr 27, 2016 at 2:36 AM, Otto Sabart  wrote:
> >
> > Hello everyone,
> > does anybody have a problem with LRO on ixge (on latest 4.6-rc5)?
> > I cannot find a way to enable it.
> >
> > On stable RHEL7.2 kernel everything works fine.
> >
> > I opened a bug report [0].
> >
> > [0] https://bugzilla.kernel.org/show_bug.cgi?id=117291
> >
> >
> > Thanks!
> >
> > Ota
> 

Hello Alex,

> So I am able to turn on LRO without any issues.

Yes, I badly desciribed the problem. The LRO was not possible to turn on
immediately _after the boot_ (I was enabling it in /etc/rc.local).

When I reloaded the ixgbe driver, the LRO was possible to turn on
without problem.

I found out that the problem was caused by network manager. When I
disabled NM, the LRO started to work.

> 
> Do you know if you have done anything that might disable LRO such as
> modified the rx-usecs to a value less than 10 or enabled routing or
> bridging on the device?  Also I think a stacked device might be able
> to block you from enabling LRO unless all the devices stacked on the
> interface can support it.

I did not modify rx-usecs (at least not intentionally).  Its value is
always (with disabled or enabled LRO) equal to 1 (with disabled NM).

$ ethtool -c ixgbe | grep rx-usecs
rx-usecs: 1
rx-usecs-irq: 0
rx-usecs-low: 0
rx-usecs-high: 0

There is no linux bridge or routing enabled on this device. There is
only a VLAN configured. Does it matter?

$ ip l show ixgbe
9: ixgbe:  mtu 1500 qdisc mq state UP mode 
DEFAULT qlen 1000
link/ether 00:1b:21:90:c3:86 brd ff:ff:ff:ff:ff:ff

$ ip l show ixgbe.40
19: ixgbe.40@ixgbe:  mtu 1500 qdisc noqueue 
state UP mode DEFAULT qlen 1000
link/ether 00:1b:21:90:c3:86 brd ff:ff:ff:ff:ff:ff

I updated the bugzilla [0] and I think we can close this as a NOTABUG.

Thanks!

Ota

signature.asc
Description: PGP signature

Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops

2016-05-06 Thread Alexander Aring


Hi,

On 05/06/2016 11:47 AM, Alexander Aring wrote:
> 
> Hi,
> 
> On 05/04/2016 02:23 PM, Stefan Schmidt wrote:
>> Hello.
>>
>> On 20/04/16 10:19, Alexander Aring wrote:
>>> This patch introduces neighbour discovery ops callback structure. The
>>> structure contains at first receive and transmit handling for NS/NA and
>>> userspace option field functionality.
>>>
>>> These callback offers 6lowpan different handling, such as 802.15.4 short
>>> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over
>>> 6LoWPANs).
>>>
>>> Cc: David S. Miller
>>> Cc: Alexey Kuznetsov
>>> Cc: James Morris
>>> Cc: Hideaki YOSHIFUJI
>>> Cc: Patrick McHardy
>>> Signed-off-by: Alexander Aring
>>> ---
>>>   include/linux/netdevice.h |  3 ++
>>>   include/net/ndisc.h   | 96 
>>> +++
>>>   net/ipv6/addrconf.c   |  1 +
>>>   net/ipv6/ndisc.c  | 71 ---
>>>   net/ipv6/route.c  |  2 +-
>>>   5 files changed, 144 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index 0052c42..bc60033 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -1677,6 +1677,9 @@ struct net_device {
>>>   #ifdef CONFIG_NET_L3_MASTER_DEV
>>>   const struct l3mdev_ops*l3mdev_ops;
>>>   #endif
>>> +#if IS_ENABLED(CONFIG_IPV6)
>>> +const struct ndisc_ops *ndisc_ops;
>>> +#endif
>>> const struct header_ops *header_ops;
>>>   diff --git a/include/net/ndisc.h b/include/net/ndisc.h
>>> index aac868e..14ed016 100644
>>> --- a/include/net/ndisc.h
>>> +++ b/include/net/ndisc.h
>>> @@ -110,7 +110,8 @@ struct ndisc_options {
>>> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
>>>   -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
>>> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
>>> +  u8 *opt, int opt_len,
>>> struct ndisc_options *ndopts);
>>> /*
>>> @@ -173,6 +174,93 @@ static inline struct neighbour 
>>> *__ipv6_neigh_lookup(struct net_device *dev, cons
>>>   return n;
>>>   }
>>>   +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt)
>>
>> Name it __ipv6... instead of __ip6...?
> 
> I had ipv6 before, but then I saw ip6... prefixes functionality in
> ndisc.c and changed it to ip6, but both seems to be used.
> 
> See "ip6_nd_hdr".
> 
>>> +{
>>> +return opt->nd_opt_type == ND_OPT_RDNSS ||
>>> +opt->nd_opt_type == ND_OPT_DNSSL;
>>> +}
>>> +
>>> +#if IS_ENABLED(CONFIG_IPV6)
>>> +struct ndisc_ops {
>>> +int(*is_useropt)(struct nd_opt_hdr *opt);
>>> +void(*send_na)(struct net_device *dev,
>>> +   const struct in6_addr *daddr,
>>> +   const struct in6_addr *solicited_addr,
>>> +   bool router, bool solicited,
>>> +   bool override, bool inc_opt);
>>> +void(*recv_na)(struct sk_buff *skb);
>>> +void(*send_ns)(struct net_device *dev,
>>> +   const struct in6_addr *solicit,
>>> +   const struct in6_addr *daddr,
>>> +   const struct in6_addr *saddr);
>>> +void(*recv_ns)(struct sk_buff *skb);
>>> +};
>>> +
>>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>>> +   struct nd_opt_hdr *opt)
>>> +{
>>> +if (likely(dev->ndisc_ops->is_useropt))
>>> +return dev->ndisc_ops->is_useropt(opt);
>>> +else
>>> +return 0;
>>> +}
>>> +
>>> +static inline void ndisc_send_na(struct net_device *dev,
>>> + const struct in6_addr *daddr,
>>> + const struct in6_addr *solicited_addr,
>>> + bool router, bool solicited, bool override,
>>> + bool inc_opt)
>>> +{
>>> +if (likely(dev->ndisc_ops->send_na))
>>> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router,
>>> +solicited, override, inc_opt);
>>> +}
>>> +
>>> +static inline void ndisc_recv_na(struct sk_buff *skb)
>>> +{
>>> +if (likely(skb->dev->ndisc_ops->recv_na))
>>> +skb->dev->ndisc_ops->recv_na(skb);
>>> +}
>>> +
>>> +static inline void ndisc_send_ns(struct net_device *dev,
>>> + const struct in6_addr *solicit,
>>> + const struct in6_addr *daddr,
>>> + const struct in6_addr *saddr)
>>> +{
>>> +if (likely(dev->ndisc_ops->send_ns))
>>> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr);
>>> +}
>>> +
>>> +static inline void ndisc_recv_ns(struct sk_buff *skb)
>>> +{
>>> +if (likely(skb->dev->ndisc_ops->recv_ns))
>>> +skb->dev->ndisc_ops->recv_ns(skb);
>>> +}
>>> +#else
>>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>>> +   struct nd_opt_hdr *opt)
>>> +{
>>> +return 0;
>>> +}
>>> +
>>> +static inline void ndisc_send_na(struct net_device *dev,
>>> + const struct in6_addr *daddr,
>>>

Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops

2016-05-06 Thread Alexander Aring


Hi,

On 05/04/2016 02:23 PM, Stefan Schmidt wrote:
> Hello.
> 
> On 20/04/16 10:19, Alexander Aring wrote:
>> This patch introduces neighbour discovery ops callback structure. The
>> structure contains at first receive and transmit handling for NS/NA and
>> userspace option field functionality.
>>
>> These callback offers 6lowpan different handling, such as 802.15.4 short
>> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over
>> 6LoWPANs).
>>
>> Cc: David S. Miller
>> Cc: Alexey Kuznetsov
>> Cc: James Morris
>> Cc: Hideaki YOSHIFUJI
>> Cc: Patrick McHardy
>> Signed-off-by: Alexander Aring
>> ---
>>   include/linux/netdevice.h |  3 ++
>>   include/net/ndisc.h   | 96 
>> +++
>>   net/ipv6/addrconf.c   |  1 +
>>   net/ipv6/ndisc.c  | 71 ---
>>   net/ipv6/route.c  |  2 +-
>>   5 files changed, 144 insertions(+), 29 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 0052c42..bc60033 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -1677,6 +1677,9 @@ struct net_device {
>>   #ifdef CONFIG_NET_L3_MASTER_DEV
>>   const struct l3mdev_ops*l3mdev_ops;
>>   #endif
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +const struct ndisc_ops *ndisc_ops;
>> +#endif
>> const struct header_ops *header_ops;
>>   diff --git a/include/net/ndisc.h b/include/net/ndisc.h
>> index aac868e..14ed016 100644
>> --- a/include/net/ndisc.h
>> +++ b/include/net/ndisc.h
>> @@ -110,7 +110,8 @@ struct ndisc_options {
>> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
>>   -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
>> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
>> +  u8 *opt, int opt_len,
>> struct ndisc_options *ndopts);
>> /*
>> @@ -173,6 +174,93 @@ static inline struct neighbour 
>> *__ipv6_neigh_lookup(struct net_device *dev, cons
>>   return n;
>>   }
>>   +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt)
> 
> Name it __ipv6... instead of __ip6...?

I had ipv6 before, but then I saw ip6... prefixes functionality in
ndisc.c and changed it to ip6, but both seems to be used.

See "ip6_nd_hdr".

>> +{
>> +return opt->nd_opt_type == ND_OPT_RDNSS ||
>> +opt->nd_opt_type == ND_OPT_DNSSL;
>> +}
>> +
>> +#if IS_ENABLED(CONFIG_IPV6)
>> +struct ndisc_ops {
>> +int(*is_useropt)(struct nd_opt_hdr *opt);
>> +void(*send_na)(struct net_device *dev,
>> +   const struct in6_addr *daddr,
>> +   const struct in6_addr *solicited_addr,
>> +   bool router, bool solicited,
>> +   bool override, bool inc_opt);
>> +void(*recv_na)(struct sk_buff *skb);
>> +void(*send_ns)(struct net_device *dev,
>> +   const struct in6_addr *solicit,
>> +   const struct in6_addr *daddr,
>> +   const struct in6_addr *saddr);
>> +void(*recv_ns)(struct sk_buff *skb);
>> +};
>> +
>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>> +   struct nd_opt_hdr *opt)
>> +{
>> +if (likely(dev->ndisc_ops->is_useropt))
>> +return dev->ndisc_ops->is_useropt(opt);
>> +else
>> +return 0;
>> +}
>> +
>> +static inline void ndisc_send_na(struct net_device *dev,
>> + const struct in6_addr *daddr,
>> + const struct in6_addr *solicited_addr,
>> + bool router, bool solicited, bool override,
>> + bool inc_opt)
>> +{
>> +if (likely(dev->ndisc_ops->send_na))
>> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router,
>> +solicited, override, inc_opt);
>> +}
>> +
>> +static inline void ndisc_recv_na(struct sk_buff *skb)
>> +{
>> +if (likely(skb->dev->ndisc_ops->recv_na))
>> +skb->dev->ndisc_ops->recv_na(skb);
>> +}
>> +
>> +static inline void ndisc_send_ns(struct net_device *dev,
>> + const struct in6_addr *solicit,
>> + const struct in6_addr *daddr,
>> + const struct in6_addr *saddr)
>> +{
>> +if (likely(dev->ndisc_ops->send_ns))
>> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr);
>> +}
>> +
>> +static inline void ndisc_recv_ns(struct sk_buff *skb)
>> +{
>> +if (likely(skb->dev->ndisc_ops->recv_ns))
>> +skb->dev->ndisc_ops->recv_ns(skb);
>> +}
>> +#else
>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>> +   struct nd_opt_hdr *opt)
>> +{
>> +return 0;
>> +}
>> +
>> +static inline void ndisc_send_na(struct net_device *dev,
>> + const struct in6_addr *daddr,
>> + const struct in6_addr *solicited_addr,
>> + bool router, bool solicited, bool override,
>> + bool inc_opt) { }
>> +
>> +static inline void ndisc_rec

OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)

2016-05-06 Thread Jesper Dangaard Brouer

Hi Felix,

This is an important fix for OpenWRT, please read!

OpenWRT changed the default fq_codel sch->limit from 10240 to 1024,
without also adjusting q->flows_cnt.  Eric explains below that you must
also adjust the buckets (q->flows_cnt) for this not to break. (Just
adjust it to 128)

Problematic OpenWRT commit in question:
 http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e
 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from 
causing too much cpu load with higher speed (#21326)")

I also highly recommend you cherry-pick this very recent commit:
 net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()")
 https://git.kernel.org/davem/net-next/c/9d18562a227

This should fix very high CPU usage in-case fq_codel goes into drop mode.
The problem is that drop mode was considered rare, and implementation
wise it was chosen to be more expensive (to save cycles on normal mode).
Unfortunately is it easy to trigger with an UDP flood. Drop mode is
especially expensive for smaller devices, as it scans a 4K big array,
thus 64 cache misses for small devices!

The fix is to allow drop-mode to bulk-drop more packets when entering
drop-mode (default 64 bulk drop).  That way we don't suddenly
experience a significantly higher processing cost per packet, but
instead can amortize this.

To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk
drop, given we also recommend bucket size to be 128 ? (thus the amount
of memory to scan is less, but their CPU is also much smaller).

--Jesper

On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet  wrote:

> On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote:
> > On 5 May 2016 at 19:12, Eric Dumazet  wrote:  
> > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote:
> > >  
> > >>
> > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024
> > >> quantum 1514 target 5.0ms interval 100.0ms ecn
> > >>  Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0)
> > >>  backlog 0b 0p requeues 0
> > >>   maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0
> > >>   new_flows_len 0 old_flows_len 0  
> > >
> > >
> > > Limit of 1024 packets and 1024 flows is not wise I think.
> > >
> > > (If all buckets are in use, each bucket has a virtual queue of 1 packet,
> > > which is almost the same than having no queue at all)
> > >
> > > I suggest to have at least 8 packets per bucket, to let Codel have a
> > > chance to trigger.
> > >
> > > So you could either reduce number of buckets to 128 (if memory is
> > > tight), or increase limit to 8192.  
> > 
> > Will try, but what I've posted is default, I didn't change/configure that.  
> 
> fq_codel has a default of 10240 packets and 1024 buckets.
> 
> http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413
> 
> If someone changed that in the linux variant you use, he probably should
> explain the rationale.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops

2016-05-06 Thread Alexander Aring


Hi,

On 05/03/2016 08:17 PM, Stefan Schmidt wrote:
> Hello.
> 
> On 02/05/16 21:36, Hannes Frederic Sowa wrote:
>> On 20.04.2016 10:19, Alexander Aring wrote:
>>> This patch introduces neighbour discovery ops callback structure. The
>>> structure contains at first receive and transmit handling for NS/NA and
>>> userspace option field functionality.
>>>
>>> These callback offers 6lowpan different handling, such as 802.15.4 short
>>> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over
>>> 6LoWPANs).
>>>
>>> Cc: David S. Miller
>>> Cc: Alexey Kuznetsov
>>> Cc: James Morris
>>> Cc: Hideaki YOSHIFUJI
>>> Cc: Patrick McHardy
>>> Signed-off-by: Alexander Aring
>>> ---
>>>   include/linux/netdevice.h |  3 ++
>>>   include/net/ndisc.h   | 96 
>>> +++
>>>   net/ipv6/addrconf.c   |  1 +
>>>   net/ipv6/ndisc.c  | 71 ---
>>>   net/ipv6/route.c  |  2 +-
>>>   5 files changed, 144 insertions(+), 29 deletions(-)
>>>
>>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>>> index 0052c42..bc60033 100644
>>> --- a/include/linux/netdevice.h
>>> +++ b/include/linux/netdevice.h
>>> @@ -1677,6 +1677,9 @@ struct net_device {
>>>   #ifdef CONFIG_NET_L3_MASTER_DEV
>>>   const struct l3mdev_ops*l3mdev_ops;
>>>   #endif
>>> +#if IS_ENABLED(CONFIG_IPV6)
>>> +const struct ndisc_ops *ndisc_ops;
>>> +#endif
>>> const struct header_ops *header_ops;
>>>   diff --git a/include/net/ndisc.h b/include/net/ndisc.h
>>> index aac868e..14ed016 100644
>>> --- a/include/net/ndisc.h
>>> +++ b/include/net/ndisc.h
>>> @@ -110,7 +110,8 @@ struct ndisc_options {
>>> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7)
>>>   -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
>>> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
>>> +  u8 *opt, int opt_len,
>>> struct ndisc_options *ndopts);
>>> /*
>>> @@ -173,6 +174,93 @@ static inline struct neighbour 
>>> *__ipv6_neigh_lookup(struct net_device *dev, cons
>>>   return n;
>>>   }
>>>   +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt)
>>> +{
>>> +return opt->nd_opt_type == ND_OPT_RDNSS ||
>>> +opt->nd_opt_type == ND_OPT_DNSSL;
>>> +}
>>> +
>>> +#if IS_ENABLED(CONFIG_IPV6)
>>> +struct ndisc_ops {
>>> +int(*is_useropt)(struct nd_opt_hdr *opt);
>>> +void(*send_na)(struct net_device *dev,
>>> +   const struct in6_addr *daddr,
>>> +   const struct in6_addr *solicited_addr,
>>> +   bool router, bool solicited,
>>> +   bool override, bool inc_opt);
>>> +void(*recv_na)(struct sk_buff *skb);
>>> +void(*send_ns)(struct net_device *dev,
>>> +   const struct in6_addr *solicit,
>>> +   const struct in6_addr *daddr,
>>> +   const struct in6_addr *saddr);
>>> +void(*recv_ns)(struct sk_buff *skb);
>>> +};
>>> +
>>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>>> +   struct nd_opt_hdr *opt)
>>> +{
>>> +if (likely(dev->ndisc_ops->is_useropt))
>>> +return dev->ndisc_ops->is_useropt(opt);
>>> +else
>>> +return 0;
>>> +}
>>> +
>>> +static inline void ndisc_send_na(struct net_device *dev,
>>> + const struct in6_addr *daddr,
>>> + const struct in6_addr *solicited_addr,
>>> + bool router, bool solicited, bool override,
>>> + bool inc_opt)
>>> +{
>>> +if (likely(dev->ndisc_ops->send_na))
>>> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router,
>>> +solicited, override, inc_opt);
>>> +}
>>> +
>>> +static inline void ndisc_recv_na(struct sk_buff *skb)
>>> +{
>>> +if (likely(skb->dev->ndisc_ops->recv_na))
>>> +skb->dev->ndisc_ops->recv_na(skb);
>>> +}
>>> +
>>> +static inline void ndisc_send_ns(struct net_device *dev,
>>> + const struct in6_addr *solicit,
>>> + const struct in6_addr *daddr,
>>> + const struct in6_addr *saddr)
>>> +{
>>> +if (likely(dev->ndisc_ops->send_ns))
>>> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr);
>>> +}
>>> +
>>> +static inline void ndisc_recv_ns(struct sk_buff *skb)
>>> +{
>>> +if (likely(skb->dev->ndisc_ops->recv_ns))
>>> +skb->dev->ndisc_ops->recv_ns(skb);
>>> +}
>>> +#else
>>> +static inline int ndisc_is_useropt(const struct net_device *dev,
>>> +   struct nd_opt_hdr *opt)
>>> +{
>>> +return 0;
>>> +}
>>> +
>>> +static inline void ndisc_send_na(struct net_device *dev,
>>> + const struct in6_addr *daddr,
>>> + const struct in6_addr *solicited_addr,
>>> + bool router, bool solicited, bool override,
>>> + bool inc_opt) { }
>>> +
>>> +static inline void ndisc_recv_na(struct sk_b

Re: [PATCH v9 net-next 4/7] openvswitch: add layer 3 flow/port support

2016-05-06 Thread Jiri Benc

On Wed,  4 May 2016 16:36:30 +0900, Simon Horman wrote:
> +static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
> + const struct ovs_action_push_eth *ethh)
> +{
> + int err;
> +
> + /* De-accelerate any hardware accelerated VLAN tag added to a previous
> +  * Ethernet header */
> + err = skb_vlan_deaccel(skb);
> + if (unlikely(err))
> + return err;
> +
> + /* Add the new Ethernet header */
> + if (skb_cow_head(skb, ETH_HLEN) < 0)
> + return -ENOMEM;
> +
> + skb_push(skb, ETH_HLEN);
> + skb_reset_mac_header(skb);
> + skb_reset_mac_len(skb);
> +
> + ether_addr_copy(eth_hdr(skb)->h_source, ethh->addresses.eth_src);
> + ether_addr_copy(eth_hdr(skb)->h_dest, ethh->addresses.eth_dst);
> + eth_hdr(skb)->h_proto = ethh->eth_type;

This doesn't seem right. We know the packet type, it's skb->protocol.
We should fill in that.

In addition, we should check whether mac_len > 0 and in such case,
change skb->protocol to ETH_P_TEB first (and store that value in the
pushed eth header).

Similarly on pop_eth, we need to check skb->protocol and if it is
ETH_P_TEB, call eth_type_trans on the modified frame to set the new
skb->protocol correctly. It's probably not that simple, as we'd need a
version of eth_type_trans that doesn't need a net device.

 Jiri

Re: [PATCHv2 bluetooth-next 02/10] 6lowpan: add 802.15.4 short addr slaac

2016-05-06 Thread Alexander Aring


Hi,

On 05/03/2016 08:16 PM, Stefan Schmidt wrote:
> Hello.
> 
> On 20/04/16 10:19, Alexander Aring wrote:
>> This patch adds the autoconfiguration if a valid 802.15.4 short address
>> is available for 802.15.4 6LoWPAN interfaces.
>>
>> Cc: David S. Miller
>> Cc: Alexey Kuznetsov
>> Cc: James Morris
>> Cc: Hideaki YOSHIFUJI
>> Cc: Patrick McHardy
>> Signed-off-by: Alexander Aring
>> ---
>>   include/net/addrconf.h |  3 +++
>>   net/6lowpan/core.c | 46 ++
>>   net/ipv6/addrconf.c|  5 +++--
>>   3 files changed, 52 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/net/addrconf.h b/include/net/addrconf.h
>> index 730d856..b1774eb 100644
>> --- a/include/net/addrconf.h
>> +++ b/include/net/addrconf.h
>> @@ -94,6 +94,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const 
>> struct sock *sk2,
>>   void addrconf_join_solict(struct net_device *dev, const struct in6_addr 
>> *addr);
>>   void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr 
>> *addr);
>>   +void addrconf_add_linklocal(struct inet6_dev *idev,
>> +const struct in6_addr *addr, u32 flags);
>> +
>>   static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
>>   {
>>   if (dev->addr_len != ETH_ALEN)
>> diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
>> index 7a240b3..fbae31e 100644
>> --- a/net/6lowpan/core.c
>> +++ b/net/6lowpan/core.c
>> @@ -14,6 +14,7 @@
>>   #include 
>> #include 
>> +#include 
>> #include "6lowpan_i.h"
>>   @@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev)
>>   }
>>   EXPORT_SYMBOL(lowpan_unregister_netdev);
>>   +static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev)
>> +{
>> +struct wpan_dev *wpan_dev = 
>> lowpan_802154_dev(dev)->wdev->ieee802154_ptr;
>> +
>> +/* Set short_addr autoconfiguration if short_addr is present only */
>> +if (!ieee802154_is_valid_src_short_addr(wpan_dev->short_addr))
>> +return -1;
> 
> -EINVAL instead of -1?
> 

The ipv6 implementation do that for the "dev->addr" slaac the same here.
I think the reason is because here, if this fails for any reason then
this is simple not an error, we just don't add a link-local for short
addr slaac here and don't abort completely the CHANGE/UP of interface.

IPv6 also use "-1" return value only if parsing fails, I would to see
that also for iphc stuff where we mixed a lot of "-EIO" and "-EINVAL",
anyway nobody will really care about that return value.

It just means: 0 parsing successful and -1 parsing failed.

- Alex

Re: [PATCH v9 net-next 4/7] openvswitch: add layer 3 flow/port support

2016-05-06 Thread Jiri Benc

On Fri, 6 May 2016 14:57:07 +0900, Simon Horman wrote:
> On Thu, May 05, 2016 at 10:37:08AM -0700, pravin shelar wrote:
> > On transmit side you are using mac_len to detect l3 packet, why not do
> > same while extracting the key?

I agree. The skb should be self-contained, i.e. it should be obvious
whether it has the MAC header set or not just from the skb itself at
any point in the packet processing. Otherwise, I'd expect things like
recirculation to break after push/pop of eth header.

> Unfortunately mac_len can't be relied on here, emprically it has the same
> value (28 in my tests) for both the TEB and layer3 case above.

That's strange, it looks like there's something setting the mac header
unconditionally in ovs code. We should find that place and change it.

The ARPHRD_NONE interfaces don't even set mac_header and mac_len, this
will need to be set by ovs upon getting frame from such interface. 

> Perhaps that could be changed by futher enhancements in the tunneling code
> but I think things are symetric as they stand:
> 
> * On recieve skb->protocol can be read to distinguish TEB and layer3 packets
> * On transmit skb->protocol should be set to distinguish TEB and layer3 
> packets

Yes, but you need to act upon this directly after receiving the
frame/just before sending the frame and set up an internal flag that
will be used throughout the code. That way, the packet can be handed
over to different parts of the code, recirculated, etc. without
worries. skb->mac_len is probably a good candidate for such flag.

 Jiri

[patch net 1/3] mlxsw: spectrum: Fix rollback order in LAG join failure

2016-05-06 Thread Jiri Pirko

From: Ido Schimmel 

Make the leave procedure in the error path symmetric to the join
procedure and first remove the port from the collector before
potentially destroying the LAG.

Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave")
Signed-off-by: Ido Schimmel 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 4afbc3e..668b2f4 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2541,11 +2541,11 @@ static int mlxsw_sp_port_lag_join(struct mlxsw_sp_port 
*mlxsw_sp_port,
lag->ref_count++;
return 0;
 
+err_col_port_enable:
+   mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id);
 err_col_port_add:
if (!lag->ref_count)
mlxsw_sp_lag_destroy(mlxsw_sp, lag_id);
-err_col_port_enable:
-   mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id);
return err;
 }
 
-- 
2.5.5

[patch net 0/3] mlxsw: Couple of fixes

2016-05-06 Thread Jiri Pirko

From: Jiri Pirko 

Ido Schimmel (2):
  mlxsw: spectrum: Fix rollback order in LAG join failure
  mlxsw: spectrum: Add missing rollback in flood configuration

Jiri Pirko (1):
  mlxsw: spectrum: Fix ordering in mlxsw_sp_fini

 drivers/net/ethernet/mellanox/mlxsw/spectrum.c   | 5 +++--
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 8 
 2 files changed, 11 insertions(+), 2 deletions(-)

-- 
2.5.5

[patch net 2/3] mlxsw: spectrum: Add missing rollback in flood configuration

2016-05-06 Thread Jiri Pirko

From: Ido Schimmel 

When we fail to set the flooding configuration for the broadcast and
unregistered multicast traffic, we should revert the flooding
configuration of the unknown unicast traffic.

Fixes: 0293038e0c36 ("mlxsw: spectrum: Add support for flood control")
Signed-off-by: Ido Schimmel 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
index e1c74ef..9cd6f47 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
@@ -214,7 +214,15 @@ static int __mlxsw_sp_port_flood_set(struct mlxsw_sp_port 
*mlxsw_sp_port,
mlxsw_reg_sftr_pack(sftr_pl, MLXSW_SP_FLOOD_TABLE_BM, idx_begin,
table_type, range, local_port, set);
err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sftr), sftr_pl);
+   if (err)
+   goto err_flood_bm_set;
+   else
+   goto buffer_out;
 
+err_flood_bm_set:
+   mlxsw_reg_sftr_pack(sftr_pl, MLXSW_SP_FLOOD_TABLE_UC, idx_begin,
+   table_type, range, local_port, !set);
+   mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sftr), sftr_pl);
 buffer_out:
kfree(sftr_pl);
return err;
-- 
2.5.5

[patch net 3/3] mlxsw: spectrum: Fix ordering in mlxsw_sp_fini

2016-05-06 Thread Jiri Pirko

From: Jiri Pirko 

Fixes: 0f433fa0ec ("mlxsw: spectrum_buffers: Implement shared buffer 
configuration")
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 668b2f4..749cc27 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2159,6 +2159,7 @@ static void mlxsw_sp_fini(void *priv)
struct mlxsw_sp *mlxsw_sp = priv;
 
mlxsw_sp_switchdev_fini(mlxsw_sp);
+   mlxsw_sp_buffers_fini(mlxsw_sp);
mlxsw_sp_traps_fini(mlxsw_sp);
mlxsw_sp_event_unregister(mlxsw_sp, MLXSW_TRAP_ID_PUDE);
mlxsw_sp_ports_remove(mlxsw_sp);
-- 
2.5.5

Re: [PATCH v9 net-next 7/7] openvswitch: use ipgre tunnel rather than gretap tunnel

2016-05-06 Thread Jiri Benc

On Fri, 6 May 2016 15:54:02 +0900, Simon Horman wrote:
> -int ovs_netdev_send_raw_tun(struct sk_buff *skb)
> -{
> - if (skb->mac_len)
> - skb->protocol = ntohs(ETH_P_TEB);
> + if (dev->type != ARPHRD_ETHER && skb->mac_len) {
> + skb->protocol = htons(ETH_P_TEB);
> + } else if (dev->type == ARPHRD_ETHER && !skb->mac_len) {
> + kfree_skb(skb);
> + return -EINVAL;
> + }

This was something I was missing in your patches (sorry, did not get to
the full review yet).

You'll also need to enable ARPHRD_NONE and ARPHRD_IPGRE interfaces in
ovs_netdev_link.

 Jiri

Re: [PATCH] cfg80211/nl80211: add wifi tx power mode switching support

2016-05-06 Thread Wei-Ning Huang

On Fri, May 6, 2016 at 12:07 AM, Dan Williams  wrote:
>
> On Thu, 2016-05-05 at 14:44 +0800, Wei-Ning Huang wrote:
> > Recent new hardware has the ability to switch between tablet mode and
> > clamshell mode. To optimize WiFi performance, we want to be able to
> > use
> > different power table between modes. This patch adds a new netlink
> > message type and cfg80211_ops function to allow userspace to trigger
> > a
> > power mode switch for a given wireless interface.
> >
> > Signed-off-by: Wei-Ning Huang 
> > ---
> >  include/net/cfg80211.h   | 11 +++
> >  include/uapi/linux/nl80211.h | 21 +
> >  net/wireless/nl80211.c   | 16 
> >  net/wireless/rdev-ops.h  | 22 ++
> >  net/wireless/trace.h | 20 
> >  5 files changed, 90 insertions(+)
> >
> > diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
> > index 9e1b24c..aa77fa0 100644
> > --- a/include/net/cfg80211.h
> > +++ b/include/net/cfg80211.h
> > @@ -2370,6 +2370,12 @@ struct cfg80211_qos_map {
> >   * @get_tx_power: store the current TX power into the dbm variable;
> >   *   return 0 if successful
> >   *
> > + * @set_tx_power_mode: set the transmit power mode. Some device have
> > the ability
> > + *   to transform between different mode such as clamshell and
> > tablet mode.
> > + *   set_tx_power_mode allows setting of different TX power
> > mode at runtime.
> > + * @get_tx_power_mode: store the current TX power mode into the mode
> > variable;
> > + *   return 0 if successful
> > + *
> >   * @set_wds_peer: set the WDS peer for a WDS interface
> >   *
> >   * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting
> > @@ -2631,6 +2637,11 @@ struct cfg80211_ops {
> >   int (*get_tx_power)(struct wiphy *wiphy, struct
> > wireless_dev *wdev,
> >   int *dbm);
> >
> > + int (*set_tx_power_mode)(struct wiphy *wiphy,
> > +  enum nl80211_tx_power_mode
> > mode);
> > + int (*get_tx_power_mode)(struct wiphy *wiphy,
> > +  enum nl80211_tx_power_mode
> > *mode);
> > +
> >   int (*set_wds_peer)(struct wiphy *wiphy, struct
> > net_device *dev,
> >   const u8 *addr);
> >
> > diff --git a/include/uapi/linux/nl80211.h
> > b/include/uapi/linux/nl80211.h
> > index 5a30a75..9b1888a 100644
> > --- a/include/uapi/linux/nl80211.h
> > +++ b/include/uapi/linux/nl80211.h
> > @@ -1796,6 +1796,9 @@ enum nl80211_commands {
> >   *   connecting to a PCP, and in %NL80211_CMD_START_AP to start
> >   *   a PCP instead of AP. Relevant for DMG networks only.
> >   *
> > + * @NL80211_ATTR_WIPHY_TX_POWER_MODE: Transmit power mode. See
> > + *  &enum nl80211_tx_power_mode for possible values.
> > + *
> >   * @NUM_NL80211_ATTR: total number of nl80211_attrs available
> >   * @NL80211_ATTR_MAX: highest attribute number currently defined
> >   * @__NL80211_ATTR_AFTER_LAST: internal use
> > @@ -2172,6 +2175,8 @@ enum nl80211_attrs {
> >
> >   NL80211_ATTR_PBSS,
> >
> > + NL80211_ATTR_WIPHY_TX_POWER_MODE,
> > +
> >   /* add attributes here, update the policy in nl80211.c */
> >
> >   __NL80211_ATTR_AFTER_LAST,
> > @@ -3703,6 +3708,22 @@ enum nl80211_tx_power_setting {
> >  };
> >
> >  /**
> > + * enum nl80211_tx_power_mode - TX power mode setting
> > + * @NL80211_TX_POWER_LOW: general low TX power mode
> > + * @NL80211_TX_POWER_MEDIUM: general medium TX power mode
> > + * @NL80211_TX_POWER_HIGH: general high TX power mode
> > + * @NL80211_TX_POWER_CLAMSHELL: clamshell mode TX power mode
> > + * @NL80211_TX_POWER_TABLET: tablet mode TX power mode
> > + */
> > +enum nl80211_tx_power_mode {
> > + NL80211_TX_POWER_LOW,
> > + NL80211_TX_POWER_MEDIUM,
> > + NL80211_TX_POWER_HIGH,
> > + NL80211_TX_POWER_CLAMSHELL,
> > + NL80211_TX_POWER_TABLET,
>

> "clamshell" and "tablet" probably mean many different things to many
> different people with respect to whether or not they should do anything
> with power saving or wifi.  I feel like a more generic interface is
> needed here.
We could probably drop those two CLAMSHELL and TABLET constant or
describing what they mean
in more detail?

>
> Could this be already done by:
> @NL80211_ATTR_WIPHY_TX_POWER_SETTING = NL80211_TX_POWER_LIMITED
> @NL80211_ATTR_WIPHY_TX_POWER_LEVEL = 
>
> and then the device would be able to change its TX power as it saw fit
> up to that limit set by your application which figures out whether it's
> in clamshell or tablet mode?

We usually want different power settings in different band/channel.
For example, we can have three different power settings
in 2.4Ghz, channels 36-64 & channels 100+ on 5Ghz. In this case, we
can not simply set a fixed number to the power level.
A power table is required to map channel/band to actual power limit.
For most of the driver, changing power table requires loading
a new set of calibration data f

Re: [PATCH net-next] vxlan: if_arp: introduce ARPHRD_VXLANGPE

2016-05-06 Thread Simon Horman

On Thu, May 05, 2016 at 04:56:43PM -0300, Thadeu Lima de Souza Cascardo wrote:
> On Thu, May 05, 2016 at 09:31:41PM +0200, Jiri Benc wrote:
> > On Thu,  5 May 2016 13:36:44 -0300, Thadeu Lima de Souza Cascardo wrote:
> > > Use ARPHRD_VXLANGPE to identify VxLAN GPE interfaces. This is going to be 
> > > used
> > > to allow GPE interfaces to be added as openvswitch ports.
> > 
> > What's wrong with ARPHRD_NONE? I don't think we need a separate type
> > for VXLAN-GPE. Just use ARPHRD_NONE in ovs and things should work, for
> > all ARPHRD_NONE interfaces as a bonus.
> > 
> 
> That's fine for me. I looked quickly at the few devices using ARPHRD_NONE in
> upstream kernel, not sure if there are broken out-of-tree drivers out there. 
> And
> should we care?

It seems unlikely to me that we should.

[GIT PULL nf-next 0/1] Second Round of IPVS Updates for v4.7

2016-05-06 Thread Simon Horman

Hi Pablo,

please consider these enhancements to the IPVS. They allow its
DoS mitigation strategy effective in conjunction with the SIP persistence
engine.

The following changes since commit cb39ad8b8ef224c544074962780bf763077d6141:

  netfilter: nf_tables: allow set names up to 32 bytes (2016-05-05 16:39:51 
+0200)

are available in the git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git 
tags/ipvs2-for-v4.7

for you to fetch changes up to 698e2a8dca98e4de32f3f630e6d9cd93753c52e1:

  ipvs: make drop_entry protection effective for SIP-pe (2016-05-06 16:26:23 
+0900)


Marco Angaroni (1):
  ipvs: make drop_entry protection effective for SIP-pe

 net/netfilter/ipvs/ip_vs_conn.c | 22 +++---
 net/netfilter/ipvs/ip_vs_core.c |  8 +++-
 2 files changed, 26 insertions(+), 4 deletions(-)

Marco Angaroni (1):
  ipvs: make drop_entry protection effective for SIP-pe

 net/netfilter/ipvs/ip_vs_conn.c | 22 +++---
 net/netfilter/ipvs/ip_vs_core.c |  8 +++-
 2 files changed, 26 insertions(+), 4 deletions(-)

-- 
2.7.0.rc3.207.g0ac5344

[PATCH nf-next 1/1] ipvs: make drop_entry protection effective for SIP-pe

2016-05-06 Thread Simon Horman

From: Marco Angaroni 

DoS protection policy that deletes connections to avoid out of memory is
currently not effective for SIP-pe plus OPS-mode for two reasons:
  1) connection templates (holding SIP call-id) are always skipped in
 ip_vs_random_dropentry()
  2) in_pkts counter (used by drop_entry algorithm) is not incremented
 for connection templates

This patch addresses such problems with the following changes:
  a) connection templates associated (via their dest) to virtual-services
 configured in OPS mode are included in ip_vs_random_dropentry()
 monitoring. This applies to SIP-pe over UDP (which requires OPS mode),
 but is more general principle: when OPS is controlled by templates
 memory can be used only by templates themselves, since OPS conns are
 deleted after packet is forwarded.
  b) OPS connections, if controlled by a template, cause increment of
 in_pkts counter of their template. This is already happening but only
 in case director is in master-slave mode (see ip_vs_sync_conn()).

Signed-off-by: Marco Angaroni 
Acked-by: Julian Anastasov 
Signed-off-by: Simon Horman 
---
 net/netfilter/ipvs/ip_vs_conn.c | 22 +++---
 net/netfilter/ipvs/ip_vs_core.c |  8 +++-
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 292365ffa4f0..2cb3c626cd43 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1261,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
 }
 
+static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
+{
+   struct ip_vs_service *svc;
+
+   if (!cp->dest)
+   return false;
+   svc = rcu_dereference(cp->dest->svc);
+   return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
+}
+
 /* Called from keventd and must protect itself from softirqs */
 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
 {
@@ -1275,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
 
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
-   if (cp->flags & IP_VS_CONN_F_TEMPLATE)
-   /* connection template */
-   continue;
if (cp->ipvs != ipvs)
continue;
+   if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+   if (atomic_read(&cp->n_control) ||
+   !ip_vs_conn_ops_mode(cp))
+   continue;
+   else
+   /* connection template of OPS */
+   goto try_drop;
+   }
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
@@ -1307,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
continue;
}
} else {
+try_drop:
if (!todrop_entry(cp))
continue;
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index f3bac2e9a25a..1207f20d24e4 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -612,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff 
*skb,
ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
 
-   atomic_inc(&cp->in_pkts);
+   if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+   atomic_inc(&cp->control->in_pkts);
+   else
+   atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
@@ -1991,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, 
struct sk_buff *skb, int
 
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
+   else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
+   /* increment is done inside ip_vs_sync_conn too */
+   atomic_inc(&cp->control->in_pkts);
 
ip_vs_conn_put(cp);
return ret;
-- 
2.7.0.rc3.207.g0ac5344

[net-next 05/11] i40evf: Drop packet split receive routine

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

As part of preparation for the rx-refactor, remove the
packet split receive routine and ancillary code.

Some of the split related context set up code stays in
i40e_virtchnl_pf.c in case an older VF driver tries to load
and still wants to use packet split.

Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  2 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  | 21 +---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  |  7 --
 drivers/net/ethernet/intel/i40evf/i40evf.h |  2 --
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 14 ---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c| 28 +-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c|  4 
 7 files changed, 3 insertions(+), 75 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c 
b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 6b9db79..36aa33a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -590,7 +590,7 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 
vsi_id,
}
rx_ctx.hbuff = info->hdr_size >> I40E_RXQ_CTX_HBUFF_SHIFT;
 
-   /* set splitalways mode 10b */
+   /* set split mode 10b */
rx_ctx.dtype = I40E_RX_DTYPE_HEADER_SPLIT;
}
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index a37a3f3..61d4a7a 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -504,22 +504,6 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring)
if (!rx_ring->rx_bi)
return;
 
-   if (ring_is_ps_enabled(rx_ring)) {
-   int bufsz = ALIGN(rx_ring->rx_hdr_len, 256) * rx_ring->count;
-
-   rx_bi = &rx_ring->rx_bi[0];
-   if (rx_bi->hdr_buf) {
-   dma_free_coherent(dev,
- bufsz,
- rx_bi->hdr_buf,
- rx_bi->dma);
-   for (i = 0; i < rx_ring->count; i++) {
-   rx_bi = &rx_ring->rx_bi[i];
-   rx_bi->dma = 0;
-   rx_bi->hdr_buf = NULL;
-   }
-   }
-   }
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
rx_bi = &rx_ring->rx_bi[i];
@@ -1435,10 +1419,7 @@ int i40evf_napi_poll(struct napi_struct *napi, int 
budget)
i40e_for_each_ring(ring, q_vector->rx) {
int cleaned;
 
-   if (ring_is_ps_enabled(ring))
-   cleaned = i40e_clean_rx_irq_ps(ring, budget_per_ring);
-   else
-   cleaned = i40e_clean_rx_irq_1buf(ring, budget_per_ring);
+   cleaned = i40e_clean_rx_irq_1buf(ring, budget_per_ring);
 
work_done += cleaned;
/* if we clean as many as budgeted, we must not be done */
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
index 3b3f976..f24a97e 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h
@@ -244,16 +244,9 @@ struct i40e_rx_queue_stats {
 enum i40e_ring_state_t {
__I40E_TX_FDIR_INIT_DONE,
__I40E_TX_XPS_INIT_DONE,
-   __I40E_RX_PS_ENABLED,
__I40E_RX_16BYTE_DESC_ENABLED,
 };
 
-#define ring_is_ps_enabled(ring) \
-   test_bit(__I40E_RX_PS_ENABLED, &(ring)->state)
-#define set_ring_ps_enabled(ring) \
-   set_bit(__I40E_RX_PS_ENABLED, &(ring)->state)
-#define clear_ring_ps_enabled(ring) \
-   clear_bit(__I40E_RX_PS_ENABLED, &(ring)->state)
 #define ring_is_16byte_desc_enabled(ring) \
test_bit(__I40E_RX_16BYTE_DESC_ENABLED, &(ring)->state)
 #define set_ring_16byte_desc_enabled(ring) \
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h 
b/drivers/net/ethernet/intel/i40evf/i40evf.h
index 25afabf..83ccc58 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf.h
+++ b/drivers/net/ethernet/intel/i40evf/i40evf.h
@@ -209,8 +209,6 @@ struct i40evf_adapter {
u32 flags;
 #define I40EVF_FLAG_RX_CSUM_ENABLED  BIT(0)
 #define I40EVF_FLAG_RX_1BUF_CAPABLE  BIT(1)
-#define I40EVF_FLAG_RX_PS_CAPABLEBIT(2)
-#define I40EVF_FLAG_RX_PS_ENABLEDBIT(3)
 #define I40EVF_FLAG_IMIR_ENABLED BIT(5)
 #define I40EVF_FLAG_MQ_CAPABLE   BIT(6)
 #define I40EVF_FLAG_NEED_LINK_UPDATE BIT(7)
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c
index 5a48ee0..e97

[net-next 06/11] i40evf: refactor receive routine

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

This is part 2 of the Rx refactor series, just including
changes to i40evf.

This refactor aligns the receive routine with the one in
ixgbe which was highly optimized.  This reduces the code
we have to maintain and allows for (hopefully) more readable
and maintainable RX hot path.

In order to do this:
- consolidate the receive path into a single function that doesn't
  use packet split but *does* use pages for Rx buffers.
- remove the old _1buf routine
- consolidate several routines into helper functions
- remove VF ethtool control over packet split
- remove priv_flags interface since it is unused

Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  | 898 +++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  |  37 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h |   5 -
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  51 --
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|   3 +-
 5 files changed, 481 insertions(+), 513 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 61d4a7a..fd7dae46 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -496,7 +496,6 @@ err:
 void i40evf_clean_rx_ring(struct i40e_ring *rx_ring)
 {
struct device *dev = rx_ring->dev;
-   struct i40e_rx_buffer *rx_bi;
unsigned long bi_size;
u16 i;
 
@@ -506,30 +505,20 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring)
 
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
-   rx_bi = &rx_ring->rx_bi[i];
-   if (rx_bi->dma) {
-   dma_unmap_single(dev,
-rx_bi->dma,
-rx_ring->rx_buf_len,
-DMA_FROM_DEVICE);
-   rx_bi->dma = 0;
-   }
+   struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+
if (rx_bi->skb) {
dev_kfree_skb(rx_bi->skb);
rx_bi->skb = NULL;
}
-   if (rx_bi->page) {
-   if (rx_bi->page_dma) {
-   dma_unmap_page(dev,
-  rx_bi->page_dma,
-  PAGE_SIZE,
-  DMA_FROM_DEVICE);
-   rx_bi->page_dma = 0;
-   }
-   __free_page(rx_bi->page);
-   rx_bi->page = NULL;
-   rx_bi->page_offset = 0;
-   }
+   if (!rx_bi->page)
+   continue;
+
+   dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE);
+   __free_pages(rx_bi->page, 0);
+
+   rx_bi->page = NULL;
+   rx_bi->page_offset = 0;
}
 
bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count;
@@ -538,6 +527,7 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring)
/* Zero out the descriptor ring */
memset(rx_ring->desc, 0, rx_ring->size);
 
+   rx_ring->next_to_alloc = 0;
rx_ring->next_to_clean = 0;
rx_ring->next_to_use = 0;
 }
@@ -562,37 +552,6 @@ void i40evf_free_rx_resources(struct i40e_ring *rx_ring)
 }
 
 /**
- * i40evf_alloc_rx_headers - allocate rx header buffers
- * @rx_ring: ring to alloc buffers
- *
- * Allocate rx header buffers for the entire ring. As these are static,
- * this is only called when setting up a new ring.
- **/
-void i40evf_alloc_rx_headers(struct i40e_ring *rx_ring)
-{
-   struct device *dev = rx_ring->dev;
-   struct i40e_rx_buffer *rx_bi;
-   dma_addr_t dma;
-   void *buffer;
-   int buf_size;
-   int i;
-
-   if (rx_ring->rx_bi[0].hdr_buf)
-   return;
-   /* Make sure the buffers don't cross cache line boundaries. */
-   buf_size = ALIGN(rx_ring->rx_hdr_len, 256);
-   buffer = dma_alloc_coherent(dev, buf_size * rx_ring->count,
-   &dma, GFP_KERNEL);
-   if (!buffer)
-   return;
-   for (i = 0; i < rx_ring->count; i++) {
-   rx_bi = &rx_ring->rx_bi[i];
-   rx_bi->dma = dma + (i * buf_size);
-   rx_bi->hdr_buf = buffer + (i * buf_size);
-   }
-}
-
-/**
  * i40evf_setup_rx_descriptors - Allocate Rx descriptors
  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
  *
@@ -613,9 +572,7 @@ int i40evf_setup_rx_descriptors(struct i40e_ring *rx_ring)
u64_stats_init(&rx_ring->syncp);
 
/* Round up to nearest 4K */
-   rx_ring->size = ring_is_16byte_desc_enabled(rx_ring)
-   ? rx_ring->count * sizeof(union i40e_16b

[net-next 08/11] i40evf: Allocate Rx buffers properly

2016-05-06 Thread Jeff Kirsher

From: Mitch Williams 

Allocate the correct number of RX buffers, and don't fiddle with
next_to_use. The common RX code handles all of this. This fixes a memory
leak of one page each time the driver is opened.

Change-Id: Id06eca353086e084921f047acad28c14745684ee
Signed-off-by: Mitch Williams 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 870bad8..b548dbe 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -990,9 +990,7 @@ static void i40evf_configure(struct i40evf_adapter *adapter)
for (i = 0; i < adapter->num_active_queues; i++) {
struct i40e_ring *ring = &adapter->rx_rings[i];
 
-   i40evf_alloc_rx_buffers(ring, ring->count);
-   ring->next_to_use = ring->count - 1;
-   writel(ring->next_to_use, ring->tail);
+   i40evf_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
}
 }
 
@@ -2768,7 +2766,6 @@ static void i40evf_remove(struct pci_dev *pdev)
 
iounmap(hw->hw_addr);
pci_release_regions(pdev);
-
i40evf_free_all_tx_resources(adapter);
i40evf_free_all_rx_resources(adapter);
i40evf_free_queues(adapter);
-- 
2.5.5

[net-next 07/11] i40e/i40evf: Remove unused hardware receive descriptor code

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

The hardware supports a 16 byte descriptor for receive, but the
driver was never using it in production.  There was no performance
benefit to the real driver of 16 byte descriptors, so drop a whole
lot of complexity while getting rid of the code.

Also since the previous patch made us use no-split mode all the
time, drop any support in the driver for any other value in dtype
and assume it is always zero (aka no-split).

Hooray for code removal!

Change-ID: I2257e902e4dad84a07b94db6d2e6f4ce69b27bc0
Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  7 +--
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 16 +---
 drivers/net/ethernet/intel/i40e/i40e_main.c| 18 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.h| 24 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  | 24 ++--
 5 files changed, 27 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ebf423b..2a6a5d3 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -122,10 +122,7 @@
 #define XSTRINGIFY(bar) STRINGIFY(bar)
 
 #define I40E_RX_DESC(R, i) \
-   ((ring_is_16byte_desc_enabled(R))   \
-   ? (union i40e_32byte_rx_desc *) \
-   (&(((union i40e_16byte_rx_desc *)((R)->desc))[i])) \
-   : (&(((union i40e_32byte_rx_desc *)((R)->desc))[i])))
+   (&(((union i40e_32byte_rx_desc *)((R)->desc))[i]))
 #define I40E_TX_DESC(R, i) \
(&(((struct i40e_tx_desc *)((R)->desc))[i]))
 #define I40E_TX_CTXTDESC(R, i) \
@@ -327,7 +324,6 @@ struct i40e_pf {
 #ifdef I40E_FCOE
 #define I40E_FLAG_FCOE_ENABLED BIT_ULL(11)
 #endif /* I40E_FCOE */
-#define I40E_FLAG_16BYTE_RX_DESC_ENABLED   BIT_ULL(13)
 #define I40E_FLAG_CLEAN_ADMINQ BIT_ULL(14)
 #define I40E_FLAG_FILTER_SYNC  BIT_ULL(15)
 #define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT_ULL(16)
@@ -532,7 +528,6 @@ struct i40e_vsi {
 
u16 max_frame;
u16 rx_buf_len;
-   u8  dtype;
 
/* List of q_vectors allocated to this VSI */
struct i40e_q_vector **q_vectors;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 8ae30f7..e6af8c8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -361,7 +361,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
 vsi->work_limit);
dev_info(&pf->pdev->dev,
 "max_frame = %d, rx_buf_len = %d dtype = %d\n",
-vsi->max_frame, vsi->rx_buf_len, vsi->dtype);
+vsi->max_frame, vsi->rx_buf_len, 0);
dev_info(&pf->pdev->dev,
 "num_q_vectors = %i, base_vector = %i\n",
 vsi->num_q_vectors, vsi->base_vector);
@@ -586,13 +586,6 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int 
ring_id, int desc_n,
 "   d[%03x] = 0x%016llx 0x%016llx\n",
 i, txd->buffer_addr,
 txd->cmd_type_offset_bsz);
-   } else if (sizeof(union i40e_rx_desc) ==
-  sizeof(union i40e_16byte_rx_desc)) {
-   rxd = I40E_RX_DESC(ring, i);
-   dev_info(&pf->pdev->dev,
-"   d[%03x] = 0x%016llx 0x%016llx\n",
-i, rxd->read.pkt_addr,
-rxd->read.hdr_addr);
} else {
rxd = I40E_RX_DESC(ring, i);
dev_info(&pf->pdev->dev,
@@ -614,13 +607,6 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int 
ring_id, int desc_n,
 "vsi = %02i tx ring = %02i d[%03x] = 0x%016llx 
0x%016llx\n",
 vsi_seid, ring_id, desc_n,
 txd->buffer_addr, txd->cmd_type_offset_bsz);
-   } else if (sizeof(union i40e_rx_desc) ==
-  sizeof(union i40e_16byte_rx_desc)) {
-   rxd = I40E_RX_DESC(ring, desc_n);
-   dev_info(&pf->pdev->dev,
-"vsi = %02i rx ring = %02i d[%03x] = 0x%016llx 
0x%016llx\n",
-vsi_seid, ring_id, desc_n,
-rxd->read.pkt_addr, rxd->read.hdr_addr);
} else {
rxd = I40E_RX_DESC(ring, desc_n);
dev_info(&pf->pdev->dev,
diff --git a/dri

[net-next 02/11] i40e: Drop packet split receive routine

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

As part of preparation for the rx-refactor, remove the
packet split receive routine and ancillary code.

Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |   3 -
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |   4 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  19 --
 drivers/net/ethernet/intel/i40e/i40e_main.c|  49 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 245 +
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|   7 -
 6 files changed, 10 insertions(+), 317 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 00c4738..ea6a69a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -101,7 +101,6 @@
 #define I40E_PRIV_FLAGS_LINKPOLL_FLAG  BIT(1)
 #define I40E_PRIV_FLAGS_FD_ATR BIT(2)
 #define I40E_PRIV_FLAGS_VEB_STATS  BIT(3)
-#define I40E_PRIV_FLAGS_PS BIT(4)
 #define I40E_PRIV_FLAGS_HW_ATR_EVICT   BIT(5)
 
 #define I40E_NVM_VERSION_LO_SHIFT  0
@@ -320,8 +319,6 @@ struct i40e_pf {
 #define I40E_FLAG_RX_CSUM_ENABLED  BIT_ULL(1)
 #define I40E_FLAG_MSI_ENABLED  BIT_ULL(2)
 #define I40E_FLAG_MSIX_ENABLED BIT_ULL(3)
-#define I40E_FLAG_RX_1BUF_ENABLED  BIT_ULL(4)
-#define I40E_FLAG_RX_PS_ENABLEDBIT_ULL(5)
 #define I40E_FLAG_RSS_ENABLED  BIT_ULL(6)
 #define I40E_FLAG_VMDQ_ENABLED BIT_ULL(7)
 #define I40E_FLAG_FDIR_REQUIRES_REINIT BIT_ULL(8)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index 83dccf1..f119a74 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -273,8 +273,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
 rx_ring->rx_buf_len,
 rx_ring->dtype);
dev_info(&pf->pdev->dev,
-"rx_rings[%i]: hsplit = %d, next_to_use = %d, 
next_to_clean = %d, ring_active = %i\n",
-i, ring_is_ps_enabled(rx_ring),
+"rx_rings[%i]: next_to_use = %d, next_to_clean = 
%d, ring_active = %i\n",
+i,
 rx_ring->next_to_use,
 rx_ring->next_to_clean,
 rx_ring->ring_active);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 8e56c43..858e169 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -2829,8 +2829,6 @@ static u32 i40e_get_priv_flags(struct net_device *dev)
I40E_PRIV_FLAGS_FD_ATR : 0;
ret_flags |= pf->flags & I40E_FLAG_VEB_STATS_ENABLED ?
I40E_PRIV_FLAGS_VEB_STATS : 0;
-   ret_flags |= pf->flags & I40E_FLAG_RX_PS_ENABLED ?
-   I40E_PRIV_FLAGS_PS : 0;
ret_flags |= pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE ?
0 : I40E_PRIV_FLAGS_HW_ATR_EVICT;
 
@@ -2851,23 +2849,6 @@ static int i40e_set_priv_flags(struct net_device *dev, 
u32 flags)
 
/* NOTE: MFP is not settable */
 
-   /* allow the user to control the method of receive
-* buffer DMA, whether the packet is split at header
-* boundaries into two separate buffers.  In some cases
-* one routine or the other will perform better.
-*/
-   if ((flags & I40E_PRIV_FLAGS_PS) &&
-   !(pf->flags & I40E_FLAG_RX_PS_ENABLED)) {
-   pf->flags |= I40E_FLAG_RX_PS_ENABLED;
-   pf->flags &= ~I40E_FLAG_RX_1BUF_ENABLED;
-   reset_required = true;
-   } else if (!(flags & I40E_PRIV_FLAGS_PS) &&
-  (pf->flags & I40E_FLAG_RX_PS_ENABLED)) {
-   pf->flags &= ~I40E_FLAG_RX_PS_ENABLED;
-   pf->flags |= I40E_FLAG_RX_1BUF_ENABLED;
-   reset_required = true;
-   }
-
if (flags & I40E_PRIV_FLAGS_LINKPOLL_FLAG)
pf->flags |= I40E_FLAG_LINK_POLLING_ENABLED;
else
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f6da6b7..84e8d4e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2871,18 +2871,9 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
}
 
rx_ctx.dtype = vsi->dtype;
-   if (vsi->dtype) {
-   set_ring_ps_enabled(ring);
-   rx_ctx.hsplit_0 = I40E_RX_SPLIT_L2  |
- I40E_RX_SPLIT_IP  |
- I40E_RX_SPLIT_TCP_UDP |
- I40E_RX_SPLIT_SCTP;
-   } else {
-

[net-next 09/11] i40e: Test memory before ethtool alloc succeeds

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

When testing on systems with very limited amounts of RAM, a bug was
found where, while changing the number of descriptors using ethtool,
the driver didn't test the limits of system memory before permanently
assuming it would be able to get receive buffer memory.

Work around this issue by pre-allocation of the receive buffer
memory, in the "ghost" ring, which is then used during reinit
using the new ring length.

Change-Id: I92d7a5fb59a6c884b2efdd1ec652845f101c3359
Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 34 +++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 6fd730ac..51a994d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -1274,6 +1274,13 @@ static int i40e_set_ringparam(struct net_device *netdev,
}
 
for (i = 0; i < vsi->num_queue_pairs; i++) {
+   /* this is to allow wr32 to have something to write to
+* during early allocation of Rx buffers
+*/
+   u32 __iomem faketail = 0;
+   struct i40e_ring *ring;
+   u16 unused;
+
/* clone ring and setup updated count */
rx_rings[i] = *vsi->rx_rings[i];
rx_rings[i].count = new_rx_count;
@@ -1282,12 +1289,22 @@ static int i40e_set_ringparam(struct net_device *netdev,
 */
rx_rings[i].desc = NULL;
rx_rings[i].rx_bi = NULL;
+   rx_rings[i].tail = (u8 __iomem *)&faketail;
err = i40e_setup_rx_descriptors(&rx_rings[i]);
+   if (err)
+   goto rx_unwind;
+
+   /* now allocate the Rx buffers to make sure the OS
+* has enough memory, any failure here means abort
+*/
+   ring = &rx_rings[i];
+   unused = I40E_DESC_UNUSED(ring);
+   err = i40e_alloc_rx_buffers(ring, unused);
+rx_unwind:
if (err) {
-   while (i) {
-   i--;
+   do {
i40e_free_rx_resources(&rx_rings[i]);
-   }
+   } while (i--);
kfree(rx_rings);
rx_rings = NULL;
 
@@ -1313,6 +1330,17 @@ static int i40e_set_ringparam(struct net_device *netdev,
if (rx_rings) {
for (i = 0; i < vsi->num_queue_pairs; i++) {
i40e_free_rx_resources(vsi->rx_rings[i]);
+   /* get the real tail offset */
+   rx_rings[i].tail = vsi->rx_rings[i]->tail;
+   /* this is to fake out the allocation routine
+* into thinking it has to realloc everything
+* but the recycling logic will let us re-use
+* the buffers allocated above
+*/
+   rx_rings[i].next_to_use = 0;
+   rx_rings[i].next_to_clean = 0;
+   rx_rings[i].next_to_alloc = 0;
+   /* do a struct copy */
*vsi->rx_rings[i] = rx_rings[i];
}
kfree(rx_rings);
-- 
2.5.5

[net-next 04/11] i40e: Refactor receive routine

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

This is part 1 of the Rx refactor series, just including
changes to i40e.

This refactor aligns the receive routine with the one in
ixgbe which was highly optimized.  This reduces the code
we have to maintain and allows for (hopefully) more readable
and maintainable RX hot path.

In order to do this:
- consolidate the receive path into a single function that doesn't
  use packet split but *does* use pages for Rx buffers.
- remove the old _1buf routine
- consolidate several routines into helper functions
- remove ethtool control over packet split

Change-ID: I5ca100721de65992aa0114f8b4bac844b84758e0
Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |   1 -
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |   9 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |   1 -
 drivers/net/ethernet/intel/i40e/i40e_main.c|  16 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 770 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|  37 +-
 6 files changed, 531 insertions(+), 303 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index ea6a69a..ebf423b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -531,7 +531,6 @@ struct i40e_vsi {
u8  *rss_lut_user;  /* User configured lookup table entries */
 
u16 max_frame;
-   u16 rx_hdr_len;
u16 rx_buf_len;
u8  dtype;
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c 
b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
index c0a01e0..8ae30f7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c
@@ -268,9 +268,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
 rx_ring->queue_index,
 rx_ring->reg_idx);
dev_info(&pf->pdev->dev,
-"rx_rings[%i]: rx_hdr_len = %d, rx_buf_len = %d\n",
-i, rx_ring->rx_hdr_len,
-rx_ring->rx_buf_len);
+"rx_rings[%i]: rx_buf_len = %d\n",
+i, rx_ring->rx_buf_len);
dev_info(&pf->pdev->dev,
 "rx_rings[%i]: next_to_use = %d, next_to_clean = 
%d, ring_active = %i\n",
 i,
@@ -361,8 +360,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int 
seid)
 "work_limit = %d\n",
 vsi->work_limit);
dev_info(&pf->pdev->dev,
-"max_frame = %d, rx_hdr_len = %d, rx_buf_len = %d dtype = 
%d\n",
-vsi->max_frame, vsi->rx_hdr_len, vsi->rx_buf_len, vsi->dtype);
+"max_frame = %d, rx_buf_len = %d dtype = %d\n",
+vsi->max_frame, vsi->rx_buf_len, vsi->dtype);
dev_info(&pf->pdev->dev,
 "num_q_vectors = %i, base_vector = %i\n",
 vsi->num_q_vectors, vsi->base_vector);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index 858e169..6fd730ac 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -235,7 +235,6 @@ static const char 
i40e_priv_flags_strings[][ETH_GSTRING_LEN] = {
"LinkPolling",
"flow-director-atr",
"veb-stats",
-   "packet-split",
"hw-atr-eviction",
 };
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 84e8d4e..e466111 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -2855,10 +2855,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
memset(&rx_ctx, 0, sizeof(rx_ctx));
 
ring->rx_buf_len = vsi->rx_buf_len;
-   ring->rx_hdr_len = vsi->rx_hdr_len;
 
rx_ctx.dbuff = ring->rx_buf_len >> I40E_RXQ_CTX_DBUFF_SHIFT;
-   rx_ctx.hbuff = ring->rx_hdr_len >> I40E_RXQ_CTX_HBUFF_SHIFT;
 
rx_ctx.base = (ring->dma / 128);
rx_ctx.qlen = ring->count;
@@ -2910,7 +2908,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
writel(0, ring->tail);
 
-   i40e_alloc_rx_buffers_1buf(ring, I40E_DESC_UNUSED(ring));
+   i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring));
 
return 0;
 }
@@ -2949,15 +2947,13 @@ static int i40e_vsi_configure_rx(struct i40e_vsi *vsi)
else
vsi->max_frame = I40E_RXBUFFER_2048;
 
-   vsi->rx_hdr_len = 0;
-   vsi->rx_buf_len = vsi->max_frame;
+   vsi->rx_buf_len = I40E_RXBUFFER_2048;
vsi->dtype = I40E_RX_DTYPE_NO_SPLIT;
 
 #ifdef I40E_FCOE
/* setup rx buffer for FCoE */
if ((vsi->type == I40E_VSI_FCOE) &&

[net-next 01/11] i40e/i40evf: Refactor tunnel interpretation

2016-05-06 Thread Jeff Kirsher

From: Jesse Brandeburg 

Refactor the interpretation of a tunnel.  This removes
some code and lets us start using the hardware's parsing.

Signed-off-by: Jesse Brandeburg 
Tested-by: Andrew Bowers 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 13 ++---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 13 ++---
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 2765d7e..dab733c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1392,7 +1392,7 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
u16 rx_ptype)
 {
struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(rx_ptype);
-   bool ipv4, ipv6, ipv4_tunnel, ipv6_tunnel;
+   bool ipv4, ipv6, tunnel = false;
 
skb->ip_summed = CHECKSUM_NONE;
 
@@ -1441,14 +1441,13 @@ static inline void i40e_rx_checksum(struct i40e_vsi 
*vsi,
 * doesn't make it a hard requirement so if we have validated the
 * inner checksum report CHECKSUM_UNNECESSARY.
 */
-
-   ipv4_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT4_MAC_PAY3) &&
-(rx_ptype <= I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4);
-   ipv6_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT6_MAC_PAY3) &&
-(rx_ptype <= I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4);
+   if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP |
+ I40E_RX_PTYPE_INNER_PROT_UDP |
+ I40E_RX_PTYPE_INNER_PROT_SCTP))
+   tunnel = true;
 
skb->ip_summed = CHECKSUM_UNNECESSARY;
-   skb->csum_level = ipv4_tunnel || ipv6_tunnel;
+   skb->csum_level = tunnel ? 1 : 0;
 
return;
 
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index ede8dfc..a37a3f3 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -864,7 +864,7 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
u16 rx_ptype)
 {
struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(rx_ptype);
-   bool ipv4, ipv6, ipv4_tunnel, ipv6_tunnel;
+   bool ipv4, ipv6, tunnel = false;
 
skb->ip_summed = CHECKSUM_NONE;
 
@@ -913,14 +913,13 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 * doesn't make it a hard requirement so if we have validated the
 * inner checksum report CHECKSUM_UNNECESSARY.
 */
-
-   ipv4_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT4_MAC_PAY3) &&
-(rx_ptype <= I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4);
-   ipv6_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT6_MAC_PAY3) &&
-(rx_ptype <= I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4);
+   if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP |
+ I40E_RX_PTYPE_INNER_PROT_UDP |
+ I40E_RX_PTYPE_INNER_PROT_SCTP))
+   tunnel = true;
 
skb->ip_summed = CHECKSUM_UNNECESSARY;
-   skb->csum_level = ipv4_tunnel || ipv6_tunnel;
+   skb->csum_level = tunnel ? 1 : 0;
 
return;
 
-- 
2.5.5

[net-next 00/11][pull request] 40GbE Intel Wired LAN Driver Updates 2016-05-05

2016-05-06 Thread Jeff Kirsher

This series contains updates to i40e and i40evf.

The theme behind this series is code reduction, yeah!  Jesse provides
most of the changes starting with a refactor of the interpretation of
a tunnel which lets us start using the hardware's parsing.  Removed
the packet split receive routine and ancillary code in preparation
for the Rx-refactor.  The refactor of the receive routine,
aligns the receive routine with the one in ixgbe which was highly
optimized.  The hardware supports a 16 byte descriptor for receive,
but the driver was never using it in production.  There was no performance
benefit to the real driver of 16 byte descriptors, so drop a whole lot
of complexity while getting rid of the code.  Fixed a bug where while
changing the number of descriptors using ethtool, the driver did not
test the limits of the system memory before permanently assuming it
would be able to get receive buffer memory.

Mitch fixes a memory leak of one page each time the driver is opened by
allocating the correct number of receive buffers and do not fiddle with
next_to_use in the VF driver.

Arnd Bergmann fixed a indentation issue by adding the appropriate
curly braces in i40e_vc_config_promiscuous_mode_msg().

Julia Lawall fixed an issue found by Coccinelle, where i40e_client_ops
structure can be const since it is never modified.

The following are changes since commit 035cd6ba53eff060760c4f4d11339fcc916a967c:
  MAINTAINERS: Cleanup Intel Wired LAN maintainers list
and are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE

Arnd Bergmann (1):
  i40e: fix misleading indentation

Jesse Brandeburg (8):
  i40e/i40evf: Refactor tunnel interpretation
  i40e: Drop packet split receive routine
  i40e/i40evf: Remove reference to ring->dtype
  i40e: Refactor receive routine
  i40evf: Drop packet split receive routine
  i40evf: refactor receive routine
  i40e/i40evf: Remove unused hardware receive descriptor code
  i40e: Test memory before ethtool alloc succeeds

Julia Lawall (1):
  i40e: constify i40e_client_ops structure

Mitch Williams (1):
  i40evf: Allocate Rx buffers properly

 drivers/infiniband/hw/i40iw/i40iw_main.c   |   2 +-
 drivers/net/ethernet/intel/i40e/i40e.h |  11 +-
 drivers/net/ethernet/intel/i40e/i40e_client.h  |   2 +-
 drivers/net/ethernet/intel/i40e/i40e_debugfs.c |  31 +-
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  54 +-
 drivers/net/ethernet/intel/i40e/i40e_main.c|  73 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 968 ++---
 drivers/net/ethernet/intel/i40e/i40e_txrx.h|  69 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |   5 +-
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c  | 930 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h  |  69 +-
 drivers/net/ethernet/intel/i40evf/i40evf.h |   7 -
 drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c |  65 --
 drivers/net/ethernet/intel/i40evf/i40evf_main.c|  34 +-
 .../net/ethernet/intel/i40evf/i40evf_virtchnl.c|   4 -
 15 files changed, 1062 insertions(+), 1262 deletions(-)

-- 
2.5.5

< 1 2 3 >

101 - 200 of 203 matches

Mail list logo