[PATCH v2 next-next 06/12] fou: Split out {fou,gue}_build_header
Create __fou_build_header and __gue_build_header. These implement the protocol generic parts of building the fou and gue header. fou_build_header and gue_build_header implement the IPv4 specific functions and call the __*_build_header functions. Signed-off-by: Tom Herbert --- include/net/fou.h | 8 net/ipv4/fou.c| 47 +-- 2 files changed, 41 insertions(+), 14 deletions(-) diff --git a/include/net/fou.h b/include/net/fou.h index 19b8a0c..7d2fda2 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -11,9 +11,9 @@ size_t fou_encap_hlen(struct ip_tunnel_encap *e); static size_t gue_encap_hlen(struct ip_tunnel_encap *e); -int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, -u8 *protocol, struct flowi4 *fl4); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, -u8 *protocol, struct flowi4 *fl4); +int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type); +int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type); #endif diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index a8b5cbf..971c8c6 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -778,6 +778,22 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, *protocol = IPPROTO_UDP; } +int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type) +{ + int err; + + err = iptunnel_handle_offloads(skb, type); + if (err) + return err; + + *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); + + return 0; +} +EXPORT_SYMBOL(__fou_build_header); + int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { @@ -786,26 +802,21 @@ int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, __be16 sport; int err; - err = iptunnel_handle_offloads(skb, type); + err = __fou_build_header(skb, e, protocol, &sport, type); if (err) return err; - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); fou_build_udp(skb, e, fl4, protocol, sport); return 0; } EXPORT_SYMBOL(fou_build_header); -int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, -u8 *protocol, struct flowi4 *fl4) +int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, __be16 *sport, int type) { - int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : - SKB_GSO_UDP_TUNNEL; struct guehdr *guehdr; size_t hdrlen, optlen = 0; - __be16 sport; void *data; bool need_priv = false; int err; @@ -824,8 +835,8 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, return err; /* Get source port (based on flow hash) before skb_push */ - sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), - skb, 0, 0, false); + *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), + skb, 0, 0, false); hdrlen = sizeof(struct guehdr) + optlen; @@ -870,6 +881,22 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, } + return 0; +} +EXPORT_SYMBOL(__gue_build_header); + +int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, +u8 *protocol, struct flowi4 *fl4) +{ + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : + SKB_GSO_UDP_TUNNEL; + __be16 sport; + int err; + + err = __gue_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + fou_build_udp(skb, e, fl4, protocol, sport); return 0; -- 2.8.0.rc2
[PATCH v2 next-next 07/12] fou: Add encap ops for IPv6 tunnels
Thsi packet adds IP tunnel encapsulation operations for IPv6. This includes the infrastructure to add and delete operations. IPv6 variants for fou6_build_header and gue6_build_header are added in a new fou6 module. These encapsulation operations for fou and gue are automatically added when the fou6 module loads. Signed-off-by: Tom Herbert --- include/net/fou.h | 2 +- include/net/ip6_tunnel.h | 14 + net/ipv6/Makefile | 4 +- net/ipv6/fou6.c| 140 + net/ipv6/ip6_tunnel_core.c | 44 ++ 5 files changed, 202 insertions(+), 2 deletions(-) create mode 100644 net/ipv6/fou6.c create mode 100644 net/ipv6/ip6_tunnel_core.c diff --git a/include/net/fou.h b/include/net/fou.h index 7d2fda2..f5cc691 100644 --- a/include/net/fou.h +++ b/include/net/fou.h @@ -9,7 +9,7 @@ #include size_t fou_encap_hlen(struct ip_tunnel_encap *e); -static size_t gue_encap_hlen(struct ip_tunnel_encap *e); +size_t gue_encap_hlen(struct ip_tunnel_encap *e); int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index fb9e015..1c14c27 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -34,6 +34,20 @@ struct __ip6_tnl_parm { __be32 o_key; }; +struct ip6_tnl_encap_ops { + size_t (*encap_hlen)(struct ip_tunnel_encap *e); + int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6); +}; + +extern const struct ip6_tnl_encap_ops __rcu * + ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; + +int ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *op, + unsigned int num); +int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *op, + unsigned int num); + /* IPv6 tunnel */ struct ip6_tnl { struct ip6_tnl __rcu *next; /* next tunnel in list */ diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 5e9d6bf..5cf4a1f 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -9,7 +9,7 @@ ipv6-objs :=af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o + udp_offload.o ip6_tunnel_core.o ipv6-offload :=ip6_offload.o tcpv6_offload.o exthdrs_offload.o @@ -43,6 +43,8 @@ obj-$(CONFIG_IPV6_SIT) += sit.o obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o obj-$(CONFIG_IPV6_GRE) += ip6_gre.o +obj-$(CONFIG_NET_FOU) += fou6.o + obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c new file mode 100644 index 000..c972d0b --- /dev/null +++ b/net/ipv6/fou6.c @@ -0,0 +1,140 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void fou6_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, + struct flowi6 *fl6, u8 *protocol, __be16 sport) +{ + struct udphdr *uh; + + skb_push(skb, sizeof(struct udphdr)); + skb_reset_transport_header(skb); + + uh = udp_hdr(skb); + + uh->dest = e->dport; + uh->source = sport; + uh->len = htons(skb->len); + udp6_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM6), skb, + &fl6->saddr, &fl6->daddr, skb->len); + + *protocol = IPPROTO_UDP; +} + +int fou6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) +{ + __be16 sport; + int err; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? + SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + err = __fou_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou6_build_udp(skb, e, fl6, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(fou6_build_header); + +int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, + u8 *protocol, struct flowi6 *fl6) +{ + __be16 sport; + int err; + int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM6 ? + SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; + + err = __gue_build_header(skb, e, protocol, &sport, type); + if (err) + return err; + + fou6_build_udp(skb, e, fl6, protocol, sport); + + return 0; +} +EXPORT_SYMBOL(gue6_build_header); + +#ifdef CONFIG_NET_FOU_IP_TUNNELS + +static const struct ip6_tnl_encap_ops fou_ip6tun_ops = { + .encap_hlen = fou
[PATCH v2 next-next 10/12] fou: Support IPv6 in fou
This patch adds receive path support for IPv6 with fou. - Add address family to fou structure for open sockets. This supports AF_INET and AF_INET6. Lookups for fou ports are performed on both the port number and family. - In fou and gue receive adjust tot_len in IPv4 header or payload_len based on address family. - Allow AF_INET6 in FOU_ATTR_AF netlink attribute. Signed-off-by: Tom Herbert --- net/ipv4/fou.c | 47 +++ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 971c8c6..75db828 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -21,6 +21,7 @@ struct fou { u8 protocol; u8 flags; __be16 port; + u8 family; u16 type; struct list_head list; struct rcu_head rcu; @@ -47,14 +48,17 @@ static inline struct fou *fou_from_sock(struct sock *sk) return sk->sk_user_data; } -static int fou_recv_pull(struct sk_buff *skb, size_t len) +static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) { - struct iphdr *iph = ip_hdr(skb); - /* Remove 'len' bytes from the packet (UDP header and * FOU header if present). */ - iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (fou->family == AF_INET) + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + else + ipv6_hdr(skb)->payload_len = + htons(ntohs(ipv6_hdr(skb)->payload_len) - len); + __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); @@ -68,7 +72,7 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) if (!fou) return 1; - if (fou_recv_pull(skb, sizeof(struct udphdr))) + if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -fou->protocol; @@ -141,7 +145,11 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) hdrlen = sizeof(struct guehdr) + optlen; - ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + if (fou->family == AF_INET) + ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); + else + ipv6_hdr(skb)->payload_len = + htons(ntohs(ipv6_hdr(skb)->payload_len) - len); /* Pull csum through the guehdr now . This can be used if * there is a remote checksum offload. @@ -424,7 +432,8 @@ static int fou_add_to_port_list(struct net *net, struct fou *fou) mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { - if (fou->port == fout->port) { + if (fou->port == fout->port && + fou->family == fout->family) { mutex_unlock(&fn->fou_lock); return -EALREADY; } @@ -469,8 +478,9 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, sk = sock->sk; - fou->flags = cfg->flags; fou->port = cfg->udp_config.local_udp_port; + fou->family = cfg->udp_config.family; + fou->flags = cfg->flags; fou->type = cfg->type; fou->sock = sock; @@ -522,12 +532,13 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); __be16 port = cfg->udp_config.local_udp_port; + u8 family = cfg->udp_config.family; int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { - if (fou->port == port) { + if (fou->port == port && fou->family == family) { fou_release(fou); err = 0; break; @@ -565,8 +576,15 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET) - return -EINVAL; + switch (family) { + case AF_INET: + break; + case AF_INET6: + cfg->udp_config.ipv6_v6only = 1; + break; + default: + return -EAFNOSUPPORT; + } cfg->udp_config.family = family; } @@ -657,6 +675,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) struct fou_cfg cfg; struct fou *fout; __be16 port; + u8 family; int ret; ret = parse_nl_config(info, &cfg); @@ -666,6 +685,10 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info) if (port == 0) return -EINVAL; + family = cfg.udp_config.family; + if (family != AF_I
[PATCH v2 next-next 09/12] ipv6: Change "final" protocol processing for encapsulation
When performing foo-over-UDP, UDP are receveived processed by the encapsulation header which returns another protocol to process. This may result in processing two (or more) protocols in the loop that are marked as INET6_PROTO_FINAL. The actions taken for hitting a final protocol, in particular the skb_postpull_rcsum can only be performed. This patch set adds a check of a final protocol has been seen. The rules are: - If the final protocol has not been seen any protocol is processed (final and non-final). In the case of a final protocol, the final actions are taken (like the skb_postpull_rcsum) - If a final protocol has been seen (e.g. an encapsulating UDP header) then no further non-final protocols are allowed (e.g. extension headers). For more final protocols the final actions are not taken (e.g. skb_postpull_rcsum). Signed-off-by: Tom Herbert --- net/ipv6/ip6_input.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 2a0258a..7d98d01 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -216,6 +216,7 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk unsigned int nhoff; int nexthdr; bool raw; + bool have_final = false; /* * Parse extension headers @@ -235,9 +236,21 @@ resubmit: if (ipprot) { int ret; - if (ipprot->flags & INET6_PROTO_FINAL) { + if (have_final) { + if (!(ipprot->flags & INET6_PROTO_FINAL)) { + /* Once we've seen a final protocol don't +* allow encapsulation on any non-final +* ones. This allows foo in UDP encapsulation +* to work. +*/ + goto discard; + } + } else if (ipprot->flags & INET6_PROTO_FINAL) { const struct ipv6hdr *hdr; + /* Only do this once for first final protocol */ + have_final = true; + /* Free reference early: we don't need it any more, and it may hold ip_conntrack module loaded indefinitely. */ -- 2.8.0.rc2
[PATCH v2 next-next 03/12] gre6: Fix flag translations
GRE for IPv6 does not properly translate for GRE flags to tunnel flags and vice versa. This patch fixes that. Signed-off-by: Tom Herbert --- net/ipv6/ip6_gre.c | 20 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 47b671a..70a1f72 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -799,8 +799,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; - p->i_flags = u->i_flags; - p->o_flags = u->o_flags; + p->i_flags = gre_flags_to_tnl_flags(u->i_flags); + p->o_flags = gre_flags_to_tnl_flags(u->o_flags); memcpy(p->name, u->name, sizeof(u->name)); } @@ -817,8 +817,8 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; - u->i_flags = p->i_flags; - u->o_flags = p->o_flags; + u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); memcpy(u->name, p->name, sizeof(u->name)); } @@ -1217,10 +1217,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[], parms->link = nla_get_u32(data[IFLA_GRE_LINK]); if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]); + parms->i_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_IFLAGS])); if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]); + parms->o_flags = gre_flags_to_tnl_flags( + nla_get_be16(data[IFLA_GRE_OFLAGS])); if (data[IFLA_GRE_IKEY]) parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); @@ -1412,8 +1414,10 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, p->i_flags) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, p->o_flags) || + nla_put_be16(skb, IFLA_GRE_IFLAGS, +gre_tnl_flags_to_gre_flags(p->i_flags)) || + nla_put_be16(skb, IFLA_GRE_OFLAGS, +gre_tnl_flags_to_gre_flags(p->o_flags)) || nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || -- 2.8.0.rc2
[PATCH v2 next-next 11/12] ip6_tun: Add infrastructure for doing encapsulation
Add encap_hlen and ip_tunnel_encap structure to ip6_tnl. Add functions for getting encap hlen, setting up encap on a tunnel, performing encapsulation operation. Signed-off-by: Tom Herbert --- include/net/ip6_tunnel.h | 8 +- net/ipv6/ip6_tunnel.c | 4 +++ net/ipv6/ip6_tunnel_core.c | 64 ++ 3 files changed, 75 insertions(+), 1 deletion(-) diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 1c14c27..1b8db86 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -66,10 +66,16 @@ struct ip6_tnl { __u32 o_seqno; /* The last output seqno */ int hlen; /* tun_hlen + encap_hlen */ int tun_hlen; /* Precalculated header length */ + int encap_hlen; /* Encap header length (FOU,GUE) */ + struct ip_tunnel_encap encap; int mlink; - }; +int ip6_tnl_encap_setup(struct ip6_tnl *t, + struct ip_tunnel_encap *ipencap); +int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, + u8 *protocol, struct flowi6 *fl6); + /* Tunnel encapsulation limit destination sub-option */ struct ipv6_tlv_tnl_enc_lim { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index ade55af..2c096ab 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1013,6 +1013,10 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, unsigned int max_headroom = sizeof(struct ipv6hdr); int err = -1; + err = ip6_tnl_encap(skb, t, &proto, fl6); + if (err) + return err; + /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { struct in6_addr *addr6; diff --git a/net/ipv6/ip6_tunnel_core.c b/net/ipv6/ip6_tunnel_core.c index 5f5b79e..94aa414 100644 --- a/net/ipv6/ip6_tunnel_core.c +++ b/net/ipv6/ip6_tunnel_core.c @@ -42,3 +42,67 @@ int ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, } EXPORT_SYMBOL(ip6_tnl_encap_del_ops); +static int ip6_encap_hlen(struct ip_tunnel_encap *e) +{ + const struct ip6_tnl_encap_ops *ops; + int hlen = -EINVAL; + + if (e->type == TUNNEL_ENCAP_NONE) + return 0; + + if (e->type >= MAX_IPTUN_ENCAP_OPS) + return -EINVAL; + + rcu_read_lock(); + ops = rcu_dereference(ip6tun_encaps[e->type]); + if (likely(ops && ops->encap_hlen)) + hlen = ops->encap_hlen(e); + rcu_read_unlock(); + + return hlen; +} + +int ip6_tnl_encap_setup(struct ip6_tnl *t, + struct ip_tunnel_encap *ipencap) +{ + int hlen; + + memset(&t->encap, 0, sizeof(t->encap)); + + hlen = ip6_encap_hlen(ipencap); + if (hlen < 0) + return hlen; + + t->encap.type = ipencap->type; + t->encap.sport = ipencap->sport; + t->encap.dport = ipencap->dport; + t->encap.flags = ipencap->flags; + + t->encap_hlen = hlen; + t->hlen = t->encap_hlen + t->tun_hlen; + + return 0; +} +EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); + +int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, + u8 *protocol, struct flowi6 *fl6) +{ + const struct ip6_tnl_encap_ops *ops; + int ret = -EINVAL; + + if (t->encap.type == TUNNEL_ENCAP_NONE) + return 0; + + if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) + return -EINVAL; + + rcu_read_lock(); + ops = rcu_dereference(ip6tun_encaps[t->encap.type]); + if (likely(ops && ops->build_header)) + ret = ops->build_header(skb, &t->encap, protocol, fl6); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(ip6_tnl_encap); -- 2.8.0.rc2
[PATCH v2 next-next 00/12] ipv6: Enable GUEoIPv6 and more fixes for v6 tunneling
This patch set: - Fixes GRE6 to process translate flags correctly from configuration - Adds support for GSO and GRO for ip6ip6 and ip4ip6 - Add support for FOU and GUE in IPv6 - Support GRE, ip6ip6 and ip4ip6 over FOU/GUE - Fixes ip6_input to deal with UDP encapsulations - Some other minor fixes v2: - Removed a check of GSO types in MPLS - Define GSO type SKB_GSO_IPXIP6 and SKB_GSO_IPXIP4 (based on input from Alexander) - Don't define GSO types specifally for IP6IP6 and IP4IP6, above fix makes that uncessary - Don't bother clearing encapsulation flag in UDP tunnel segment (another item suggested by Alexander). Tested: Tested a variety of case, but not the full matrix (which is quite large now). Most of the obivous cases (e.g. GRE) work fine. Still some issues probably with GSO/GRO being effective in all cases. - IPv4/GRE/GUE/IPv6 with RCO 1 TCP_STREAM 6616 Mbps 200 TCP_RR 1244043 tps 141/243/446 90/95/99% latencies 86.61% CPU utilization - IPv6/GRE/GUE/IPv6 with RCO 1 TCP_STREAM 6940 Mbps 200 TCP_RR 1270903 tps 138/236/440 90/95/99% latencies 87.51% CPU utilization - IP6IP6 1 TCP_STREAM 2576 Mbps 200 TCP_RR 498981 tps 388/498/631 90/95/99% latencies 19.75% CPU utilization (1 CPU saturated) - IP6IP6/GUE/IPv6 with RCO 1 TCP_STREAM 1854 Mbps 200 TCP_RR 1233818 tps 143/244/451 90/95/99% latencies 87.57 CPU utilization - IP4IP6 1 TCP_STREAM 200 TCP_RR 763774 tps 250/318/466 90/95/99% latencies 35.25% CPU utilization (1 CPU saturated) - GRE with keyid 200 TCP_RR 744173 tps 258/332/461 90/95/99% latencies 34.59% CPU utilization (1 CPU saturated) Tom Herbert (12): gso: Remove arbitrary checks for unsupported GSO net: define gso types for IPx over IPv4 and IPv6 gre6: Fix flag translations udp: Don't set skb->encapsulation with RCO fou: Call setup_udp_tunnel_sock fou: Split out {fou,gue}_build_header fou: Add encap ops for IPv6 tunnels ipv6: Fix nexthdr for reinjection ipv6: Change "final" protocol processing for encapsulation fou: Support IPv6 in fou ip6_tun: Add infrastructure for doing encapsulation ip6_gre: Add support for fou/gue encapsulation drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 5 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 +- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 3 +- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 3 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 +- include/linux/netdev_features.h | 12 +- include/linux/netdevice.h | 4 +- include/linux/skbuff.h| 4 +- include/net/fou.h | 10 +- include/net/ip6_tunnel.h | 22 +++- net/core/ethtool.c| 4 +- net/ipv4/af_inet.c| 20 +-- net/ipv4/fou.c| 144 +- net/ipv4/gre_offload.c| 14 --- net/ipv4/ipip.c | 2 +- net/ipv4/tcp_offload.c| 19 --- net/ipv4/udp_offload.c| 19 +-- net/ipv6/Makefile | 4 +- net/ipv6/fou6.c | 140 + net/ipv6/ip6_gre.c| 95 -- net/ipv6/ip6_input.c | 24 +++- net/ipv6/ip6_offload.c| 22 +--- net/ipv6/ip6_tunnel.c | 4 + net/ipv6/ip6_tunnel_core.c| 108 net/ipv6/sit.c| 4 +- net/ipv6/udp_offload.c| 13 -- net/mpls/mpls_gso.c | 9 -- net/netfilter/ipvs/ip_vs_xmit.c | 11 +- 31 files changed, 511 insertions(+), 224 deletions(-) create mode 100644 net/ipv6/fou6.c create mode 100644 net/ipv6/ip6_tunnel_core.c -- 2.8.0.rc2
[PATCH v2 next-next 01/12] gso: Remove arbitrary checks for unsupported GSO
In several gso_segment functions there are checks of gso_type against a seemingly arbitrary list of SKB_GSO_* flags. This seems like an attempt to identify unsupported GSO types, but since the stack is the one that set these GSO types in the first place this seems unnecessary to do. If a combination isn't valid in the first place that stack should not allow setting it. This is a code simplication especially for add new GSO types. Signed-off-by: Tom Herbert --- net/ipv4/af_inet.c | 18 -- net/ipv4/gre_offload.c | 14 -- net/ipv4/tcp_offload.c | 19 --- net/ipv4/udp_offload.c | 10 -- net/ipv6/ip6_offload.c | 18 -- net/ipv6/udp_offload.c | 13 - net/mpls/mpls_gso.c| 9 - 7 files changed, 101 deletions(-) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 2e6e65f..7f08d45 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1205,24 +1205,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int ihl; int id; - if (unlikely(skb_shinfo(skb)->gso_type & -~(SKB_GSO_TCPV4 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_TCPV6 | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TCP_FIXEDID | - SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_PARTIAL | - 0))) - goto out; - skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index e88190a..ecd1e09 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -26,20 +26,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, int gre_offset, outer_hlen; bool need_csum, ufo; - if (unlikely(skb_shinfo(skb)->gso_type & - ~(SKB_GSO_TCPV4 | - SKB_GSO_TCPV6 | - SKB_GSO_UDP | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCP_FIXEDID | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_PARTIAL))) - goto out; - if (!skb->encapsulation) goto out; diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 02737b6..5c59649 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -83,25 +83,6 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & -~(SKB_GSO_TCPV4 | - SKB_GSO_DODGY | - SKB_GSO_TCP_ECN | - SKB_GSO_TCP_FIXEDID | - SKB_GSO_TCPV6 | - SKB_GSO_GRE | - SKB_GSO_GRE_CSUM | - SKB_GSO_IPIP | - SKB_GSO_SIT | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - 0) || -!(type & (SKB_GSO_TCPV4 | - SKB_GSO_TCPV6 - goto out; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 097060de..b556ef6 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -209,16 +209,6 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ - int type = skb_shinfo(skb)->gso_type; - - if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY | - SKB_GSO_UDP_TUNNEL | - SKB_GSO_UDP_TUNNEL_CSUM | - SKB_GSO_TUNNEL_REMCSUM | - SKB_GSO_IPIP | - SKB_GSO_GRE | SKB_GSO_G
[PATCH v2 next-next 02/12] net: define gso types for IPx over IPv4 and IPv6
This patch defines two new GDO definitions SKB_GSO_IPXIP4 and SKB_GSO_IPXIP6 along with corresponding NETIF_F_GSO_IPXIP4 and NETIF_F_GSO_IPXIP6. These are used to described IP in IP tunnel and what the outer protocol is. The inner protocol can be deduced from other GSO types (e.g. SKB_GSO_TCPV4 and SKB_GSO_TCPV6). The GSO types of SKB_GSO_IPIP and SKB_GSO_SIT are removed (these are both instances of SKB_GSO_IPXIP4). SKB_GSO_IPXIP6 will be used when support for GSO with IP encapsulation over IPv6 is added. Signed-off-by: Tom Herbert --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 5 ++--- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +-- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 +-- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 3 +-- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 3 +-- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 +-- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 3 +-- include/linux/netdev_features.h | 12 ++-- include/linux/netdevice.h | 4 ++-- include/linux/skbuff.h| 4 ++-- net/core/ethtool.c| 4 ++-- net/ipv4/af_inet.c| 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_offload.c| 4 ++-- net/ipv6/sit.c| 4 ++-- net/netfilter/ipvs/ip_vs_xmit.c | 11 ++- 17 files changed, 30 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index d465bd7..0a5b770 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -13259,12 +13259,11 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev, NETIF_F_RXHASH | NETIF_F_HW_VLAN_CTAG_TX; if (!chip_is_e1x) { dev->hw_features |= NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL | - NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT; + NETIF_F_GSO_IPXIP4; dev->hw_enc_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 | - NETIF_F_GSO_IPIP | - NETIF_F_GSO_SIT | + NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index fd85b6d..e449228 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6218,7 +6218,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE | - NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT | + NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH | NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO; @@ -6228,7 +6228,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE | NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | - NETIF_F_GSO_IPIP | NETIF_F_GSO_SIT | + NETIF_F_GSO_IPXIP4; NETIF_F_GSO_PARTIAL; dev->gso_partial_features = NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f6da6b7..c2a4c10 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -9131,8 +9131,7 @@ static int i40e_config_netdev(struct i40e_vsi *vsi) NETIF_F_TSO6 | NETIF_F_GSO_GRE | NETIF_F_GSO_GRE_CSUM | - NETIF_F_GSO_IPIP | - NETIF_F_GSO_SIT | + NETIF_F_GSO_IPXIP4 | NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_PARTIA
[PATCH v2 next-next 05/12] fou: Call setup_udp_tunnel_sock
Use helper function to set up UDP tunnel related information for a fou socket. Signed-off-by: Tom Herbert --- net/ipv4/fou.c | 50 -- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 7ac5ec8..a8b5cbf 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -446,31 +446,13 @@ static void fou_release(struct fou *fou) kfree_rcu(fou, rcu); } -static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) -{ - udp_sk(sk)->encap_rcv = fou_udp_recv; - udp_sk(sk)->gro_receive = fou_gro_receive; - udp_sk(sk)->gro_complete = fou_gro_complete; - fou_from_sock(sk)->protocol = cfg->protocol; - - return 0; -} - -static int gue_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg) -{ - udp_sk(sk)->encap_rcv = gue_udp_recv; - udp_sk(sk)->gro_receive = gue_gro_receive; - udp_sk(sk)->gro_complete = gue_gro_complete; - - return 0; -} - static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { struct socket *sock = NULL; struct fou *fou = NULL; struct sock *sk; + struct udp_tunnel_sock_cfg tunnel_cfg; int err; /* Open UDP socket */ @@ -489,33 +471,33 @@ static int fou_create(struct net *net, struct fou_cfg *cfg, fou->flags = cfg->flags; fou->port = cfg->udp_config.local_udp_port; + fou->type = cfg->type; + fou->sock = sock; + + memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); + tunnel_cfg.encap_type = 1; + tunnel_cfg.sk_user_data = fou; + tunnel_cfg.encap_destroy = NULL; /* Initial for fou type */ switch (cfg->type) { case FOU_ENCAP_DIRECT: - err = fou_encap_init(sk, fou, cfg); - if (err) - goto error; + tunnel_cfg.encap_rcv = fou_udp_recv; + tunnel_cfg.gro_receive = fou_gro_receive; + tunnel_cfg.gro_complete = fou_gro_complete; + fou->protocol = cfg->protocol; break; case FOU_ENCAP_GUE: - err = gue_encap_init(sk, fou, cfg); - if (err) - goto error; + tunnel_cfg.encap_rcv = gue_udp_recv; + tunnel_cfg.gro_receive = gue_gro_receive; + tunnel_cfg.gro_complete = gue_gro_complete; break; default: err = -EINVAL; goto error; } - fou->type = cfg->type; - - udp_sk(sk)->encap_type = 1; - udp_encap_enable(); - - sk->sk_user_data = fou; - fou->sock = sock; - - inet_inc_convert_csum(sk); + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); sk->sk_allocation = GFP_ATOMIC; -- 2.8.0.rc2
[PATCH v2 next-next 12/12] ip6_gre: Add support for fou/gue encapsulation
Add netlink and setup for encapsulation Signed-off-by: Tom Herbert --- net/ipv6/ip6_gre.c | 75 ++ 1 file changed, 75 insertions(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 70a1f72..ed5ddcc 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1027,6 +1027,8 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); + tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); dev->needed_headroom= LL_MAX_HEADER + t_hlen + 4; @@ -1293,15 +1295,57 @@ static void ip6gre_tap_setup(struct net_device *dev) dev->priv_flags &= ~IFF_TX_SKB_SHARING; } +static bool ip6gre_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *ipencap) +{ + bool ret = false; + + memset(ipencap, 0, sizeof(*ipencap)); + + if (!data) + return ret; + + if (data[IFLA_GRE_ENCAP_TYPE]) { + ret = true; + ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); + } + + if (data[IFLA_GRE_ENCAP_FLAGS]) { + ret = true; + ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); + } + + if (data[IFLA_GRE_ENCAP_SPORT]) { + ret = true; + ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); + } + + if (data[IFLA_GRE_ENCAP_DPORT]) { + ret = true; + ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); + } + + return ret; +} + static int ip6gre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip6_tnl *nt; struct net *net = dev_net(dev); struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); + struct ip_tunnel_encap ipencap; int err; nt = netdev_priv(dev); + + if (ip6gre_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(nt, &ipencap); + + if (err < 0) + return err; + } + ip6gre_netlink_parms(data, &nt->parms); if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) @@ -1348,10 +1392,18 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], struct net *net = nt->net; struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct __ip6_tnl_parm p; + struct ip_tunnel_encap ipencap; if (dev == ign->fb_tunnel_dev) return -EINVAL; + if (ip6gre_netlink_encap_parms(data, &ipencap)) { + int err = ip6_tnl_encap_setup(nt, &ipencap); + + if (err < 0) + return err; + } + ip6gre_netlink_parms(data, &p); t = ip6gre_tunnel_locate(net, &p, 0); @@ -1405,6 +1457,14 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_GRE_FLAGS */ nla_total_size(4) + + /* IFLA_GRE_ENCAP_TYPE */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_FLAGS */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_SPORT */ + nla_total_size(2) + + /* IFLA_GRE_ENCAP_DPORT */ + nla_total_size(2) + 0; } @@ -1428,6 +1488,17 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) goto nla_put_failure; + + if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, + t->encap.type) || + nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, +t->encap.sport) || + nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, +t->encap.dport) || + nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, + t->encap.flags)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1446,6 +1517,10 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, [IFLA_GRE_FLOWINFO]= { .type = NLA_U32 }, [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, + [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, + [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { -- 2.8.0.rc2
[PATCH v2 next-next 04/12] udp: Don't set skb->encapsulation with RCO
When RCO is in effect we want to ensure that the outer checksum is properly offloaded. Don't set skb->encapsulation in this case to ensure that checksum offload is later considered for hw_features instead of hw_enc_features. Signed-off-by: Tom Herbert --- net/ipv4/udp_offload.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b556ef6..92a9222 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -94,11 +94,12 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, do { unsigned int len; - if (remcsum) + if (remcsum) { skb->ip_summed = CHECKSUM_NONE; - - /* Set up inner headers if we are offloading inner checksum */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { + /* Set up inner headers if we are offloading inner +* checksum +*/ skb_reset_inner_headers(skb); skb->encapsulation = 1; } -- 2.8.0.rc2
[PATCH v2 next-next 08/12] ipv6: Fix nexthdr for reinjection
In ip6_input_finish the protocol handle returns a value greater than zero the packet needs to be resubmitted using the returned protocol. The returned protocol is being ignored and each time through resubmit nexthdr is taken from an offest in the packet. This patch fixes that so that nexthdr is taken from return value of the protocol handler. Signed-off-by: Tom Herbert --- net/ipv6/ip6_input.c | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 6ed5601..2a0258a 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -222,13 +222,14 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk */ rcu_read_lock(); -resubmit: + idev = ip6_dst_idev(skb_dst(skb)); if (!pskb_pull(skb, skb_transport_offset(skb))) goto discard; nhoff = IP6CB(skb)->nhoff; nexthdr = skb_network_header(skb)[nhoff]; +resubmit: raw = raw6_local_deliver(skb, nexthdr); ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot) { @@ -256,10 +257,12 @@ resubmit: goto discard; ret = ipprot->handler(skb); - if (ret > 0) + if (ret > 0) { + nexthdr = ret; goto resubmit; - else if (ret == 0) + } else if (ret == 0) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS); + } } else { if (!raw) { if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { -- 2.8.0.rc2
Re: [PATCH net-next] ipv4: tcp: ip_send_unicast_reply() is not BH safe
From: Eric Dumazet Date: Fri, 06 May 2016 09:46:18 -0700 > From: Eric Dumazet > > I forgot that ip_send_unicast_reply() is not BH safe (yet). > > Disabling preemption before calling it was not a good move. > > Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible") > Signed-off-by: Eric Dumazet > Reported-by: Andres Lagar-Cavilla Applied, thanks.
Re: [PATCH net-next 0/7] bpf: introduce direct packet access
From: Alexei Starovoitov Date: Thu, 5 May 2016 19:49:08 -0700 > This set of patches introduce 'direct packet access' from > cls_bpf and act_bpf programs (which are root only). Series applied, thanks Alexei.
Re: [patch net 0/3] mlxsw: Couple of fixes
From: Jiri Pirko Date: Fri, 6 May 2016 11:17:21 +0200 > From: Jiri Pirko > > Ido Schimmel (2): > mlxsw: spectrum: Fix rollback order in LAG join failure > mlxsw: spectrum: Add missing rollback in flood configuration > > Jiri Pirko (1): > mlxsw: spectrum: Fix ordering in mlxsw_sp_fini What tree is this for? Because on 'net' this makes the build fail. drivers/net/ethernet/mellanox/mlxsw/spectrum.c: In function ‘mlxsw_sp_fini’: drivers/net/ethernet/mellanox/mlxsw/spectrum.c:2162:2: error: implicit declaration of function ‘mlxsw_sp_buffers_fini’ [-Werror=implicit-function-declaration]
Re: [net-next 00/11][pull request] 40GbE Intel Wired LAN Driver Updates 2016-05-05
From: Jeff Kirsher Date: Fri, 6 May 2016 00:03:37 -0700 > This series contains updates to i40e and i40evf. Looks good, pulled, thanks!
Re: [PATCH net-next v2] net: vrf: Create FIB tables on link create
From: David Ahern Date: Wed, 4 May 2016 21:46:12 -0700 > Tables have to exist for VRFs to function. Ensure they exist > when VRF device is created. > > Signed-off-by: David Ahern > --- > v2 > - create table before rt6 allocation per comment from DaveM Yep, this looks better, applied.
Re: [PATCH net 1/1] qede: prevent chip hang when increasing channels
From: Sudarsana Reddy Kalluru Date: Thu, 5 May 2016 00:35:16 -0400 > qede requires qed to provide enough resources to accommodate 16 combined > channels, but that upper-bound isn't actually being enforced by it. > Instead, qed inform back to qede how many channels can be opened based on > available resources - but that calculation doesn't really take into account > the resources requested by qede; Instead it considers other FW/HW available > resources. > > As a result, if a user would increase the number of channels to more than > 16 [e.g., using ethtool] the chip would hang. > > This change increments the resources requested by qede to 64 combined > channels instead of 16; This value is an upper bound on the possible > available channels [due to other FW/HW resources]. > > Signed-off-by: Sudarsana Reddy Kalluru > Signed-off-by: Yuval Mintz Applied.
Re: [PATCH net v3 2/2] udp_offload: Set encapsulation before inner completes.
On Fri, May 6, 2016 at 12:34 PM, David Miller wrote: > From: Jarno Rajahalme > Date: Tue, 3 May 2016 16:10:21 -0700 > >> UDP tunnel segmentation code relies on the inner offsets being set for >> an UDP tunnel GSO packet, but the inner *_complete() functions will >> set the inner offsets only if 'encapsulation' is set before calling >> them. Currently, udp_gro_complete() sets 'encapsulation' only after >> the inner *_complete() functions are done. This causes the inner >> offsets having invalid values after udp_gro_complete() returns, which >> in turn will make it impossible to properly segment the packet in case >> it needs to be forwarded, which would be visible to the user either as >> invalid packets being sent or as packet loss. >> >> This patch fixes this by setting skb's 'encapsulation' in >> udp_gro_complete() before calling into the inner complete functions, >> and by making each possible UDP tunnel gro_complete() callback set the >> inner_mac_header to the beginning of the tunnel payload. >> >> Signed-off-by: Jarno Rajahalme >> --- >> v3: Added setting inner_mac_header from all possible callbacks to cover >> cases where there is no inner mac header. > > Alex and Tom, can you please review this new version since you guys had > so much feedback for v2? > > THanks. I had reviewed it a day or so ago. It did address the issues I saw with the original patch, and from what I can tell it is fixing the original issue reported. Reviewed-by: Alexander Duyck
Re: [PATCH] net: ipv6: tcp reset, icmp need to consider L3 domain
From: David Ahern Date: Wed, 4 May 2016 21:26:08 -0700 > Responses for packets to unused ports are getting lost with L3 domains. > > IPv4 has ip_send_unicast_reply for sending TCP responses which accounts > for L3 domains; update the IPv6 counterpart tcp_v6_send_response. > For icmp the L3 master check needs to be moved up in icmp6_send > to properly respond to UDP packets to a port with no listener. > > Fixes: ca254490c8df ("net: Add VRF support to IPv6 stack") > Signed-off-by: David Ahern Applied and queued up for -stable, thanks.
Re: [PATCH net v3] vlan: Propagate MAC address to VLANs
On Fri, May 6, 2016 at 12:36 PM, Mike Manning wrote: > On 05/06/2016 06:02 PM, Alexander Duyck wrote: >> On Fri, May 6, 2016 at 6:26 AM, Mike Manning wrote: >>> The MAC address of the physical interface is only copied to the VLAN >>> when it is first created, resulting in an inconsistency after MAC >>> address changes of only newly created VLANs having an up-to-date MAC. >>> >>> The VLANs should continue inheriting the MAC address of the physical >>> interface, unless explicitly changed to be different from this. >>> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes >>> to the MAC of the physical interface and thus for DAD to behave as >>> expected. >>> >>> Signed-off-by: Mike Manning >>> --- >>> include/linux/if_vlan.h |2 ++ >>> net/8021q/vlan.c| 17 +++-- >>> net/8021q/vlan_dev.c| 13 ++--- >>> 3 files changed, 23 insertions(+), 9 deletions(-) >>> >>> --- a/include/linux/if_vlan.h >>> +++ b/include/linux/if_vlan.h >>> @@ -138,6 +138,7 @@ struct netpoll; >>> * @flags: device flags >>> * @real_dev: underlying netdevice >>> * @real_dev_addr: address of underlying netdevice >>> + * @addr_assign_type: address assignment type >>> * @dent: proc dir entry >>> * @vlan_pcpu_stats: ptr to percpu rx stats >>> */ >>> @@ -153,6 +154,7 @@ struct vlan_dev_priv { >>> >>> struct net_device *real_dev; >>> unsigned char real_dev_addr[ETH_ALEN]; >>> + unsigned char addr_assign_type; >>> >>> struct proc_dir_entry *dent; >>> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; >> >> Please don't start adding new members to structures when it already >> exists in the net_device. If anything you should be able to drop >> read_dev_addr if you do this correctly because you shouldn't need to >> clone the lower dev address to watch for changes. All you will need >> to do is watch NET_ADDR_STOLEN. >> > > Thanks for the detailed review. I had initially used the existing type > in net_device, but the problem with this was that it got overwritten to > NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify. > It would just be a case of setting the type earlier in that function > (and caching the previous value in case there is an error). > > However, based on your later comment, it seems I should not bother with > the approach I have here, namely that if the VLAN MAC is set to the same > value as that of the lower device MAC, that is to be considered as > resetting it and thus for MAC inheritance to resume. Instead, I will just > make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited, > and if it is set to anything (even the value of the lower device MAC), > inheritance is stopped. I agree this makes for a far simpler changeset. > > I don't think I can remove real_dev_addr, as that is still needed for > the existing functionality in vlan_sync_address() to determine if the sync > should be done, also as a way of caching it for handling in vlan_dev_open(). The thing is that logic isn't really needed anymore though if you are going to be following the lower dev. If you follow the code what it is doing is adding the address via dev_uc_add if the lower address moves away from the VLAN address. With your changes you are updating the VLAN MAC address to the lower value in the NET_ADDR_STOLEN case so you don't need to add or remove an extra unicast address. If the user sets the MAC address you can then use the vlandev->dev_addr as the address you add/remove from the unicast list and you probably don't need to bother with tracking the lower device state anyway. > As a matter of interest, what is the advantage of not updating the VLAN > MAC when it is down? I appreciate that one should not add/delete > secondary unicast addresses in this case, but there is no such > restriction for copying the MAC. Basically you are just wasting cycles messing with it while it is down. You don't need to bother with syncing up the addresses until you bring the interface up. At that point you essentially need to do the vlan_sync_address type work anyway because you have to push your address to the lower dev, or you have to pull it up from the lower dev in the case of the stolen address. You don't want to have MAC addresses written to the device for an interface that is down.
Re: [Y2038] [RESEND PATCH 2/3] fs: poll/select/recvmmsg: use timespec64 for timeout events
From: John Stultz Date: Wed, 4 May 2016 17:01:24 -0700 > On Wed, May 4, 2016 at 4:51 PM, Andrew Morton > wrote: >> On Wed, 04 May 2016 23:08:11 +0200 Arnd Bergmann wrote: >> >>> > But I'm less comfortable making the call on this one. It looks >>> > relatively straight forward, but it would be good to have maintainer >>> > acks before I add it to my tree. >>> >>> Agreed. Feel free to add my >>> >>> Reviewed-by: Arnd Bergmann >>> >>> at least (whoever picks it up). >> >> In reply to [1/3] John said >> >> : Looks ok at the first glance. I've queued these up for testing, >> : however I only got #1 and #3 of the set. Are you hoping these two >> : patches will go through tip/timers/core or are you looking for acks so >> : they can go via another tree? >> >> However none of the patches are in linux-next. >> >> John had qualms about [2/3], but it looks like a straightforward >> substitution in areas which will get plenty of testing > > Yea. My main concern is just not stepping on any other maintainers toes. The networking changes look fine to me: Acked-by: David S. Miller
Re: [PATCH net-next] cnic: call cp->stop_hw() in cnic_start_hw() on allocation failure
From: Jon Maxwell Date: Thu, 5 May 2016 09:55:51 +1000 > We recently had a system crash in the cnic module. Vmcore analysis confirmed > that "ip link up" was executed which failed due to an allocation failure > because of memory fragmentation. Futher analysis revealed that the cnic irq > vector was still allocated after the "ip link up" that failed. When > "ip link down" was executed it called free_msi_irqs() which crashed the > system > because the cnic irq was still inuse. ... > The cnic_start_hw() routine is not handling the allocation failure correctly. > Fix this by checking whether CNIC_DRV_STATE_HANDLES_IRQ flag is set > indicating > that the hardware has been started in cnic_start_hw(). If it has then call > cp->stop_hw() which frees the cnic irq vector and cnic resources. Otherwise > just maintain the previous behaviour and free cnic resources. > > I reproduced this by injecting an ENOMEM error into cnic_cm_alloc_mem()s > return > code. > > # ip link set dev enpX down > # ip link set dev enpX up <--- hit's allocation failure > # ip link set dev enpX down <--- crashes here > > With this patch I confirmed there was no crash in the reproducer. > > Signed-off-by: Jon Maxwell Applied, thank you.
Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)
On Fri, May 6, 2016 at 11:56 AM, Roman Yeryomin wrote: > On 6 May 2016 at 21:43, Roman Yeryomin wrote: >> On 6 May 2016 at 15:47, Jesper Dangaard Brouer wrote: >>> >>> I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2] >>> closed Felix'es OpenWRT email account (bad choice! emails bouncing). >>> Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project >>> is in some kind of conflict. >>> >>> OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349 >>> >>> [2] >>> http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335 >> >> OK, so, after porting the patch to 4.1 openwrt kernel and playing a >> bit with fq_codel limits I was able to get 420Mbps UDP like this: >> tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256 > > Forgot to mention, I've reduced drop_batch_size down to 32 0) Not clear to me if that's the right line, there are 4 wifi queues, and the third one is the BE queue. That is too low a limit, also, for normal use. And: for the purpose of this particular UDP test, flows 16 is ok, but not ideal. 1) What's the tcp number (with a simultaneous ping) with this latest patchset? (I care about tcp performance a lot more than udp floods - surviving a udp flood yes, performance, no) before/after? tc -s qdisc show dev wlan0 during/after results? IF you are doing builds for the archer c7v2, I can join in on this... (?) I did do a test of the ath10k "before", fq_codel *never engaged*, and tcp induced latencies under load, e at 100mbit, cracked 600ms, while staying flat (20ms) at 100mbit. (not the same patches you are testing) on x86. I have got tcp 300Mbit out of an osx box, similar latency, have yet to get anything more on anything I currently have before/after patchsets. I'll go add flooding to the tests, I just finished a series comparing two different speed stations and life was good on that. "before" - fq_codel never engages, we see seconds of latency under load. root@apu2:~# tc -s qdisc show dev wlp4s0 qdisc mq 0: root Sent 8570563893 bytes 6326983 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 qdisc fq_codel 0: parent :1 limit 10240p flows 1024 quantum 1514 target 5.0ms interval 100.0ms ecn Sent 2262 bytes 17 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0 new_flows_len 0 old_flows_len 0 qdisc fq_codel 0: parent :2 limit 10240p flows 1024 quantum 1514 target 5.0ms interval 100.0ms ecn Sent 220486569 bytes 152058 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 maxpacket 18168 drop_overlimit 0 new_flow_count 1 ecn_mark 0 new_flows_len 0 old_flows_len 1 qdisc fq_codel 0: parent :3 limit 10240p flows 1024 quantum 1514 target 5.0ms interval 100.0ms ecn Sent 8340546509 bytes 6163431 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 maxpacket 68130 drop_overlimit 0 new_flow_count 120050 ecn_mark 0 new_flows_len 1 old_flows_len 3 qdisc fq_codel 0: parent :4 limit 10240p flows 1024 quantum 1514 target 5.0ms interval 100.0ms ecn Sent 9528553 bytes 11477 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 maxpacket 66 drop_overlimit 0 new_flow_count 1 ecn_mark 0 new_flows_len 1 old_flows_len 0 ``` >> This is certainly better than 30Mbps but still more than two times >> less than before (900). The number that I still am not sure we got is that you were sending 900mbit udp and recieving 900mbit on the prior tests? >> TCP also improved a little (550 to ~590). The limit is probably a bit low, also. You might want to try target 20ms as well. >> >> Felix, others, do you want to see the ported patch, maybe I did something >> wrong? >> Doesn't look like it will save ath10k from performance regression. what was tcp "before"? (I'm sorry, such a long thread) >> >>> >>> On Fri, 6 May 2016 11:42:43 +0200 >>> Jesper Dangaard Brouer wrote: >>> Hi Felix, This is an important fix for OpenWRT, please read! OpenWRT changed the default fq_codel sch->limit from 10240 to 1024, without also adjusting q->flows_cnt. Eric explains below that you must also adjust the buckets (q->flows_cnt) for this not to break. (Just adjust it to 128) Problematic OpenWRT commit in question: http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from causing too much cpu load with higher speed (#21326)") I also highly recommend you cherry-pick this very recent commit: net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") https://git.kernel.org/davem/net-next/c/9d18562a227 This should fix very high CPU usage in-case fq_codel goes into drop mode. The problem is that drop mode was considered rare, and implementation wise it was chosen to be more expensive (to save cycles on normal mode). U
Re: [PATCH 2/3] net/mlx5e: make VXLAN support conditional
From: Arnd Bergmann Date: Thu, 05 May 2016 20:09:19 +0200 > For reference, I've tried it out on the MLX4 driver, and it does > seem nicer that way, see below. Is it possible to wind down this conversation and have someone submit whatever final patch everyone agrees to? Thanks.
[PATCH] e1000e: prevent division by zero if TIMINCA is zero
Users report that under VMWare, er32(TIMINCA) returns zero. This causes division by zero at init time as follows: ==>incvalue = er32(TIMINCA) & E1000_TIMINCA_INCVALUE_MASK; for (i = 0; i < E1000_MAX_82574_SYSTIM_REREADS; i++) { /* latch SYSTIMH on read of SYSTIML */ systim_next = (cycle_t)er32(SYSTIML); systim_next |= (cycle_t)er32(SYSTIMH) << 32; time_delta = systim_next - systim; temp = time_delta; > rem = do_div(temp, incvalue); This change makes kernel survive this, and users report that NIC does work after this change. Since on real hardware incvalue is never zero, this should not affect real hardware use case. Signed-off-by: Denys Vlasenko CC: Jeff Kirsher CC: "Ruinskiy, Dima" CC: intel-wired-...@lists.osuosl.org CC: netdev@vger.kernel.org CC: LKML --- drivers/net/ethernet/intel/e1000e/netdev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 269087c..0626935 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -4315,7 +4315,8 @@ static cycle_t e1000e_cyclecounter_read(const struct cyclecounter *cc) time_delta = systim_next - systim; temp = time_delta; - rem = do_div(temp, incvalue); + /* VMWare users have seen incvalue of zero, don't div / 0 */ + rem = incvalue ? do_div(temp, incvalue) : (time_delta != 0); systim = systim_next; -- 1.8.1.4
Re: [PATCH net] netfilter: nf_conntrack: Use net_mutex for helper unregistration.
On 6 May 2016 at 04:03, Pablo Neira Ayuso wrote: > Hi Joe, > > On Thu, May 05, 2016 at 03:50:37PM -0700, Joe Stringer wrote: >> diff --git a/net/netfilter/nf_conntrack_helper.c >> b/net/netfilter/nf_conntrack_helper.c >> index 3b40ec575cd5..6860b19be406 100644 >> --- a/net/netfilter/nf_conntrack_helper.c >> +++ b/net/netfilter/nf_conntrack_helper.c >> @@ -449,10 +449,10 @@ void nf_conntrack_helper_unregister(struct >> nf_conntrack_helper *me) >>*/ >> synchronize_rcu(); >> >> - rtnl_lock(); >> + mutex_lock(&net_mutex); >> for_each_net(net) >> __nf_conntrack_helper_unregister(me, net); >> - rtnl_unlock(); >> + mutex_unlock(&net_mutex); > > This simple solution works because we have no .exit callbacks in any > of our helpers. Otherwise, the helper code may be already gone by when > the worker has a chance to run to release the netns. I'm open to any alternative solutions, but if helper code isn't doing this yet then perhaps this fix is sufficient? > If so, probably I can append this as comment to this function so we > don't forget. If we ever have .exit callbacks (I don't expect so), we > would need to wait for worker completion. Sounds reasonable to me. I see there's a bunch of other unregister locations like nf_nat_l3proto_clean(), nf_nat_l4proto_clean(), nf_unregister_hook() which might need similar treatment?
Re: [PATCH net-next 1/2] sfc: Support setting rss_cpus to 'cores', 'packages' or 'hyperthreads'
From: Edward Cree Date: Wed, 4 May 2016 18:01:52 +0100 > These settings autoconfigure the number of RSS channels to match the number of > CPUs present. > > Signed-off-by: Edward Cree I can't believe I allowed this 'rss_cpus' thing into the tree to begin with. It's completely wrong and is exactly the kind of thing we are trying to actively avoid in network drivers. If another network driver wants to provide the same facility they will add a module parameter with a slightly different name, a different set of valid choices, and different semantics. Define a proper global, stable, tree-wide mechanism to configure these kinds of things and use that instead. Thanks.
Re: [PATCH net v3] vlan: Propagate MAC address to VLANs
On 05/06/2016 06:02 PM, Alexander Duyck wrote: > On Fri, May 6, 2016 at 6:26 AM, Mike Manning wrote: >> The MAC address of the physical interface is only copied to the VLAN >> when it is first created, resulting in an inconsistency after MAC >> address changes of only newly created VLANs having an up-to-date MAC. >> >> The VLANs should continue inheriting the MAC address of the physical >> interface, unless explicitly changed to be different from this. >> This allows IPv6 EUI64 addresses for the VLAN to reflect any changes >> to the MAC of the physical interface and thus for DAD to behave as >> expected. >> >> Signed-off-by: Mike Manning >> --- >> include/linux/if_vlan.h |2 ++ >> net/8021q/vlan.c| 17 +++-- >> net/8021q/vlan_dev.c| 13 ++--- >> 3 files changed, 23 insertions(+), 9 deletions(-) >> >> --- a/include/linux/if_vlan.h >> +++ b/include/linux/if_vlan.h >> @@ -138,6 +138,7 @@ struct netpoll; >> * @flags: device flags >> * @real_dev: underlying netdevice >> * @real_dev_addr: address of underlying netdevice >> + * @addr_assign_type: address assignment type >> * @dent: proc dir entry >> * @vlan_pcpu_stats: ptr to percpu rx stats >> */ >> @@ -153,6 +154,7 @@ struct vlan_dev_priv { >> >> struct net_device *real_dev; >> unsigned char real_dev_addr[ETH_ALEN]; >> + unsigned char addr_assign_type; >> >> struct proc_dir_entry *dent; >> struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; > > Please don't start adding new members to structures when it already > exists in the net_device. If anything you should be able to drop > read_dev_addr if you do this correctly because you shouldn't need to > clone the lower dev address to watch for changes. All you will need > to do is watch NET_ADDR_STOLEN. > Thanks for the detailed review. I had initially used the existing type in net_device, but the problem with this was that it got overwritten to NET_ADDR_SET in dev_set_mac_address(), which I was reluctant to modify. It would just be a case of setting the type earlier in that function (and caching the previous value in case there is an error). However, based on your later comment, it seems I should not bother with the approach I have here, namely that if the VLAN MAC is set to the same value as that of the lower device MAC, that is to be considered as resetting it and thus for MAC inheritance to resume. Instead, I will just make this a 1-shot transition, i.e. the VLAN MAC starts off as inherited, and if it is set to anything (even the value of the lower device MAC), inheritance is stopped. I agree this makes for a far simpler changeset. I don't think I can remove real_dev_addr, as that is still needed for the existing functionality in vlan_sync_address() to determine if the sync should be done, also as a way of caching it for handling in vlan_dev_open(). As a matter of interest, what is the advantage of not updating the VLAN MAC when it is down? I appreciate that one should not add/delete secondary unicast addresses in this case, but there is no such restriction for copying the MAC. >> --- a/net/8021q/vlan.c >> +++ b/net/8021q/vlan.c >> @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net >> if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) >> return; >> >> + /* vlan continues to inherit address of parent interface */ >> + if (vlan->addr_assign_type == NET_ADDR_STOLEN) { >> + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); >> + goto out; >> + } >> + >> + if (!(vlandev->flags & IFF_UP)) >> + goto out; >> + >> /* vlan address was different from the old address and is equal to >> * the new address */ >> if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && >> @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net >> !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) >> dev_uc_add(dev, vlandev->dev_addr); >> >> +out: >> ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); >> } >> >> @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti >> >> case NETDEV_CHANGEADDR: >> /* Adjust unicast filters on underlying device */ >> - vlan_group_for_each_dev(grp, i, vlandev) { >> - flgs = vlandev->flags; >> - if (!(flgs & IFF_UP)) >> - continue; >> - >> + vlan_group_for_each_dev(grp, i, vlandev) >> vlan_sync_address(dev, vlandev); >> - } >> break; >> >> case NETDEV_CHANGEMTU: > > So all of this is far more complicated than it needs to be. If > NET_ADDR_STOLEN is set you have to follow the lower device MAC > address, otherwise you
Re: [PATCH net v3 2/2] udp_offload: Set encapsulation before inner completes.
From: Jarno Rajahalme Date: Tue, 3 May 2016 16:10:21 -0700 > UDP tunnel segmentation code relies on the inner offsets being set for > an UDP tunnel GSO packet, but the inner *_complete() functions will > set the inner offsets only if 'encapsulation' is set before calling > them. Currently, udp_gro_complete() sets 'encapsulation' only after > the inner *_complete() functions are done. This causes the inner > offsets having invalid values after udp_gro_complete() returns, which > in turn will make it impossible to properly segment the packet in case > it needs to be forwarded, which would be visible to the user either as > invalid packets being sent or as packet loss. > > This patch fixes this by setting skb's 'encapsulation' in > udp_gro_complete() before calling into the inner complete functions, > and by making each possible UDP tunnel gro_complete() callback set the > inner_mac_header to the beginning of the tunnel payload. > > Signed-off-by: Jarno Rajahalme > --- > v3: Added setting inner_mac_header from all possible callbacks to cover > cases where there is no inner mac header. Alex and Tom, can you please review this new version since you guys had so much feedback for v2? THanks.
Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)
On 6 May 2016 at 21:43, Roman Yeryomin wrote: > On 6 May 2016 at 15:47, Jesper Dangaard Brouer wrote: >> >> I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2] >> closed Felix'es OpenWRT email account (bad choice! emails bouncing). >> Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project >> is in some kind of conflict. >> >> OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349 >> >> [2] >> http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335 > > OK, so, after porting the patch to 4.1 openwrt kernel and playing a > bit with fq_codel limits I was able to get 420Mbps UDP like this: > tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256 Forgot to mention, I've reduced drop_batch_size down to 32 > This is certainly better than 30Mbps but still more than two times > less than before (900). > TCP also improved a little (550 to ~590). > > Felix, others, do you want to see the ported patch, maybe I did something > wrong? > Doesn't look like it will save ath10k from performance regression. > >> >> On Fri, 6 May 2016 11:42:43 +0200 >> Jesper Dangaard Brouer wrote: >> >>> Hi Felix, >>> >>> This is an important fix for OpenWRT, please read! >>> >>> OpenWRT changed the default fq_codel sch->limit from 10240 to 1024, >>> without also adjusting q->flows_cnt. Eric explains below that you must >>> also adjust the buckets (q->flows_cnt) for this not to break. (Just >>> adjust it to 128) >>> >>> Problematic OpenWRT commit in question: >>> http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e >>> 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from >>> causing too much cpu load with higher speed (#21326)") >>> >>> >>> I also highly recommend you cherry-pick this very recent commit: >>> net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") >>> https://git.kernel.org/davem/net-next/c/9d18562a227 >>> >>> This should fix very high CPU usage in-case fq_codel goes into drop mode. >>> The problem is that drop mode was considered rare, and implementation >>> wise it was chosen to be more expensive (to save cycles on normal mode). >>> Unfortunately is it easy to trigger with an UDP flood. Drop mode is >>> especially expensive for smaller devices, as it scans a 4K big array, >>> thus 64 cache misses for small devices! >>> >>> The fix is to allow drop-mode to bulk-drop more packets when entering >>> drop-mode (default 64 bulk drop). That way we don't suddenly >>> experience a significantly higher processing cost per packet, but >>> instead can amortize this. >>> >>> To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk >>> drop, given we also recommend bucket size to be 128 ? (thus the amount >>> of memory to scan is less, but their CPU is also much smaller). >>> >>> --Jesper >>> >>> >>> On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet >>> wrote: >>> >>> > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote: >>> > > On 5 May 2016 at 19:12, Eric Dumazet wrote: >>> > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote: >>> > > > >>> > > >> >>> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024 >>> > > >> quantum 1514 target 5.0ms interval 100.0ms ecn >>> > > >> Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0) >>> > > >> backlog 0b 0p requeues 0 >>> > > >> maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0 >>> > > >> new_flows_len 0 old_flows_len 0 >>> > > > >>> > > > >>> > > > Limit of 1024 packets and 1024 flows is not wise I think. >>> > > > >>> > > > (If all buckets are in use, each bucket has a virtual queue of 1 >>> > > > packet, >>> > > > which is almost the same than having no queue at all) >>> > > > >>> > > > I suggest to have at least 8 packets per bucket, to let Codel have a >>> > > > chance to trigger. >>> > > > >>> > > > So you could either reduce number of buckets to 128 (if memory is >>> > > > tight), or increase limit to 8192. >>> > > >>> > > Will try, but what I've posted is default, I didn't change/configure >>> > > that. >>> > >>> > fq_codel has a default of 10240 packets and 1024 buckets. >>> > >>> > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413 >>> > >>> > If someone changed that in the linux variant you use, he probably should >>> > explain the rationale. >> >> -- >> Best regards, >> Jesper Dangaard Brouer >> MSc.CS, Principal Kernel Engineer at Red Hat >> Author of http://www.iptv-analyzer.org >> LinkedIn: http://www.linkedin.com/in/brouer
Re: [PATCH] Add support for configuring Infiniband GUIDs
On Fri, 6 May 2016 10:43:25 -0500 Eli Cohen wrote: > Add two NLA's that allow configuration of Infiniband node or port GUIDs > by referencing the IPoIB net device set over then physical function. The > format to be used is as follows: > > ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70 > ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78 > > Issue: 702759 > Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e > Signed-off-by: Eli Cohen I am not that familiar with Infiniband, but the documentation seems to use a non-colon form: # ip link set dev ib0 vf 0 node_guid 0002c90300216e70 Seems like ip should follow the lead of ibstat and friends.
Re: [PATCH iproute2 0/2] ip link gre: fix external mode handling
On Wed, 27 Apr 2016 16:11:12 +0200 Jiri Benc wrote: > Fix two bugs with handling of the 'external' keyword for GRE. > > Jiri Benc (2): > ip link gre: create interfaces in external mode correctly > ip link gre: print only relevant info in external mode > > ip/link_gre.c | 43 +-- > 1 file changed, 25 insertions(+), 18 deletions(-) > Applied
Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)
On 6 May 2016 at 15:47, Jesper Dangaard Brouer wrote: > > I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2] > closed Felix'es OpenWRT email account (bad choice! emails bouncing). > Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project > is in some kind of conflict. > > OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349 > > [2] > http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335 OK, so, after porting the patch to 4.1 openwrt kernel and playing a bit with fq_codel limits I was able to get 420Mbps UDP like this: tc qdisc replace dev wlan0 parent :1 fq_codel flows 16 limit 256 This is certainly better than 30Mbps but still more than two times less than before (900). TCP also improved a little (550 to ~590). Felix, others, do you want to see the ported patch, maybe I did something wrong? Doesn't look like it will save ath10k from performance regression. > > On Fri, 6 May 2016 11:42:43 +0200 > Jesper Dangaard Brouer wrote: > >> Hi Felix, >> >> This is an important fix for OpenWRT, please read! >> >> OpenWRT changed the default fq_codel sch->limit from 10240 to 1024, >> without also adjusting q->flows_cnt. Eric explains below that you must >> also adjust the buckets (q->flows_cnt) for this not to break. (Just >> adjust it to 128) >> >> Problematic OpenWRT commit in question: >> http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e >> 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from >> causing too much cpu load with higher speed (#21326)") >> >> >> I also highly recommend you cherry-pick this very recent commit: >> net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") >> https://git.kernel.org/davem/net-next/c/9d18562a227 >> >> This should fix very high CPU usage in-case fq_codel goes into drop mode. >> The problem is that drop mode was considered rare, and implementation >> wise it was chosen to be more expensive (to save cycles on normal mode). >> Unfortunately is it easy to trigger with an UDP flood. Drop mode is >> especially expensive for smaller devices, as it scans a 4K big array, >> thus 64 cache misses for small devices! >> >> The fix is to allow drop-mode to bulk-drop more packets when entering >> drop-mode (default 64 bulk drop). That way we don't suddenly >> experience a significantly higher processing cost per packet, but >> instead can amortize this. >> >> To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk >> drop, given we also recommend bucket size to be 128 ? (thus the amount >> of memory to scan is less, but their CPU is also much smaller). >> >> --Jesper >> >> >> On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet >> wrote: >> >> > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote: >> > > On 5 May 2016 at 19:12, Eric Dumazet wrote: >> > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote: >> > > > >> > > >> >> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024 >> > > >> quantum 1514 target 5.0ms interval 100.0ms ecn >> > > >> Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0) >> > > >> backlog 0b 0p requeues 0 >> > > >> maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0 >> > > >> new_flows_len 0 old_flows_len 0 >> > > > >> > > > >> > > > Limit of 1024 packets and 1024 flows is not wise I think. >> > > > >> > > > (If all buckets are in use, each bucket has a virtual queue of 1 >> > > > packet, >> > > > which is almost the same than having no queue at all) >> > > > >> > > > I suggest to have at least 8 packets per bucket, to let Codel have a >> > > > chance to trigger. >> > > > >> > > > So you could either reduce number of buckets to 128 (if memory is >> > > > tight), or increase limit to 8192. >> > > >> > > Will try, but what I've posted is default, I didn't change/configure >> > > that. >> > >> > fq_codel has a default of 10240 packets and 1024 buckets. >> > >> > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413 >> > >> > If someone changed that in the linux variant you use, he probably should >> > explain the rationale. > > -- > Best regards, > Jesper Dangaard Brouer > MSc.CS, Principal Kernel Engineer at Red Hat > Author of http://www.iptv-analyzer.org > LinkedIn: http://www.linkedin.com/in/brouer
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
On Tue, May 3, 2016 at 2:16 PM, Dean Jenkins wrote: > A good test would be to run "ping -c 1 -s $packet_length $ip_address" inside > a script which has a loop with an increasing payload length $packet_length > with a small delay between ping calls. This will show whether particular > packet sizes trigger the failures. > > Then try with "ping -f -c 200 -s $packet_length $ip_address" to load up the > USB link. I've tried both of these on my x86_64 system. I can send single pings up to 65507 without triggering the issue (after which I get errors sending on the host side as I think I cross a 64k boundary with headers, not the asix errors). Then when I try ping -f -c 200 -s 65507 $ip_address, I don't see any failures. I did it for a count of 2000 as well without any issues. I'll be adding more debug prints in soon. thanks -john
Re: [PATCH] netdev: enc28j60 kernel panic fix.
I kind of thought my patch was at best incomplete. When you state this change silences the bug but does not fix it, what are the implications of systems running this patch? We have some production systems using this patch. They reboot daily, but have been solid. In addition, if we sent you a pi and the ethernet controller and a small but reasonable sum of money for your labor, would you be able to properly fix it? Short of that, do you have any recommendations on quick overviews of the networking stack in the kernel and then documentation on the various flags and such? Thanks. -David Russell APRS World, LLC http://www.aprsworld.com/ On Thu, May 5, 2016 at 3:51 AM, Francois Romieu wrote: > David Russell : >> When connected directly to another system (not via a switch) >> eventually a condition where a NULL pointer dereference occurs in >> enc28j60_hw_tx() and this patch simply checks for that condition and >> returns gracefully without causing a kernel panic. I believe, but >> have not investigated this is caused by a packet collision and am not >> sure if the kernel tracks collisions or counts them as errors, so that >> should probably be added if this is what's happening. I'm also not >> familiar with the linux kernel, so may have fixed this in a less than >> ideal way. > > Is it possible for EIR.EIR_TXERIF and EIR.EIR_TXIF to be set for the > same packet ? > > If so the driver is intrinsically racy: > - EIR.EIR_TXIF completes transmission, clears tx_skb and enables queueing > again (see netif_wake_queue in enc28j60_tx_clear) > > - insert start_xmit here: tx_skb is set and enc28j60_hw_tx is scheduled > for late execution (user context work) > > - EIR.EIR_EIR.EIR_TXERIF issues same enc28j60_tx_clear and clears tx_skb > > - enc28j60_hw_tx is run but tx_skb is NULL > >> diff --git a/drivers/net/ethernet/microchip/enc28j60.c >> b/drivers/net/ethernet/microchip/enc28j60.c >> index 86ea17e..36ac65f 100644 >> --- a/drivers/net/ethernet/microchip/enc28j60.c >> +++ b/drivers/net/ethernet/microchip/enc28j60.c >> @@ -1233,6 +1233,9 @@ static void enc28j60_irq_work_handler(struct >> work_struct *work) >> */ >> static void enc28j60_hw_tx(struct enc28j60_net *priv) >> { >> + if (!priv->tx_skb) >> + return; >> + >> if (netif_msg_tx_queued(priv)) >> printk(KERN_DEBUG DRV_NAME >> ": Tx Packet Len:%d\n", priv->tx_skb->len); > > enc28j60_hw_tx isn't the culprit. It's the victim. > > This change silences the bug but it does not fix it at all. > > -- > Ueimor
Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
On Fri, May 6, 2016 at 11:01 AM, Larry Finger wrote: > On 05/06/2016 12:13 PM, Alexander Duyck wrote: >> >> On Fri, May 6, 2016 at 9:33 AM, Wang YanQing wrote: >>> >>> We can't use kfree_skb in irq disable context, because spin_lock_irqsave >>> make sure we are always in irq disable context, use dev_kfree_skb_irq >>> instead of kfree_skb is better than dev_kfree_skb_any. >>> >>> This patch fix below kernel warning: >>> [ 7612.095528] [ cut here ] >>> [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 >>> __local_bh_enable_ip+0x58/0x80() >>> [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal >>> btcoexist rtl_pci rtlwifi rtl8723_common >>> [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW >>> 4.4.0+ #4 >>> [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW >>> (1.19 ) 08/27/2015 >>> [ 7612.095574] da37fc70 c12ce7c5 da37fca0 >>> c104cc59 c19d4454 >>> [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 >>> 0200 c1b42400 >>> [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc >>> c10508a8 f21f08b8 >>> [ 7612.095604] Call Trace: >>> [ 7612.095614] [] dump_stack+0x41/0x5c >>> [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 >>> [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 >>> [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 >>> [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 >>> [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 >>> [ 7612.095653] [] destroy_conntrack+0x64/0xa0 >>> [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 >>> [ 7612.095665] [] skb_release_head_state+0x55/0xa0 >>> [ 7612.095670] [] skb_release_all+0xb/0x20 >>> [ 7612.095674] [] __kfree_skb+0xb/0x60 >>> [ 7612.095679] [] kfree_skb+0x30/0x70 >>> [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 >>> [rtl_pci] >>> [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] >>> [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] >>> [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] >>> [ 7612.095712] [] drv_start+0x36/0xc0 >>> [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 >>> [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 >>> [ 7612.095730] [] ieee80211_open+0x4d/0x50 >>> [ 7612.095736] [] __dev_open+0xa3/0x130 >>> [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 >>> [ 7612.095748] [] __dev_change_flags+0x89/0x140 >>> [ 7612.095753] [] ? selinux_capable+0xd/0x10 >>> [ 7612.095759] [] dev_change_flags+0x29/0x60 >>> [ 7612.095765] [] devinet_ioctl+0x553/0x670 >>> [ 7612.095772] [] ? _copy_to_user+0x28/0x40 >>> [ 7612.095777] [] inet_ioctl+0x85/0xb0 >>> [ 7612.095783] [] sock_ioctl+0x67/0x260 >>> [ 7612.095788] [] ? sock_fasync+0x80/0x80 >>> [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 >>> [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 >>> [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 >>> [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 >>> [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 >>> [ 7612.095827] [] SyS_ioctl+0x70/0x80 >>> [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 >>> [ 7612.095839] [] sysenter_past_esp+0x36/0x55 >>> [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- >>> >>> Signed-off-by: Wang YanQing >>> Cc: Stable >>> --- >>> Changes: >>> v1-v2: >>> 1: add a Cc to stable. >>> >>> drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c >>> b/drivers/net/wireless/realtek/rtlwifi/pci.c >>> index 1ac41b8..99a3a03 100644 >>> --- a/drivers/net/wireless/realtek/rtlwifi/pci.c >>> +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c >>> @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) >>> true, >>> >>> HW_DESC_TXBUFF_ADDR), >>> skb->len, >>> PCI_DMA_TODEVICE); >>> - kfree_skb(skb); >>> + dev_kfree_skb_irq(skb); >>> ring->idx = (ring->idx + 1) % >>> ring->entries; >>> } >>> ring->idx = 0; >> >> >> Is this always called in IRQ context? You might be better off using >> dev_kfree_skb_any instead if this is something that can be called from >> net_device_ops since that way you avoid having to call into the Tx >> softirq cleanup routine to free the buffers later unless you really >> need it. >> >> - Alex >> > > Alex, > > Six lines below the change is a spin_unlock_irqrestore(), which is always > called. I believe that the patch is correct. Okay. That works then. Thanks. - Alex
Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
On 05/06/2016 12:13 PM, Alexander Duyck wrote: On Fri, May 6, 2016 at 9:33 AM, Wang YanQing wrote: We can't use kfree_skb in irq disable context, because spin_lock_irqsave make sure we are always in irq disable context, use dev_kfree_skb_irq instead of kfree_skb is better than dev_kfree_skb_any. This patch fix below kernel warning: [ 7612.095528] [ cut here ] [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 __local_bh_enable_ip+0x58/0x80() [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist rtl_pci rtlwifi rtl8723_common [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW 4.4.0+ #4 [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 ) 08/27/2015 [ 7612.095574] da37fc70 c12ce7c5 da37fca0 c104cc59 c19d4454 [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 0200 c1b42400 [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc c10508a8 f21f08b8 [ 7612.095604] Call Trace: [ 7612.095614] [] dump_stack+0x41/0x5c [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 [ 7612.095653] [] destroy_conntrack+0x64/0xa0 [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 [ 7612.095665] [] skb_release_head_state+0x55/0xa0 [ 7612.095670] [] skb_release_all+0xb/0x20 [ 7612.095674] [] __kfree_skb+0xb/0x60 [ 7612.095679] [] kfree_skb+0x30/0x70 [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] [ 7612.095712] [] drv_start+0x36/0xc0 [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 [ 7612.095730] [] ieee80211_open+0x4d/0x50 [ 7612.095736] [] __dev_open+0xa3/0x130 [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 [ 7612.095748] [] __dev_change_flags+0x89/0x140 [ 7612.095753] [] ? selinux_capable+0xd/0x10 [ 7612.095759] [] dev_change_flags+0x29/0x60 [ 7612.095765] [] devinet_ioctl+0x553/0x670 [ 7612.095772] [] ? _copy_to_user+0x28/0x40 [ 7612.095777] [] inet_ioctl+0x85/0xb0 [ 7612.095783] [] sock_ioctl+0x67/0x260 [ 7612.095788] [] ? sock_fasync+0x80/0x80 [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 [ 7612.095827] [] SyS_ioctl+0x70/0x80 [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 [ 7612.095839] [] sysenter_past_esp+0x36/0x55 [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- Signed-off-by: Wang YanQing Cc: Stable --- Changes: v1-v2: 1: add a Cc to stable. drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 1ac41b8..99a3a03 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) true, HW_DESC_TXBUFF_ADDR), skb->len, PCI_DMA_TODEVICE); - kfree_skb(skb); + dev_kfree_skb_irq(skb); ring->idx = (ring->idx + 1) % ring->entries; } ring->idx = 0; Is this always called in IRQ context? You might be better off using dev_kfree_skb_any instead if this is something that can be called from net_device_ops since that way you avoid having to call into the Tx softirq cleanup routine to free the buffers later unless you really need it. - Alex Alex, Six lines below the change is a spin_unlock_irqrestore(), which is always called. I believe that the patch is correct. Larry
[PATCH v1 1/1] ISDN: eicon: replace custom hex_asc_lo() / hex_pack_byte()
Instead of custom approach re-use generic helpers to convert byte to hex format. Signed-off-by: Andy Shevchenko --- drivers/isdn/hardware/eicon/message.c | 21 +++-- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/isdn/hardware/eicon/message.c b/drivers/isdn/hardware/eicon/message.c index d7c2866..1a1d997 100644 --- a/drivers/isdn/hardware/eicon/message.c +++ b/drivers/isdn/hardware/eicon/message.c @@ -1147,8 +1147,6 @@ static byte test_c_ind_mask_bit(PLCI *plci, word b) static void dump_c_ind_mask(PLCI *plci) { - static char hex_digit_table[0x10] = - {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; word i, j, k; dword d; char *p; @@ -1165,7 +1163,7 @@ static void dump_c_ind_mask(PLCI *plci) d = plci->c_ind_mask_table[i + j]; for (k = 0; k < 8; k++) { - *(--p) = hex_digit_table[d & 0xf]; + *(--p) = hex_asc_lo(d); d >>= 4; } } @@ -10507,7 +10505,6 @@ static void mixer_set_bchannel_id(PLCI *plci, byte *chi) static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) { - static char hex_digit_table[0x10] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; word n, i, j; char *p; char hex_line[2 * MIXER_MAX_DUMP_CHANNELS + MIXER_MAX_DUMP_CHANNELS / 8 + 4]; @@ -10690,13 +10687,13 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) n = li_total_channels; if (n > MIXER_MAX_DUMP_CHANNELS) n = MIXER_MAX_DUMP_CHANNELS; + p = hex_line; for (j = 0; j < n; j++) { if ((j & 0x7) == 0) *(p++) = ' '; - *(p++) = hex_digit_table[li_config_table[j].curchnl >> 4]; - *(p++) = hex_digit_table[li_config_table[j].curchnl & 0xf]; + p = hex_byte_pack(p, li_config_table[j].curchnl); } *p = '\0'; dbug(1, dprintf("[%06lx] CURRENT %s", @@ -10706,8 +10703,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) { if ((j & 0x7) == 0) *(p++) = ' '; - *(p++) = hex_digit_table[li_config_table[j].channel >> 4]; - *(p++) = hex_digit_table[li_config_table[j].channel & 0xf]; + p = hex_byte_pack(p, li_config_table[j].channel); } *p = '\0'; dbug(1, dprintf("[%06lx] CHANNEL %s", @@ -10717,8 +10713,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) { if ((j & 0x7) == 0) *(p++) = ' '; - *(p++) = hex_digit_table[li_config_table[j].chflags >> 4]; - *(p++) = hex_digit_table[li_config_table[j].chflags & 0xf]; + p = hex_byte_pack(p, li_config_table[j].chflags); } *p = '\0'; dbug(1, dprintf("[%06lx] CHFLAG %s", @@ -10730,8 +10725,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) { if ((j & 0x7) == 0) *(p++) = ' '; - *(p++) = hex_digit_table[li_config_table[i].flag_table[j] >> 4]; - *(p++) = hex_digit_table[li_config_table[i].flag_table[j] & 0xf]; + p = hex_byte_pack(p, li_config_table[i].flag_table[j]); } *p = '\0'; dbug(1, dprintf("[%06lx] FLAG[%02x]%s", @@ -10744,8 +10738,7 @@ static void mixer_calculate_coefs(DIVA_CAPI_ADAPTER *a) { if ((j & 0x7) == 0) *(p++) = ' '; - *(p++) = hex_digit_table[li_config_table[i].coef_table[j] >> 4]; - *(p++) = hex_digit_table[li_config_table[i].coef_table[j] & 0xf]; + p = hex_byte_pack(p, li_config_table[i].coef_table[j]); } *p = '\0'; dbug(1, dprintf("[%06lx] COEF[%02x]%s", -- 2.8.1
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
On Fri, May 6, 2016 at 8:00 AM, Dean Jenkins wrote: > My conclusion is that your USB to Ethernet Adaptor is not running at high > speed (480Mbps) mode which is causing a partial loss (corruption) of > Ethernet frames across the USB link. A USB Protocol Analyser or software > tool usbmon could be used to confirm this scenario. > > Therefore please retest with a working high-speed USB hub or remove the > full-speed USB hub from the test environment and directly connect the USB to > Ethernet Adaptor to the root hub of the USB port. Then repeat the tests to > see whether anything improved. > > In other words, you need to eliminate the dmesg messages saying "not running > at top speed; connect to a high speed hub". The aarch64 system has a quirk that at the moment limits it to the slower full-speed mode, which also exacerbates the issue (basically taking a fairly slow 1.1.Mb/s network connection without your patch, to an almost unusable 30Kb/s with it). But that isn't the case on the x86_64 system, which is seeing a very similar problem (though the performance effect isn't nearly as bad, as the error rate in time seems relatively similar on both, and I think my scp transmissions are cpu bound on this atom board :). thanks -john
[PATCH v2] Documentation/networking: more accurate LCO explanation
In few places the term "ones-complement sum" was used but the actual meaning is "the complement of the ones-complement sum". Also, avoid enclosing long statements with underscore, to ease readability. Signed-off-by: Shmulik Ladkani Acked-by: Edward Cree --- v2: - Fixed one more occurence where "complement of" was missing - Got rid of unreadable underscore wrapped statements Took the liberty having the underscore removal as part of this patch. Let me know if you feel this needs a patch split. Documentation/networking/checksum-offloads.txt | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt index de2a327766..56e3686124 100644 --- a/Documentation/networking/checksum-offloads.txt +++ b/Documentation/networking/checksum-offloads.txt @@ -69,18 +69,18 @@ LCO: Local Checksum Offload LCO is a technique for efficiently computing the outer checksum of an encapsulated datagram when the inner checksum is due to be offloaded. The ones-complement sum of a correctly checksummed TCP or UDP packet is - equal to the sum of the pseudo header, because everything else gets - 'cancelled out' by the checksum field. This is because the sum was + equal to the complement of the sum of the pseudo header, because everything + else gets 'cancelled out' by the checksum field. This is because the sum was complemented before being written to the checksum field. More generally, this holds in any case where the 'IP-style' ones complement checksum is used, and thus any checksum that TX Checksum Offload supports. That is, if we have set up TX Checksum Offload with a start/offset pair, we - know that _after the device has filled in that checksum_, the ones + know that after the device has filled in that checksum, the ones complement sum from csum_start to the end of the packet will be equal to - _whatever value we put in the checksum field beforehand_. This allows us - to compute the outer checksum without looking at the payload: we simply - stop summing when we get to csum_start, then add the 16-bit word at - (csum_start + csum_offset). + the complement of whatever value we put in the checksum field beforehand. + This allows us to compute the outer checksum without looking at the payload: + we simply stop summing when we get to csum_start, then add the complement of + the 16-bit word at (csum_start + csum_offset). Then, when the true inner checksum is filled in (either by hardware or by skb_checksum_help()), the outer checksum will become correct by virtue of the arithmetic. -- 2.7.4
Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
On Fri, May 6, 2016 at 9:33 AM, Wang YanQing wrote: > We can't use kfree_skb in irq disable context, because spin_lock_irqsave > make sure we are always in irq disable context, use dev_kfree_skb_irq > instead of kfree_skb is better than dev_kfree_skb_any. > > This patch fix below kernel warning: > [ 7612.095528] [ cut here ] > [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 > __local_bh_enable_ip+0x58/0x80() > [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist > rtl_pci rtlwifi rtl8723_common > [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW > 4.4.0+ #4 > [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW > (1.19 ) 08/27/2015 > [ 7612.095574] da37fc70 c12ce7c5 da37fca0 > c104cc59 c19d4454 > [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 > 0200 c1b42400 > [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc > c10508a8 f21f08b8 > [ 7612.095604] Call Trace: > [ 7612.095614] [] dump_stack+0x41/0x5c > [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 > [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 > [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 > [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 > [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 > [ 7612.095653] [] destroy_conntrack+0x64/0xa0 > [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 > [ 7612.095665] [] skb_release_head_state+0x55/0xa0 > [ 7612.095670] [] skb_release_all+0xb/0x20 > [ 7612.095674] [] __kfree_skb+0xb/0x60 > [ 7612.095679] [] kfree_skb+0x30/0x70 > [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] > [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] > [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] > [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] > [ 7612.095712] [] drv_start+0x36/0xc0 > [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 > [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 > [ 7612.095730] [] ieee80211_open+0x4d/0x50 > [ 7612.095736] [] __dev_open+0xa3/0x130 > [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 > [ 7612.095748] [] __dev_change_flags+0x89/0x140 > [ 7612.095753] [] ? selinux_capable+0xd/0x10 > [ 7612.095759] [] dev_change_flags+0x29/0x60 > [ 7612.095765] [] devinet_ioctl+0x553/0x670 > [ 7612.095772] [] ? _copy_to_user+0x28/0x40 > [ 7612.095777] [] inet_ioctl+0x85/0xb0 > [ 7612.095783] [] sock_ioctl+0x67/0x260 > [ 7612.095788] [] ? sock_fasync+0x80/0x80 > [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 > [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 > [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 > [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 > [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 > [ 7612.095827] [] SyS_ioctl+0x70/0x80 > [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 > [ 7612.095839] [] sysenter_past_esp+0x36/0x55 > [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- > > Signed-off-by: Wang YanQing > Cc: Stable > --- > Changes: > v1-v2: > 1: add a Cc to stable. > > drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c > b/drivers/net/wireless/realtek/rtlwifi/pci.c > index 1ac41b8..99a3a03 100644 > --- a/drivers/net/wireless/realtek/rtlwifi/pci.c > +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c > @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) > true, > HW_DESC_TXBUFF_ADDR), > skb->len, PCI_DMA_TODEVICE); > - kfree_skb(skb); > + dev_kfree_skb_irq(skb); > ring->idx = (ring->idx + 1) % ring->entries; > } > ring->idx = 0; Is this always called in IRQ context? You might be better off using dev_kfree_skb_any instead if this is something that can be called from net_device_ops since that way you avoid having to call into the Tx softirq cleanup routine to free the buffers later unless you really need it. - Alex
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
On Thu, May 5, 2016 at 1:11 AM, Dean Jenkins wrote: > On 05/05/16 00:45, John Stultz wrote: >> >> Just as a sample point, I have managed to reproduce exactly this issue >> on an x86_64 system by simply scp'ing a large file. > > Please tell us the x86_64 kernel version number that you used and which > Linux Distribution it was ? This allows other people a chance to reproduce > your observations. Sorry for being a little slow here, had some other issues I had to chase. On my x86_64 system, its Ubuntu 14.04.4, with a 4.6.0-rc2 kernel. >> [ 417.819276] asix 1-5:1.0 eth1: asix_rx_fixup() Data Header >> synchronisation was lost, remaining 988 > > It is interesting that the reported "remaining" value is 988. Is 988 always > shown ? I mean that do you see any other "remaining" values for the "Data > Header synchronisation was lost" error message ? Yep. Its always the same 988 remaining, on either architecture. >> [ 417.823415] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length >> 0xef830347, offset 4 > > The gap in the timestamps shows 417.823415 - 417.819276 = 0.004139 = 4ms > which is a large gap in terms of USB 2.0 high speed communications. This gap > is expected to be in the 100us range for consecutive URBs. So 4ms is > strange. > > The expectation is that the "Data Header synchronisation was lost" error > message resets the 32-bit header word synchronisation to the start of the > next URB buffer. The "Bad Header Length, offset 4" is the expected outcome > for the next URB because it is unlikely the 32-bit header word is at the > start of URB buffer due to Ethernet frames spanning across URBs. >> >> [ 417.827502] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length >> 0x31e2b348, offset 4 > > Timestamps show the gap to be 4ms which is strange for USB 2.0 high speed, > are you sure high speed mode is being used ? >> Yep, on my x86_64 system, it seems to be. [3.101115] usb 1-5: new high-speed USB device number 2 using ehci-pci [3.232309] usb 1-5: New USB device found, idVendor=0b95, idProduct=772b [3.232327] usb 1-5: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [3.232339] usb 1-5: Product: AX88772B [3.232350] usb 1-5: Manufacturer: ASIX Elec. Corp. [3.232360] usb 1-5: SerialNumber: 188298 [4.032206] asix 1-5:1.0 eth1: register 'asix' at usb-:00:04.1-5, ASIX AX88772B USB 2.0 Ethernet, 00:50:b6:18:82:98 > Please can you supply the output of ifconfig for the USB to Ethernet > adaptor, your example above shows eth1 as the device. > > Please show the output of ifconfig eth1 before and after the issue is seen. > This will show us whether the kernel logs any network errors and how many > bytes have been transferred. Before: $ ifconfig eth1 eth1 Link encap:Ethernet HWaddr 00:50:b6:18:82:98 inet addr:192.168.0.12 Bcast:192.168.0.255 Mask:255.255.255.0 inet6 addr: 2601:1c2:1002:83f0:250:b6ff:fe18:8298/64 Scope:Global inet6 addr: fe80::250:b6ff:fe18:8298/64 Scope:Link inet6 addr: 2601:1c2:1002:83f0:b0f0:71a0:6c7e:346b/64 Scope:Global UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:372 errors:0 dropped:0 overruns:0 frame:0 TX packets:385 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:38523 (38.5 KB) TX bytes:48801 (48.8 KB) After: $ ifconfig eth1 eth1 Link encap:Ethernet HWaddr 00:50:b6:18:82:98 inet addr:192.168.0.12 Bcast:192.168.0.255 Mask:255.255.255.0 inet6 addr: 2601:1c2:1002:83f0:250:b6ff:fe18:8298/64 Scope:Global inet6 addr: fe80::250:b6ff:fe18:8298/64 Scope:Link inet6 addr: 2601:1c2:1002:83f0:b0f0:71a0:6c7e:346b/64 Scope:Global UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:151005 errors:169 dropped:0 overruns:0 frame:0 TX packets:61351 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:225874384 (225.8 MB) TX bytes:4431098 (4.4 MB) > After the issue is seen, please can you show us the output of "dmesg | grep > asix" so that we can see status messages from the ASIX driver that the USB > to Ethernet adaptor is using. In particular we need to check that USB high > speed operation (480Mbps) is being used and not full speed operation > (12Mbps). [2.766525] usbcore: registered new interface driver asix [4.031443] asix 1-5:1.0 eth1: register 'asix' at usb-:00:04.1-5, ASIX AX88772B USB 2.0 Ethernet, 00:50:b6:18:82:98 [ 31.578983] asix 1-5:1.0 eth1: link down [ 33.244743] asix 1-5:1.0 eth1: link up, 100Mbps, full-duplex, lpa 0xCDE1 [ 171.959244] asix 1-5:1.0 eth1: asix_rx_fixup() Data Header synchronisation was lost, remaining 988 [ 171.959530] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length 0x1651c2bf, offset 4 [ 171.959768] asix 1-5:1.0 eth1: asix_rx_fixup() Bad Header Length 0xfcf61092, offset 4 [ 171.960001] asix 1-5:1.0 eth1: asix_rx_fixup() Ba
Re: [PATCH] Add support for configuring Infiniband GUIDs
Hello. On 05/06/2016 06:43 PM, Eli Cohen wrote: Add two NLA's that allow configuration of Infiniband node or port GUIDs by referencing the IPoIB net device set over then physical function. The format to be used is as follows: ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70 ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78 Issue: 702759 Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e Signed-off-by: Eli Cohen --- ip/iplink.c | 40 man/man8/ip-link.8.in | 12 +++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/ip/iplink.c b/ip/iplink.c index d2e586b6d133..3f885defdfeb 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -237,6 +237,30 @@ struct iplink_req { charbuf[1024]; }; +static int extract_guid(__u64 *guid, char *arg) +{ + __u64 ret; + int g[8]; + int err; + + err = sscanf(arg, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", +g, g + 1, g + 2, g + 3, g + 4, g + 5, g + 6, g + 7); + if (err != 8) Strange name for a variable, if sscanf() returns # of fields read... In fact, you don't even need this variable. + return -1; + + ret = ((__u64)(g[0]) << 56) | + ((__u64)(g[1]) << 48) | + ((__u64)(g[2]) << 40) | + ((__u64)(g[3]) << 32) | + ((__u64)(g[4]) << 24) | + ((__u64)(g[5]) << 16) | + ((__u64)(g[6]) << 8) | + ((__u64)(g[7])); + *guid = ret; + + return 0; +} + static int iplink_parse_vf(int vf, int *argcp, char ***argvp, struct iplink_req *req, int dev_index) { [...] MBR, Sergei
Re: [PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
On 05/06/2016 11:33 AM, Wang YanQing wrote: We can't use kfree_skb in irq disable context, because spin_lock_irqsave make sure we are always in irq disable context, use dev_kfree_skb_irq instead of kfree_skb is better than dev_kfree_skb_any. This patch fix below kernel warning: [ 7612.095528] [ cut here ] [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 __local_bh_enable_ip+0x58/0x80() [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist rtl_pci rtlwifi rtl8723_common [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW 4.4.0+ #4 [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 ) 08/27/2015 [ 7612.095574] da37fc70 c12ce7c5 da37fca0 c104cc59 c19d4454 [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 0200 c1b42400 [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc c10508a8 f21f08b8 [ 7612.095604] Call Trace: [ 7612.095614] [] dump_stack+0x41/0x5c [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 [ 7612.095653] [] destroy_conntrack+0x64/0xa0 [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 [ 7612.095665] [] skb_release_head_state+0x55/0xa0 [ 7612.095670] [] skb_release_all+0xb/0x20 [ 7612.095674] [] __kfree_skb+0xb/0x60 [ 7612.095679] [] kfree_skb+0x30/0x70 [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] [ 7612.095712] [] drv_start+0x36/0xc0 [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 [ 7612.095730] [] ieee80211_open+0x4d/0x50 [ 7612.095736] [] __dev_open+0xa3/0x130 [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 [ 7612.095748] [] __dev_change_flags+0x89/0x140 [ 7612.095753] [] ? selinux_capable+0xd/0x10 [ 7612.095759] [] dev_change_flags+0x29/0x60 [ 7612.095765] [] devinet_ioctl+0x553/0x670 [ 7612.095772] [] ? _copy_to_user+0x28/0x40 [ 7612.095777] [] inet_ioctl+0x85/0xb0 [ 7612.095783] [] sock_ioctl+0x67/0x260 [ 7612.095788] [] ? sock_fasync+0x80/0x80 [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 [ 7612.095827] [] SyS_ioctl+0x70/0x80 [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 [ 7612.095839] [] sysenter_past_esp+0x36/0x55 [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- Signed-off-by: Wang YanQing Cc: Stable --- Changes: v1-v2: 1: add a Cc to stable. drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 1ac41b8..99a3a03 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) true, HW_DESC_TXBUFF_ADDR), skb->len, PCI_DMA_TODEVICE); - kfree_skb(skb); + dev_kfree_skb_irq(skb); ring->idx = (ring->idx + 1) % ring->entries; } ring->idx = 0; Acked-by: Larry Finger Thanks, Larry
Re: [PATCH v9 net-next 1/2] hv_sock: introduce Hyper-V Sockets
From: Dexuan Cui Date: Wed, 4 May 2016 09:56:57 -0700 > +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_RECV (5 * PAGE_SIZE) > +#define VMBUS_RINGBUFFER_SIZE_HVSOCK_SEND (5 * PAGE_SIZE) > + > +#define HVSOCK_RCV_BUF_SZVMBUS_RINGBUFFER_SIZE_HVSOCK_RECV ... > +struct hvsock_sock { ... > + /* The 'hdr' and 'buf' in the below 'send' and 'recv' definitions must > + * be consecutive: see hvsock_send_data() and hvsock_recv_data(). > + */ > + struct { > + struct vmpipe_proto_header hdr; > + u8 buf[HVSOCK_SND_BUF_SZ]; > + } send; > + > + struct { > + struct vmpipe_proto_header hdr; > + u8 buf[HVSOCK_RCV_BUF_SZ]; > + > + unsigned int data_len; > + unsigned int data_offset; > + } recv; I don't think allocating 5 pages of unswappable memory for every Hyper-V socket created is reasonable.
Re: [PATCH net v3] vlan: Propagate MAC address to VLANs
On Fri, May 6, 2016 at 6:26 AM, Mike Manning wrote: > The MAC address of the physical interface is only copied to the VLAN > when it is first created, resulting in an inconsistency after MAC > address changes of only newly created VLANs having an up-to-date MAC. > > The VLANs should continue inheriting the MAC address of the physical > interface, unless explicitly changed to be different from this. > This allows IPv6 EUI64 addresses for the VLAN to reflect any changes > to the MAC of the physical interface and thus for DAD to behave as > expected. > > Signed-off-by: Mike Manning > --- > include/linux/if_vlan.h |2 ++ > net/8021q/vlan.c| 17 +++-- > net/8021q/vlan_dev.c| 13 ++--- > 3 files changed, 23 insertions(+), 9 deletions(-) > > --- a/include/linux/if_vlan.h > +++ b/include/linux/if_vlan.h > @@ -138,6 +138,7 @@ struct netpoll; > * @flags: device flags > * @real_dev: underlying netdevice > * @real_dev_addr: address of underlying netdevice > + * @addr_assign_type: address assignment type > * @dent: proc dir entry > * @vlan_pcpu_stats: ptr to percpu rx stats > */ > @@ -153,6 +154,7 @@ struct vlan_dev_priv { > > struct net_device *real_dev; > unsigned char real_dev_addr[ETH_ALEN]; > + unsigned char addr_assign_type; > > struct proc_dir_entry *dent; > struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; Please don't start adding new members to structures when it already exists in the net_device. If anything you should be able to drop read_dev_addr if you do this correctly because you shouldn't need to clone the lower dev address to watch for changes. All you will need to do is watch NET_ADDR_STOLEN. > --- a/net/8021q/vlan.c > +++ b/net/8021q/vlan.c > @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net > if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) > return; > > + /* vlan continues to inherit address of parent interface */ > + if (vlan->addr_assign_type == NET_ADDR_STOLEN) { > + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); > + goto out; > + } > + > + if (!(vlandev->flags & IFF_UP)) > + goto out; > + > /* vlan address was different from the old address and is equal to > * the new address */ > if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && > @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net > !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) > dev_uc_add(dev, vlandev->dev_addr); > > +out: > ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); > } > > @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti > > case NETDEV_CHANGEADDR: > /* Adjust unicast filters on underlying device */ > - vlan_group_for_each_dev(grp, i, vlandev) { > - flgs = vlandev->flags; > - if (!(flgs & IFF_UP)) > - continue; > - > + vlan_group_for_each_dev(grp, i, vlandev) > vlan_sync_address(dev, vlandev); > - } > break; > > case NETDEV_CHANGEMTU: So all of this is far more complicated than it needs to be. If NET_ADDR_STOLEN is set you have to follow the lower device MAC address, otherwise you maintain your own address and have to hold a reference to it on the lower device. You should also be able to maintain the current logic of not updating a down interface on an address change. You don't need to update a stolen MAC address until the open routine is called for the interface. > --- a/net/8021q/vlan_dev.c > +++ b/net/8021q/vlan_dev.c > @@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi > > static int vlan_dev_set_mac_address(struct net_device *dev, void *p) > { > - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; > + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); > + struct net_device *real_dev = vlan->real_dev; > struct sockaddr *addr = p; > + bool is_real_addr; > int err; > > if (!is_valid_ether_addr(addr->sa_data)) > return -EADDRNOTAVAIL; > > + is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr); > + > if (!(dev->flags & IFF_UP)) > goto out; > > - if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { > + if (!is_real_addr) { > err = dev_uc_add(real_dev, addr->sa_data); > if (err < 0) > return err; > @@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru > > out: > ether_addr_copy(dev->dev_addr, addr->sa_data); > + vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : > NET_ADDR_SET;
Re: [PATCHv2 net] bridge: fix igmp / mld query parsing
From: Linus Lüssing Date: Wed, 4 May 2016 17:25:02 +0200 > With the newly introduced helper functions the skb pulling is hidden > in the checksumming function - and undone before returning to the > caller. > > The IGMP and MLD query parsing functions in the bridge still > assumed that the skb is pointing to the beginning of the IGMP/MLD > message while it is now kept at the beginning of the IPv4/6 header. > > If there is a querier somewhere else, then this either causes > the multicast snooping to stay disabled even though it could be > enabled. Or, if we have the querier enabled too, then this can > create unnecessary IGMP / MLD query messages on the link. > > Fixing this by taking the offset between IP and IGMP/MLD header into > account, too. > > Fixes: 9afd85c9e455 ("net: Export IGMP/MLD message validation code") > Reported-by: Simon Wunderlich > Signed-off-by: Linus Lüssing Applied and queued up for -stable, thanks.
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
On 06/05/16 16:27, Andrew Lunn wrote: In other words, the full-speed hub is restricting the USB to Ethernet Adaptor to a 12Mbps (half-duplex) bandwidth to support Ethernet 100Mbps (full-duplex) traffic. That is not going to work very well because Ethernet frames (perhaps partial Ethernet frames) need to be discarded within the USB link. If that really is true, the design is broken. I would expect the adaptor to reliably transfer whole frames over USB, and drop whole frames from its receive queue when the USB is congested. TCP is also going to see the USB bottleneck as just like any bottleneck in the network and back off. So TCP streams should not cause major congestion on the USB link. The host's USB host controller polls the USB to Ethernet adaptor for more data. The USB to Ethernet adaptor cannot predict when the next poll request comes. The AX88772B can span Ethernet frames across multiple poll requests. This means it is possible get a partial Ethernet frame received in the USB host controller on one poll and it is assumed that the next poll (sometime in the near future) will get the remaining part of the Ethernet frame. However, the USB to Ethernet adaptor does not contain an infinitely sized RX Ethernet buffer for the incoming Ethernet frames. I believe the USB to Ethernet adaptor is just a pipe and does not directly implement flow control for Ethernet frames so the RX buffer is going to overflow causing loss of whole Ethernet frames. I suspect the IP stack in the host computer implements flow control for Ethernet frames. Because the AX88772B can span Ethernet frames across multiple poll requests there is a risk that the designers of the device could of implemented a solution to discard the remaining part of the Ethernet frame before the next poll arrives due to the RX buffer overflowing. I don't know the algorithm used in the AX88772B but there will be loss of data due to the mismatch in bandwidths. I agree that dropping whole Ethernet frames would be preferable to dropping partial Ethernet frames which would corrupt the data stream. My suspicion is that the URB buffers are containing discontinues in the data stream because of lost data due to insufficient bandwidth on the USB link. Going over a 12Mbps USB link should be no different to hitting an old Ethernet hub which can only do 10/Half. Not exactly, because USB is a transport link which is agnostic to the type of data that is flowing. It is up to the layers above USB to manage the data content. In other words, the USB speed needs to be higher than the Ethernet speed to avoid mismatches in bandwidth. Therefore please retest with a working high-speed USB hub or remove the full-speed USB hub from the test environment and directly connect the USB to Ethernet Adaptor to the root hub of the USB port. Then repeat the tests to see whether anything improved. In other words, you need to eliminate the dmesg messages saying "not running at top speed; connect to a high speed hub". I would also suggest testing with the Ethernet at 10/half. You should be able to use Ethtool to set that up. Your USB and Ethernet bandwidth become more equal. If you still see errors, it suggests a protocol implementation error somewhere. I agree with the suggestion but I hope USB high speed (480Mbps) operation was the intended environment rather than the useless USB full speed (12Mbps) operation. Let's hope that not using the USB hub improves things. Regards, Dean Andrew -- Dean Jenkins Embedded Software Engineer Linux Transportation Solutions Mentor Embedded Software Division Mentor Graphics (UK) Ltd.
Re: [PATCH] Documentation/networking: more accurate LCO explanation
From: Alexander Duyck Date: Fri, 6 May 2016 09:29:56 -0700 > I don't really see the point of using an underscore before and after > that statement. If it was only one or two words it might work for > emphasis but the statement is large enough that starting it with an > underscore just makes it harder to read. Agreed.
[PATCH v2] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
We can't use kfree_skb in irq disable context, because spin_lock_irqsave make sure we are always in irq disable context, use dev_kfree_skb_irq instead of kfree_skb is better than dev_kfree_skb_any. This patch fix below kernel warning: [ 7612.095528] [ cut here ] [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 __local_bh_enable_ip+0x58/0x80() [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist rtl_pci rtlwifi rtl8723_common [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW 4.4.0+ #4 [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 ) 08/27/2015 [ 7612.095574] da37fc70 c12ce7c5 da37fca0 c104cc59 c19d4454 [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 0200 c1b42400 [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc c10508a8 f21f08b8 [ 7612.095604] Call Trace: [ 7612.095614] [] dump_stack+0x41/0x5c [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 [ 7612.095653] [] destroy_conntrack+0x64/0xa0 [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 [ 7612.095665] [] skb_release_head_state+0x55/0xa0 [ 7612.095670] [] skb_release_all+0xb/0x20 [ 7612.095674] [] __kfree_skb+0xb/0x60 [ 7612.095679] [] kfree_skb+0x30/0x70 [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] [ 7612.095712] [] drv_start+0x36/0xc0 [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 [ 7612.095730] [] ieee80211_open+0x4d/0x50 [ 7612.095736] [] __dev_open+0xa3/0x130 [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 [ 7612.095748] [] __dev_change_flags+0x89/0x140 [ 7612.095753] [] ? selinux_capable+0xd/0x10 [ 7612.095759] [] dev_change_flags+0x29/0x60 [ 7612.095765] [] devinet_ioctl+0x553/0x670 [ 7612.095772] [] ? _copy_to_user+0x28/0x40 [ 7612.095777] [] inet_ioctl+0x85/0xb0 [ 7612.095783] [] sock_ioctl+0x67/0x260 [ 7612.095788] [] ? sock_fasync+0x80/0x80 [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 [ 7612.095827] [] SyS_ioctl+0x70/0x80 [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 [ 7612.095839] [] sysenter_past_esp+0x36/0x55 [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- Signed-off-by: Wang YanQing Cc: Stable --- Changes: v1-v2: 1: add a Cc to stable. drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 1ac41b8..99a3a03 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) true, HW_DESC_TXBUFF_ADDR), skb->len, PCI_DMA_TODEVICE); - kfree_skb(skb); + dev_kfree_skb_irq(skb); ring->idx = (ring->idx + 1) % ring->entries; } ring->idx = 0; -- 1.8.5.6.2.g3d8a54e.dirty
[PATCH net-next] ipv4: tcp: ip_send_unicast_reply() is not BH safe
From: Eric Dumazet I forgot that ip_send_unicast_reply() is not BH safe (yet). Disabling preemption before calling it was not a good move. Fixes: c10d9310edf5 ("tcp: do not assume TCP code is non preemptible") Signed-off-by: Eric Dumazet Reported-by: Andres Lagar-Cavilla --- net/ipv4/tcp_ipv4.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a7ab9472d64560d86ea24ac1b6e1a7800f89989d..8219d0d8dc8370d0d3e6fc4cd17b4925617968ab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -692,7 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) offsetof(struct inet_timewait_sock, tw_bound_dev_if)); arg.tos = ip_hdr(skb)->tos; - preempt_disable(); + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, @@ -700,7 +700,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS); - preempt_enable(); + local_bh_enable(); #ifdef CONFIG_TCP_MD5SIG out: @@ -776,14 +776,14 @@ static void tcp_v4_send_ack(struct net *net, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - preempt_disable(); + local_bh_disable(); ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS); - preempt_enable(); + local_bh_enable(); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
Re: [PATCH] Documentation/networking: more accurate LCO explanation
On Fri, May 6, 2016 at 8:57 AM, Shmulik Ladkani wrote: > In few places the term "ones-complement sum" was used but the actual > meaning is "the complement of the ones-complement sum". Looks like there might still be a few minor corrections needed. Comments inline below. > > Signed-off-by: Shmulik Ladkani > --- > > I assume readers interpret the term "ones-complement sum" as the sum > using one's complement arithmentic, without the final bitwise > complement of sum's result. > Hence I added "the complement of" where applicable. > > Documentation/networking/checksum-offloads.txt | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/Documentation/networking/checksum-offloads.txt > b/Documentation/networking/checksum-offloads.txt > index de2a327766..9567200e1f 100644 > --- a/Documentation/networking/checksum-offloads.txt > +++ b/Documentation/networking/checksum-offloads.txt > @@ -69,17 +69,17 @@ LCO: Local Checksum Offload > LCO is a technique for efficiently computing the outer checksum of an > encapsulated datagram when the inner checksum is due to be offloaded. > The ones-complement sum of a correctly checksummed TCP or UDP packet is > - equal to the sum of the pseudo header, because everything else gets > - 'cancelled out' by the checksum field. This is because the sum was > + equal to the complement of the sum of the pseudo header, because everything > + else gets 'cancelled out' by the checksum field. This is because the sum > was > complemented before being written to the checksum field. > More generally, this holds in any case where the 'IP-style' ones complement > checksum is used, and thus any checksum that TX Checksum Offload supports. > That is, if we have set up TX Checksum Offload with a start/offset pair, we > know that _after the device has filled in that checksum_, the ones > complement sum from csum_start to the end of the packet will be equal to > - _whatever value we put in the checksum field beforehand_. This allows us > - to compute the outer checksum without looking at the payload: we simply > - stop summing when we get to csum_start, then add the 16-bit word at > + the complement of _whatever value we put in the checksum field beforehand_. I don't really see the point of using an underscore before and after that statement. If it was only one or two words it might work for emphasis but the statement is large enough that starting it with an underscore just makes it harder to read. > + This allows us to compute the outer checksum without looking at the payload: > + we simply stop summing when we get to csum_start, then add the 16-bit word > at > (csum_start + csum_offset). You don't add the 16-bit word you add the compliment of the 16 bit word.
RE: [Intel-wired-lan] NULL dereference on v4.1.x while enabling VF
Hey William, My validation hasn't be able to recreate the dereference on v4.1.x, v4.5.x or net_next. Where exactly did you place the two line script in your rc scripts. Our validation was able to run it as soon as ~14 second after the first boot message logged in dmesg. Is this anywhere close to where you were executing it? Likewise he has attempted running at both run level 3 and 5 in case that changed any of the times of how soon the rc scripts are executed. So the more detail you can give of setup the more it might help us recreate what you are seeing. Thanks, -Don Skidmore > -Original Message- > From: William Dauchy [mailto:wdau...@gmail.com] > Sent: Tuesday, May 03, 2016 5:33 AM > To: Skidmore, Donald C > Cc: NETDEV ; intel-wired-...@lists.osuosl.org; > Alex Duyck > Subject: Re: [Intel-wired-lan] NULL dereference on v4.1.x while enabling VF > > Hello Don, > > Thank you for your reply. > > On Mon, May 2, 2016 at 11:33 PM, Skidmore, Donald C > wrote: > > Thanks for reporting the dereference. Could you provide a little more > detail on how you created this issue? Are you just running the two > commands (ip, sriov_numvfs) in some rc script and if you put a few second > sleep in front of it you don't see the failure? > > Your understanding is correct; a rc script is run with ip and echo in numvfs > commands. I tried to reduce it to the minimum. If I put a sleep > 20 in front of it, it does not crash. I also forgot to add I did not had the > issue > in 3.14.x with the same script. > > Best, > -- > William
Re: [PATCH] Documentation/networking: more accurate LCO explanation
On 06/05/16 16:57, Shmulik Ladkani wrote: > In few places the term "ones-complement sum" was used but the actual > meaning is "the complement of the ones-complement sum". > > Signed-off-by: Shmulik Ladkani Acked-by: Edward Cree
Re: [PATCH] rtlwifi: pci: use dev_kfree_skb_irq instead of kfree_skb in rtl_pci_reset_trx_ring
On 05/05/2016 12:19 PM, Wang YanQing wrote: We can't use kfree_skb in irq disable context, because spin_lock_irqsave make sure we are always in irq disable context, use dev_kfree_skb_irq instead of kfree_skb is better than dev_kfree_skb_any. This patch fix below kernel warning: [ 7612.095528] [ cut here ] [ 7612.095546] WARNING: CPU: 3 PID: 4460 at kernel/softirq.c:150 __local_bh_enable_ip+0x58/0x80() [ 7612.095550] Modules linked in: rtl8723be x86_pkg_temp_thermal btcoexist rtl_pci rtlwifi rtl8723_common [ 7612.095567] CPU: 3 PID: 4460 Comm: ifconfig Tainted: GW 4.4.0+ #4 [ 7612.095570] Hardware name: LENOVO 20DFA04FCD/20DFA04FCD, BIOS J5ET48WW (1.19 ) 08/27/2015 [ 7612.095574] da37fc70 c12ce7c5 da37fca0 c104cc59 c19d4454 [ 7612.095584] 0003 116c c19d4784 0096 c10508a8 c10508a8 0200 c1b42400 [ 7612.095594] f29be780 da37fcb0 c104ccad 0009 da37fcbc c10508a8 f21f08b8 [ 7612.095604] Call Trace: [ 7612.095614] [] dump_stack+0x41/0x5c [ 7612.095620] [] warn_slowpath_common+0x89/0xc0 [ 7612.095628] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095634] [] ? __local_bh_enable_ip+0x58/0x80 [ 7612.095640] [] warn_slowpath_null+0x1d/0x20 [ 7612.095646] [] __local_bh_enable_ip+0x58/0x80 [ 7612.095653] [] destroy_conntrack+0x64/0xa0 [ 7612.095660] [] nf_conntrack_destroy+0xf/0x20 [ 7612.095665] [] skb_release_head_state+0x55/0xa0 [ 7612.095670] [] skb_release_all+0xb/0x20 [ 7612.095674] [] __kfree_skb+0xb/0x60 [ 7612.095679] [] kfree_skb+0x30/0x70 [ 7612.095686] [] ? rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095692] [] rtl_pci_reset_trx_ring+0x22d/0x370 [rtl_pci] [ 7612.095698] [] rtl_pci_start+0x19/0x190 [rtl_pci] [ 7612.095705] [] rtl_op_start+0x56/0x90 [rtlwifi] [ 7612.095712] [] drv_start+0x36/0xc0 [ 7612.095717] [] ieee80211_do_open+0x2d3/0x890 [ 7612.095725] [] ? call_netdevice_notifiers_info+0x2e/0x60 [ 7612.095730] [] ieee80211_open+0x4d/0x50 [ 7612.095736] [] __dev_open+0xa3/0x130 [ 7612.095742] [] ? _raw_spin_unlock_bh+0x13/0x20 [ 7612.095748] [] __dev_change_flags+0x89/0x140 [ 7612.095753] [] ? selinux_capable+0xd/0x10 [ 7612.095759] [] dev_change_flags+0x29/0x60 [ 7612.095765] [] devinet_ioctl+0x553/0x670 [ 7612.095772] [] ? _copy_to_user+0x28/0x40 [ 7612.095777] [] inet_ioctl+0x85/0xb0 [ 7612.095783] [] sock_ioctl+0x67/0x260 [ 7612.095788] [] ? sock_fasync+0x80/0x80 [ 7612.095795] [] do_vfs_ioctl+0x6b/0x550 [ 7612.095800] [] ? selinux_file_ioctl+0x102/0x1e0 [ 7612.095807] [] ? timekeeping_suspend+0x294/0x320 [ 7612.095813] [] ? __hrtimer_run_queues+0x14a/0x210 [ 7612.095820] [] ? security_file_ioctl+0x34/0x50 [ 7612.095827] [] SyS_ioctl+0x70/0x80 [ 7612.095832] [] do_fast_syscall_32+0x84/0x120 [ 7612.095839] [] sysenter_past_esp+0x36/0x55 [ 7612.095844] ---[ end trace 97e9c637a20e8348 ]--- Signed-off-by: Wang YanQing --- drivers/net/wireless/realtek/rtlwifi/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index 1ac41b8..99a3a03 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -1572,7 +1572,7 @@ int rtl_pci_reset_trx_ring(struct ieee80211_hw *hw) true, HW_DESC_TXBUFF_ADDR), skb->len, PCI_DMA_TODEVICE); - kfree_skb(skb); + dev_kfree_skb_irq(skb); ring->idx = (ring->idx + 1) % ring->entries; } ring->idx = 0; After testing, this patch is OK other than needing a Cc to stable. Please fix that and resubmit V2. Larry
[PATCH] Documentation/networking: more accurate LCO explanation
In few places the term "ones-complement sum" was used but the actual meaning is "the complement of the ones-complement sum". Signed-off-by: Shmulik Ladkani --- I assume readers interpret the term "ones-complement sum" as the sum using one's complement arithmentic, without the final bitwise complement of sum's result. Hence I added "the complement of" where applicable. Documentation/networking/checksum-offloads.txt | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/checksum-offloads.txt b/Documentation/networking/checksum-offloads.txt index de2a327766..9567200e1f 100644 --- a/Documentation/networking/checksum-offloads.txt +++ b/Documentation/networking/checksum-offloads.txt @@ -69,17 +69,17 @@ LCO: Local Checksum Offload LCO is a technique for efficiently computing the outer checksum of an encapsulated datagram when the inner checksum is due to be offloaded. The ones-complement sum of a correctly checksummed TCP or UDP packet is - equal to the sum of the pseudo header, because everything else gets - 'cancelled out' by the checksum field. This is because the sum was + equal to the complement of the sum of the pseudo header, because everything + else gets 'cancelled out' by the checksum field. This is because the sum was complemented before being written to the checksum field. More generally, this holds in any case where the 'IP-style' ones complement checksum is used, and thus any checksum that TX Checksum Offload supports. That is, if we have set up TX Checksum Offload with a start/offset pair, we know that _after the device has filled in that checksum_, the ones complement sum from csum_start to the end of the packet will be equal to - _whatever value we put in the checksum field beforehand_. This allows us - to compute the outer checksum without looking at the payload: we simply - stop summing when we get to csum_start, then add the 16-bit word at + the complement of _whatever value we put in the checksum field beforehand_. + This allows us to compute the outer checksum without looking at the payload: + we simply stop summing when we get to csum_start, then add the 16-bit word at (csum_start + csum_offset). Then, when the true inner checksum is filled in (either by hardware or by skb_checksum_help()), the outer checksum will become correct by virtue of -- 2.7.4
[PATCH net-next] fq_codel: add memory limitation per queue
From: Eric Dumazet On small embedded routers, one wants to control maximal amount of memory used by fq_codel, instead of controlling number of packets or bytes, since GRO/TSO make these not practical. Assuming skb->truesize is accurate, we have to keep track of skb->truesize sum for skbs in queue. This patch adds a new TCA_FQ_CODEL_MEMORY_LIMIT attribute. I chose a default value of 32 MBytes, which looks reasonable even for heavy duty usages. (Prior fq_codel users should not be hurt when they upgrade their kernels) Two fields are added to tc_fq_codel_qd_stats to report : - Current memory usage - Number of drops caused by memory limits # tc qd replace dev eth1 root est 1sec 4sec fq_codel memory_limit 4M .. # tc -s -d qd sh dev eth1 qdisc fq_codel 8008: root refcnt 257 limit 10240p flows 1024 quantum 1514 target 5.0ms interval 100.0ms memory_limit 4Mb ecn Sent 2083566791363 bytes 1376214889 pkt (dropped 4994406, overlimits 0 requeues 21705223) rate 9841Mbit 812549pps backlog 3906120b 376p requeues 21705223 maxpacket 68130 drop_overlimit 4994406 new_flow_count 28855414 ecn_mark 0 memory_used 4190048 drop_overmemory 4994406 new_flows_len 1 old_flows_len 177 Signed-off-by: Eric Dumazet Cc: Jesper Dangaard Brouer Cc: Dave Täht Cc: Sebastian Möller --- include/uapi/linux/pkt_sched.h |3 +++ net/sched/sch_fq_codel.c | 27 --- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index a11afecd4482..2382eed50278 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -719,6 +719,7 @@ enum { TCA_FQ_CODEL_QUANTUM, TCA_FQ_CODEL_CE_THRESHOLD, TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, __TCA_FQ_CODEL_MAX }; @@ -743,6 +744,8 @@ struct tc_fq_codel_qd_stats { __u32 new_flows_len; /* count of flows in new list */ __u32 old_flows_len; /* count of flows in old list */ __u32 ce_mark;/* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; }; struct tc_fq_codel_cl_stats { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index e7b42b0d5145..bb8bd9314629 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -60,8 +60,11 @@ struct fq_codel_sched_data { u32 perturbation; /* hash perturbation */ u32 quantum;/* psched_mtu(qdisc_dev(sch)); */ u32 drop_batch_size; + u32 memory_limit; struct codel_params cparams; struct codel_stats cstats; + u32 memory_usage; + u32 drop_overmemory; u32 drop_overlimit; u32 new_flow_count; @@ -143,6 +146,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) unsigned int maxbacklog = 0, idx = 0, i, len; struct fq_codel_flow *flow; unsigned int threshold; + unsigned int mem = 0; /* Queue is full! Find the fat flow and drop packet(s) from it. * This might sound expensive, but with 1024 flows, we scan @@ -167,11 +171,13 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets) do { skb = dequeue_head(flow); len += qdisc_pkt_len(skb); + mem += skb->truesize; kfree_skb(skb); } while (++i < max_packets && len < threshold); flow->dropped += i; q->backlogs[idx] -= len; + q->memory_usage -= mem; sch->qstats.drops += i; sch->qstats.backlog -= len; sch->q.qlen -= i; @@ -193,6 +199,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) unsigned int idx, prev_backlog, prev_qlen; struct fq_codel_flow *flow; int uninitialized_var(ret); + bool memory_limited; idx = fq_codel_classify(skb, sch, &ret); if (idx == 0) { @@ -215,7 +222,9 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) flow->deficit = q->quantum; flow->dropped = 0; } - if (++sch->q.qlen <= sch->limit) + q->memory_usage += skb->truesize; + memory_limited = q->memory_usage > q->memory_limit; + if (++sch->q.qlen <= sch->limit && !memory_limited) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; @@ -229,7 +238,8 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = fq_codel_drop(sch, q->drop_batch_size); q->drop_overlimit += prev_qlen - sch->q.qlen; - + if (memory_limited) + q->drop_overmemory += prev_qlen - sch->q.qlen; /* As we dropped packet(s), better let upper stack know this */ qdisc_tree_reduce_backlog(sch, prev_qlen - sch->q.qlen,
[PATCH] Add support for configuring Infiniband GUIDs
Add two NLA's that allow configuration of Infiniband node or port GUIDs by referencing the IPoIB net device set over then physical function. The format to be used is as follows: ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70 ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78 Issue: 702759 Change-Id: I5ffb54d6de7bfa8650bf5818f484279914991d6e Signed-off-by: Eli Cohen --- ip/iplink.c | 40 man/man8/ip-link.8.in | 12 +++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/ip/iplink.c b/ip/iplink.c index d2e586b6d133..3f885defdfeb 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -237,6 +237,30 @@ struct iplink_req { charbuf[1024]; }; +static int extract_guid(__u64 *guid, char *arg) +{ + __u64 ret; + int g[8]; + int err; + + err = sscanf(arg, "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x", +g, g + 1, g + 2, g + 3, g + 4, g + 5, g + 6, g + 7); + if (err != 8) + return -1; + + ret = ((__u64)(g[0]) << 56) | + ((__u64)(g[1]) << 48) | + ((__u64)(g[2]) << 40) | + ((__u64)(g[3]) << 32) | + ((__u64)(g[4]) << 24) | + ((__u64)(g[5]) << 16) | + ((__u64)(g[6]) << 8) | + ((__u64)(g[7])); + *guid = ret; + + return 0; +} + static int iplink_parse_vf(int vf, int *argcp, char ***argvp, struct iplink_req *req, int dev_index) { @@ -383,6 +407,22 @@ static int iplink_parse_vf(int vf, int *argcp, char ***argvp, invarg("Invalid \"state\" value\n", *argv); ivl.vf = vf; addattr_l(&req->n, sizeof(*req), IFLA_VF_LINK_STATE, &ivl, sizeof(ivl)); + } else if (matches(*argv, "node_guid") == 0) { + struct ifla_vf_guid ivg; + + NEXT_ARG(); + ivg.vf = vf; + if (extract_guid(&ivg.guid, *argv)) + return -1; + addattr_l(&req->n, sizeof(*req), IFLA_VF_IB_NODE_GUID, &ivg, sizeof(ivg)); + } else if (matches(*argv, "port_guid") == 0) { + struct ifla_vf_guid ivg; + + NEXT_ARG(); + ivg.vf = vf; + if (extract_guid(&ivg.guid, *argv)) + return -1; + addattr_l(&req->n, sizeof(*req), IFLA_VF_IB_PORT_GUID, &ivg, sizeof(ivg)); } else { /* rewind arg */ PREV_ARG(); diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in index 805511423ef2..e143a5ec8a9a 100644 --- a/man/man8/ip-link.8.in +++ b/man/man8/ip-link.8.in @@ -143,7 +143,11 @@ ip-link \- network device configuration .br .RB "[ " state " { " auto " | " enable " | " disable " } ]" .br -.RB "[ " trust " { " on " | " off " } ] ]" +.RB "[ " trust " { " on " | " off " } ]" +.br +.RB "[ " node_guid " eui64 ]" +.br +.RB "[ " port_guid " eui64 ] ]" .br .in -9 .RB "[ " master @@ -1033,6 +1037,12 @@ sent by the VF. .BI trust " on|off" - trust the specified VF user. This enables that VF user can set a specific feature which may impact security and/or performance. (e.g. VF multicast promiscuous mode) +.sp +.BI node_guid " eui64" +- configure node GUID for the VF. +.sp +.BI port_guid " eui64" +- configure port GUID for the VF. .in -8 .TP -- 2.8.1
Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting
On Fri, May 06, 2016 at 04:14:11PM +0100, Edward Cree wrote: > On 06/05/16 15:43, Phil Sutter wrote: > > On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote: > >> Since we can only configure unicast, we probably want to be able to > >> display unicast, rather than multicast. > > Furthermore, the kernel even rejects multicast peer addresses. > Yes, but a future kernel might not, and iproute2 is meant to be forward- > compatible. Sorry, but I fail to see how this might break forward compatibility. Quite the contrary, suppose geneve in future supported multicast peers, current iproute2 would fail to recognize it's existence. What am I missing here? > > Why do you then propose a dubious fix to a dubious check instead of > > getting rid of it in the first place? > Because John Linville clearly had some reason for putting a check there, > and he probably knows better than me. Chesterton's fence. A valid point, indeed. In my opinion the same applies to your patch as well, as instead of removing the fence you're moving it to the other lane. :) Cheers, Phil
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
> In other words, the full-speed hub is restricting the USB to > Ethernet Adaptor to a 12Mbps (half-duplex) bandwidth to support > Ethernet 100Mbps (full-duplex) traffic. That is not going to work > very well because Ethernet frames (perhaps partial Ethernet frames) > need to be discarded within the USB link. If that really is true, the design is broken. I would expect the adaptor to reliably transfer whole frames over USB, and drop whole frames from its receive queue when the USB is congested. TCP is also going to see the USB bottleneck as just like any bottleneck in the network and back off. So TCP streams should not cause major congestion on the USB link. Going over a 12Mbps USB link should be no different to hitting an old Ethernet hub which can only do 10/Half. > Therefore please retest with a working high-speed USB hub or remove > the full-speed USB hub from the test environment and directly > connect the USB to Ethernet Adaptor to the root hub of the USB port. > Then repeat the tests to see whether anything improved. > > In other words, you need to eliminate the dmesg messages saying "not > running at top speed; connect to a high speed hub". I would also suggest testing with the Ethernet at 10/half. You should be able to use Ethtool to set that up. Your USB and Ethernet bandwidth become more equal. If you still see errors, it suggests a protocol implementation error somewhere. Andrew
Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting
On 06/05/16 15:43, Phil Sutter wrote: > On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote: >> Since we can only configure unicast, we probably want to be able to >> display unicast, rather than multicast. > Furthermore, the kernel even rejects multicast peer addresses. Yes, but a future kernel might not, and iproute2 is meant to be forward- compatible. > Why do you then propose a dubious fix to a dubious check instead of > getting rid of it in the first place? Because John Linville clearly had some reason for putting a check there, and he probably knows better than me. Chesterton's fence. -Ed
Re: [RFC PATCH net-next 14/20] net: dsa: mv88e6xxx: factorize VLAN Ethertype
Hi Andrew, Andrew Lunn writes: >> @@ -55,6 +58,7 @@ static const struct mv88e6xxx_info mv88e6131_table[] = { >> .num_databases = 256, >> .num_ports = 10, >> .flags = MV88E6XXX_FLAG_ATU | >> +MV88E6XXX_FLAG_CORE_TAG_TYPE | >> MV88E6XXX_FLAG_PPU | >> MV88E6XXX_FLAG_VLANTABLE | >> MV88E6XXX_FLAG_VTU, > > Rather than repeating these flags again and again, could you add one > #define containing the flags, and then use that to initialise .flags. Hum OK, I wasn't sure, but looking at the final mv88e6xxx_info table, I can see that models from the same family all have the same set of flags, even if they don't have the same number of ports or databases. I'll add one MV88E6XXX_FLAGS_ per family. Thanks, Vivien
Re: [REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions
On 05/05/16 13:19, Guodong Xu wrote: Hi, Dean I am not sure why do you insist 'not full speed'. Actually, the tests I run on ARM-64bit is at USB full speed mode. I pasted my log here: http://paste.ubuntu.com/16236442/ , which includes the information you requested above, ifconfig, dmesg. The interval between two consecutive errors varies from 10 to 40ms. Your log from http://paste.ubuntu.com/16236442/ shows high speed for device 3 is not being used: [3.586968] usb 1-1: new full-speed USB device number 2 using dwc2 [3.792091] usb 1-1: not running at top speed; connect to a high speed hub [3.800477] hub 1-1:1.0: USB hub found [3.803658] hub 1-1:1.0: 3 ports detected [4.086636] usb 1-1.2: new full-speed USB device number 3 using dwc2 [4.202209] usb 1-1.2: not running at top speed; connect to a high speed hub [8.851236] asix 1-1.2:1.0 eth0: register 'asix' at usb-f72c.usb-1.2, ASIX AX88772B USB 2.0 Ethernet, 00:0e:c6:fa:bf:fd Hopefully, you know USB 2.0 high speed (480Mbps) is faster than full speed (12Mbps) mode. Therefore, your USB to Ethernet Adaptor is not running in its optimal "normal" high speed operation and there is a USB hub in the way that is not running at USB high speed mode. This is an abnormal configuration and potentially explains some of your failure observations. Running at full-speed (12Mbps) mode would explain why the timestamps has gaps of ms rather than us gaps (for 480Mbps). In other words, the full-speed hub is restricting the USB to Ethernet Adaptor to a 12Mbps (half-duplex) bandwidth to support Ethernet 100Mbps (full-duplex) traffic. That is not going to work very well because Ethernet frames (perhaps partial Ethernet frames) need to be discarded within the USB link. Your ifconfig output from http://paste.ubuntu.com/16236442/ shows 249 errors eth0 Link encap:Ethernet HWaddr 00:0e:c6:fa:bf:fd inet addr:192.168.1.11 Bcast:192.168.1.255 Mask:255.255.255.0 UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:865 errors:249 dropped:0 overruns:0 frame:0 TX packets:880 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:1228273 (1.1 MiB) TX bytes:68955 (67.3 KiB) Before the test RX packets:28 errors:0 dropped:0 overruns:0 frame:0 After the test RX packets:865 errors:249 dropped:0 overruns:0 frame:0 Good test packets = 865 - 28 = 837 Detected bad Ethernet frames = 249 Bad to good ratio is 249:837 = 1:3.36 so 1 detected bad Ethernet frame per 3.36 good Ethernet frames Your ifconfig output from http://paste.ubuntu.com/16236764/ shows 1282 errors eth0 Link encap:Ethernet HWaddr 00:0e:c6:fa:bf:fd inet addr:192.168.1.11 Bcast:192.168.1.255 Mask:255.255.255.0 UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1 RX packets:55 errors:1282 dropped:0 overruns:0 frame:0 TX packets:64 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:1000 RX bytes:14287 (13.9 KiB) TX bytes:7639 (7.4 KiB) Before the test RX packets:19 errors:0 dropped:0 overruns:0 frame:0 After the test RX packets:55 errors:1282 dropped:0 overruns:0 frame:0 Good test packets = 55 - 19 = 36 Detected bad Ethernet frames = 1282 Bad to good ratio is 1282:36 = 1:0.28 so 1 detected bad Ethernet frame per 0.028 good Ethernet frames This suggests a very high error rate. It is interesting that the reported "remaining" value is 988. Is 988 always shown ? I mean that do you see any other "remaining" values for the "Data Header synchronisation was lost" error message ? Yes and No. When doing iperf test in TCP mode, always 988. I have never seen other "remaining" value. But, 1. I tried "ping -f -s 1400 [my.arm.64bit.board.ip]", but this cannot trigger the error. 2. Tried iperf in UDP mode, I saw "Data Header synchronisation was lost" remaining value is 984 (again, seemingly always in several tries). Log is pasted here. http://paste.ubuntu.com/16236764/ In http://paste.ubuntu.com/16236764/ you see very many [ 41.938370] asix 1-1.2:1.0 eth0: asix_rx_fixup() Bad Header Length 0x11400040, offset 4 but only a few [ 42.214607] asix 1-1.2:1.0 eth0: asix_rx_fixup() Data Header synchronisation was lost, remaining 984 This suggests that the "Bad Header Length" and "Data Header synchronisation was lost" error messages are not related to consecutive URBs. The expectation is that a "Data Header synchronisation was lost" error message is immediately followed by a "Bad Header Length" message with a timestamp much less than 1ms (for high speed USB). This is because an Ethernet frame that spans URBs needs low latency so should be sent quickly in consecutive URBs. The Bad Header Length error messages with offset 4 indicates that 32-bit header word was not found in the expected location at the start of the URB buffer. [ 41.938370] asix 1-1.2:1.0 eth0: asix_rx_fixup() Bad Header Le
Re: [PATCH iproute2] geneve: fix IPv6 remote address reporting
On Fri, May 06, 2016 at 03:28:25PM +0100, Edward Cree wrote: > Since we can only configure unicast, we probably want to be able to > display unicast, rather than multicast. Furthermore, the kernel even rejects multicast peer addresses. > I'm assuming this is what was intended, but tbh I don't know why we > need to check for multicast on the display side at all, rather than > just displaying whatever the kernel gives us. Why do you then propose a dubious fix to a dubious check instead of getting rid of it in the first place? Reminds me a bit of this here (no offense intended): http://geekandpoke.typepad.com/geekandpoke/2011/07/good-coders.html Cheers, Phil
[PATCH iproute2] geneve: fix IPv6 remote address reporting
Since we can only configure unicast, we probably want to be able to display unicast, rather than multicast. Fixes: 906ac5437ab8 ("geneve: add support for IPv6 link partners") Signed-off-by: Edward Cree --- I'm assuming this is what was intended, but tbh I don't know why we need to check for multicast on the display side at all, rather than just displaying whatever the kernel gives us. ip/iplink_geneve.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c index 84d948f..65af6b3 100644 --- a/ip/iplink_geneve.c +++ b/ip/iplink_geneve.c @@ -204,7 +204,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) memcpy(&addr, RTA_DATA(tb[IFLA_GENEVE_REMOTE6]), sizeof(struct in6_addr)); if (memcmp(&addr, &in6addr_any, sizeof(addr)) != 0) { - if (IN6_IS_ADDR_MULTICAST(&addr)) + if (!IN6_IS_ADDR_MULTICAST(&addr)) fprintf(f, "remote %s ", format_host(AF_INET6, sizeof(struct in6_addr), &addr)); } -- 2.4.3
[PATCH v4 1/2] soc: qcom: smd: Introduce compile stubs
Introduce compile stubs for the SMD API, allowing consumers to be compile tested. Acked-by: Andy Gross Signed-off-by: Bjorn Andersson --- Changes since v3: - None Changes since v2: - Introduce this patch, to allow compile testing of QRTR_SMD include/linux/soc/qcom/smd.h | 28 +++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h index d0cb6d189a0a..46a984f5e3a3 100644 --- a/include/linux/soc/qcom/smd.h +++ b/include/linux/soc/qcom/smd.h @@ -45,13 +45,39 @@ struct qcom_smd_driver { int (*callback)(struct qcom_smd_device *, const void *, size_t); }; +#if IS_ENABLED(CONFIG_QCOM_SMD) + int qcom_smd_driver_register(struct qcom_smd_driver *drv); void qcom_smd_driver_unregister(struct qcom_smd_driver *drv); +int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len); + +#else + +static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv) +{ + return -ENXIO; +} + +static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv) +{ + /* This shouldn't be possible */ + WARN_ON(1); +} + +static inline int qcom_smd_send(struct qcom_smd_channel *channel, + const void *data, int len) +{ + /* This shouldn't be possible */ + WARN_ON(1); + return -ENXIO; +} + +#endif + #define module_qcom_smd_driver(__smd_driver) \ module_driver(__smd_driver, qcom_smd_driver_register, \ qcom_smd_driver_unregister) -int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len); #endif -- 2.5.0
[PATCH v4 2/2] net: Add Qualcomm IPC router
From: Courtney Cavin Add an implementation of Qualcomm's IPC router protocol, used to communicate with service providing remote processors. Signed-off-by: Courtney Cavin Signed-off-by: Bjorn Andersson [bjorn: Cope with 0 being a valid node id and implement RTM_NEWADDR] Signed-off-by: Bjorn Andersson --- Changes since v3: - Made it possible to compile qrtr as module Changes since v2: - Altered Kconfig dependency for QRTR_SMD to be compile testable Changes since v1: - Made node 0 (normally the Qualcomm modem) a valid node - Implemented RTM_NEWADDR for specifying the local node id include/linux/socket.h|4 +- include/uapi/linux/qrtr.h | 12 + net/Kconfig |1 + net/Makefile |1 + net/qrtr/Kconfig | 24 ++ net/qrtr/Makefile |2 + net/qrtr/qrtr.c | 1007 + net/qrtr/qrtr.h | 31 ++ net/qrtr/smd.c| 117 ++ 9 files changed, 1198 insertions(+), 1 deletion(-) create mode 100644 include/uapi/linux/qrtr.h create mode 100644 net/qrtr/Kconfig create mode 100644 net/qrtr/Makefile create mode 100644 net/qrtr/qrtr.c create mode 100644 net/qrtr/qrtr.h create mode 100644 net/qrtr/smd.c diff --git a/include/linux/socket.h b/include/linux/socket.h index 73bf6c6a833b..b5cc5a6d7011 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -201,8 +201,9 @@ struct ucred { #define AF_NFC 39 /* NFC sockets */ #define AF_VSOCK 40 /* vSockets */ #define AF_KCM 41 /* Kernel Connection Multiplexor*/ +#define AF_QIPCRTR 42 /* Qualcomm IPC Router */ -#define AF_MAX 42 /* For now.. */ +#define AF_MAX 43 /* For now.. */ /* Protocol families, same as address families. */ #define PF_UNSPEC AF_UNSPEC @@ -249,6 +250,7 @@ struct ucred { #define PF_NFC AF_NFC #define PF_VSOCK AF_VSOCK #define PF_KCM AF_KCM +#define PF_QIPCRTR AF_QIPCRTR #define PF_MAX AF_MAX /* Maximum queue length specifiable by listen. */ diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h new file mode 100644 index ..66c0748d26e2 --- /dev/null +++ b/include/uapi/linux/qrtr.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_QRTR_H +#define _LINUX_QRTR_H + +#include + +struct sockaddr_qrtr { + __kernel_sa_family_t sq_family; + __u32 sq_node; + __u32 sq_port; +}; + +#endif /* _LINUX_QRTR_H */ diff --git a/net/Kconfig b/net/Kconfig index a8934d8c8fda..b841c42e5c9b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -236,6 +236,7 @@ source "net/mpls/Kconfig" source "net/hsr/Kconfig" source "net/switchdev/Kconfig" source "net/l3mdev/Kconfig" +source "net/qrtr/Kconfig" config RPS bool diff --git a/net/Makefile b/net/Makefile index 81d14119eab5..bdd14553a774 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ endif ifneq ($(CONFIG_NET_L3_MASTER_DEV),) obj-y += l3mdev/ endif +obj-$(CONFIG_QRTR) += qrtr/ diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig new file mode 100644 index ..673fd1f86ebe --- /dev/null +++ b/net/qrtr/Kconfig @@ -0,0 +1,24 @@ +# Qualcomm IPC Router configuration +# + +config QRTR + tristate "Qualcomm IPC Router support" + depends on ARCH_QCOM || COMPILE_TEST + ---help--- + Say Y if you intend to use Qualcomm IPC router protocol. The + protocol is used to communicate with services provided by other + hardware blocks in the system. + + In order to do service lookups, a userspace daemon is required to + maintain a service listing. + +if QRTR + +config QRTR_SMD + tristate "SMD IPC Router channels" + depends on QCOM_SMD || COMPILE_TEST + ---help--- + Say Y here to support SMD based ipcrouter channels. SMD is the + most common transport for IPC Router. + +endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile new file mode 100644 index ..6c00dc623b7e --- /dev/null +++ b/net/qrtr/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_QRTR) := qrtr.o +obj-$(CONFIG_QRTR_SMD) += smd.o diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c new file mode 100644 index ..c985ecbe9bd6 --- /dev/null +++ b/net/qrtr/qrtr.c @@ -0,0 +1,1007 @@ +/* + * Copyright (c) 2015, Sony Mobile Communications Inc. + * Copyright (c) 2013, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#include
[PATCH net v3] vlan: Propagate MAC address to VLANs
The MAC address of the physical interface is only copied to the VLAN when it is first created, resulting in an inconsistency after MAC address changes of only newly created VLANs having an up-to-date MAC. The VLANs should continue inheriting the MAC address of the physical interface, unless explicitly changed to be different from this. This allows IPv6 EUI64 addresses for the VLAN to reflect any changes to the MAC of the physical interface and thus for DAD to behave as expected. Signed-off-by: Mike Manning --- include/linux/if_vlan.h |2 ++ net/8021q/vlan.c| 17 +++-- net/8021q/vlan_dev.c| 13 ++--- 3 files changed, 23 insertions(+), 9 deletions(-) --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -138,6 +138,7 @@ struct netpoll; * @flags: device flags * @real_dev: underlying netdevice * @real_dev_addr: address of underlying netdevice + * @addr_assign_type: address assignment type * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats */ @@ -153,6 +154,7 @@ struct vlan_dev_priv { struct net_device *real_dev; unsigned char real_dev_addr[ETH_ALEN]; + unsigned char addr_assign_type; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -291,6 +291,15 @@ static void vlan_sync_address(struct net if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr)) return; + /* vlan continues to inherit address of parent interface */ + if (vlan->addr_assign_type == NET_ADDR_STOLEN) { + ether_addr_copy(vlandev->dev_addr, dev->dev_addr); + goto out; + } + + if (!(vlandev->flags & IFF_UP)) + goto out; + /* vlan address was different from the old address and is equal to * the new address */ if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) && @@ -303,6 +312,7 @@ static void vlan_sync_address(struct net !ether_addr_equal(vlandev->dev_addr, dev->dev_addr)) dev_uc_add(dev, vlandev->dev_addr); +out: ether_addr_copy(vlan->real_dev_addr, dev->dev_addr); } @@ -389,13 +399,8 @@ static int vlan_device_event(struct noti case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - vlan_group_for_each_dev(grp, i, vlandev) { - flgs = vlandev->flags; - if (!(flgs & IFF_UP)) - continue; - + vlan_group_for_each_dev(grp, i, vlandev) vlan_sync_address(dev, vlandev); - } break; case NETDEV_CHANGEMTU: --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -315,17 +315,21 @@ static int vlan_dev_stop(struct net_devi static int vlan_dev_set_mac_address(struct net_device *dev, void *p) { - struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; struct sockaddr *addr = p; + bool is_real_addr; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; + is_real_addr = ether_addr_equal(addr->sa_data, real_dev->dev_addr); + if (!(dev->flags & IFF_UP)) goto out; - if (!ether_addr_equal(addr->sa_data, real_dev->dev_addr)) { + if (!is_real_addr) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; @@ -336,6 +340,7 @@ static int vlan_dev_set_mac_address(stru out: ether_addr_copy(dev->dev_addr, addr->sa_data); + vlan->addr_assign_type = is_real_addr ? NET_ADDR_STOLEN : NET_ADDR_SET; return 0; } @@ -558,8 +563,10 @@ static int vlan_dev_init(struct net_devi /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; - if (is_zero_ether_addr(dev->dev_addr)) + if (is_zero_ether_addr(dev->dev_addr)) { eth_hw_addr_inherit(dev, real_dev); + vlan_dev_priv(dev)->addr_assign_type = NET_ADDR_STOLEN; + } if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); -- 1.7.10.4
[PATCH net] macvtap: segmented packet is consumed
From: Eric Dumazet If GSO packet is segmented and its segments are properly queued, we call consume_skb() instead of kfree_skb() to be drop monitor friendly. Fixes: 3e4f8b7873709 ("macvtap: Perform GSO on forwarding path.") Signed-off-by: Eric Dumazet Cc: Vlad Yasevich --- drivers/net/macvtap.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 95394edd1ed5..9a35aa462314 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -373,7 +373,7 @@ static rx_handler_result_t macvtap_handle_frame(struct sk_buff **pskb) goto wake_up; } - kfree_skb(skb); + consume_skb(skb); while (segs) { struct sk_buff *nskb = segs->next;
Re: OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)
I've created a OpenWRT ticket[1] on this issue, as it seems that someone[2] closed Felix'es OpenWRT email account (bad choice! emails bouncing). Sounds like OpenWRT and the LEDE https://www.lede-project.org/ project is in some kind of conflict. OpenWRT ticket [1] https://dev.openwrt.org/ticket/22349 [2] http://thread.gmane.org/gmane.comp.embedded.openwrt.devel/40298/focus=40335 On Fri, 6 May 2016 11:42:43 +0200 Jesper Dangaard Brouer wrote: > Hi Felix, > > This is an important fix for OpenWRT, please read! > > OpenWRT changed the default fq_codel sch->limit from 10240 to 1024, > without also adjusting q->flows_cnt. Eric explains below that you must > also adjust the buckets (q->flows_cnt) for this not to break. (Just > adjust it to 128) > > Problematic OpenWRT commit in question: > http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e > 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from > causing too much cpu load with higher speed (#21326)") > > > I also highly recommend you cherry-pick this very recent commit: > net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") > https://git.kernel.org/davem/net-next/c/9d18562a227 > > This should fix very high CPU usage in-case fq_codel goes into drop mode. > The problem is that drop mode was considered rare, and implementation > wise it was chosen to be more expensive (to save cycles on normal mode). > Unfortunately is it easy to trigger with an UDP flood. Drop mode is > especially expensive for smaller devices, as it scans a 4K big array, > thus 64 cache misses for small devices! > > The fix is to allow drop-mode to bulk-drop more packets when entering > drop-mode (default 64 bulk drop). That way we don't suddenly > experience a significantly higher processing cost per packet, but > instead can amortize this. > > To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk > drop, given we also recommend bucket size to be 128 ? (thus the amount > of memory to scan is less, but their CPU is also much smaller). > > --Jesper > > > On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet > wrote: > > > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote: > > > On 5 May 2016 at 19:12, Eric Dumazet wrote: > > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote: > > > > > > > >> > > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024 > > > >> quantum 1514 target 5.0ms interval 100.0ms ecn > > > >> Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0) > > > >> backlog 0b 0p requeues 0 > > > >> maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0 > > > >> new_flows_len 0 old_flows_len 0 > > > > > > > > > > > > Limit of 1024 packets and 1024 flows is not wise I think. > > > > > > > > (If all buckets are in use, each bucket has a virtual queue of 1 packet, > > > > which is almost the same than having no queue at all) > > > > > > > > I suggest to have at least 8 packets per bucket, to let Codel have a > > > > chance to trigger. > > > > > > > > So you could either reduce number of buckets to 128 (if memory is > > > > tight), or increase limit to 8192. > > > > > > Will try, but what I've posted is default, I didn't change/configure > > > that. > > > > fq_codel has a default of 10240 packets and 1024 buckets. > > > > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413 > > > > If someone changed that in the linux variant you use, he probably should > > explain the rationale. -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat Author of http://www.iptv-analyzer.org LinkedIn: http://www.linkedin.com/in/brouer
[PATCH v2] net: arc/emac: Move arc_emac_tx_clean() into arc_emac_tx() and disable tx interrut
From: Shuyu Wei Doing tx_clean() inside poll() may scramble the tx ring buffer if tx() is running. This will cause tx to stop working, which can be reproduced by simultaneously downloading two large files at high speed. Moving tx_clean() into tx() will prevent this. And tx interrupt is no longer needed now. Picked the Shuyu's patch up, the patch is sent on https://patchwork.kernel.org/patch/8356821/, since that make sense for rockchip platform. Note: Many people feedback the cransh problems with rk3036/rk3188 emac when download the heavy loading and this patch is indeed can fix the crash. The crash log as the followings: ... [ 2191.996127 ] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.4.0-rc6 #114 [ 2192.002475 ] Hardware name: Rockchip (Device Tree) [ 2192.007174 ] Backtrace: [ 2192.009658 ] [] (dump_backtrace) from [] (show_stack+0x18/0x1c) [ 2192.017220 ] r7:c051c4f8 r6:ef463180 r5:c05b7000 r4: [ 2192.022948 ] [] (show_stack) from [] (dump_stack+0x90/0xa0) [ 2192.030176 ] [] (dump_stack) from [] (bad_page+0xdc/0x12c) [ 2192.037302 ] r5:c059a100 r4:c05f430c [ 2192.040913 ] [] (bad_page) from [] (get_page_from_freelist+0x388/0x95c) [ 2192.049166 ] r9:0008 r8:ef463180 r7:c051c4d0 r6: r5: r4:c051c4e4 [ 2192.056982 ] [] (get_page_from_freelist) from [] (__alloc_pages_nodemask+0xd8/0x8e8) [ 2192.066362 ] r10:c001b068 r9: r8:ee0b02b0 r7:6113 r6:0003 r5:02095220 [ 2192.074254 ] r4:c05ca1c0 [ 2192.076809 ] [] (__alloc_pages_nodemask) from [] (__alloc_page_frag+0xb0/0x160) [ 2192.085757 ] r10:c001b068 r9: r8:ee0b02b0 r7:6113 r6:02080020 r5:0740 [ 2192.093650 ] r4:eedbc884 [ 2192.096207 ] [] (__alloc_page_frag) from [] (__netdev_alloc_skb+0xa0/0x104) [ 2192.104806 ] r7:6113 r6:eedbc884 r5:ee0b r4:0740 [ 2192.110525 ] [] (__netdev_alloc_skb) from [] (arc_emac_poll+0x318/0x57c) [ 2192.118865 ] r9: r8:ee0b02b0 r7:019c r6:ee163780 r5:0670 r4:ee0b [ 2192.126683 ] [] (arc_emac_poll) from [] (net_rx_action+0x1f0/0x2ec) [ 2192.134590 ] r10:c0599df8 r9:c059a100 r8:00073760 r7:012c r6:0028 r5:c02aa8e8 [ 2192.142483 ] r4:ee0b04e0 [ 2192.145040 ] [] (net_rx_action) from [] (__do_softirq+0x134/0x258) [ 2192.152860 ] r10:c059a080 r9:4003 r8:0003 r7:0100 r6:c0598000 r5:c059a08c [ 2192.160751 ] r4: ... Signed-off-by: Shuyu Wei Tested-by: Michael Niewoehner Tested-by: Xing Zheng Cc: "David S. Miller" Cc: Alexander Kochetkov Cc: netdev@vger.kernel.org Signed-off-by: Caesar Wang --- drivers/net/ethernet/arc/emac_main.c | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c index a3a9392..4f4e25e 100644 --- a/drivers/net/ethernet/arc/emac_main.c +++ b/drivers/net/ethernet/arc/emac_main.c @@ -311,12 +311,10 @@ static int arc_emac_poll(struct napi_struct *napi, int budget) struct arc_emac_priv *priv = netdev_priv(ndev); unsigned int work_done; - arc_emac_tx_clean(ndev); - work_done = arc_emac_rx(ndev, budget); if (work_done < budget) { napi_complete(napi); - arc_reg_or(priv, R_ENABLE, RXINT_MASK | TXINT_MASK); + arc_reg_or(priv, R_ENABLE, RXINT_MASK); } return work_done; @@ -345,9 +343,9 @@ static irqreturn_t arc_emac_intr(int irq, void *dev_instance) /* Reset all flags except "MDIO complete" */ arc_reg_set(priv, R_STATUS, status); - if (status & (RXINT_MASK | TXINT_MASK)) { + if (status & RXINT_MASK) { if (likely(napi_schedule_prep(&priv->napi))) { - arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK); + arc_reg_clr(priv, R_ENABLE, RXINT_MASK); __napi_schedule(&priv->napi); } } @@ -461,7 +459,7 @@ static int arc_emac_open(struct net_device *ndev) arc_reg_set(priv, R_TX_RING, (unsigned int)priv->txbd_dma); /* Enable interrupts */ - arc_reg_set(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK); + arc_reg_set(priv, R_ENABLE, RXINT_MASK | ERR_MASK); /* Set CONTROL */ arc_reg_set(priv, R_CTRL, @@ -594,7 +592,7 @@ static int arc_emac_stop(struct net_device *ndev) netif_stop_queue(ndev); /* Disable interrupts */ - arc_reg_clr(priv, R_ENABLE, RXINT_MASK | TXINT_MASK | ERR_MASK); + arc_reg_clr(priv, R_ENABLE, RXINT_MASK | ERR_MASK); /* Disable EMAC */ arc_reg_clr(priv, R_CTRL, EN_MASK); @@ -656,6 +654,8 @@ static int arc_emac_tx(struct sk_buff *skb, struct net_device *ndev) __le32 *info = &priv->txbd[*txbd_curr].info; dma_addr_t addr; + arc_emac_tx_clean(ndev); + if (skb_padto(skb, ETH_ZLEN)) return NETDEV_TX_OK; -- 1.9.1
Re: [PATCH net] netfilter: nf_conntrack: Use net_mutex for helper unregistration.
Hi Joe, On Thu, May 05, 2016 at 03:50:37PM -0700, Joe Stringer wrote: > diff --git a/net/netfilter/nf_conntrack_helper.c > b/net/netfilter/nf_conntrack_helper.c > index 3b40ec575cd5..6860b19be406 100644 > --- a/net/netfilter/nf_conntrack_helper.c > +++ b/net/netfilter/nf_conntrack_helper.c > @@ -449,10 +449,10 @@ void nf_conntrack_helper_unregister(struct > nf_conntrack_helper *me) >*/ > synchronize_rcu(); > > - rtnl_lock(); > + mutex_lock(&net_mutex); > for_each_net(net) > __nf_conntrack_helper_unregister(me, net); > - rtnl_unlock(); > + mutex_unlock(&net_mutex); This simple solution works because we have no .exit callbacks in any of our helpers. Otherwise, the helper code may be already gone by when the worker has a chance to run to release the netns. If so, probably I can append this as comment to this function so we don't forget. If we ever have .exit callbacks (I don't expect so), we would need to wait for worker completion.
Re: ixgbe: cannot enable LRO
> On Wed, Apr 27, 2016 at 2:36 AM, Otto Sabart wrote: > > > > Hello everyone, > > does anybody have a problem with LRO on ixge (on latest 4.6-rc5)? > > I cannot find a way to enable it. > > > > On stable RHEL7.2 kernel everything works fine. > > > > I opened a bug report [0]. > > > > [0] https://bugzilla.kernel.org/show_bug.cgi?id=117291 > > > > > > Thanks! > > > > Ota > Hello Alex, > So I am able to turn on LRO without any issues. Yes, I badly desciribed the problem. The LRO was not possible to turn on immediately _after the boot_ (I was enabling it in /etc/rc.local). When I reloaded the ixgbe driver, the LRO was possible to turn on without problem. I found out that the problem was caused by network manager. When I disabled NM, the LRO started to work. > > Do you know if you have done anything that might disable LRO such as > modified the rx-usecs to a value less than 10 or enabled routing or > bridging on the device? Also I think a stacked device might be able > to block you from enabling LRO unless all the devices stacked on the > interface can support it. I did not modify rx-usecs (at least not intentionally). Its value is always (with disabled or enabled LRO) equal to 1 (with disabled NM). $ ethtool -c ixgbe | grep rx-usecs rx-usecs: 1 rx-usecs-irq: 0 rx-usecs-low: 0 rx-usecs-high: 0 There is no linux bridge or routing enabled on this device. There is only a VLAN configured. Does it matter? $ ip l show ixgbe 9: ixgbe: mtu 1500 qdisc mq state UP mode DEFAULT qlen 1000 link/ether 00:1b:21:90:c3:86 brd ff:ff:ff:ff:ff:ff $ ip l show ixgbe.40 19: ixgbe.40@ixgbe: mtu 1500 qdisc noqueue state UP mode DEFAULT qlen 1000 link/ether 00:1b:21:90:c3:86 brd ff:ff:ff:ff:ff:ff I updated the bugzilla [0] and I think we can close this as a NOTABUG. Thanks! Ota signature.asc Description: PGP signature
Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops
Hi, On 05/06/2016 11:47 AM, Alexander Aring wrote: > > Hi, > > On 05/04/2016 02:23 PM, Stefan Schmidt wrote: >> Hello. >> >> On 20/04/16 10:19, Alexander Aring wrote: >>> This patch introduces neighbour discovery ops callback structure. The >>> structure contains at first receive and transmit handling for NS/NA and >>> userspace option field functionality. >>> >>> These callback offers 6lowpan different handling, such as 802.15.4 short >>> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over >>> 6LoWPANs). >>> >>> Cc: David S. Miller >>> Cc: Alexey Kuznetsov >>> Cc: James Morris >>> Cc: Hideaki YOSHIFUJI >>> Cc: Patrick McHardy >>> Signed-off-by: Alexander Aring >>> --- >>> include/linux/netdevice.h | 3 ++ >>> include/net/ndisc.h | 96 >>> +++ >>> net/ipv6/addrconf.c | 1 + >>> net/ipv6/ndisc.c | 71 --- >>> net/ipv6/route.c | 2 +- >>> 5 files changed, 144 insertions(+), 29 deletions(-) >>> >>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >>> index 0052c42..bc60033 100644 >>> --- a/include/linux/netdevice.h >>> +++ b/include/linux/netdevice.h >>> @@ -1677,6 +1677,9 @@ struct net_device { >>> #ifdef CONFIG_NET_L3_MASTER_DEV >>> const struct l3mdev_ops*l3mdev_ops; >>> #endif >>> +#if IS_ENABLED(CONFIG_IPV6) >>> +const struct ndisc_ops *ndisc_ops; >>> +#endif >>> const struct header_ops *header_ops; >>> diff --git a/include/net/ndisc.h b/include/net/ndisc.h >>> index aac868e..14ed016 100644 >>> --- a/include/net/ndisc.h >>> +++ b/include/net/ndisc.h >>> @@ -110,7 +110,8 @@ struct ndisc_options { >>> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) >>> -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, >>> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, >>> + u8 *opt, int opt_len, >>> struct ndisc_options *ndopts); >>> /* >>> @@ -173,6 +174,93 @@ static inline struct neighbour >>> *__ipv6_neigh_lookup(struct net_device *dev, cons >>> return n; >>> } >>> +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt) >> >> Name it __ipv6... instead of __ip6...? > > I had ipv6 before, but then I saw ip6... prefixes functionality in > ndisc.c and changed it to ip6, but both seems to be used. > > See "ip6_nd_hdr". > >>> +{ >>> +return opt->nd_opt_type == ND_OPT_RDNSS || >>> +opt->nd_opt_type == ND_OPT_DNSSL; >>> +} >>> + >>> +#if IS_ENABLED(CONFIG_IPV6) >>> +struct ndisc_ops { >>> +int(*is_useropt)(struct nd_opt_hdr *opt); >>> +void(*send_na)(struct net_device *dev, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *solicited_addr, >>> + bool router, bool solicited, >>> + bool override, bool inc_opt); >>> +void(*recv_na)(struct sk_buff *skb); >>> +void(*send_ns)(struct net_device *dev, >>> + const struct in6_addr *solicit, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *saddr); >>> +void(*recv_ns)(struct sk_buff *skb); >>> +}; >>> + >>> +static inline int ndisc_is_useropt(const struct net_device *dev, >>> + struct nd_opt_hdr *opt) >>> +{ >>> +if (likely(dev->ndisc_ops->is_useropt)) >>> +return dev->ndisc_ops->is_useropt(opt); >>> +else >>> +return 0; >>> +} >>> + >>> +static inline void ndisc_send_na(struct net_device *dev, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *solicited_addr, >>> + bool router, bool solicited, bool override, >>> + bool inc_opt) >>> +{ >>> +if (likely(dev->ndisc_ops->send_na)) >>> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router, >>> +solicited, override, inc_opt); >>> +} >>> + >>> +static inline void ndisc_recv_na(struct sk_buff *skb) >>> +{ >>> +if (likely(skb->dev->ndisc_ops->recv_na)) >>> +skb->dev->ndisc_ops->recv_na(skb); >>> +} >>> + >>> +static inline void ndisc_send_ns(struct net_device *dev, >>> + const struct in6_addr *solicit, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *saddr) >>> +{ >>> +if (likely(dev->ndisc_ops->send_ns)) >>> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr); >>> +} >>> + >>> +static inline void ndisc_recv_ns(struct sk_buff *skb) >>> +{ >>> +if (likely(skb->dev->ndisc_ops->recv_ns)) >>> +skb->dev->ndisc_ops->recv_ns(skb); >>> +} >>> +#else >>> +static inline int ndisc_is_useropt(const struct net_device *dev, >>> + struct nd_opt_hdr *opt) >>> +{ >>> +return 0; >>> +} >>> + >>> +static inline void ndisc_send_na(struct net_device *dev, >>> + const struct in6_addr *daddr, >>>
Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops
Hi, On 05/04/2016 02:23 PM, Stefan Schmidt wrote: > Hello. > > On 20/04/16 10:19, Alexander Aring wrote: >> This patch introduces neighbour discovery ops callback structure. The >> structure contains at first receive and transmit handling for NS/NA and >> userspace option field functionality. >> >> These callback offers 6lowpan different handling, such as 802.15.4 short >> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over >> 6LoWPANs). >> >> Cc: David S. Miller >> Cc: Alexey Kuznetsov >> Cc: James Morris >> Cc: Hideaki YOSHIFUJI >> Cc: Patrick McHardy >> Signed-off-by: Alexander Aring >> --- >> include/linux/netdevice.h | 3 ++ >> include/net/ndisc.h | 96 >> +++ >> net/ipv6/addrconf.c | 1 + >> net/ipv6/ndisc.c | 71 --- >> net/ipv6/route.c | 2 +- >> 5 files changed, 144 insertions(+), 29 deletions(-) >> >> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >> index 0052c42..bc60033 100644 >> --- a/include/linux/netdevice.h >> +++ b/include/linux/netdevice.h >> @@ -1677,6 +1677,9 @@ struct net_device { >> #ifdef CONFIG_NET_L3_MASTER_DEV >> const struct l3mdev_ops*l3mdev_ops; >> #endif >> +#if IS_ENABLED(CONFIG_IPV6) >> +const struct ndisc_ops *ndisc_ops; >> +#endif >> const struct header_ops *header_ops; >> diff --git a/include/net/ndisc.h b/include/net/ndisc.h >> index aac868e..14ed016 100644 >> --- a/include/net/ndisc.h >> +++ b/include/net/ndisc.h >> @@ -110,7 +110,8 @@ struct ndisc_options { >> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) >> -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, >> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, >> + u8 *opt, int opt_len, >> struct ndisc_options *ndopts); >> /* >> @@ -173,6 +174,93 @@ static inline struct neighbour >> *__ipv6_neigh_lookup(struct net_device *dev, cons >> return n; >> } >> +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt) > > Name it __ipv6... instead of __ip6...? I had ipv6 before, but then I saw ip6... prefixes functionality in ndisc.c and changed it to ip6, but both seems to be used. See "ip6_nd_hdr". >> +{ >> +return opt->nd_opt_type == ND_OPT_RDNSS || >> +opt->nd_opt_type == ND_OPT_DNSSL; >> +} >> + >> +#if IS_ENABLED(CONFIG_IPV6) >> +struct ndisc_ops { >> +int(*is_useropt)(struct nd_opt_hdr *opt); >> +void(*send_na)(struct net_device *dev, >> + const struct in6_addr *daddr, >> + const struct in6_addr *solicited_addr, >> + bool router, bool solicited, >> + bool override, bool inc_opt); >> +void(*recv_na)(struct sk_buff *skb); >> +void(*send_ns)(struct net_device *dev, >> + const struct in6_addr *solicit, >> + const struct in6_addr *daddr, >> + const struct in6_addr *saddr); >> +void(*recv_ns)(struct sk_buff *skb); >> +}; >> + >> +static inline int ndisc_is_useropt(const struct net_device *dev, >> + struct nd_opt_hdr *opt) >> +{ >> +if (likely(dev->ndisc_ops->is_useropt)) >> +return dev->ndisc_ops->is_useropt(opt); >> +else >> +return 0; >> +} >> + >> +static inline void ndisc_send_na(struct net_device *dev, >> + const struct in6_addr *daddr, >> + const struct in6_addr *solicited_addr, >> + bool router, bool solicited, bool override, >> + bool inc_opt) >> +{ >> +if (likely(dev->ndisc_ops->send_na)) >> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router, >> +solicited, override, inc_opt); >> +} >> + >> +static inline void ndisc_recv_na(struct sk_buff *skb) >> +{ >> +if (likely(skb->dev->ndisc_ops->recv_na)) >> +skb->dev->ndisc_ops->recv_na(skb); >> +} >> + >> +static inline void ndisc_send_ns(struct net_device *dev, >> + const struct in6_addr *solicit, >> + const struct in6_addr *daddr, >> + const struct in6_addr *saddr) >> +{ >> +if (likely(dev->ndisc_ops->send_ns)) >> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr); >> +} >> + >> +static inline void ndisc_recv_ns(struct sk_buff *skb) >> +{ >> +if (likely(skb->dev->ndisc_ops->recv_ns)) >> +skb->dev->ndisc_ops->recv_ns(skb); >> +} >> +#else >> +static inline int ndisc_is_useropt(const struct net_device *dev, >> + struct nd_opt_hdr *opt) >> +{ >> +return 0; >> +} >> + >> +static inline void ndisc_send_na(struct net_device *dev, >> + const struct in6_addr *daddr, >> + const struct in6_addr *solicited_addr, >> + bool router, bool solicited, bool override, >> + bool inc_opt) { } >> + >> +static inline void ndisc_rec
OpenWRT wrong adjustment of fq_codel defaults (Was: [Codel] fq_codel_drop vs a udp flood)
Hi Felix, This is an important fix for OpenWRT, please read! OpenWRT changed the default fq_codel sch->limit from 10240 to 1024, without also adjusting q->flows_cnt. Eric explains below that you must also adjust the buckets (q->flows_cnt) for this not to break. (Just adjust it to 128) Problematic OpenWRT commit in question: http://git.openwrt.org/?p=openwrt.git;a=patch;h=12cd6578084e 12cd6578084e ("kernel: revert fq_codel quantum override to prevent it from causing too much cpu load with higher speed (#21326)") I also highly recommend you cherry-pick this very recent commit: net-next: 9d18562a2278 ("fq_codel: add batch ability to fq_codel_drop()") https://git.kernel.org/davem/net-next/c/9d18562a227 This should fix very high CPU usage in-case fq_codel goes into drop mode. The problem is that drop mode was considered rare, and implementation wise it was chosen to be more expensive (to save cycles on normal mode). Unfortunately is it easy to trigger with an UDP flood. Drop mode is especially expensive for smaller devices, as it scans a 4K big array, thus 64 cache misses for small devices! The fix is to allow drop-mode to bulk-drop more packets when entering drop-mode (default 64 bulk drop). That way we don't suddenly experience a significantly higher processing cost per packet, but instead can amortize this. To Eric, should we recommend OpenWRT to adjust default (max) 64 bulk drop, given we also recommend bucket size to be 128 ? (thus the amount of memory to scan is less, but their CPU is also much smaller). --Jesper On Thu, 05 May 2016 12:23:27 -0700 Eric Dumazet wrote: > On Thu, 2016-05-05 at 19:25 +0300, Roman Yeryomin wrote: > > On 5 May 2016 at 19:12, Eric Dumazet wrote: > > > On Thu, 2016-05-05 at 17:53 +0300, Roman Yeryomin wrote: > > > > > >> > > >> qdisc fq_codel 0: dev eth0 root refcnt 2 limit 1024p flows 1024 > > >> quantum 1514 target 5.0ms interval 100.0ms ecn > > >> Sent 12306 bytes 128 pkt (dropped 0, overlimits 0 requeues 0) > > >> backlog 0b 0p requeues 0 > > >> maxpacket 0 drop_overlimit 0 new_flow_count 0 ecn_mark 0 > > >> new_flows_len 0 old_flows_len 0 > > > > > > > > > Limit of 1024 packets and 1024 flows is not wise I think. > > > > > > (If all buckets are in use, each bucket has a virtual queue of 1 packet, > > > which is almost the same than having no queue at all) > > > > > > I suggest to have at least 8 packets per bucket, to let Codel have a > > > chance to trigger. > > > > > > So you could either reduce number of buckets to 128 (if memory is > > > tight), or increase limit to 8192. > > > > Will try, but what I've posted is default, I didn't change/configure that. > > fq_codel has a default of 10240 packets and 1024 buckets. > > http://lxr.free-electrons.com/source/net/sched/sch_fq_codel.c#L413 > > If someone changed that in the linux variant you use, he probably should > explain the rationale. -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat Author of http://www.iptv-analyzer.org LinkedIn: http://www.linkedin.com/in/brouer
Re: [PATCHv2 bluetooth-next 07/10] ipv6: introduce neighbour discovery ops
Hi, On 05/03/2016 08:17 PM, Stefan Schmidt wrote: > Hello. > > On 02/05/16 21:36, Hannes Frederic Sowa wrote: >> On 20.04.2016 10:19, Alexander Aring wrote: >>> This patch introduces neighbour discovery ops callback structure. The >>> structure contains at first receive and transmit handling for NS/NA and >>> userspace option field functionality. >>> >>> These callback offers 6lowpan different handling, such as 802.15.4 short >>> address handling or RFC6775 (Neighbor Discovery Optimization for IPv6 over >>> 6LoWPANs). >>> >>> Cc: David S. Miller >>> Cc: Alexey Kuznetsov >>> Cc: James Morris >>> Cc: Hideaki YOSHIFUJI >>> Cc: Patrick McHardy >>> Signed-off-by: Alexander Aring >>> --- >>> include/linux/netdevice.h | 3 ++ >>> include/net/ndisc.h | 96 >>> +++ >>> net/ipv6/addrconf.c | 1 + >>> net/ipv6/ndisc.c | 71 --- >>> net/ipv6/route.c | 2 +- >>> 5 files changed, 144 insertions(+), 29 deletions(-) >>> >>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h >>> index 0052c42..bc60033 100644 >>> --- a/include/linux/netdevice.h >>> +++ b/include/linux/netdevice.h >>> @@ -1677,6 +1677,9 @@ struct net_device { >>> #ifdef CONFIG_NET_L3_MASTER_DEV >>> const struct l3mdev_ops*l3mdev_ops; >>> #endif >>> +#if IS_ENABLED(CONFIG_IPV6) >>> +const struct ndisc_ops *ndisc_ops; >>> +#endif >>> const struct header_ops *header_ops; >>> diff --git a/include/net/ndisc.h b/include/net/ndisc.h >>> index aac868e..14ed016 100644 >>> --- a/include/net/ndisc.h >>> +++ b/include/net/ndisc.h >>> @@ -110,7 +110,8 @@ struct ndisc_options { >>> #define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) >>> -struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, >>> +struct ndisc_options *ndisc_parse_options(const struct net_device *dev, >>> + u8 *opt, int opt_len, >>> struct ndisc_options *ndopts); >>> /* >>> @@ -173,6 +174,93 @@ static inline struct neighbour >>> *__ipv6_neigh_lookup(struct net_device *dev, cons >>> return n; >>> } >>> +static inline int __ip6_ndisc_is_useropt(struct nd_opt_hdr *opt) >>> +{ >>> +return opt->nd_opt_type == ND_OPT_RDNSS || >>> +opt->nd_opt_type == ND_OPT_DNSSL; >>> +} >>> + >>> +#if IS_ENABLED(CONFIG_IPV6) >>> +struct ndisc_ops { >>> +int(*is_useropt)(struct nd_opt_hdr *opt); >>> +void(*send_na)(struct net_device *dev, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *solicited_addr, >>> + bool router, bool solicited, >>> + bool override, bool inc_opt); >>> +void(*recv_na)(struct sk_buff *skb); >>> +void(*send_ns)(struct net_device *dev, >>> + const struct in6_addr *solicit, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *saddr); >>> +void(*recv_ns)(struct sk_buff *skb); >>> +}; >>> + >>> +static inline int ndisc_is_useropt(const struct net_device *dev, >>> + struct nd_opt_hdr *opt) >>> +{ >>> +if (likely(dev->ndisc_ops->is_useropt)) >>> +return dev->ndisc_ops->is_useropt(opt); >>> +else >>> +return 0; >>> +} >>> + >>> +static inline void ndisc_send_na(struct net_device *dev, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *solicited_addr, >>> + bool router, bool solicited, bool override, >>> + bool inc_opt) >>> +{ >>> +if (likely(dev->ndisc_ops->send_na)) >>> +dev->ndisc_ops->send_na(dev, daddr, solicited_addr, router, >>> +solicited, override, inc_opt); >>> +} >>> + >>> +static inline void ndisc_recv_na(struct sk_buff *skb) >>> +{ >>> +if (likely(skb->dev->ndisc_ops->recv_na)) >>> +skb->dev->ndisc_ops->recv_na(skb); >>> +} >>> + >>> +static inline void ndisc_send_ns(struct net_device *dev, >>> + const struct in6_addr *solicit, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *saddr) >>> +{ >>> +if (likely(dev->ndisc_ops->send_ns)) >>> +dev->ndisc_ops->send_ns(dev, solicit, daddr, saddr); >>> +} >>> + >>> +static inline void ndisc_recv_ns(struct sk_buff *skb) >>> +{ >>> +if (likely(skb->dev->ndisc_ops->recv_ns)) >>> +skb->dev->ndisc_ops->recv_ns(skb); >>> +} >>> +#else >>> +static inline int ndisc_is_useropt(const struct net_device *dev, >>> + struct nd_opt_hdr *opt) >>> +{ >>> +return 0; >>> +} >>> + >>> +static inline void ndisc_send_na(struct net_device *dev, >>> + const struct in6_addr *daddr, >>> + const struct in6_addr *solicited_addr, >>> + bool router, bool solicited, bool override, >>> + bool inc_opt) { } >>> + >>> +static inline void ndisc_recv_na(struct sk_b
Re: [PATCH v9 net-next 4/7] openvswitch: add layer 3 flow/port support
On Wed, 4 May 2016 16:36:30 +0900, Simon Horman wrote: > +static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, > + const struct ovs_action_push_eth *ethh) > +{ > + int err; > + > + /* De-accelerate any hardware accelerated VLAN tag added to a previous > + * Ethernet header */ > + err = skb_vlan_deaccel(skb); > + if (unlikely(err)) > + return err; > + > + /* Add the new Ethernet header */ > + if (skb_cow_head(skb, ETH_HLEN) < 0) > + return -ENOMEM; > + > + skb_push(skb, ETH_HLEN); > + skb_reset_mac_header(skb); > + skb_reset_mac_len(skb); > + > + ether_addr_copy(eth_hdr(skb)->h_source, ethh->addresses.eth_src); > + ether_addr_copy(eth_hdr(skb)->h_dest, ethh->addresses.eth_dst); > + eth_hdr(skb)->h_proto = ethh->eth_type; This doesn't seem right. We know the packet type, it's skb->protocol. We should fill in that. In addition, we should check whether mac_len > 0 and in such case, change skb->protocol to ETH_P_TEB first (and store that value in the pushed eth header). Similarly on pop_eth, we need to check skb->protocol and if it is ETH_P_TEB, call eth_type_trans on the modified frame to set the new skb->protocol correctly. It's probably not that simple, as we'd need a version of eth_type_trans that doesn't need a net device. Jiri
Re: [PATCHv2 bluetooth-next 02/10] 6lowpan: add 802.15.4 short addr slaac
Hi, On 05/03/2016 08:16 PM, Stefan Schmidt wrote: > Hello. > > On 20/04/16 10:19, Alexander Aring wrote: >> This patch adds the autoconfiguration if a valid 802.15.4 short address >> is available for 802.15.4 6LoWPAN interfaces. >> >> Cc: David S. Miller >> Cc: Alexey Kuznetsov >> Cc: James Morris >> Cc: Hideaki YOSHIFUJI >> Cc: Patrick McHardy >> Signed-off-by: Alexander Aring >> --- >> include/net/addrconf.h | 3 +++ >> net/6lowpan/core.c | 46 ++ >> net/ipv6/addrconf.c| 5 +++-- >> 3 files changed, 52 insertions(+), 2 deletions(-) >> >> diff --git a/include/net/addrconf.h b/include/net/addrconf.h >> index 730d856..b1774eb 100644 >> --- a/include/net/addrconf.h >> +++ b/include/net/addrconf.h >> @@ -94,6 +94,9 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const >> struct sock *sk2, >> void addrconf_join_solict(struct net_device *dev, const struct in6_addr >> *addr); >> void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr >> *addr); >> +void addrconf_add_linklocal(struct inet6_dev *idev, >> +const struct in6_addr *addr, u32 flags); >> + >> static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) >> { >> if (dev->addr_len != ETH_ALEN) >> diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c >> index 7a240b3..fbae31e 100644 >> --- a/net/6lowpan/core.c >> +++ b/net/6lowpan/core.c >> @@ -14,6 +14,7 @@ >> #include >> #include >> +#include >> #include "6lowpan_i.h" >> @@ -72,16 +73,61 @@ void lowpan_unregister_netdev(struct net_device *dev) >> } >> EXPORT_SYMBOL(lowpan_unregister_netdev); >> +static int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev) >> +{ >> +struct wpan_dev *wpan_dev = >> lowpan_802154_dev(dev)->wdev->ieee802154_ptr; >> + >> +/* Set short_addr autoconfiguration if short_addr is present only */ >> +if (!ieee802154_is_valid_src_short_addr(wpan_dev->short_addr)) >> +return -1; > > -EINVAL instead of -1? > The ipv6 implementation do that for the "dev->addr" slaac the same here. I think the reason is because here, if this fails for any reason then this is simple not an error, we just don't add a link-local for short addr slaac here and don't abort completely the CHANGE/UP of interface. IPv6 also use "-1" return value only if parsing fails, I would to see that also for iphc stuff where we mixed a lot of "-EIO" and "-EINVAL", anyway nobody will really care about that return value. It just means: 0 parsing successful and -1 parsing failed. - Alex
Re: [PATCH v9 net-next 4/7] openvswitch: add layer 3 flow/port support
On Fri, 6 May 2016 14:57:07 +0900, Simon Horman wrote: > On Thu, May 05, 2016 at 10:37:08AM -0700, pravin shelar wrote: > > On transmit side you are using mac_len to detect l3 packet, why not do > > same while extracting the key? I agree. The skb should be self-contained, i.e. it should be obvious whether it has the MAC header set or not just from the skb itself at any point in the packet processing. Otherwise, I'd expect things like recirculation to break after push/pop of eth header. > Unfortunately mac_len can't be relied on here, emprically it has the same > value (28 in my tests) for both the TEB and layer3 case above. That's strange, it looks like there's something setting the mac header unconditionally in ovs code. We should find that place and change it. The ARPHRD_NONE interfaces don't even set mac_header and mac_len, this will need to be set by ovs upon getting frame from such interface. > Perhaps that could be changed by futher enhancements in the tunneling code > but I think things are symetric as they stand: > > * On recieve skb->protocol can be read to distinguish TEB and layer3 packets > * On transmit skb->protocol should be set to distinguish TEB and layer3 > packets Yes, but you need to act upon this directly after receiving the frame/just before sending the frame and set up an internal flag that will be used throughout the code. That way, the packet can be handed over to different parts of the code, recirculated, etc. without worries. skb->mac_len is probably a good candidate for such flag. Jiri
[patch net 1/3] mlxsw: spectrum: Fix rollback order in LAG join failure
From: Ido Schimmel Make the leave procedure in the error path symmetric to the join procedure and first remove the port from the collector before potentially destroying the LAG. Fixes: 0d65fc13042f ("mlxsw: spectrum: Implement LAG port join/leave") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 4afbc3e..668b2f4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2541,11 +2541,11 @@ static int mlxsw_sp_port_lag_join(struct mlxsw_sp_port *mlxsw_sp_port, lag->ref_count++; return 0; +err_col_port_enable: + mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id); err_col_port_add: if (!lag->ref_count) mlxsw_sp_lag_destroy(mlxsw_sp, lag_id); -err_col_port_enable: - mlxsw_sp_lag_col_port_remove(mlxsw_sp_port, lag_id); return err; } -- 2.5.5
[patch net 0/3] mlxsw: Couple of fixes
From: Jiri Pirko Ido Schimmel (2): mlxsw: spectrum: Fix rollback order in LAG join failure mlxsw: spectrum: Add missing rollback in flood configuration Jiri Pirko (1): mlxsw: spectrum: Fix ordering in mlxsw_sp_fini drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 +++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 8 2 files changed, 11 insertions(+), 2 deletions(-) -- 2.5.5
[patch net 2/3] mlxsw: spectrum: Add missing rollback in flood configuration
From: Ido Schimmel When we fail to set the flooding configuration for the broadcast and unregistered multicast traffic, we should revert the flooding configuration of the unknown unicast traffic. Fixes: 0293038e0c36 ("mlxsw: spectrum: Add support for flood control") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 8 1 file changed, 8 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index e1c74ef..9cd6f47 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -214,7 +214,15 @@ static int __mlxsw_sp_port_flood_set(struct mlxsw_sp_port *mlxsw_sp_port, mlxsw_reg_sftr_pack(sftr_pl, MLXSW_SP_FLOOD_TABLE_BM, idx_begin, table_type, range, local_port, set); err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sftr), sftr_pl); + if (err) + goto err_flood_bm_set; + else + goto buffer_out; +err_flood_bm_set: + mlxsw_reg_sftr_pack(sftr_pl, MLXSW_SP_FLOOD_TABLE_UC, idx_begin, + table_type, range, local_port, !set); + mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sftr), sftr_pl); buffer_out: kfree(sftr_pl); return err; -- 2.5.5
[patch net 3/3] mlxsw: spectrum: Fix ordering in mlxsw_sp_fini
From: Jiri Pirko Fixes: 0f433fa0ec ("mlxsw: spectrum_buffers: Implement shared buffer configuration") Signed-off-by: Jiri Pirko --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 668b2f4..749cc27 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2159,6 +2159,7 @@ static void mlxsw_sp_fini(void *priv) struct mlxsw_sp *mlxsw_sp = priv; mlxsw_sp_switchdev_fini(mlxsw_sp); + mlxsw_sp_buffers_fini(mlxsw_sp); mlxsw_sp_traps_fini(mlxsw_sp); mlxsw_sp_event_unregister(mlxsw_sp, MLXSW_TRAP_ID_PUDE); mlxsw_sp_ports_remove(mlxsw_sp); -- 2.5.5
Re: [PATCH v9 net-next 7/7] openvswitch: use ipgre tunnel rather than gretap tunnel
On Fri, 6 May 2016 15:54:02 +0900, Simon Horman wrote: > -int ovs_netdev_send_raw_tun(struct sk_buff *skb) > -{ > - if (skb->mac_len) > - skb->protocol = ntohs(ETH_P_TEB); > + if (dev->type != ARPHRD_ETHER && skb->mac_len) { > + skb->protocol = htons(ETH_P_TEB); > + } else if (dev->type == ARPHRD_ETHER && !skb->mac_len) { > + kfree_skb(skb); > + return -EINVAL; > + } This was something I was missing in your patches (sorry, did not get to the full review yet). You'll also need to enable ARPHRD_NONE and ARPHRD_IPGRE interfaces in ovs_netdev_link. Jiri
Re: [PATCH] cfg80211/nl80211: add wifi tx power mode switching support
On Fri, May 6, 2016 at 12:07 AM, Dan Williams wrote: > > On Thu, 2016-05-05 at 14:44 +0800, Wei-Ning Huang wrote: > > Recent new hardware has the ability to switch between tablet mode and > > clamshell mode. To optimize WiFi performance, we want to be able to > > use > > different power table between modes. This patch adds a new netlink > > message type and cfg80211_ops function to allow userspace to trigger > > a > > power mode switch for a given wireless interface. > > > > Signed-off-by: Wei-Ning Huang > > --- > > include/net/cfg80211.h | 11 +++ > > include/uapi/linux/nl80211.h | 21 + > > net/wireless/nl80211.c | 16 > > net/wireless/rdev-ops.h | 22 ++ > > net/wireless/trace.h | 20 > > 5 files changed, 90 insertions(+) > > > > diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h > > index 9e1b24c..aa77fa0 100644 > > --- a/include/net/cfg80211.h > > +++ b/include/net/cfg80211.h > > @@ -2370,6 +2370,12 @@ struct cfg80211_qos_map { > > * @get_tx_power: store the current TX power into the dbm variable; > > * return 0 if successful > > * > > + * @set_tx_power_mode: set the transmit power mode. Some device have > > the ability > > + * to transform between different mode such as clamshell and > > tablet mode. > > + * set_tx_power_mode allows setting of different TX power > > mode at runtime. > > + * @get_tx_power_mode: store the current TX power mode into the mode > > variable; > > + * return 0 if successful > > + * > > * @set_wds_peer: set the WDS peer for a WDS interface > > * > > * @rfkill_poll: polls the hw rfkill line, use cfg80211 reporting > > @@ -2631,6 +2637,11 @@ struct cfg80211_ops { > > int (*get_tx_power)(struct wiphy *wiphy, struct > > wireless_dev *wdev, > > int *dbm); > > > > + int (*set_tx_power_mode)(struct wiphy *wiphy, > > + enum nl80211_tx_power_mode > > mode); > > + int (*get_tx_power_mode)(struct wiphy *wiphy, > > + enum nl80211_tx_power_mode > > *mode); > > + > > int (*set_wds_peer)(struct wiphy *wiphy, struct > > net_device *dev, > > const u8 *addr); > > > > diff --git a/include/uapi/linux/nl80211.h > > b/include/uapi/linux/nl80211.h > > index 5a30a75..9b1888a 100644 > > --- a/include/uapi/linux/nl80211.h > > +++ b/include/uapi/linux/nl80211.h > > @@ -1796,6 +1796,9 @@ enum nl80211_commands { > > * connecting to a PCP, and in %NL80211_CMD_START_AP to start > > * a PCP instead of AP. Relevant for DMG networks only. > > * > > + * @NL80211_ATTR_WIPHY_TX_POWER_MODE: Transmit power mode. See > > + * &enum nl80211_tx_power_mode for possible values. > > + * > > * @NUM_NL80211_ATTR: total number of nl80211_attrs available > > * @NL80211_ATTR_MAX: highest attribute number currently defined > > * @__NL80211_ATTR_AFTER_LAST: internal use > > @@ -2172,6 +2175,8 @@ enum nl80211_attrs { > > > > NL80211_ATTR_PBSS, > > > > + NL80211_ATTR_WIPHY_TX_POWER_MODE, > > + > > /* add attributes here, update the policy in nl80211.c */ > > > > __NL80211_ATTR_AFTER_LAST, > > @@ -3703,6 +3708,22 @@ enum nl80211_tx_power_setting { > > }; > > > > /** > > + * enum nl80211_tx_power_mode - TX power mode setting > > + * @NL80211_TX_POWER_LOW: general low TX power mode > > + * @NL80211_TX_POWER_MEDIUM: general medium TX power mode > > + * @NL80211_TX_POWER_HIGH: general high TX power mode > > + * @NL80211_TX_POWER_CLAMSHELL: clamshell mode TX power mode > > + * @NL80211_TX_POWER_TABLET: tablet mode TX power mode > > + */ > > +enum nl80211_tx_power_mode { > > + NL80211_TX_POWER_LOW, > > + NL80211_TX_POWER_MEDIUM, > > + NL80211_TX_POWER_HIGH, > > + NL80211_TX_POWER_CLAMSHELL, > > + NL80211_TX_POWER_TABLET, > > "clamshell" and "tablet" probably mean many different things to many > different people with respect to whether or not they should do anything > with power saving or wifi. I feel like a more generic interface is > needed here. We could probably drop those two CLAMSHELL and TABLET constant or describing what they mean in more detail? > > Could this be already done by: > @NL80211_ATTR_WIPHY_TX_POWER_SETTING = NL80211_TX_POWER_LIMITED > @NL80211_ATTR_WIPHY_TX_POWER_LEVEL = > > and then the device would be able to change its TX power as it saw fit > up to that limit set by your application which figures out whether it's > in clamshell or tablet mode? We usually want different power settings in different band/channel. For example, we can have three different power settings in 2.4Ghz, channels 36-64 & channels 100+ on 5Ghz. In this case, we can not simply set a fixed number to the power level. A power table is required to map channel/band to actual power limit. For most of the driver, changing power table requires loading a new set of calibration data f
Re: [PATCH net-next] vxlan: if_arp: introduce ARPHRD_VXLANGPE
On Thu, May 05, 2016 at 04:56:43PM -0300, Thadeu Lima de Souza Cascardo wrote: > On Thu, May 05, 2016 at 09:31:41PM +0200, Jiri Benc wrote: > > On Thu, 5 May 2016 13:36:44 -0300, Thadeu Lima de Souza Cascardo wrote: > > > Use ARPHRD_VXLANGPE to identify VxLAN GPE interfaces. This is going to be > > > used > > > to allow GPE interfaces to be added as openvswitch ports. > > > > What's wrong with ARPHRD_NONE? I don't think we need a separate type > > for VXLAN-GPE. Just use ARPHRD_NONE in ovs and things should work, for > > all ARPHRD_NONE interfaces as a bonus. > > > > That's fine for me. I looked quickly at the few devices using ARPHRD_NONE in > upstream kernel, not sure if there are broken out-of-tree drivers out there. > And > should we care? It seems unlikely to me that we should.
[GIT PULL nf-next 0/1] Second Round of IPVS Updates for v4.7
Hi Pablo, please consider these enhancements to the IPVS. They allow its DoS mitigation strategy effective in conjunction with the SIP persistence engine. The following changes since commit cb39ad8b8ef224c544074962780bf763077d6141: netfilter: nf_tables: allow set names up to 32 bytes (2016-05-05 16:39:51 +0200) are available in the git repository at: https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git tags/ipvs2-for-v4.7 for you to fetch changes up to 698e2a8dca98e4de32f3f630e6d9cd93753c52e1: ipvs: make drop_entry protection effective for SIP-pe (2016-05-06 16:26:23 +0900) Marco Angaroni (1): ipvs: make drop_entry protection effective for SIP-pe net/netfilter/ipvs/ip_vs_conn.c | 22 +++--- net/netfilter/ipvs/ip_vs_core.c | 8 +++- 2 files changed, 26 insertions(+), 4 deletions(-) Marco Angaroni (1): ipvs: make drop_entry protection effective for SIP-pe net/netfilter/ipvs/ip_vs_conn.c | 22 +++--- net/netfilter/ipvs/ip_vs_core.c | 8 +++- 2 files changed, 26 insertions(+), 4 deletions(-) -- 2.7.0.rc3.207.g0ac5344
[PATCH nf-next 1/1] ipvs: make drop_entry protection effective for SIP-pe
From: Marco Angaroni DoS protection policy that deletes connections to avoid out of memory is currently not effective for SIP-pe plus OPS-mode for two reasons: 1) connection templates (holding SIP call-id) are always skipped in ip_vs_random_dropentry() 2) in_pkts counter (used by drop_entry algorithm) is not incremented for connection templates This patch addresses such problems with the following changes: a) connection templates associated (via their dest) to virtual-services configured in OPS mode are included in ip_vs_random_dropentry() monitoring. This applies to SIP-pe over UDP (which requires OPS mode), but is more general principle: when OPS is controlled by templates memory can be used only by templates themselves, since OPS conns are deleted after packet is forwarded. b) OPS connections, if controlled by a template, cause increment of in_pkts counter of their template. This is already happening but only in case director is in master-slave mode (see ip_vs_sync_conn()). Signed-off-by: Marco Angaroni Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 22 +++--- net/netfilter/ipvs/ip_vs_core.c | 8 +++- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 292365ffa4f0..2cb3c626cd43 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1261,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1275,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1307,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f3bac2e9a25a..1207f20d24e4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -612,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1991,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; -- 2.7.0.rc3.207.g0ac5344
[net-next 05/11] i40evf: Drop packet split receive routine
From: Jesse Brandeburg As part of preparation for the rx-refactor, remove the packet split receive routine and ancillary code. Some of the split related context set up code stays in i40e_virtchnl_pf.c in case an older VF driver tries to load and still wants to use packet split. Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 +- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 21 +--- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 7 -- drivers/net/ethernet/intel/i40evf/i40evf.h | 2 -- drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 14 --- drivers/net/ethernet/intel/i40evf/i40evf_main.c| 28 +- .../net/ethernet/intel/i40evf/i40evf_virtchnl.c| 4 7 files changed, 3 insertions(+), 75 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 6b9db79..36aa33a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -590,7 +590,7 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, } rx_ctx.hbuff = info->hdr_size >> I40E_RXQ_CTX_HBUFF_SHIFT; - /* set splitalways mode 10b */ + /* set split mode 10b */ rx_ctx.dtype = I40E_RX_DTYPE_HEADER_SPLIT; } diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index a37a3f3..61d4a7a 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -504,22 +504,6 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring) if (!rx_ring->rx_bi) return; - if (ring_is_ps_enabled(rx_ring)) { - int bufsz = ALIGN(rx_ring->rx_hdr_len, 256) * rx_ring->count; - - rx_bi = &rx_ring->rx_bi[0]; - if (rx_bi->hdr_buf) { - dma_free_coherent(dev, - bufsz, - rx_bi->hdr_buf, - rx_bi->dma); - for (i = 0; i < rx_ring->count; i++) { - rx_bi = &rx_ring->rx_bi[i]; - rx_bi->dma = 0; - rx_bi->hdr_buf = NULL; - } - } - } /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { rx_bi = &rx_ring->rx_bi[i]; @@ -1435,10 +1419,7 @@ int i40evf_napi_poll(struct napi_struct *napi, int budget) i40e_for_each_ring(ring, q_vector->rx) { int cleaned; - if (ring_is_ps_enabled(ring)) - cleaned = i40e_clean_rx_irq_ps(ring, budget_per_ring); - else - cleaned = i40e_clean_rx_irq_1buf(ring, budget_per_ring); + cleaned = i40e_clean_rx_irq_1buf(ring, budget_per_ring); work_done += cleaned; /* if we clean as many as budgeted, we must not be done */ diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h index 3b3f976..f24a97e 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h @@ -244,16 +244,9 @@ struct i40e_rx_queue_stats { enum i40e_ring_state_t { __I40E_TX_FDIR_INIT_DONE, __I40E_TX_XPS_INIT_DONE, - __I40E_RX_PS_ENABLED, __I40E_RX_16BYTE_DESC_ENABLED, }; -#define ring_is_ps_enabled(ring) \ - test_bit(__I40E_RX_PS_ENABLED, &(ring)->state) -#define set_ring_ps_enabled(ring) \ - set_bit(__I40E_RX_PS_ENABLED, &(ring)->state) -#define clear_ring_ps_enabled(ring) \ - clear_bit(__I40E_RX_PS_ENABLED, &(ring)->state) #define ring_is_16byte_desc_enabled(ring) \ test_bit(__I40E_RX_16BYTE_DESC_ENABLED, &(ring)->state) #define set_ring_16byte_desc_enabled(ring) \ diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h index 25afabf..83ccc58 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf.h +++ b/drivers/net/ethernet/intel/i40evf/i40evf.h @@ -209,8 +209,6 @@ struct i40evf_adapter { u32 flags; #define I40EVF_FLAG_RX_CSUM_ENABLED BIT(0) #define I40EVF_FLAG_RX_1BUF_CAPABLE BIT(1) -#define I40EVF_FLAG_RX_PS_CAPABLEBIT(2) -#define I40EVF_FLAG_RX_PS_ENABLEDBIT(3) #define I40EVF_FLAG_IMIR_ENABLED BIT(5) #define I40EVF_FLAG_MQ_CAPABLE BIT(6) #define I40EVF_FLAG_NEED_LINK_UPDATE BIT(7) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c b/drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c index 5a48ee0..e97
[net-next 06/11] i40evf: refactor receive routine
From: Jesse Brandeburg This is part 2 of the Rx refactor series, just including changes to i40evf. This refactor aligns the receive routine with the one in ixgbe which was highly optimized. This reduces the code we have to maintain and allows for (hopefully) more readable and maintainable RX hot path. In order to do this: - consolidate the receive path into a single function that doesn't use packet split but *does* use pages for Rx buffers. - remove the old _1buf routine - consolidate several routines into helper functions - remove VF ethtool control over packet split - remove priv_flags interface since it is unused Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 898 +++-- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 37 +- drivers/net/ethernet/intel/i40evf/i40evf.h | 5 - drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 51 -- drivers/net/ethernet/intel/i40evf/i40evf_main.c| 3 +- 5 files changed, 481 insertions(+), 513 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 61d4a7a..fd7dae46 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -496,7 +496,6 @@ err: void i40evf_clean_rx_ring(struct i40e_ring *rx_ring) { struct device *dev = rx_ring->dev; - struct i40e_rx_buffer *rx_bi; unsigned long bi_size; u16 i; @@ -506,30 +505,20 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring) /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { - rx_bi = &rx_ring->rx_bi[i]; - if (rx_bi->dma) { - dma_unmap_single(dev, -rx_bi->dma, -rx_ring->rx_buf_len, -DMA_FROM_DEVICE); - rx_bi->dma = 0; - } + struct i40e_rx_buffer *rx_bi = &rx_ring->rx_bi[i]; + if (rx_bi->skb) { dev_kfree_skb(rx_bi->skb); rx_bi->skb = NULL; } - if (rx_bi->page) { - if (rx_bi->page_dma) { - dma_unmap_page(dev, - rx_bi->page_dma, - PAGE_SIZE, - DMA_FROM_DEVICE); - rx_bi->page_dma = 0; - } - __free_page(rx_bi->page); - rx_bi->page = NULL; - rx_bi->page_offset = 0; - } + if (!rx_bi->page) + continue; + + dma_unmap_page(dev, rx_bi->dma, PAGE_SIZE, DMA_FROM_DEVICE); + __free_pages(rx_bi->page, 0); + + rx_bi->page = NULL; + rx_bi->page_offset = 0; } bi_size = sizeof(struct i40e_rx_buffer) * rx_ring->count; @@ -538,6 +527,7 @@ void i40evf_clean_rx_ring(struct i40e_ring *rx_ring) /* Zero out the descriptor ring */ memset(rx_ring->desc, 0, rx_ring->size); + rx_ring->next_to_alloc = 0; rx_ring->next_to_clean = 0; rx_ring->next_to_use = 0; } @@ -562,37 +552,6 @@ void i40evf_free_rx_resources(struct i40e_ring *rx_ring) } /** - * i40evf_alloc_rx_headers - allocate rx header buffers - * @rx_ring: ring to alloc buffers - * - * Allocate rx header buffers for the entire ring. As these are static, - * this is only called when setting up a new ring. - **/ -void i40evf_alloc_rx_headers(struct i40e_ring *rx_ring) -{ - struct device *dev = rx_ring->dev; - struct i40e_rx_buffer *rx_bi; - dma_addr_t dma; - void *buffer; - int buf_size; - int i; - - if (rx_ring->rx_bi[0].hdr_buf) - return; - /* Make sure the buffers don't cross cache line boundaries. */ - buf_size = ALIGN(rx_ring->rx_hdr_len, 256); - buffer = dma_alloc_coherent(dev, buf_size * rx_ring->count, - &dma, GFP_KERNEL); - if (!buffer) - return; - for (i = 0; i < rx_ring->count; i++) { - rx_bi = &rx_ring->rx_bi[i]; - rx_bi->dma = dma + (i * buf_size); - rx_bi->hdr_buf = buffer + (i * buf_size); - } -} - -/** * i40evf_setup_rx_descriptors - Allocate Rx descriptors * @rx_ring: Rx descriptor ring (for a specific queue) to setup * @@ -613,9 +572,7 @@ int i40evf_setup_rx_descriptors(struct i40e_ring *rx_ring) u64_stats_init(&rx_ring->syncp); /* Round up to nearest 4K */ - rx_ring->size = ring_is_16byte_desc_enabled(rx_ring) - ? rx_ring->count * sizeof(union i40e_16b
[net-next 08/11] i40evf: Allocate Rx buffers properly
From: Mitch Williams Allocate the correct number of RX buffers, and don't fiddle with next_to_use. The common RX code handles all of this. This fixes a memory leak of one page each time the driver is opened. Change-Id: Id06eca353086e084921f047acad28c14745684ee Signed-off-by: Mitch Williams Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40evf/i40evf_main.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index 870bad8..b548dbe 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -990,9 +990,7 @@ static void i40evf_configure(struct i40evf_adapter *adapter) for (i = 0; i < adapter->num_active_queues; i++) { struct i40e_ring *ring = &adapter->rx_rings[i]; - i40evf_alloc_rx_buffers(ring, ring->count); - ring->next_to_use = ring->count - 1; - writel(ring->next_to_use, ring->tail); + i40evf_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); } } @@ -2768,7 +2766,6 @@ static void i40evf_remove(struct pci_dev *pdev) iounmap(hw->hw_addr); pci_release_regions(pdev); - i40evf_free_all_tx_resources(adapter); i40evf_free_all_rx_resources(adapter); i40evf_free_queues(adapter); -- 2.5.5
[net-next 07/11] i40e/i40evf: Remove unused hardware receive descriptor code
From: Jesse Brandeburg The hardware supports a 16 byte descriptor for receive, but the driver was never using it in production. There was no performance benefit to the real driver of 16 byte descriptors, so drop a whole lot of complexity while getting rid of the code. Also since the previous patch made us use no-split mode all the time, drop any support in the driver for any other value in dtype and assume it is always zero (aka no-split). Hooray for code removal! Change-ID: I2257e902e4dad84a07b94db6d2e6f4ce69b27bc0 Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e.h | 7 +-- drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 16 +--- drivers/net/ethernet/intel/i40e/i40e_main.c| 18 +- drivers/net/ethernet/intel/i40e/i40e_txrx.h| 24 ++-- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 24 ++-- 5 files changed, 27 insertions(+), 62 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index ebf423b..2a6a5d3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -122,10 +122,7 @@ #define XSTRINGIFY(bar) STRINGIFY(bar) #define I40E_RX_DESC(R, i) \ - ((ring_is_16byte_desc_enabled(R)) \ - ? (union i40e_32byte_rx_desc *) \ - (&(((union i40e_16byte_rx_desc *)((R)->desc))[i])) \ - : (&(((union i40e_32byte_rx_desc *)((R)->desc))[i]))) + (&(((union i40e_32byte_rx_desc *)((R)->desc))[i])) #define I40E_TX_DESC(R, i) \ (&(((struct i40e_tx_desc *)((R)->desc))[i])) #define I40E_TX_CTXTDESC(R, i) \ @@ -327,7 +324,6 @@ struct i40e_pf { #ifdef I40E_FCOE #define I40E_FLAG_FCOE_ENABLED BIT_ULL(11) #endif /* I40E_FCOE */ -#define I40E_FLAG_16BYTE_RX_DESC_ENABLED BIT_ULL(13) #define I40E_FLAG_CLEAN_ADMINQ BIT_ULL(14) #define I40E_FLAG_FILTER_SYNC BIT_ULL(15) #define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT_ULL(16) @@ -532,7 +528,6 @@ struct i40e_vsi { u16 max_frame; u16 rx_buf_len; - u8 dtype; /* List of q_vectors allocated to this VSI */ struct i40e_q_vector **q_vectors; diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 8ae30f7..e6af8c8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -361,7 +361,7 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) vsi->work_limit); dev_info(&pf->pdev->dev, "max_frame = %d, rx_buf_len = %d dtype = %d\n", -vsi->max_frame, vsi->rx_buf_len, vsi->dtype); +vsi->max_frame, vsi->rx_buf_len, 0); dev_info(&pf->pdev->dev, "num_q_vectors = %i, base_vector = %i\n", vsi->num_q_vectors, vsi->base_vector); @@ -586,13 +586,6 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int ring_id, int desc_n, " d[%03x] = 0x%016llx 0x%016llx\n", i, txd->buffer_addr, txd->cmd_type_offset_bsz); - } else if (sizeof(union i40e_rx_desc) == - sizeof(union i40e_16byte_rx_desc)) { - rxd = I40E_RX_DESC(ring, i); - dev_info(&pf->pdev->dev, -" d[%03x] = 0x%016llx 0x%016llx\n", -i, rxd->read.pkt_addr, -rxd->read.hdr_addr); } else { rxd = I40E_RX_DESC(ring, i); dev_info(&pf->pdev->dev, @@ -614,13 +607,6 @@ static void i40e_dbg_dump_desc(int cnt, int vsi_seid, int ring_id, int desc_n, "vsi = %02i tx ring = %02i d[%03x] = 0x%016llx 0x%016llx\n", vsi_seid, ring_id, desc_n, txd->buffer_addr, txd->cmd_type_offset_bsz); - } else if (sizeof(union i40e_rx_desc) == - sizeof(union i40e_16byte_rx_desc)) { - rxd = I40E_RX_DESC(ring, desc_n); - dev_info(&pf->pdev->dev, -"vsi = %02i rx ring = %02i d[%03x] = 0x%016llx 0x%016llx\n", -vsi_seid, ring_id, desc_n, -rxd->read.pkt_addr, rxd->read.hdr_addr); } else { rxd = I40E_RX_DESC(ring, desc_n); dev_info(&pf->pdev->dev, diff --git a/dri
[net-next 02/11] i40e: Drop packet split receive routine
From: Jesse Brandeburg As part of preparation for the rx-refactor, remove the packet split receive routine and ancillary code. Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e.h | 3 - drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 4 +- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 19 -- drivers/net/ethernet/intel/i40e/i40e_main.c| 49 + drivers/net/ethernet/intel/i40e/i40e_txrx.c| 245 + drivers/net/ethernet/intel/i40e/i40e_txrx.h| 7 - 6 files changed, 10 insertions(+), 317 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 00c4738..ea6a69a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -101,7 +101,6 @@ #define I40E_PRIV_FLAGS_LINKPOLL_FLAG BIT(1) #define I40E_PRIV_FLAGS_FD_ATR BIT(2) #define I40E_PRIV_FLAGS_VEB_STATS BIT(3) -#define I40E_PRIV_FLAGS_PS BIT(4) #define I40E_PRIV_FLAGS_HW_ATR_EVICT BIT(5) #define I40E_NVM_VERSION_LO_SHIFT 0 @@ -320,8 +319,6 @@ struct i40e_pf { #define I40E_FLAG_RX_CSUM_ENABLED BIT_ULL(1) #define I40E_FLAG_MSI_ENABLED BIT_ULL(2) #define I40E_FLAG_MSIX_ENABLED BIT_ULL(3) -#define I40E_FLAG_RX_1BUF_ENABLED BIT_ULL(4) -#define I40E_FLAG_RX_PS_ENABLEDBIT_ULL(5) #define I40E_FLAG_RSS_ENABLED BIT_ULL(6) #define I40E_FLAG_VMDQ_ENABLED BIT_ULL(7) #define I40E_FLAG_FDIR_REQUIRES_REINIT BIT_ULL(8) diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 83dccf1..f119a74 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -273,8 +273,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) rx_ring->rx_buf_len, rx_ring->dtype); dev_info(&pf->pdev->dev, -"rx_rings[%i]: hsplit = %d, next_to_use = %d, next_to_clean = %d, ring_active = %i\n", -i, ring_is_ps_enabled(rx_ring), +"rx_rings[%i]: next_to_use = %d, next_to_clean = %d, ring_active = %i\n", +i, rx_ring->next_to_use, rx_ring->next_to_clean, rx_ring->ring_active); diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 8e56c43..858e169 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -2829,8 +2829,6 @@ static u32 i40e_get_priv_flags(struct net_device *dev) I40E_PRIV_FLAGS_FD_ATR : 0; ret_flags |= pf->flags & I40E_FLAG_VEB_STATS_ENABLED ? I40E_PRIV_FLAGS_VEB_STATS : 0; - ret_flags |= pf->flags & I40E_FLAG_RX_PS_ENABLED ? - I40E_PRIV_FLAGS_PS : 0; ret_flags |= pf->auto_disable_flags & I40E_FLAG_HW_ATR_EVICT_CAPABLE ? 0 : I40E_PRIV_FLAGS_HW_ATR_EVICT; @@ -2851,23 +2849,6 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) /* NOTE: MFP is not settable */ - /* allow the user to control the method of receive -* buffer DMA, whether the packet is split at header -* boundaries into two separate buffers. In some cases -* one routine or the other will perform better. -*/ - if ((flags & I40E_PRIV_FLAGS_PS) && - !(pf->flags & I40E_FLAG_RX_PS_ENABLED)) { - pf->flags |= I40E_FLAG_RX_PS_ENABLED; - pf->flags &= ~I40E_FLAG_RX_1BUF_ENABLED; - reset_required = true; - } else if (!(flags & I40E_PRIV_FLAGS_PS) && - (pf->flags & I40E_FLAG_RX_PS_ENABLED)) { - pf->flags &= ~I40E_FLAG_RX_PS_ENABLED; - pf->flags |= I40E_FLAG_RX_1BUF_ENABLED; - reset_required = true; - } - if (flags & I40E_PRIV_FLAGS_LINKPOLL_FLAG) pf->flags |= I40E_FLAG_LINK_POLLING_ENABLED; else diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f6da6b7..84e8d4e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2871,18 +2871,9 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) } rx_ctx.dtype = vsi->dtype; - if (vsi->dtype) { - set_ring_ps_enabled(ring); - rx_ctx.hsplit_0 = I40E_RX_SPLIT_L2 | - I40E_RX_SPLIT_IP | - I40E_RX_SPLIT_TCP_UDP | - I40E_RX_SPLIT_SCTP; - } else { -
[net-next 09/11] i40e: Test memory before ethtool alloc succeeds
From: Jesse Brandeburg When testing on systems with very limited amounts of RAM, a bug was found where, while changing the number of descriptors using ethtool, the driver didn't test the limits of system memory before permanently assuming it would be able to get receive buffer memory. Work around this issue by pre-allocation of the receive buffer memory, in the "ghost" ring, which is then used during reinit using the new ring length. Change-Id: I92d7a5fb59a6c884b2efdd1ec652845f101c3359 Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 34 +++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 6fd730ac..51a994d 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1274,6 +1274,13 @@ static int i40e_set_ringparam(struct net_device *netdev, } for (i = 0; i < vsi->num_queue_pairs; i++) { + /* this is to allow wr32 to have something to write to +* during early allocation of Rx buffers +*/ + u32 __iomem faketail = 0; + struct i40e_ring *ring; + u16 unused; + /* clone ring and setup updated count */ rx_rings[i] = *vsi->rx_rings[i]; rx_rings[i].count = new_rx_count; @@ -1282,12 +1289,22 @@ static int i40e_set_ringparam(struct net_device *netdev, */ rx_rings[i].desc = NULL; rx_rings[i].rx_bi = NULL; + rx_rings[i].tail = (u8 __iomem *)&faketail; err = i40e_setup_rx_descriptors(&rx_rings[i]); + if (err) + goto rx_unwind; + + /* now allocate the Rx buffers to make sure the OS +* has enough memory, any failure here means abort +*/ + ring = &rx_rings[i]; + unused = I40E_DESC_UNUSED(ring); + err = i40e_alloc_rx_buffers(ring, unused); +rx_unwind: if (err) { - while (i) { - i--; + do { i40e_free_rx_resources(&rx_rings[i]); - } + } while (i--); kfree(rx_rings); rx_rings = NULL; @@ -1313,6 +1330,17 @@ static int i40e_set_ringparam(struct net_device *netdev, if (rx_rings) { for (i = 0; i < vsi->num_queue_pairs; i++) { i40e_free_rx_resources(vsi->rx_rings[i]); + /* get the real tail offset */ + rx_rings[i].tail = vsi->rx_rings[i]->tail; + /* this is to fake out the allocation routine +* into thinking it has to realloc everything +* but the recycling logic will let us re-use +* the buffers allocated above +*/ + rx_rings[i].next_to_use = 0; + rx_rings[i].next_to_clean = 0; + rx_rings[i].next_to_alloc = 0; + /* do a struct copy */ *vsi->rx_rings[i] = rx_rings[i]; } kfree(rx_rings); -- 2.5.5
[net-next 04/11] i40e: Refactor receive routine
From: Jesse Brandeburg This is part 1 of the Rx refactor series, just including changes to i40e. This refactor aligns the receive routine with the one in ixgbe which was highly optimized. This reduces the code we have to maintain and allows for (hopefully) more readable and maintainable RX hot path. In order to do this: - consolidate the receive path into a single function that doesn't use packet split but *does* use pages for Rx buffers. - remove the old _1buf routine - consolidate several routines into helper functions - remove ethtool control over packet split Change-ID: I5ca100721de65992aa0114f8b4bac844b84758e0 Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e.h | 1 - drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 9 +- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 1 - drivers/net/ethernet/intel/i40e/i40e_main.c| 16 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c| 770 - drivers/net/ethernet/intel/i40e/i40e_txrx.h| 37 +- 6 files changed, 531 insertions(+), 303 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index ea6a69a..ebf423b 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -531,7 +531,6 @@ struct i40e_vsi { u8 *rss_lut_user; /* User configured lookup table entries */ u16 max_frame; - u16 rx_hdr_len; u16 rx_buf_len; u8 dtype; diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index c0a01e0..8ae30f7 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -268,9 +268,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) rx_ring->queue_index, rx_ring->reg_idx); dev_info(&pf->pdev->dev, -"rx_rings[%i]: rx_hdr_len = %d, rx_buf_len = %d\n", -i, rx_ring->rx_hdr_len, -rx_ring->rx_buf_len); +"rx_rings[%i]: rx_buf_len = %d\n", +i, rx_ring->rx_buf_len); dev_info(&pf->pdev->dev, "rx_rings[%i]: next_to_use = %d, next_to_clean = %d, ring_active = %i\n", i, @@ -361,8 +360,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) "work_limit = %d\n", vsi->work_limit); dev_info(&pf->pdev->dev, -"max_frame = %d, rx_hdr_len = %d, rx_buf_len = %d dtype = %d\n", -vsi->max_frame, vsi->rx_hdr_len, vsi->rx_buf_len, vsi->dtype); +"max_frame = %d, rx_buf_len = %d dtype = %d\n", +vsi->max_frame, vsi->rx_buf_len, vsi->dtype); dev_info(&pf->pdev->dev, "num_q_vectors = %i, base_vector = %i\n", vsi->num_q_vectors, vsi->base_vector); diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 858e169..6fd730ac 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -235,7 +235,6 @@ static const char i40e_priv_flags_strings[][ETH_GSTRING_LEN] = { "LinkPolling", "flow-director-atr", "veb-stats", - "packet-split", "hw-atr-eviction", }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 84e8d4e..e466111 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -2855,10 +2855,8 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) memset(&rx_ctx, 0, sizeof(rx_ctx)); ring->rx_buf_len = vsi->rx_buf_len; - ring->rx_hdr_len = vsi->rx_hdr_len; rx_ctx.dbuff = ring->rx_buf_len >> I40E_RXQ_CTX_DBUFF_SHIFT; - rx_ctx.hbuff = ring->rx_hdr_len >> I40E_RXQ_CTX_HBUFF_SHIFT; rx_ctx.base = (ring->dma / 128); rx_ctx.qlen = ring->count; @@ -2910,7 +2908,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q); writel(0, ring->tail); - i40e_alloc_rx_buffers_1buf(ring, I40E_DESC_UNUSED(ring)); + i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); return 0; } @@ -2949,15 +2947,13 @@ static int i40e_vsi_configure_rx(struct i40e_vsi *vsi) else vsi->max_frame = I40E_RXBUFFER_2048; - vsi->rx_hdr_len = 0; - vsi->rx_buf_len = vsi->max_frame; + vsi->rx_buf_len = I40E_RXBUFFER_2048; vsi->dtype = I40E_RX_DTYPE_NO_SPLIT; #ifdef I40E_FCOE /* setup rx buffer for FCoE */ if ((vsi->type == I40E_VSI_FCOE) &&
[net-next 01/11] i40e/i40evf: Refactor tunnel interpretation
From: Jesse Brandeburg Refactor the interpretation of a tunnel. This removes some code and lets us start using the hardware's parsing. Signed-off-by: Jesse Brandeburg Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 13 ++--- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 13 ++--- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 2765d7e..dab733c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1392,7 +1392,7 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, u16 rx_ptype) { struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(rx_ptype); - bool ipv4, ipv6, ipv4_tunnel, ipv6_tunnel; + bool ipv4, ipv6, tunnel = false; skb->ip_summed = CHECKSUM_NONE; @@ -1441,14 +1441,13 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, * doesn't make it a hard requirement so if we have validated the * inner checksum report CHECKSUM_UNNECESSARY. */ - - ipv4_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT4_MAC_PAY3) && -(rx_ptype <= I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4); - ipv6_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT6_MAC_PAY3) && -(rx_ptype <= I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4); + if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP | + I40E_RX_PTYPE_INNER_PROT_UDP | + I40E_RX_PTYPE_INNER_PROT_SCTP)) + tunnel = true; skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = ipv4_tunnel || ipv6_tunnel; + skb->csum_level = tunnel ? 1 : 0; return; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index ede8dfc..a37a3f3 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -864,7 +864,7 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, u16 rx_ptype) { struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(rx_ptype); - bool ipv4, ipv6, ipv4_tunnel, ipv6_tunnel; + bool ipv4, ipv6, tunnel = false; skb->ip_summed = CHECKSUM_NONE; @@ -913,14 +913,13 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi, * doesn't make it a hard requirement so if we have validated the * inner checksum report CHECKSUM_UNNECESSARY. */ - - ipv4_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT4_MAC_PAY3) && -(rx_ptype <= I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4); - ipv6_tunnel = (rx_ptype >= I40E_RX_PTYPE_GRENAT6_MAC_PAY3) && -(rx_ptype <= I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4); + if (decoded.inner_prot & (I40E_RX_PTYPE_INNER_PROT_TCP | + I40E_RX_PTYPE_INNER_PROT_UDP | + I40E_RX_PTYPE_INNER_PROT_SCTP)) + tunnel = true; skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->csum_level = ipv4_tunnel || ipv6_tunnel; + skb->csum_level = tunnel ? 1 : 0; return; -- 2.5.5
[net-next 00/11][pull request] 40GbE Intel Wired LAN Driver Updates 2016-05-05
This series contains updates to i40e and i40evf. The theme behind this series is code reduction, yeah! Jesse provides most of the changes starting with a refactor of the interpretation of a tunnel which lets us start using the hardware's parsing. Removed the packet split receive routine and ancillary code in preparation for the Rx-refactor. The refactor of the receive routine, aligns the receive routine with the one in ixgbe which was highly optimized. The hardware supports a 16 byte descriptor for receive, but the driver was never using it in production. There was no performance benefit to the real driver of 16 byte descriptors, so drop a whole lot of complexity while getting rid of the code. Fixed a bug where while changing the number of descriptors using ethtool, the driver did not test the limits of the system memory before permanently assuming it would be able to get receive buffer memory. Mitch fixes a memory leak of one page each time the driver is opened by allocating the correct number of receive buffers and do not fiddle with next_to_use in the VF driver. Arnd Bergmann fixed a indentation issue by adding the appropriate curly braces in i40e_vc_config_promiscuous_mode_msg(). Julia Lawall fixed an issue found by Coccinelle, where i40e_client_ops structure can be const since it is never modified. The following are changes since commit 035cd6ba53eff060760c4f4d11339fcc916a967c: MAINTAINERS: Cleanup Intel Wired LAN maintainers list and are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 40GbE Arnd Bergmann (1): i40e: fix misleading indentation Jesse Brandeburg (8): i40e/i40evf: Refactor tunnel interpretation i40e: Drop packet split receive routine i40e/i40evf: Remove reference to ring->dtype i40e: Refactor receive routine i40evf: Drop packet split receive routine i40evf: refactor receive routine i40e/i40evf: Remove unused hardware receive descriptor code i40e: Test memory before ethtool alloc succeeds Julia Lawall (1): i40e: constify i40e_client_ops structure Mitch Williams (1): i40evf: Allocate Rx buffers properly drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- drivers/net/ethernet/intel/i40e/i40e.h | 11 +- drivers/net/ethernet/intel/i40e/i40e_client.h | 2 +- drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 31 +- drivers/net/ethernet/intel/i40e/i40e_ethtool.c | 54 +- drivers/net/ethernet/intel/i40e/i40e_main.c| 73 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c| 968 ++--- drivers/net/ethernet/intel/i40e/i40e_txrx.h| 69 +- drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 5 +- drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 930 ++-- drivers/net/ethernet/intel/i40evf/i40e_txrx.h | 69 +- drivers/net/ethernet/intel/i40evf/i40evf.h | 7 - drivers/net/ethernet/intel/i40evf/i40evf_ethtool.c | 65 -- drivers/net/ethernet/intel/i40evf/i40evf_main.c| 34 +- .../net/ethernet/intel/i40evf/i40evf_virtchnl.c| 4 - 15 files changed, 1062 insertions(+), 1262 deletions(-) -- 2.5.5