Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

2016-11-29 Thread David Lebrun
On 11/30/2016 04:52 AM, Hannes Frederic Sowa wrote:
> In the worst case this causes 2GB (order 19) allocations (x == 31) to
> happen in GFP_ATOMIC (due to write lock) context and could cause update
> failures to the routing table due to fragmentation. Are you sure the
> upper limit of 31 is reasonable? I would very much prefer an upper limit
> of below or equal 25 for x to stay within the bounds of the slab
> allocators (which is still a lot and probably causes errors!).
> Unfortunately because of the nature of the sysctl you can't really
> create its own cache for it. :/
> 

Agreed. I think that even something like 16 would be excessively
sufficient, that would enable 65K slices, which is way more than enough
to have sufficient balancing with a reasonable amount of nexthops (I
wonder whether there are actual deployments with more than 32 nexthops
for a route).

> Also by design, one day this should all be RCU and having that much data
> outstanding worries me a bit during routing table mutation.
> 
> I am a fan of consistent hashing but I am not so sure if it belongs into
> a generic ECMP implementation or into its own ipvs or netfilter module
> where you specifically know how much memory to burn for it.
> 

The complexity of the consistent hashing code might warrant something
like that, but I am ot sure of the implications.

> Also please convert the sysctl to a netlink attribute if you pursue this
> because if I change the sysctl while my quagga is hammering the routing
> table I would like to know which nodes allocate what amount of memory.

Yes, that was the idea.

Thanks for the feedback

David



signature.asc
Description: OpenPGP digital signature


[PATCH iproute2 V3 2/3] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai 
---
 man/man8/tc-flower.8 | 17 ++-
 tc/f_flower.c| 84 +---
 2 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 74f76647753b..0e0b0cf4bb72 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -36,7 +36,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -121,6 +125,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 2d31d1aa832d..173cfc20f90b 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
fprintf(stderr, "   dst_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   src_ip [ IPV4-ADDR | IPV6-ADDR 
] |\n");
fprintf(stderr, "   dst_port PORT-NUMBER |\n");
-   fprintf(stderr, "   src_port PORT-NUMBER }\n");
+   fprintf(stderr, "   src_port PORT-NUMBER |\n");
+   fprintf(stderr, "   enc_dst_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_src_ip [ IPV4-ADDR | 
IPV6-ADDR ] |\n");
+   fprintf(stderr, "   enc_key_id [ KEY-ID ] }\n");
fprintf(stderr, "   FILTERID := X:Y:Z\n");
fprintf(stderr, "   ACTION-SPEC := ... look at individual 
actions\n");
fprintf(stderr, "\n");
@@ -121,8 +124,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+   } else if (!eth_type) {
+   family = AF_UNSPEC;
} else {
-   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
 
@@ -130,8 +134,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
 
-   if (addr.family != family)
+   if (family && (addr.family != family)) {
+   fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+   }
 
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
  addr.data, addr.bytelen);
@@ -181,6 +187,18 @@ static int flower_parse_port(char *str, __u8 ip_port,
return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+   int ret;
+   __be32 key_id;
+
+   ret = get_be32(_id, str, 10);
+   if (!ret)
+   addattr32(n, MAX_MSG, type, key_id);
+
+   return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
 {
@@ -339,6 +357,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
*handle,
fprintf(stderr, "Illegal \"src_port\"\n");
return -1;
}
+   } else if (matches(*argv, "enc_dst_ip") == 0) {
+   NEXT_ARG();
+   ret = flower_parse_ip_addr(*argv, 0,
+  TCA_FLOWER_KEY_ENC_IPV4_DST,
+  
TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+  

[PATCH iproute2 V3 3/3] tc/act_tunnel: Introduce ip tunnel action

2016-11-29 Thread Amir Vadai
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai 
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/m_tunnel_key.c| 258 +++
 4 files changed, 414 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h 
b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index ..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai 
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include 
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+   tc_gen;
+   int t_action;
+};
+
+enum {
+   TCA_TUNNEL_KEY_UNSPEC,
+   TCA_TUNNEL_KEY_TM,
+   TCA_TUNNEL_KEY_PARMS,
+   TCA_TUNNEL_KEY_ENC_IPV4_SRC,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV4_DST,/* be32 */
+   TCA_TUNNEL_KEY_ENC_IPV6_SRC,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_IPV6_DST,/* struct in6_addr */
+   TCA_TUNNEL_KEY_ENC_KEY_ID,  /* be64 */
+   TCA_TUNNEL_KEY_PAD,
+   __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index ..d0c333d27158
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,113 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" 
"Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+It must be used for offloaded filters, such that hardware drivers can
+realize they need to program the HW to do decapsulation.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. 
The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Decapsulation mode, no further arguments allowed. This function is not
+mandatory and might be used only in some specific use cases.
+.TP
+.B set
+Encapsulation mode. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel by setting metadata to VNI 11, source IP 11.11.0.1 and destination IP
+11.11.0.2 by 

[PATCH iproute2 V3 0/3] tc: Support for ip tunnel metadata set/unset/classify

2016-11-29 Thread Amir Vadai
Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit f3f339e9590a ("cleanup debris from revert")

Example usage:

$ tc filter add dev vxlan0 protocol ip parent : \
flower \
  enc_src_ip 11.11.0.2 \
  enc_dst_ip 11.11.0.1 \
  enc_key_id 11 \
  dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent : \
flower \
  ip_proto 1 \
  dst_ip 11.11.11.2 \
action tunnel_key set \
  src_ip 11.11.0.1 \
  dst_ip 11.11.0.2 \
  id 11 \
action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' 
operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c |   4 +-
 include/libnetlink.h |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++
 ip/iplink_geneve.c   |   2 +-
 ip/iplink_vxlan.c|   2 +-
 man/man8/tc-flower.8 |  17 ++-
 man/man8/tc-tunnel_key.8 | 113 +++
 tc/Makefile  |   1 +
 tc/f_flower.c|  84 +++-
 tc/m_tunnel_key.c| 258 +++
 10 files changed, 523 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2



[PATCH iproute2 V3 1/3] libnetlink: Introduce rta_getattr_be*()

2016-11-29 Thread Amir Vadai
Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai 
---
 bridge/fdb.c | 4 ++--
 include/libnetlink.h | 9 +
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c| 2 +-
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
if (tb[NDA_PORT]) {
if (jw_global)
jsonw_uint_field(jw_global, "port",
-ntohs(rta_getattr_u16(tb[NDA_PORT])));
+rta_getattr_be16(tb[NDA_PORT]));
else
fprintf(fp, "port %d ",
-   ntohs(rta_getattr_u16(tb[NDA_PORT])));
+   rta_getattr_be16(tb[NDA_PORT]));
}
 
if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct rtnl_handle {
int fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr 
*rta)
 {
return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+   return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+   return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_GENEVE_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+   rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
if (tb[IFLA_GENEVE_COLLECT_METADATA])
fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_VXLAN_PORT])
fprintf(f, "dstport %u ",
-   ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+   rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
if (tb[IFLA_VXLAN_LEARNING] &&
!rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
-- 
2.10.2



Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
On Wed, Nov 30, 2016 at 9:17 AM, Amir Vadai  wrote:
> On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
(Sending again since I just discovered that Google Inbox is adding an
HTML part...)

>> The overall design is fine, just a couple nits with the code.
>>
>> > diff --git a/tc/f_flower.c b/tc/f_flower.c
>> > index 2d31d1aa832d..1cf0750b5b83 100644
>> > --- a/tc/f_flower.c
>> > +++ b/tc/f_flower.c
>>
>> >
>> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
>>
>> str is not modified, therefore use: const char *str
> ack
>
>>
>> > +{
>> > +   int ret;
>> > +   __be32 key_id;
>> > +
>> > +   ret = get_be32(_id, str, 10);
>> > +   if (ret)
>> > +   return -1;
>>
>> Traditionally netlink attributes are in host order, why was flower
>> chosen to be different?
> I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
> flower code.
Now the right Jiri (Pirko) is CC'ed

>
>>
>> > +
>> > +   addattr32(n, MAX_MSG, type, key_id);
>> > +
>> > +   return 0;
>>
>>
>> Why lose the return value here?  Why not:
>>
>>   ret = get_be32(_id, str, 10);
>>   if (!ret)
>>   addattr32(n, MAX_MSG, type, key_id);
> The get_*() function can return only -1 or 0. But you are right, and it
> is better the way you suggested.  Changing accordingly in V3.
>
>>
>> > +}
>> > +
>> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
>> > int argc, char **argv, struct nlmsghdr *n)
>> >  {
>> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, 
>> > char *handle,
>> > fprintf(stderr, "Illegal \"src_port\"\n");
>> > return -1;
>> > }
>> > +   } else if (matches(*argv, "enc_dst_ip") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_ip_addr(*argv, 0,
>> > +  TCA_FLOWER_KEY_ENC_IPV4_DST,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
>> > +  TCA_FLOWER_KEY_ENC_IPV6_DST,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
>> > +  n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
>> > +   return -1;
>> > +   }
>> > +   } else if (matches(*argv, "enc_src_ip") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_ip_addr(*argv, 0,
>> > +  TCA_FLOWER_KEY_ENC_IPV4_SRC,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
>> > +  TCA_FLOWER_KEY_ENC_IPV6_SRC,
>> > +  
>> > TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
>> > +  n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_src_ip\"\n");
>> > +   return -1;
>> > +   }
>> > +   } else if (matches(*argv, "enc_key_id") == 0) {
>> > +   NEXT_ARG();
>> > +   ret = flower_parse_key_id(*argv,
>> > + TCA_FLOWER_KEY_ENC_KEY_ID, 
>> > n);
>> > +   if (ret < 0) {
>> > +   fprintf(stderr, "Illegal \"enc_key_id\"\n");
>> > +   return -1;
>> > +   }
>> > } else if (matches(*argv, "action") == 0) {
>> > NEXT_ARG();
>> > ret = parse_action(, , TCA_FLOWER_ACT, n);
>> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, 
>> > __u8 ip_proto,
>> > fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
>> >  }
>> >
>> > +static void flower_print_key_id(FILE *f, char *name,
>> > +   struct rtattr *attr)
>>
>> const char *name?
> ack
>
>>
>>
>> > +{
>> > +   if (!attr)
>> > +   return;
>> > +   fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
>> > +}
>> > +
>>
>> Why short circuit, just change the order:
>>
>>   if (attr)
>>   fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
>>
>> You might also want to introduce rta_getattr_be32()
> ack
>
>>
>> Please change, retest and resubmit both patches.
> ack
>
> Thanks for reviewing,
> Amir


RE: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer

2016-11-29 Thread Andy Duan
From: Nikita Yushchenko  Sent: Wednesday, 
November 30, 2016 2:36 PM
 >To: Andy Duan ; David S. Miller
 >; Troy Kisky ;
 >Andrew Lunn ; Eric Nelson ; Philippe
 >Reynes ; Johannes Berg ;
 >netdev@vger.kernel.org
 >Cc: Chris Healy ; Fabio Estevam
 >; linux-ker...@vger.kernel.org
 >Subject: Re: [patch net / RFC] net: fec: increase frame size limitation to
 >actually available buffer
 >
 >> But I think it is not necessary since the driver don't support jumbo frame.
 >
 >Hardcoded 1522 raises two separate issues.
 >
 >(1) When DSA is in use, frames processed by FEC chip contain DSA tag and
 >thus can be larger than hardcoded limit of 1522. This issue is not 
 >FEC-specific,
 >any driver that hardcodes maximum frame size to 1522 (many
 >do) will have this issue if used with DSA.
 >
 >Clean solution for this must take into account that difference between MTU
 >and max frame size is no longer known at compile time. Actually this is the
 >case even without DSA, due to VLANs: max frame size is (MTU + 18) without
 >VLANs, but (MTU + 22) with VLANs. However currently drivers tend to ignore
 >this and hardcode 22.  With DSA, 22 is not enough, need to add switch-
 >specific tag size to that.
 >
 >Not yet sure how to handle this. DSA-specific API to find out tag size could 
 >be
 >added, but generic solution should handle all cases of dynamic difference
 >between MTU and max frame size, not only DSA.
 >
 >
 >(2) There is some demand to use larger frames for optimization purposes.
 >
 >FEC register fields that limit frame size are 14-bit, thus allowing frames up 
 >to
 >(4k-1). I'm about to prepare a larger patch:
 >- add ndo_change_mtu handler, allowing MTU up to (4k - overhead),
 >- set MAX_FL / TRUNC_FL based on configured MTU,
 >- if necessary, do buffer reallocation with larger buffers.
 >
 >Is this suitable for upstreaming?
 >Is there any policy related to handling larger frames?

Of course, welcome to upstream the jumbo frame patches, but hope to also add 
the transmit jumbo frame, not only receive path, which is helpful for testing 
with two boards connection.
And, some notes need you to care:
- the maximum jumbo frame should consider the fifo size. Different chip has 
different fifo size. Like i.MX53 tx and rx share one fifo, i.mx6q/dl/sl have 
separate 4k fifo for tx and rx, i.mx6sx/i.mx7x have separate 8k fifo for tx and 
rx.
- rx fifo watermark to generate pause frame in busy loading system to avoid 
fifo overrun. In general,  little pause frames bring better performance, mass 
of pause frames cause worse performance.


Regards,
Andy


Re: [PATCH v3 net-next 3/3] openvswitch: Fix skb->protocol for vlan frames.

2016-11-29 Thread Pravin Shelar
On Tue, Nov 29, 2016 at 3:30 PM, Jarno Rajahalme  wrote:
> Do not always set skb->protocol to be the ethertype of the L3 header.
> For a packet with non-accelerated VLAN tags skb->protocol needs to be
> the ethertype of the outermost non-accelerated VLAN ethertype.
>
> Any VLAN offloading is undone on the OVS netlink interface, and any
> VLAN tags added by userspace are non-accelerated, as are double tagged
> VLAN packets.
>
> Fixes: 018c1dda5f ("openvswitch: 802.1AD Flow handling, actions, vlan 
> parsing, netlink attributes")
> Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
> Signed-off-by: Jarno Rajahalme 

Looks much better now. Thanks for fixing it. I have one minor comment.

Acked-by: Pravin B Shelar 

> ---
> v3: Set skb->protocol properly also for double tagged packets, as suggested
> by Pravin.  This patch is no longer needed for the MTU test failure, as
> the new patch 2/3 addresses that.
>
>  net/openvswitch/datapath.c |  1 -
>  net/openvswitch/flow.c | 62 
> +++---
>  2 files changed, 31 insertions(+), 32 deletions(-)
>
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index 08aa926..e2fe2e5 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -354,6 +354,7 @@ static int parse_vlan(struct sk_buff *skb, struct 
> sw_flow_key *key)
> res = parse_vlan_tag(skb, >eth.vlan);
> if (res <= 0)
> return res;
> +   skb->protocol = key->eth.vlan.tpid;
> }
>
> /* Parse inner vlan tag. */
> @@ -361,6 +362,11 @@ static int parse_vlan(struct sk_buff *skb, struct 
> sw_flow_key *key)
> if (res <= 0)
> return res;
>
> +   /* If the outer vlan tag was accelerated, skb->protocol should
> +* refelect the inner vlan type. */
> +   if (!eth_type_vlan(skb->protocol))

Since you would be spinning another version, can you change this
condition to directly check for skb-vlan-tag rather than indirectly
checking for the vlan accelerated case?


Re: [WIP] net+mlx4: auto doorbell

2016-11-29 Thread Alexei Starovoitov
On Mon, Nov 28, 2016 at 10:58 PM, Eric Dumazet  wrote:
>  {
> @@ -496,8 +531,13 @@ static bool mlx4_en_process_tx_cq(struct net_device *dev,
> wmb();
>
> /* we want to dirty this cache line once */
> -   ACCESS_ONCE(ring->last_nr_txbb) = last_nr_txbb;
> -   ACCESS_ONCE(ring->cons) = ring_cons + txbbs_skipped;
> +   WRITE_ONCE(ring->last_nr_txbb, last_nr_txbb);
> +   ring_cons += txbbs_skipped;
> +   WRITE_ONCE(ring->cons, ring_cons);
> +   WRITE_ONCE(ring->ncons, ring_cons + last_nr_txbb);
> +
> +   if (dev->doorbell_opt)
> +   mlx4_en_xmit_doorbell(dev, ring);
...
> +   /* Doorbell avoidance : We can omit doorbell if we know a TX 
> completion
> +* will happen shortly.
> +*/
> +   if (send_doorbell &&
> +   dev->doorbell_opt &&
> +   (s32)(READ_ONCE(ring->prod_bell) - READ_ONCE(ring->ncons)) > 0)
> +   send_doorbell = false;

this is awesome idea!
I'm missing though why you need all these read_once/write_once.
It's a tx ring that is already protected by tx queue lock
or completion is happening without grabbing the lock?
Then how do we make sure that we don't race here
and indeed above prod_bell - ncons > 0 condition is safe?
Good description of algorithm would certainly help ;)


Re: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer

2016-11-29 Thread Toshiaki Makita
On 2016/11/30 15:36, Nikita Yushchenko wrote:
>> But I think it is not necessary since the driver don't support jumbo frame.
> 
> Hardcoded 1522 raises two separate issues.
> 
> (1) When DSA is in use, frames processed by FEC chip contain DSA tag and
> thus can be larger than hardcoded limit of 1522. This issue is not
> FEC-specific, any driver that hardcodes maximum frame size to 1522 (many
> do) will have this issue if used with DSA.
> 
> Clean solution for this must take into account that difference between
> MTU and max frame size is no longer known at compile time. Actually this
> is the case even without DSA, due to VLANs: max frame size is (MTU + 18)
> without VLANs, but (MTU + 22) with VLANs. However currently drivers tend
> to ignore this and hardcode 22.  With DSA, 22 is not enough, need to add
> switch-specific tag size to that.
> 
> Not yet sure how to handle this. DSA-specific API to find out tag size
> could be added, but generic solution should handle all cases of dynamic
> difference between MTU and max frame size, not only DSA.

BTW I'm trying to introduce envelope frames to solve this kind of problems.
http://marc.info/?t=14749669155=1=2
http://marc.info/?t=14749669153=1=2
http://marc.info/?t=14749669152=1=2
http://marc.info/?t=14749669154=1=2
http://marc.info/?t=14749669151=1=2

It needs jumbo frame support of NICs though.

Regards,
Toshiaki Makita




Re: [PATCH v3 net-next 2/3] openvswitch: Use is_skb_forwardable() for length check.

2016-11-29 Thread Pravin Shelar
On Tue, Nov 29, 2016 at 3:30 PM, Jarno Rajahalme  wrote:
> Use is_skb_forwardable() instead of an explicit length check.  This
> gets around the apparent MTU check failure in OVS test cases when
> skb->protocol is not properly set in case of non-accelerated VLAN
> skbs.
>
> Suggested-by: Pravin Shelar 
> Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
> Signed-off-by: Jarno Rajahalme 
> ---
> v3: New patch suggested by Pravin.
>
> net/openvswitch/vport.c | 38 ++
>  1 file changed, 14 insertions(+), 24 deletions(-)
>
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index b6c8524..076b39f 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c

>  void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
>  {
> -   int mtu = vport->dev->mtu;
> -
> switch (vport->dev->type) {
> case ARPHRD_NONE:
> if (mac_proto == MAC_PROTO_ETHERNET) {
> @@ -504,11 +485,20 @@ void ovs_vport_send(struct vport *vport, struct sk_buff 
> *skb, u8 mac_proto)
> goto drop;
> }
>
> -   if (unlikely(packet_length(skb, vport->dev) > mtu &&
> -!skb_is_gso(skb))) {
> -   net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
> -vport->dev->name,
> -packet_length(skb, vport->dev), mtu);
> +   if (unlikely(!is_skb_forwardable(vport->dev, skb))) {

This would easy to read if you inverse the if condition here.

> +   /* Log only if the device is up. */
> +   if (vport->dev->flags & IFF_UP) {

since is_skb_forwardable() is checking for IFF_UP we can remove same
check from internal-device send() op.

> +   unsigned int length = skb->len
> +   - vport->dev->hard_header_len;
> +
> +   if (!skb_vlan_tag_present(skb)
> +   && eth_type_vlan(skb->protocol))
> +   length -= VLAN_HLEN;
> +
> +   net_warn_ratelimited("%s: dropped over-mtu packet %d 
> > %d\n",
> +vport->dev->name, length,
> +vport->dev->mtu);
> +   }
> vport->dev->stats.tx_errors++;
> goto drop;
> }
> --
> 2.1.4
>


Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Amir Vadai
On Tue, Nov 29, 2016 at 07:26:58PM -0800, Stephen Hemminger wrote:
> The overall design is fine, just a couple nits with the code.
> 
> > diff --git a/tc/f_flower.c b/tc/f_flower.c
> > index 2d31d1aa832d..1cf0750b5b83 100644
> > --- a/tc/f_flower.c
> > +++ b/tc/f_flower.c
> 
> >  
> > +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)
> 
> str is not modified, therefore use: const char *str
ack

> 
> > +{
> > +   int ret;
> > +   __be32 key_id;
> > +
> > +   ret = get_be32(_id, str, 10);
> > +   if (ret)
> > +   return -1;
> 
> Traditionally netlink attributes are in host order, why was flower
> chosen to be different?
I don't know, maybe Jiri (cc'ed) can explain, but it is all over the
flower code.

> 
> > +
> > +   addattr32(n, MAX_MSG, type, key_id);
> > +
> > +   return 0;
> 
> 
> Why lose the return value here?  Why not:
> 
>   ret = get_be32(_id, str, 10);
>   if (!ret)
>   addattr32(n, MAX_MSG, type, key_id);
The get_*() function can return only -1 or 0. But you are right, and it
is better the way you suggested.  Changing accordingly in V3.

> 
> > +}
> > +
> >  static int flower_parse_opt(struct filter_util *qu, char *handle,
> > int argc, char **argv, struct nlmsghdr *n)
> >  {
> > @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, 
> > char *handle,
> > fprintf(stderr, "Illegal \"src_port\"\n");
> > return -1;
> > }
> > +   } else if (matches(*argv, "enc_dst_ip") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_ip_addr(*argv, 0,
> > +  TCA_FLOWER_KEY_ENC_IPV4_DST,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
> > +  TCA_FLOWER_KEY_ENC_IPV6_DST,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
> > +  n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
> > +   return -1;
> > +   }
> > +   } else if (matches(*argv, "enc_src_ip") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_ip_addr(*argv, 0,
> > +  TCA_FLOWER_KEY_ENC_IPV4_SRC,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
> > +  TCA_FLOWER_KEY_ENC_IPV6_SRC,
> > +  
> > TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
> > +  n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_src_ip\"\n");
> > +   return -1;
> > +   }
> > +   } else if (matches(*argv, "enc_key_id") == 0) {
> > +   NEXT_ARG();
> > +   ret = flower_parse_key_id(*argv,
> > + TCA_FLOWER_KEY_ENC_KEY_ID, n);
> > +   if (ret < 0) {
> > +   fprintf(stderr, "Illegal \"enc_key_id\"\n");
> > +   return -1;
> > +   }
> > } else if (matches(*argv, "action") == 0) {
> > NEXT_ARG();
> > ret = parse_action(, , TCA_FLOWER_ACT, n);
> > @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, 
> > __u8 ip_proto,
> > fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
> >  }
> >  
> > +static void flower_print_key_id(FILE *f, char *name,
> > +   struct rtattr *attr)
> 
> const char *name?
ack

> 
> 
> > +{
> > +   if (!attr)
> > +   return;
> > +   fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
> > +}
> > +
> 
> Why short circuit, just change the order:
> 
>   if (attr)
>   fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));
> 
> You might also want to introduce rta_getattr_be32()
ack

> 
> Please change, retest and resubmit both patches.
ack

Thanks for reviewing,
Amir


Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure

2016-11-29 Thread Alexei Starovoitov
On Wed, Nov 30, 2016 at 07:48:51AM +0100, Thomas Graf wrote:
> On 11/29/16 at 04:15pm, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> > ...
> > > +#define LWT_BPF_MAX_HEADROOM 128
> > 
> > why 128?
> > btw I'm thinking for XDP to use 256, so metadata can be stored in there.
> 
> It's an arbitrary limit to catch obvious misconfiguration. I'm absolutely
> fine with bumping it to 256.
> 
> > > +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
> > > +struct dst_entry *dst, bool can_redirect)
> > > +{
> > > + int ret;
> > > +
> > > + /* Preempt disable is needed to protect per-cpu redirect_info between
> > > +  * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
> > > +  * access to maps strictly require a rcu_read_lock() for protection,
> > > +  * mixing with BH RCU lock doesn't work.
> > > +  */
> > > + preempt_disable();
> > > + rcu_read_lock();
> > > + bpf_compute_data_end(skb);
> > > + ret = BPF_PROG_RUN(lwt->prog, skb);
> > > + rcu_read_unlock();
> > > +
> > > + switch (ret) {
> > > + case BPF_OK:
> > > + break;
> > > +
> > > + case BPF_REDIRECT:
> > > + if (!can_redirect) {
> > > + WARN_ONCE(1, "Illegal redirect return code in prog 
> > > %s\n",
> > > +   lwt->name ? : "");
> > > + ret = BPF_OK;
> > > + } else {
> > > + ret = skb_do_redirect(skb);
> > 
> > I think this assumes that program did bpf_skb_push and L2 header is present.
> > Would it make sense to check that mac_header < network_header here to make
> > sure that it actually happened? I think the cost of single 'if' isn't much.
> > Also skb_do_redirect() can redirect to l3 tunnels like ipip ;)
> > so program shouldn't be doing bpf_skb_push in such case...
> 
> We are currently guaranteeing mac_header <= network_header given that
> bpf_skb_push() is calling skb_reset_mac_header() unconditionally.
> 
> Even if a program were to push an L2 header and then redirect to an l3
> tunnel, __bpf_redirect_no_mac will pull it off again and correct the
> mac_header offset.

yes. that part is fine.

> Should we check in __bpf_redirect_common() whether mac_header <
> nework_header then or add it to lwt-bpf conditional on
> dev_is_mac_header_xmit()?

may be only extra 'if' in lwt-bpf is all we need?
I'm still missing what will happen if we 'forget' to do
bpf_skb_push() inside the lwt-bpf program, but still do redirect
in lwt_xmit stage to l2 netdev...



Re: [PATCH net-next v3 4/4] bpf: Add tests and samples for LWT-BPF

2016-11-29 Thread Thomas Graf
On 11/29/16 at 04:17pm, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 02:21:23PM +0100, Thomas Graf wrote:
> > Adds a series of test to verify the functionality of attaching
> > BPF programs at LWT hooks.
> > 
> > Also adds a sample which collects a histogram of packet sizes which
> > pass through an LWT hook.
> > 
> > $ ./lwt_len_hist.sh
> > Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family 
> > AF_UNSPEC
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 
> > 192.168.253.2 () port 0 AF_INET : demo
> > Recv   SendSend
> > Socket Socket  Message  Elapsed
> > Size   SizeSize Time Throughput
> > bytes  bytes   bytessecs.10^6bits/sec
> > 
> >  87380  16384  1638410.0039857.69
> 
> Nice!
> 
> > +   ret = bpf_redirect(ifindex, 0);
> > +   if (ret < 0) {
> > +   printk("bpf_redirect() failed: %d\n", ret);
> > +   return BPF_DROP;
> > +   }
> 
> this 'if' looks a bit weird. You're passing 0 as flags,
> so this helper will always succeed.
> Other sample code often does 'return bpf_redirect(...)'
> due to this reasoning.

Right, the if branch is absolutely useless. I will remove it.


Re: [PATCH] netns: avoid disabling irq for netns id

2016-11-29 Thread Cong Wang
On Tue, Nov 29, 2016 at 2:14 PM, Paul Moore  wrote:
> On Tue, Nov 29, 2016 at 5:11 PM, Paul Moore  wrote:
>> From: Paul Moore 
>>
>> Bring back commit bc51dddf98c9 ("netns: avoid disabling irq for netns
>> id") now that we've fixed some audit multicast issues that caused
>> problems with original attempt.  Additional information, and history,
>> can be found in the links below:
>>
>>  * https://github.com/linux-audit/audit-kernel/issues/22
>>  * https://github.com/linux-audit/audit-kernel/issues/23
>>
>> Signed-off-by: Cong Wang 
>> Signed-off-by: Paul Moore 
>> ---
>>  net/core/net_namespace.c |   35 +++
>>  1 file changed, 15 insertions(+), 20 deletions(-)
>
> Cong Wang, I added your sign-off to the patch since you were the
> original author, if you would prefer I leave it off or want it changed
> just let me know.

Thanks for not forgetting it. I am fine with signed-off-by.


Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure

2016-11-29 Thread Thomas Graf
On 11/29/16 at 04:15pm, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> ...
> > +#define LWT_BPF_MAX_HEADROOM 128
> 
> why 128?
> btw I'm thinking for XDP to use 256, so metadata can be stored in there.

It's an arbitrary limit to catch obvious misconfiguration. I'm absolutely
fine with bumping it to 256.

> > +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
> > +  struct dst_entry *dst, bool can_redirect)
> > +{
> > +   int ret;
> > +
> > +   /* Preempt disable is needed to protect per-cpu redirect_info between
> > +* BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
> > +* access to maps strictly require a rcu_read_lock() for protection,
> > +* mixing with BH RCU lock doesn't work.
> > +*/
> > +   preempt_disable();
> > +   rcu_read_lock();
> > +   bpf_compute_data_end(skb);
> > +   ret = BPF_PROG_RUN(lwt->prog, skb);
> > +   rcu_read_unlock();
> > +
> > +   switch (ret) {
> > +   case BPF_OK:
> > +   break;
> > +
> > +   case BPF_REDIRECT:
> > +   if (!can_redirect) {
> > +   WARN_ONCE(1, "Illegal redirect return code in prog 
> > %s\n",
> > + lwt->name ? : "");
> > +   ret = BPF_OK;
> > +   } else {
> > +   ret = skb_do_redirect(skb);
> 
> I think this assumes that program did bpf_skb_push and L2 header is present.
> Would it make sense to check that mac_header < network_header here to make
> sure that it actually happened? I think the cost of single 'if' isn't much.
> Also skb_do_redirect() can redirect to l3 tunnels like ipip ;)
> so program shouldn't be doing bpf_skb_push in such case...

We are currently guaranteeing mac_header <= network_header given that
bpf_skb_push() is calling skb_reset_mac_header() unconditionally.

Even if a program were to push an L2 header and then redirect to an l3
tunnel, __bpf_redirect_no_mac will pull it off again and correct the
mac_header offset.

Should we check in __bpf_redirect_common() whether mac_header <
nework_header then or add it to lwt-bpf conditional on
dev_is_mac_header_xmit()?

> May be rename bpf_skb_push to bpf_skb_push_l2 ?
> since it's doing skb_reset_mac_header(skb); at the end of it?
> Or it's probably better to use 'flags' argument to tell whether
> bpf_skb_push() should set mac_header or not ?

I added the flags for this purpose but left it unused for now. This
will allow to add a flag later to extend the l3 header prior to
pushing an l2 header, hence the current helper name. But even then,
we would always reset the mac header as well to ensure we never
leave an skb with mac_header > network_header. Are you seeing a
scenario where we would want to omit the mac header reset?

> Then this bit:
> 
> > +   case BPF_OK:
> > +   /* If the L3 header was expanded, headroom might be too
> > +* small for L2 header now, expand as needed.
> > +*/
> > +   ret = xmit_check_hhlen(skb);
> 
> will work fine as well...
> which probably needs "mac_header wasn't set" check? or it's fine?

Right. The check is not strictly required right now but is a safety net
to ensure that the skb passed to dst_neigh_output() which will add the
l2 header in the non-redirect case to always have sufficient headroom.
It will currently be a NOP as we are not allowing to extend the l3 header
in bpf_skb_push() yet. I left this in there to ensure that we are not
missing to add this later.


Re: [patch net / RFC] net: fec: increase frame size limitation to actually available buffer

2016-11-29 Thread Nikita Yushchenko
> But I think it is not necessary since the driver don't support jumbo frame.

Hardcoded 1522 raises two separate issues.

(1) When DSA is in use, frames processed by FEC chip contain DSA tag and
thus can be larger than hardcoded limit of 1522. This issue is not
FEC-specific, any driver that hardcodes maximum frame size to 1522 (many
do) will have this issue if used with DSA.

Clean solution for this must take into account that difference between
MTU and max frame size is no longer known at compile time. Actually this
is the case even without DSA, due to VLANs: max frame size is (MTU + 18)
without VLANs, but (MTU + 22) with VLANs. However currently drivers tend
to ignore this and hardcode 22.  With DSA, 22 is not enough, need to add
switch-specific tag size to that.

Not yet sure how to handle this. DSA-specific API to find out tag size
could be added, but generic solution should handle all cases of dynamic
difference between MTU and max frame size, not only DSA.


(2) There is some demand to use larger frames for optimization purposes.

FEC register fields that limit frame size are 14-bit, thus allowing
frames up to (4k-1). I'm about to prepare a larger patch:
- add ndo_change_mtu handler, allowing MTU up to (4k - overhead),
- set MAX_FL / TRUNC_FL based on configured MTU,
- if necessary, do buffer reallocation with larger buffers.

Is this suitable for upstreaming?
Is there any policy related to handling larger frames?


Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 06:07:18PM -0700, David Ahern wrote:
> On 11/29/16 5:59 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 05:43:08PM -0700, David Ahern wrote:
> >> On 11/29/16 1:01 PM, Alexei Starovoitov wrote:
> >>> Could you also expose sk_protcol and sk_type as read only fields?
> >>
> >> Those are bitfields in struct sock, so can't use offsetof or sizeof. Any 
> >> existing use cases that try to load a bitfield in a bpf that I can look at?
> > 
> > pkt_type, vlan are also bitfileds in skb. Please see convert_skb_access()
> > There is a bit of ugliness due to __BIG_ENDIAN_BITFIELD though..
> > 
> 
> Given the added complexity I'd prefer to defer this second use case to a 
> follow on patch set. This one introduces the infra for sockets and I don't 
> see anything needing to change with it to add the read of 3 more sock 
> elements. Agree?

I don't see a complexity. It was straightforward for skb bitfields,
but if there is some unforeseen issue, it's better to tackle it now
otherwise the feature may never come and this 'infra for sockets' will
stay as 'infra for vrf only' and I'm struggling to convince myself that it's ok.
So I'll try to tweak this patch to add these 3 fields...



Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 06:52:36PM -0800, John Fastabend wrote:
> On 16-11-29 04:15 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> >> Registers new BPF program types which correspond to the LWT hooks:
> >>   - BPF_PROG_TYPE_LWT_IN   => dst_input()
> >>   - BPF_PROG_TYPE_LWT_OUT  => dst_output()
> >>   - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()
> >>
> >> The separate program types are required to differentiate between the
> >> capabilities each LWT hook allows:
> >>
> >>  * Programs attached to dst_input() or dst_output() are restricted and
> >>may only read the data of an skb. This prevent modification and
> >>possible invalidation of already validated packet headers on receive
> >>and the construction of illegal headers while the IP headers are
> >>still being assembled.
> >>
> >>  * Programs attached to lwtunnel_xmit() are allowed to modify packet
> >>content as well as prepending an L2 header via a newly introduced
> >>helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
> >>after the IP header has been assembled completely.
> >>
> >> All BPF programs receive an skb with L3 headers attached and may return
> >> one of the following error codes:
> >>
> >>  BPF_OK - Continue routing as per nexthop
> >>  BPF_DROP - Drop skb and return EPERM
> >>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
> >> (Only valid in lwtunnel_xmit() context)
> >>
> >> The return codes are binary compatible with their TC_ACT_
> >> relatives to ease compatibility.
> >>
> >> Signed-off-by: Thomas Graf 
> > ...
> >> +#define LWT_BPF_MAX_HEADROOM 128
> > 
> > why 128?
> > btw I'm thinking for XDP to use 256, so metadata can be stored in there.
> > 
> 
> hopefully not too off-topic but for XDP I would like to see this get

definitely off-topic. lwt->headroom is existing concept. Too late
to do anything about it.

> passed down with the program. It would be more generic and drivers could
> configure the headroom on demand and more importantly verify that a
> program pushing data is not going to fail at runtime.

For xdp I think it will be problematic, since we'd have to check for
that value at prog array access to make sure tailcalls are not broken.
Mix and match won't be possible.
So what does 'configure the headroom on demand' buys us?
Isn't it much easier to tell all drivers "always reserve this much" ?
We burn the page anyway.
If it's configurable per driver, then we'd need an api
to retrieve it. Yet the program author doesn't care what the value is.
If program needs to do udp encap, it will try do it. No matter what.
If xdp_adjust_head() helper fails, the program will likely decide
to drop the packet. In some cases it may decide to punt to stack
for further processing, but for high performance dataplane code
it's highly unlikely.
If it's configurable to something that is not cache line boundary
hw dma performance may suffer and so on.
So I see only cons in such 'configurable headroom' and propose
to have fixed 256 bytes headroom for XDP
which is enough for any sensible encap and metadata.



Re: The ubufs->refcount maybe be subtracted twice when tun_get_user failed

2016-11-29 Thread Jason Wang



On 2016年11月30日 10:53, Jason Wang wrote:



On 2016年11月29日 21:27, wangyunjian wrote:
Sorry, I didn't describe it clearly. In fact, the second subtraction 
happens in the function handle_tx,

when tun_get_user fails and zcopy_used is ture. Fllowing the steps:


I get your meaning. Thanks for the reporting. Will post patches (since 
macvtap has similar issue).


Posted, appreciate if you can have a test on them.

Thanks


Re: [net-next PATCH v3 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 06:50:18PM -0800, John Fastabend wrote:
> On 16-11-29 04:37 PM, Alexei Starovoitov wrote:
> > On Tue, Nov 29, 2016 at 12:11:33PM -0800, John Fastabend wrote:
> >> virtio_net XDP support expects receive buffers to be contiguous.
> >> If this is not the case we enable a slowpath to allow connectivity
> >> to continue but at a significan performance overhead associated with
> >> linearizing data. To make it painfully aware to users that XDP is
> >> running in a degraded mode we throw an xdp buffer error.
> >>
> >> To linearize packets we allocate a page and copy the segments of
> >> the data, including the header, into it. After this the page can be
> >> handled by XDP code flow as normal.
> >>
> >> Then depending on the return code the page is either freed or sent
> >> to the XDP xmit path. There is no attempt to optimize this path.
> >>
> >> Signed-off-by: John Fastabend 
> > ...
> >> +/* The conditions to enable XDP should preclude the underlying device from
> >> + * sending packets across multiple buffers (num_buf > 1). However per spec
> >> + * it does not appear to be illegal to do so but rather just against 
> >> convention.
> >> + * So in order to avoid making a system unresponsive the packets are 
> >> pushed
> >> + * into a page and the XDP program is run. This will be extremely slow 
> >> and we
> >> + * push a warning to the user to fix this as soon as possible. Fixing 
> >> this may
> >> + * require resolving the underlying hardware to determine why multiple 
> >> buffers
> >> + * are being received or simply loading the XDP program in the ingress 
> >> stack
> >> + * after the skb is built because there is no advantage to running it here
> >> + * anymore.
> >> + */
> > ...
> >>if (num_buf > 1) {
> >>bpf_warn_invalid_xdp_buffer();
>   ^^
> 
> Here is the warn once call. I made it a defined bpf warning so that we
> can easily identify if needed and it can be used by other drivers as
> well.

ahh. I thought it has - in front of it :)
and you're deleting it in this patch.

> >> -  goto err_xdp;
> >> +
> >> +  /* linearize data for XDP */
> >> +  xdp_page = xdp_linearize_page(rq, num_buf,
> >> +page, offset, );
> >> +  if (!xdp_page)
> >> +  goto err_xdp;
> > 
> > in case when we're 'lucky' the performance will silently be bad.
> 
> Were you specifically worried about the alloc failing here and no
> indication? I was thinking a statistic counter could be added as a
> follow on series to catch this and other such cases in non XDP paths
> if needed.
> 
> > Can we do warn_once here? so at least something in dmesg points out
> > that performance is not as expected. Am I reading it correctly that
> > you had to do a special kernel hack to trigger this situation and
> > in all normal cases it's not the case?
> > 
> 
> Correct the only way to produce this with upstream qemu and Linux is to
> write a kernel hack to build these types of buffers. AFAIK I caught all
> the cases where it could happen otherwise in the setup xdp prog call and
> required user to configure device driver so they can not happen e.g.
> disable LRO, set MTU < PAGE_SIZE, etc.
> 
> If I missed anything, I don't see it now, I would call it a bug and fix
> it. Again AFAIK everything should be covered though. As Micheal pointed
> out earlier although unexpected its not strictly against virtio spec to
> build such packets so we should handle it at least in a degraded mode
> even though it is not expected in any qemu cases.

Ok. Makes sense. The above wasn't clear in the commit log.
Thanks



[PATCH net 2/2] macvtap: handle ubuf refcount correctly when meet erros

2016-11-29 Thread Jason Wang
We trigger uarg->callback() immediately after we decide do datacopy
even if caller want to do zerocopy. This will cause the callback
(vhost_net_zerocopy_callback) decrease the refcount. But when we meet
an error afterwards, the error handling in vhost handle_tx() will try
to decrease it again. This is wrong and fix this by delay the
uarg->callback() until we're sure there's no errors.

Signed-off-by: Jason Wang 
---
The patch is needed for -stable.
---
 drivers/net/macvtap.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index bceca28..7869b06 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -742,13 +742,8 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, 
struct msghdr *m,
 
if (zerocopy)
err = zerocopy_sg_from_iter(skb, from);
-   else {
+   else
err = skb_copy_datagram_from_iter(skb, 0, from, len);
-   if (!err && m && m->msg_control) {
-   struct ubuf_info *uarg = m->msg_control;
-   uarg->callback(uarg, false);
-   }
-   }
 
if (err)
goto err_kfree;
@@ -779,7 +774,11 @@ static ssize_t macvtap_get_user(struct macvtap_queue *q, 
struct msghdr *m,
skb_shinfo(skb)->destructor_arg = m->msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+   } else if (m && m->msg_control) {
+   struct ubuf_info *uarg = m->msg_control;
+   uarg->callback(uarg, false);
}
+
if (vlan) {
skb->dev = vlan->dev;
dev_queue_xmit(skb);
-- 
2.7.4



[PATCH net 1/2] tun: handle ubuf refcount correctly when meet erros

2016-11-29 Thread Jason Wang
We trigger uarg->callback() immediately after we decide do datacopy
even if caller want to do zerocopy. This will cause the callback
(vhost_net_zerocopy_callback) decrease the refcount. But when we meet
an error afterwards, the error handling in vhost handle_tx() will try
to decrease it again. This is wrong and fix this by delay the
uarg->callback() until we're sure there's no errors.

Reported-by: wangyunjian 
Signed-off-by: Jason Wang 
---
The patch is needed for -stable.
---
 drivers/net/tun.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8093e39..db6acec 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1246,13 +1246,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
 
if (zerocopy)
err = zerocopy_sg_from_iter(skb, from);
-   else {
+   else
err = skb_copy_datagram_from_iter(skb, 0, from, len);
-   if (!err && msg_control) {
-   struct ubuf_info *uarg = msg_control;
-   uarg->callback(uarg, false);
-   }
-   }
 
if (err) {
this_cpu_inc(tun->pcpu_stats->rx_dropped);
@@ -1298,6 +1293,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
skb_shinfo(skb)->destructor_arg = msg_control;
skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+   } else if (msg_control) {
+   struct ubuf_info *uarg = msg_control;
+   uarg->callback(uarg, false);
}
 
skb_reset_network_header(skb);
-- 
2.7.4



Re: netlink: GPF in sock_sndtimeo

2016-11-29 Thread Richard Guy Briggs
On 2016-11-29 15:13, Cong Wang wrote:
> On Tue, Nov 29, 2016 at 8:48 AM, Richard Guy Briggs  wrote:
> > On 2016-11-26 17:11, Cong Wang wrote:
> >> It is racy on audit_sock, especially on the netns exit path.
> >
> > I think that is the only place it is racy.  The other places audit_sock
> > is set is when the socket failure has just triggered a reset.
> >
> > Is there a notifier callback for failed or reaped sockets?
> 
> Is NETLINK_URELEASE event what you are looking for?

Possibly, yes.  Thanks, I'll have a look.

- RGB

--
Richard Guy Briggs 
Kernel Security Engineering, Base Operating Systems, Red Hat
Remote, Ottawa, Canada
Voice: +1.647.777.2635, Internal: (81) 32635


Re: [PATCH] net: macb: Write only necessary bits in NCR in macb reset

2016-11-29 Thread Harini Katakam
Hi David,

On Wed, Nov 30, 2016 at 5:35 AM, David Miller  wrote:
> From: Harini Katakam 
> Date: Mon, 28 Nov 2016 14:53:49 +0530
>
>> In macb_reset_hw, use read-modify-write to disable RX and TX.
>> This way exiting settings and reserved bits wont be disturbed.
>> Use the same method for clearing statistics as well.
>>
>> Signed-off-by: Harini Katakam 
>
> This doesn't make much sense to me.
>
> Consider the two callers of this function.
>
> macb_init_hw() is going to do a non-masking write to the NCR
> register:
>
> /* Enable TX and RX */
> macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));
>
> So obviously no other writable fields matter at all for programming
> the chip properly, otherwise macb_init_hw() would "or" in the bits
> after a read of NCR.  But that's not what this code does, it
> writes "RE | TE | MPE" directly.
>
> And the other caller is macb_close() which is shutting down the
> chip so can zero out all the other bits and it can't possibly
> matter, also due to the assertion above about macb_init_hw()
> showing that only the RE, TE, and MPE bits matter for proper
> functioning of the chip.
>
> You haven't shown a issue caused by the way the code works now, so
> this patch isn't fixing a bug.  In fact, the "bit preserving" would
> even be misleading to someone reading the code.  They will ask
> themselves what bits need to be preserved, and as shown above none of
> them need to be.
>
> I'm not applying this, sorry.
>

Thanks for the detailed analysis.
I noticed an issue with respect to management port enable bit
when working on the patch series for macb mdio driver for
multi MAC - multi PHY access via same mdio bus.
MPE bit of the MAC (whose phy management register
is used) was being reset. But that is a separate
series still in review.
You are right, there is no existing bug that this
patch addresses. I will come back to this later if
required.

Regards,
Harini


Re: [PATCH net 00/16] net: fix fixed-link phydev leaks

2016-11-29 Thread David Miller
From: Johan Hovold 
Date: Mon, 28 Nov 2016 19:24:53 +0100

> This series fixes failures to deregister and free fixed-link phydevs
> that have been registered using the of_phy_register_fixed_link()
> interface.
> 
> All but two drivers currently fail to do this and this series fixes most
> of them with the exception of a staging driver and the stmmac drivers
> which will be fixed by follow-on patches.
> 
> Included are also a couple of fixes for related of-node leaks.
> 
> Note that all patches except the of_mdio one have been compile-tested
> only.
> 
> Also note that the series is against net due to dependencies not yet in
> net-next.

Series applied, thanks Johan.


Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

2016-11-29 Thread Tom Herbert
On Tue, Nov 29, 2016 at 9:15 AM, David Lebrun  wrote:
> When multiple nexthops are available for a given route, the routing engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting value
> indexes the nexthop to select. This method causes issues when a new nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to another
> nexthop.
>
This is a lot of code to make ECMP work better. Can you be more
specific as to what the "issues" are? Assuming this is just the
transient packet reorder that happens in one link flap I am wondering
if this complexity is justified.

Tom

> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
>
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
>
> The number of slices for a route also fixes the maximum number of nexthops
> possible for that route.
>
> Signed-off-by: David Lebrun 
> ---
>  include/net/ip6_fib.h|  25 +++
>  include/net/netns/ipv6.h |   1 +
>  net/ipv6/ip6_fib.c   | 174 
> ++-
>  net/ipv6/route.c |  58 
>  4 files changed, 229 insertions(+), 29 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index a74e2aa..29594cc 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -93,6 +93,30 @@ struct rt6key {
>
>  struct fib6_table;
>
> +/* Multipath nexthop.
> + * @nh is the actual nexthop
> + * @nslices is the number of slices assigned to this nexthop
> + */
> +struct rt6_multi_nh {
> +   struct rt6_info *nh;
> +   unsigned intnslices;
> +};
> +
> +/* Multipath routes map.
> + * @nhs is an array of available nexthops
> + * @size is the number of elements in @nhs
> + * @nslices is the number of slices and the number of elements in @index
> + * and is always in the form 2^x to prevent using a division for
> + * the computation of the modulo.
> + * @index is an array mapping a slice index to a nexthop index in @nhs
> + */
> +struct rt6_multi_map {
> +   struct rt6_multi_nh *nhs;
> +   unsigned intsize;
> +   unsigned intnslices;
> +   __u8*index;
> +};
> +
>  struct rt6_info {
> struct dst_entrydst;
>
> @@ -113,6 +137,7 @@ struct rt6_info {
>  */
> struct list_headrt6i_siblings;
> unsigned intrt6i_nsiblings;
> +   struct rt6_multi_map*rt6i_nh_map;
>
> atomic_trt6i_ref;
>
> diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
> index de7745e..4967846 100644
> --- a/include/net/netns/ipv6.h
> +++ b/include/net/netns/ipv6.h
> @@ -36,6 +36,7 @@ struct netns_sysctl_ipv6 {
> int idgen_retries;
> int idgen_delay;
> int flowlabel_state_ranges;
> +   int ip6_rt_ecmp_slices;
>  };
>
>  struct netns_ipv6 {
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index ef54852..b6b1895 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -727,6 +727,166 @@ static void fib6_purge_rt(struct rt6_info *rt, struct 
> fib6_node *fn,
> }
>  }
>
> +static void fib6_mp_free(struct rt6_info *rt)
> +{
> +   struct rt6_multi_map *nh_map = rt->rt6i_nh_map;
> +   struct rt6_info *sibling;
> +
> +   if (nh_map) {
> +   list_for_each_entry(sibling, >rt6i_siblings,
> +   rt6i_siblings) {
> +   sibling->rt6i_nh_map = NULL;
> +   }
> +
> +   rt->rt6i_nh_map = NULL;
> +
> +   kfree(nh_map->nhs);
> +   kfree(nh_map->index);
> +   kfree(nh_map);
> +   }
> +}
> +
> +static int fib6_mp_extend(struct net *net, struct rt6_info *sref,
> + struct rt6_info *rt)
> +{
> +   struct rt6_multi_map *nh_map = sref->rt6i_nh_map;
> +   struct rt6_multi_nh *tmp_nhs, *old_nhs;
> 

Re: [RFC PATCH net-next v2] ipv6: implement consistent hashing for equal-cost multipath routing

2016-11-29 Thread Hannes Frederic Sowa
Hi,

On Tue, Nov 29, 2016, at 18:15, David Lebrun wrote:
> When multiple nexthops are available for a given route, the routing
> engine
> chooses a nexthop by computing the flow hash through get_hash_from_flowi6
> and by taking that value modulo the number of nexthops. The resulting
> value
> indexes the nexthop to select. This method causes issues when a new
> nexthop
> is added or one is removed (e.g. link failure). In that case, the number
> of nexthops changes and potentially all the flows get re-routed to
> another
> nexthop.
> 
> This patch implements a consistent hash method to select the nexthop in
> case of ECMP. The idea is to generate K slices (or intervals) for each
> route with multiple nexthops. The nexthops are randomly assigned to those
> slices, in a uniform manner. The number K is configurable through a
> sysctl
> net.ipv6.route.ecmp_slices and is always an exponent of 2. To select the
> nexthop, the algorithm takes the flow hash and computes an index which is
> the flow hash modulo K. As K = 2^x, the modulo can be computed using a
> simple binary AND operation (idx = hash & (K - 1)). The resulting index
> references the selected nexthop. The lookup time complexity is thus O(1).
> 
> When a nexthop is added, it steals K/N slices from the other nexthops,
> where N is the new number of nexthops. The slices are stolen randomly and
> uniformly from the other nexthops. When a nexthop is removed, the orphan
> slices are randomly reassigned to the other nexthops.
> 
> The number of slices for a route also fixes the maximum number of
> nexthops
> possible for that route.

In the worst case this causes 2GB (order 19) allocations (x == 31) to
happen in GFP_ATOMIC (due to write lock) context and could cause update
failures to the routing table due to fragmentation. Are you sure the
upper limit of 31 is reasonable? I would very much prefer an upper limit
of below or equal 25 for x to stay within the bounds of the slab
allocators (which is still a lot and probably causes errors!).
Unfortunately because of the nature of the sysctl you can't really
create its own cache for it. :/

Also by design, one day this should all be RCU and having that much data
outstanding worries me a bit during routing table mutation.

I am a fan of consistent hashing but I am not so sure if it belongs into
a generic ECMP implementation or into its own ipvs or netfilter module
where you specifically know how much memory to burn for it.

Also please convert the sysctl to a netlink attribute if you pursue this
because if I change the sysctl while my quagga is hammering the routing
table I would like to know which nodes allocate what amount of memory.

Bye,
Hannes


Re: [PATCH net-next 5/5] udp: add recvmmsg implementation

2016-11-29 Thread Hannes Frederic Sowa
Hello,

On Wed, Nov 30, 2016, at 01:22, David Miller wrote:
> From: Hannes Frederic Sowa 
> Date: Fri, 25 Nov 2016 18:09:00 +0100
> 
> > During review we discussed on how to handle major errors in the kernel:
> > 
> > The old code and the new code still can report back success even though
> > the kernel got back an EFAULT while copying from kernel space to user
> > space (due to bad pointers).
> > 
> > I favor that we drop all packets (also the already received batches) in
> > this case and let the code report -EFAULT and increase sk_drops for all
> > dropped packets from the queue.
> > 
> > Currently sk_err is set so the next syscall would get an -EFAULT, which
> > seems very bad and can also be overwritten by incoming icmp packets, so
> > we never get a notification that we actually had a bad pointer somewhere
> > in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
> > will make people confused that use strace.
> > 
> > If people would like to know the amount of packets dropped we can make
> > sk_drops readable by an getsockopt.
> > 
> > Thoughts?
> > 
> > Unfortunately the interface doesn't allow for better error handling.
> 
> I think this is a major problem.
> 
> If, as a side effect of batch dequeueing the SKBs from the socket,
> you cannot stop properly mid-transfer if an error occurs, well then
> you simply cannot batch like that.
> 
> You have to stop the exact byte where an error occurs mid-stream,
> return the successful amount of bytes transferred, and then return
> the error on the next recvmmsg call.
> 
> There is no other sane error reporting strategy.

Actually I think there is no sane error handling strategy at all.

SIGSEGV and EFAULT should be delivered reliable in my opinion and all
the details become very difficult suddenly.

E.g. if we recvmmsg with -EFAULT and we try to deliver the fault on the
following socket call, I am pretty certain most programs don't bother
with close() return values, so the application might simply ignore it.
Also -EFAULT is not in our repository for error codes to return.

In case of UDP we can simply drop the packets and I would be okay with
that (in some cases we actually guarantee correctly ordered packets,
even for UDP, so we can't simply queue those packets back).

Also I would very much prefer ptrace/gdb to show me the syscall where
the memory management fault happened and not the next one.

> If I get 4 frames, and the kernel can successfully copy the first
> three and get an -EFAULT on the 4th.  Dammit you better tell the
> application this so it can properly process the first 3 packets and
> then determine how it is going to error out and recover for the 4th
> one.
> 
> If we need to add prioritized sk_err stuff, or another value like
> "sk_app_err" to handle the ICMP vs. -EFAULT issue, so be it.
> 
> I know what you guys are thinking, in that you can't figure out a
> way to avoid the transactional overhead if it is necessary to
> "put back" some SKBs if one of them in the batch gets a fault.

I prefer correctness over performance all the time. :)

> That's too bad, we need a proper implementation and proper error
> reporting.  Those performance numbers are useless if we effectively
> lose error notifications.

We have those problems right now and besides deprecating the syscalls I
have no idea how to fix this reliably and would probably need a lot of
changes (besides the sk_app_err solution, which I don't really favor at
all).

The syscall should have been designed in a way that the struct mmsghdr
-> msg_len would be ssize_t, so we could return error codes per fragment
and test before starting the batch that we have proper memory, so we
don't fail in the management code. :(

Bye,
Hannes


Re: [PATCH iproute2 V2 1/2] tc/cls_flower: Classify packet in ip tunnels

2016-11-29 Thread Stephen Hemminger
The overall design is fine, just a couple nits with the code.

> diff --git a/tc/f_flower.c b/tc/f_flower.c
> index 2d31d1aa832d..1cf0750b5b83 100644
> --- a/tc/f_flower.c
> +++ b/tc/f_flower.c

>  
> +static int flower_parse_key_id(char *str, int type, struct nlmsghdr *n)

str is not modified, therefore use: const char *str

> +{
> + int ret;
> + __be32 key_id;
> +
> + ret = get_be32(_id, str, 10);
> + if (ret)
> + return -1;

Traditionally netlink attributes are in host order, why was flower
chosen to be different?

> +
> + addattr32(n, MAX_MSG, type, key_id);
> +
> + return 0;


Why lose the return value here?  Why not:

ret = get_be32(_id, str, 10);
if (!ret)
addattr32(n, MAX_MSG, type, key_id);

> +}
> +
>  static int flower_parse_opt(struct filter_util *qu, char *handle,
>   int argc, char **argv, struct nlmsghdr *n)
>  {
> @@ -339,6 +359,38 @@ static int flower_parse_opt(struct filter_util *qu, char 
> *handle,
>   fprintf(stderr, "Illegal \"src_port\"\n");
>   return -1;
>   }
> + } else if (matches(*argv, "enc_dst_ip") == 0) {
> + NEXT_ARG();
> + ret = flower_parse_ip_addr(*argv, 0,
> +TCA_FLOWER_KEY_ENC_IPV4_DST,
> +
> TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
> +TCA_FLOWER_KEY_ENC_IPV6_DST,
> +
> TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
> +n);
> + if (ret < 0) {
> + fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
> + return -1;
> + }
> + } else if (matches(*argv, "enc_src_ip") == 0) {
> + NEXT_ARG();
> + ret = flower_parse_ip_addr(*argv, 0,
> +TCA_FLOWER_KEY_ENC_IPV4_SRC,
> +
> TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
> +TCA_FLOWER_KEY_ENC_IPV6_SRC,
> +
> TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
> +n);
> + if (ret < 0) {
> + fprintf(stderr, "Illegal \"enc_src_ip\"\n");
> + return -1;
> + }
> + } else if (matches(*argv, "enc_key_id") == 0) {
> + NEXT_ARG();
> + ret = flower_parse_key_id(*argv,
> +   TCA_FLOWER_KEY_ENC_KEY_ID, n);
> + if (ret < 0) {
> + fprintf(stderr, "Illegal \"enc_key_id\"\n");
> + return -1;
> + }
>   } else if (matches(*argv, "action") == 0) {
>   NEXT_ARG();
>   ret = parse_action(, , TCA_FLOWER_ACT, n);
> @@ -509,6 +561,14 @@ static void flower_print_port(FILE *f, char *name, __u8 
> ip_proto,
>   fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
>  }
>  
> +static void flower_print_key_id(FILE *f, char *name,
> + struct rtattr *attr)

const char *name?


> +{
> + if (!attr)
> + return;
> + fprintf(f, "\n  %s %d", name, ntohl(rta_getattr_u32(attr)));
> +}
> +

Why short circuit, just change the order:

if (attr)
fprintf(f, "\n  %s %s", name, ntohl(rta_getattr_u32(attr));

You might also want to introduce rta_getattr_be32()

Please change, retest and resubmit both patches.


Re: [PATCH iproute2 1/2] devlink: Add usage help for eswitch subcommand

2016-11-29 Thread Stephen Hemminger
On Sun, 27 Nov 2016 13:21:02 +0200
Roi Dayan  wrote:

> Add missing usage help for devlink dev eswitch subcommand.
> 
> Signed-off-by: Roi Dayan 
> Reviewed-by: Or Gerlitz 

Ok. Applied both, will show in next merge


[PATCH v4 3/5] net: asix: Fix AX88772x resume failures

2016-11-29 Thread ASIX_Allan [Office]
The change fixes AX88772x resume failure by
- Restore incorrect AX88772A PHY registers when resetting
- Need to stop MAC operation when suspending
- Need to restart MII when restoring PHY

Signed-off-by: Allan Chou 
Signed-off-by: Robert Foss 
Tested-by: Robert Foss 
Tested-by: Jon Hunter 
Tested-by: Allan Chou 

---
drivers/net/usb/asix_devices.c | 47
+-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c
index ebeb730..083dc2e 100644
--- a/drivers/net/usb/asix_devices.c
+++ b/drivers/net/usb/asix_devices.c
@@ -35,6 +35,15 @@
 
 #definePHY_MODE_RTL8211CL  0x000C
 
+#define AX88772A_PHY14H0x14
+#define AX88772A_PHY14H_DEFAULT 0x442C
+
+#define AX88772A_PHY15H0x15
+#define AX88772A_PHY15H_DEFAULT 0x03C8
+
+#define AX88772A_PHY16H0x16
+#define AX88772A_PHY16H_DEFAULT 0x4044
+
 struct ax88172_int_data {
__le16 res1;
u8 link;
@@ -424,7 +433,7 @@ static int ax88772a_hw_reset(struct usbnet *dev, int
in_pm)  {
struct asix_data *data = (struct asix_data *)>data;
int ret, embd_phy;
-   u16 rx_ctl;
+   u16 rx_ctl, phy14h, phy15h, phy16h;
u8 chipcode = 0;
 
ret = asix_write_gpio(dev, AX_GPIO_RSE, 5, in_pm); @@ -482,6 +491,32
@@ static int ax88772a_hw_reset(struct usbnet *dev, int in_pm)
   ret);
goto out;
}
+   } else if ((chipcode & AX_CHIPCODE_MASK) == AX_AX88772A_CHIPCODE) {
+   /* Check if the PHY registers have default settings */
+   phy14h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY14H);
+   phy15h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY15H);
+   phy16h = asix_mdio_read_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY16H);
+
+   netdev_dbg(dev->net,
+  "772a_hw_reset: MR20=0x%x MR21=0x%x MR22=0x%x\n",
+  phy14h, phy15h, phy16h);
+
+   /* Restore PHY registers default setting if not */
+   if (phy14h != AX88772A_PHY14H_DEFAULT)
+   asix_mdio_write_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY14H,
+AX88772A_PHY14H_DEFAULT);
+   if (phy15h != AX88772A_PHY15H_DEFAULT)
+   asix_mdio_write_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY15H,
+AX88772A_PHY15H_DEFAULT);
+   if (phy16h != AX88772A_PHY16H_DEFAULT)
+   asix_mdio_write_nopm(dev->net, dev->mii.phy_id,
+AX88772A_PHY16H,
+AX88772A_PHY16H_DEFAULT);
}
 
ret = asix_write_cmd(dev, AX_CMD_WRITE_IPG0, @@ -543,6 +578,15 @@
static const struct net_device_ops ax88772_netdev_ops = {  static void
ax88772_suspend(struct usbnet *dev)  {
struct asix_common_private *priv = dev->driver_priv;
+   u16 medium;
+
+   /* Stop MAC operation */
+   medium = asix_read_medium_status(dev, 1);
+   medium &= ~AX_MEDIUM_RE;
+   asix_write_medium_mode(dev, medium, 1);
+
+   netdev_dbg(dev->net, "ax88772_suspend: medium=0x%04x\n",
+  asix_read_medium_status(dev, 1));
 
/* Preserve BMCR for restoring */
priv->presvd_phy_bmcr =
@@ -577,6 +621,7 @@ static void ax88772_restore_phy(struct usbnet *dev)
asix_mdio_write_nopm(dev->net, dev->mii.phy_id, MII_BMCR,
 priv->presvd_phy_bmcr);
 
+   mii_nway_restart(>mii);
priv->presvd_phy_advertise = 0;
priv->presvd_phy_bmcr = 0;
}
--
2.7.4




Re: The ubufs->refcount maybe be subtracted twice when tun_get_user failed

2016-11-29 Thread Jason Wang



On 2016年11月29日 21:27, wangyunjian wrote:

Sorry, I didn't describe it clearly. In fact, the second subtraction happens in 
the function handle_tx,
when tun_get_user fails and zcopy_used is ture. Fllowing the steps:


I get your meaning. Thanks for the reporting. Will post patches (since 
macvtap has similar issue).





static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
   void *msg_control, struct iov_iter *from,
   int noblock) {
...

if (zerocopy)
  err = zerocopy_sg_from_iter(skb, from);
else {
  err = skb_copy_datagram_from_iter(skb, 0, from, len);
  if (!err && msg_control) {
   struct ubuf_info *uarg = msg_control;
   uarg->callback(uarg, false);   
--> step 1, the ubufs->refcount is subtracted frist.
  }
}

if (err) {
  this_cpu_inc(tun->pcpu_stats->rx_dropped);
  kfree_skb(skb);
  return -EFAULT;
}

err = virtio_net_hdr_to_skb(skb, , tun_is_little_endian(tun));
if (err) {
  this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
  kfree_skb(skb);
  return -EINVAL; 
-->step 2, return err.
}
}

static void handle_tx(struct vhost_net *net)
{
...
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(sock, , len);
if (unlikely(err < 0)) {
if (zcopy_used) {
vhost_net_ubuf_put(ubufs);
--> step 3, the ubufs->refcount will be subtracted twice, when sendmsg 
execution err.
nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
% UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
break;
}
...
}

-Original Message-




Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure

2016-11-29 Thread John Fastabend
On 16-11-29 04:15 PM, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
>> Registers new BPF program types which correspond to the LWT hooks:
>>   - BPF_PROG_TYPE_LWT_IN   => dst_input()
>>   - BPF_PROG_TYPE_LWT_OUT  => dst_output()
>>   - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()
>>
>> The separate program types are required to differentiate between the
>> capabilities each LWT hook allows:
>>
>>  * Programs attached to dst_input() or dst_output() are restricted and
>>may only read the data of an skb. This prevent modification and
>>possible invalidation of already validated packet headers on receive
>>and the construction of illegal headers while the IP headers are
>>still being assembled.
>>
>>  * Programs attached to lwtunnel_xmit() are allowed to modify packet
>>content as well as prepending an L2 header via a newly introduced
>>helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
>>after the IP header has been assembled completely.
>>
>> All BPF programs receive an skb with L3 headers attached and may return
>> one of the following error codes:
>>
>>  BPF_OK - Continue routing as per nexthop
>>  BPF_DROP - Drop skb and return EPERM
>>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
>> (Only valid in lwtunnel_xmit() context)
>>
>> The return codes are binary compatible with their TC_ACT_
>> relatives to ease compatibility.
>>
>> Signed-off-by: Thomas Graf 
> ...
>> +#define LWT_BPF_MAX_HEADROOM 128
> 
> why 128?
> btw I'm thinking for XDP to use 256, so metadata can be stored in there.
> 

hopefully not too off-topic but for XDP I would like to see this get
passed down with the program. It would be more generic and drivers could
configure the headroom on demand and more importantly verify that a
program pushing data is not going to fail at runtime.

.John


Re: [net-next PATCH v3 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers

2016-11-29 Thread John Fastabend
On 16-11-29 04:37 PM, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 12:11:33PM -0800, John Fastabend wrote:
>> virtio_net XDP support expects receive buffers to be contiguous.
>> If this is not the case we enable a slowpath to allow connectivity
>> to continue but at a significan performance overhead associated with
>> linearizing data. To make it painfully aware to users that XDP is
>> running in a degraded mode we throw an xdp buffer error.
>>
>> To linearize packets we allocate a page and copy the segments of
>> the data, including the header, into it. After this the page can be
>> handled by XDP code flow as normal.
>>
>> Then depending on the return code the page is either freed or sent
>> to the XDP xmit path. There is no attempt to optimize this path.
>>
>> Signed-off-by: John Fastabend 
> ...
>> +/* The conditions to enable XDP should preclude the underlying device from
>> + * sending packets across multiple buffers (num_buf > 1). However per spec
>> + * it does not appear to be illegal to do so but rather just against 
>> convention.
>> + * So in order to avoid making a system unresponsive the packets are pushed
>> + * into a page and the XDP program is run. This will be extremely slow and 
>> we
>> + * push a warning to the user to fix this as soon as possible. Fixing this 
>> may
>> + * require resolving the underlying hardware to determine why multiple 
>> buffers
>> + * are being received or simply loading the XDP program in the ingress stack
>> + * after the skb is built because there is no advantage to running it here
>> + * anymore.
>> + */
> ...
>>  if (num_buf > 1) {
>>  bpf_warn_invalid_xdp_buffer();
^^

Here is the warn once call. I made it a defined bpf warning so that we
can easily identify if needed and it can be used by other drivers as
well.

>> -goto err_xdp;
>> +
>> +/* linearize data for XDP */
>> +xdp_page = xdp_linearize_page(rq, num_buf,
>> +  page, offset, );
>> +if (!xdp_page)
>> +goto err_xdp;
> 
> in case when we're 'lucky' the performance will silently be bad.

Were you specifically worried about the alloc failing here and no
indication? I was thinking a statistic counter could be added as a
follow on series to catch this and other such cases in non XDP paths
if needed.

> Can we do warn_once here? so at least something in dmesg points out
> that performance is not as expected. Am I reading it correctly that
> you had to do a special kernel hack to trigger this situation and
> in all normal cases it's not the case?
> 

Correct the only way to produce this with upstream qemu and Linux is to
write a kernel hack to build these types of buffers. AFAIK I caught all
the cases where it could happen otherwise in the setup xdp prog call and
required user to configure device driver so they can not happen e.g.
disable LRO, set MTU < PAGE_SIZE, etc.

If I missed anything, I don't see it now, I would call it a bug and fix
it. Again AFAIK everything should be covered though. As Micheal pointed
out earlier although unexpected its not strictly against virtio spec to
build such packets so we should handle it at least in a degraded mode
even though it is not expected in any qemu cases.

.John



Re: [net-next] neigh: remove duplicate check for same neigh

2016-11-29 Thread David Ahern
On 11/29/16 6:27 PM, 张胜举 wrote:
> This has less changes, but I preferred to add a new function to do the
> filter logic, this is the same as your code  @rtnl_dump_ifinfo().

I'd prefer it to stay in line as it is now. You can save the cycles of 
filtering on already viewed entries without an overhaul of this code.



Re: [PATCH net-next] hv_netvsc: remove excessive logging on MTU change

2016-11-29 Thread David Miller
From: Vitaly Kuznetsov 
Date: Mon, 28 Nov 2016 18:25:44 +0100

> When we change MTU or the number of channels on a netvsc device we get the
> following logged:
> 
>  hv_netvsc bf5edba8...: net device safe to remove
>  hv_netvsc: hv_netvsc channel opened successfully
>  hv_netvsc bf5edba8...: Send section size: 6144, Section count:2560
>  hv_netvsc bf5edba8...: Device MAC 00:15:5d:1e:91:12 link state up
> 
> This information is useful as debug at most.
> 
> Signed-off-by: Vitaly Kuznetsov 

Applied, thanks.


Re: [patch net-next 0/4] mlxsw: couple of enhancements and fixes

2016-11-29 Thread David Miller
From: Jiri Pirko 
Date: Mon, 28 Nov 2016 18:01:22 +0100

> Couple of enhancements and fixes from Ido.

Series applied, thanks Jiri.


Re: [PATCH 1/1] GSO: Reload iph after pskb_may_pull

2016-11-29 Thread David Miller
From: Arnaldo Carvalho de Melo 
Date: Mon, 28 Nov 2016 12:36:58 -0300

> As it may get stale and lead to use after free.
> 
> Acked-by: Eric Dumazet 
> Cc: Alexander Duyck 
> Cc: Andrey Konovalov 
> Fixes: cbc53e08a793 ("GSO: Add GSO type for fixed IPv4 ID")
> Signed-off-by: Arnaldo Carvalho de Melo 

Applied and queued up for -stable, thanks Arnaldo!


Re: [patch net] sched: cls_flower: remove from hashtable only in case skip sw flag is not set

2016-11-29 Thread David Miller
From: Jiri Pirko 
Date: Mon, 28 Nov 2016 15:40:13 +0100

> From: Jiri Pirko 
> 
> Be symmetric to hashtable insert and remove filter from hashtable only
> in case skip sw flag is not set.
> 
> Fixes: e69985c67c33 ("net/sched: cls_flower: Introduce support in SKIP SW 
> flag")
> Signed-off-by: Jiri Pirko 

Applied, thanks Jiri.


Re: [PATCH net] net/dccp: fix use-after-free in dccp_invalid_packet

2016-11-29 Thread David Miller
From: Eric Dumazet 
Date: Mon, 28 Nov 2016 06:26:49 -0800

> From: Eric Dumazet 
> 
> pskb_may_pull() can reallocate skb->head, we need to reload dh pointer
> in dccp_invalid_packet() or risk use after free.
> 
> Bug found by Andrey Konovalov using syzkaller.
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Andrey Konovalov 

Applied and queued up for -stable, thanks Eric.


Re: [PATCH net-next v2] mlxsw: switchib: add MLXSW_PCI dependency

2016-11-29 Thread David Miller
From: Arnd Bergmann 
Date: Mon, 28 Nov 2016 15:26:04 +0100

> The newly added switchib driver fails to link if MLXSW_PCI=m:
> 
> drivers/net/ethernet/mellanox/mlxsw/mlxsw_switchib.o: In 
> function^Cmlxsw_sib_module_exit':
> switchib.c:(.exit.text+0x8): undefined reference to 
> `mlxsw_pci_driver_unregister'
> switchib.c:(.exit.text+0x10): undefined reference to 
> `mlxsw_pci_driver_unregister'
> drivers/net/ethernet/mellanox/mlxsw/mlxsw_switchib.o: In function 
> `mlxsw_sib_module_init':
> switchib.c:(.init.text+0x28): undefined reference to 
> `mlxsw_pci_driver_register'
> switchib.c:(.init.text+0x38): undefined reference to 
> `mlxsw_pci_driver_register'
> switchib.c:(.init.text+0x48): undefined reference to 
> `mlxsw_pci_driver_unregister'
> 
> The other two such sub-drivers have a dependency, so add the same one
> here. In theory we could allow this driver if MLXSW_PCI is disabled,
> but it's probably not worth it.
> 
> Fixes: d1ba52638456 ("mlxsw: switchib: Introduce SwitchIB and SwitchIB 
> silicon driver")
> Reviewed-by: Jiri Pirko 
> Signed-off-by: Arnd Bergmann 
> ---
> v2: add Fixes tag

Applied, thanks Arnd.


Re: [PATCH v2] net: phy: Fix use after free in phy_detach()

2016-11-29 Thread David Miller
From: Geert Uytterhoeven 
Date: Mon, 28 Nov 2016 15:18:31 +0100

> If device_release_driver(>mdio.dev) is called, it releases all
> resources belonging to the PHY device. Hence the subsequent call to
> phy_led_triggers_unregister() will access already freed memory when
> unregistering the LEDs.
> 
> Move the call to phy_led_triggers_unregister() before the possible call
> to device_release_driver() to fix this.
> 
> Fixes: 2e0bc452f4721520 ("net: phy: leds: add support for led triggers on phy 
> link state change")
> Signed-off-by: Geert Uytterhoeven 
> ---
> This is v2 of "[RFC] net: phy: Fix double free in phy_detach()".
> 
> v2:
>   - Dropped RFC,
>   - Reworded, as commit a7dac9f9c1695d74 ("phy: fix error case of
> phy_led_triggers_(un)register") fixed the double free, and thus the
> warning I was seeing during "poweroff" on sh73a0/kzm9g,
>   - Verified use after free using CONFIG_DEBUG_DEVRES, log_devres = 1,
> and additional debug code printing the address of
> phy->phy_led_triggers. Adding poisoning of freed memory to
> devres_log() will cause a crash.

Applied, thanks.


Re: [PATCH v2 1/1] net: macb: ensure ordering write to re-enable RX smoothly

2016-11-29 Thread David Miller
From: Zumeng Chen 
Date: Mon, 28 Nov 2016 21:55:00 +0800

> When a hardware issue happened as described by inline comments, the register
> write pattern looks like the following:
> 
> 
> + wmb();
> 
> 
> There might be a memory barrier between these two write operations, so add wmb
> to ensure an flip from 0 to 1 for NCR.
> 
> Signed-off-by: Zumeng Chen 
> ---
> 
> V2 changes:
> 
> Add the same wmb for at91ether as well based on reviewer's suggestion.

Applied, thanks.


RE: [net-next] neigh: remove duplicate check for same neigh

2016-11-29 Thread 张胜举
> -Original Message-
> From: David Ahern [mailto:d...@cumulusnetworks.com]
> Sent: Wednesday, November 30, 2016 12:23 AM
> To: Zhang Shengju ;
> netdev@vger.kernel.org
> Subject: Re: [net-next] neigh: remove duplicate check for same neigh
> 
> On 11/29/16 1:22 AM, Zhang Shengju wrote:
> > @@ -2285,20 +2295,15 @@ static int neigh_dump_table(struct neigh_table
> *tbl, struct sk_buff *skb,
> > rcu_read_lock_bh();
> > nht = rcu_dereference_bh(tbl->nht);
> >
> > -   for (h = s_h; h < (1 << nht->hash_shift); h++) {
> > -   if (h > s_h)
> > -   s_idx = 0;
> > +   for (h = s_h; h < (1 << nht->hash_shift); h++, s_idx = 0) {
> > for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
> >  n != NULL;
> > -n = rcu_dereference_bh(n->next)) {
> > -   if (!net_eq(dev_net(n->dev), net))
> > -   continue;
> > -   if (neigh_ifindex_filtered(n->dev, filter_idx))
> > +n = rcu_dereference_bh(n->next), idx++) {
> 
> incrementing idx here ...
> 
> > +   if (idx < s_idx || !net_eq(dev_net(n->dev), net))
> > continue;
> > -   if (neigh_master_filtered(n->dev,
filter_master_idx))
> > +   if (neigh_dump_filtered(n->dev, filter_idx,
> > +   filter_master_idx))
> > continue;
> > -   if (idx < s_idx)
> > -   goto next;
> > if (neigh_fill_info(skb, n, NETLINK_CB(cb-
> >skb).portid,
> > cb->nlh->nlmsg_seq,
> > RTM_NEWNEIGH,
> 
> ... causes a missed entry when an error happens here
> 
> idx is only incremented when a neigh has been successfully added to the
skb.

If an error happens, we just goto 'out' and exit this loop, this will skip
the 'idx++'.  
So no missed entry, right?

> 
> 
> Your intention is to save a few cycles by making idx an absolute index
within
> the hash bucket and jumping to the next entry to be dumped without
> evaluating any filters per entry.
> 
> So why not keep it simple and do this:

This has less changes, but I preferred to add a new function to do the
filter logic, this is the same as your code  @rtnl_dump_ifinfo().

And for the 'idx', it should be increased when neigh entry is filter out or
is successfully added to the skb. Isn't it straight-forward to put it in for
loop?

> 
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c index
> 2ae929f9bd06..2e49a85e696a 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -2291,14 +2291,12 @@ static int neigh_dump_table(struct neigh_table
> *tbl, struct sk_buff *skb,
> for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx =
0;
>  n != NULL;
>  n = rcu_dereference_bh(n->next)) {
> -   if (!net_eq(dev_net(n->dev), net))
> -   continue;
> -   if (neigh_ifindex_filtered(n->dev, filter_idx))
> -   continue;
> -   if (neigh_master_filtered(n->dev,
filter_master_idx))
> -   continue;
> if (idx < s_idx)
> goto next;
> +   if (!net_eq(dev_net(n->dev), net) ||
> +   neigh_ifindex_filtered(n->dev, filter_idx) ||
> +   neigh_master_filtered(n->dev,
filter_master_idx))
> +   goto next;
> if (neigh_fill_info(skb, n,
NETLINK_CB(cb->skb).portid,
> cb->nlh->nlmsg_seq,
> RTM_NEWNEIGH,





Re: [PATCH net-next v3 0/4] Fix OdroidC2 Gigabit Tx link issue

2016-11-29 Thread Florian Fainelli
On 11/29/2016 05:13 PM, David Miller wrote:
> From: Florian Fainelli 
> Date: Tue, 29 Nov 2016 16:43:20 -0800
> 
>> On 11/29/2016 04:38 PM, David Miller wrote:
>>> From: Jerome Brunet 
>>> Date: Mon, 28 Nov 2016 10:46:45 +0100
>>>
 This patchset fixes an issue with the OdroidC2 board (DWMAC + RTL8211F).
 The platform seems to enter LPI on the Rx path too often while performing
 relatively high TX transfer. This eventually break the link (both Tx and
 Rx), and require to bring the interface down and up again to get the Rx
 path working again.

 The root cause of this issue is not fully understood yet but disabling EEE
 advertisement on the PHY prevent this feature to be negotiated.
 With this change, the link is stable and reliable, with the expected
 throughput performance.

 The patchset adds options in the generic phy driver to disable EEE
 advertisement, through device tree. The way it is done is very similar
 to the handling of the max-speed property.
>>>
>>> Patches 1-3 applied to net-next, thanks.
>>
>> Meh, there was a v4 submitted shortly after, and I objected to the whole
>> idea of using that kind of Device Tree properties to disable EEE, we can
>> send reverts though..
> 
> Sorry, I lost this in all the discussion, I can revert.

Yeah, I can understand why, these freaking PHYs tend to generate a lot
of noise and discussion...

> 
> Just send me a revert of the entire merge commit
> a152c91889556df17ca6d8ea134fb2cb4ac9f893 with a short
> description of why and I'll apply it.

OK, I will talk with Jerome first to make sure that we are in agreement
with the solution to deploy to fix the OdroidC2 problem first.

Thanks!
-- 
Florian


Re: [PATCH net-next v3 0/4] Fix OdroidC2 Gigabit Tx link issue

2016-11-29 Thread David Miller
From: Florian Fainelli 
Date: Tue, 29 Nov 2016 16:43:20 -0800

> On 11/29/2016 04:38 PM, David Miller wrote:
>> From: Jerome Brunet 
>> Date: Mon, 28 Nov 2016 10:46:45 +0100
>> 
>>> This patchset fixes an issue with the OdroidC2 board (DWMAC + RTL8211F).
>>> The platform seems to enter LPI on the Rx path too often while performing
>>> relatively high TX transfer. This eventually break the link (both Tx and
>>> Rx), and require to bring the interface down and up again to get the Rx
>>> path working again.
>>>
>>> The root cause of this issue is not fully understood yet but disabling EEE
>>> advertisement on the PHY prevent this feature to be negotiated.
>>> With this change, the link is stable and reliable, with the expected
>>> throughput performance.
>>>
>>> The patchset adds options in the generic phy driver to disable EEE
>>> advertisement, through device tree. The way it is done is very similar
>>> to the handling of the max-speed property.
>> 
>> Patches 1-3 applied to net-next, thanks.
> 
> Meh, there was a v4 submitted shortly after, and I objected to the whole
> idea of using that kind of Device Tree properties to disable EEE, we can
> send reverts though..

Sorry, I lost this in all the discussion, I can revert.

Just send me a revert of the entire merge commit
a152c91889556df17ca6d8ea134fb2cb4ac9f893 with a short
description of why and I'll apply it.

Thanks.


Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications

2016-11-29 Thread David Ahern
On 11/29/16 5:59 PM, Alexei Starovoitov wrote:
> On Tue, Nov 29, 2016 at 05:43:08PM -0700, David Ahern wrote:
>> On 11/29/16 1:01 PM, Alexei Starovoitov wrote:
>>> Could you also expose sk_protcol and sk_type as read only fields?
>>
>> Those are bitfields in struct sock, so can't use offsetof or sizeof. Any 
>> existing use cases that try to load a bitfield in a bpf that I can look at?
> 
> pkt_type, vlan are also bitfileds in skb. Please see convert_skb_access()
> There is a bit of ugliness due to __BIG_ENDIAN_BITFIELD though..
> 

Given the added complexity I'd prefer to defer this second use case to a follow 
on patch set. This one introduces the infra for sockets and I don't see 
anything needing to change with it to add the read of 3 more sock elements. 
Agree?


Re: [PATCH net v2 1/1] net: macb: fix the RX queue reset in macb_rx()

2016-11-29 Thread David Miller
From: Cyrille Pitchen 
Date: Mon, 28 Nov 2016 14:40:55 +0100

> On macb only (not gem), when a RX queue corruption was detected from
> macb_rx(), the RX queue was reset: during this process the RX ring
> buffer descriptor was initialized by macb_init_rx_ring() but we forgot
> to also set bp->rx_tail to 0.
> 
> Indeed, when processing the received frames, bp->rx_tail provides the
> macb driver with the index in the RX ring buffer of the next buffer to
> process. So when the whole ring buffer is reset we must also reset
> bp->rx_tail so the driver is synchronized again with the hardware.
> 
> Since macb_init_rx_ring() is called from many locations, currently from
> macb_rx() and macb_init_rings(), we'd rather add the "bp->rx_tail = 0;"
> line inside macb_init_rx_ring() than add the very same line after each
> call of this function.
> 
> Without this fix, the rx queue is not reset properly to recover from
> queue corruption and connection drop may occur.
> 
> Signed-off-by: Cyrille Pitchen 
> Fixes: 9ba723b081a2 ("net: macb: remove BUG_ON() and reset the queue to 
> handle RX errors")
> Acked-by: Nicolas Ferre 

Applied and queued up for -stable, thanks.


Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 05:43:08PM -0700, David Ahern wrote:
> On 11/29/16 1:01 PM, Alexei Starovoitov wrote:
> > Could you also expose sk_protcol and sk_type as read only fields?
> 
> Those are bitfields in struct sock, so can't use offsetof or sizeof. Any 
> existing use cases that try to load a bitfield in a bpf that I can look at?

pkt_type, vlan are also bitfileds in skb. Please see convert_skb_access()
There is a bit of ugliness due to __BIG_ENDIAN_BITFIELD though..


Re: [PATCH] stmmac: fix comments, make debug output consistent

2016-11-29 Thread David Miller
From: Pavel Machek 
Date: Mon, 28 Nov 2016 12:55:59 +0100

> Fix comments, add some new, and make debugfs output consistent.
> 
> Signed-off-by: Pavel Machek 

Applied to net-next, thanks.


Re: [PATCH net-next v2] bpf: cgroup: fix documentation of __cgroup_bpf_update()

2016-11-29 Thread David Miller
From: Daniel Mack 
Date: Mon, 28 Nov 2016 14:11:04 +0100

> There's a 'not' missing in one paragraph. Add it.
> 
> Signed-off-by: Daniel Mack 
> Reported-by: Rami Rosen 
> Fixes: 3007098494be ("cgroup: add support for eBPF programs")

Applied, thanks.


[PATCH net-next 1/1] driver: ipvlan: Remove useless member mtu_adj of struct ipvl_dev

2016-11-29 Thread fgao
From: Gao Feng 

The mtu_adj is initialized to zero when alloc mem, there is no any
assignment to mtu_adj. It is only used in ipvlan_adjust_mtu as one
right value.
So it is useless member of struct ipvl_dev, then remove it.

Signed-off-by: Gao Feng 
---
 drivers/net/ipvlan/ipvlan.h  | 1 -
 drivers/net/ipvlan/ipvlan_main.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 7e0732f..05a62d2 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -73,7 +73,6 @@ struct ipvl_dev {
DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
netdev_features_t   sfeatures;
u32 msg_enable;
-   u16 mtu_adj;
 };
 
 struct ipvl_addr {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ab90b22..c6aa667 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -32,7 +32,7 @@
 
 static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
 {
-   ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj;
+   ipvlan->dev->mtu = dev->mtu;
 }
 
 static int ipvlan_register_nf_hook(void)
-- 
1.9.1




Re: Crash due to mutex genl_lock called from RCU context

2016-11-29 Thread David Miller
From: Herbert Xu 
Date: Mon, 28 Nov 2016 19:22:12 +0800

> netlink: Call cb->done from a worker thread
> 
> The cb->done interface expects to be called in process context.
> This was broken by the netlink RCU conversion.  This patch fixes
> it by adding a worker struct to make the cb->done call where
> necessary.
> 
> Fixes: 21e4902aea80 ("netlink: Lockless lookup with RCU grace...")
> Reported-by: Subash Abhinov Kasiviswanathan 
> Signed-off-by: Herbert Xu 

Applied and queued up for -stable, thanks Herbert.


Re: [PATCH net V2] net/sched: pedit: make sure that offset is valid

2016-11-29 Thread David Miller
From: Amir Vadai 
Date: Mon, 28 Nov 2016 12:56:40 +0200

> Add a validation function to make sure offset is valid:
> 1. Not below skb head (could happen when offset is negative).
> 2. Validate both 'offset' and 'at'.
> 
> Signed-off-by: Amir Vadai 

Applied and queued up for -stable, thanks.


Re: [PATCH net-next v3 0/4] Fix OdroidC2 Gigabit Tx link issue

2016-11-29 Thread Florian Fainelli
On 11/29/2016 04:38 PM, David Miller wrote:
> From: Jerome Brunet 
> Date: Mon, 28 Nov 2016 10:46:45 +0100
> 
>> This patchset fixes an issue with the OdroidC2 board (DWMAC + RTL8211F).
>> The platform seems to enter LPI on the Rx path too often while performing
>> relatively high TX transfer. This eventually break the link (both Tx and
>> Rx), and require to bring the interface down and up again to get the Rx
>> path working again.
>>
>> The root cause of this issue is not fully understood yet but disabling EEE
>> advertisement on the PHY prevent this feature to be negotiated.
>> With this change, the link is stable and reliable, with the expected
>> throughput performance.
>>
>> The patchset adds options in the generic phy driver to disable EEE
>> advertisement, through device tree. The way it is done is very similar
>> to the handling of the max-speed property.
> 
> Patches 1-3 applied to net-next, thanks.

Meh, there was a v4 submitted shortly after, and I objected to the whole
idea of using that kind of Device Tree properties to disable EEE, we can
send reverts though..
-- 
Florian


Re: [PATCH net-next v5 2/3] bpf: Add new cgroup attach type to enable sock modifications

2016-11-29 Thread David Ahern
On 11/29/16 1:01 PM, Alexei Starovoitov wrote:
> Could you also expose sk_protcol and sk_type as read only fields?

Those are bitfields in struct sock, so can't use offsetof or sizeof. Any 
existing use cases that try to load a bitfield in a bpf that I can look at?


Re: [PATCH] cpsw: ethtool: add support for nway reset

2016-11-29 Thread David Miller
From: yegorsli...@googlemail.com
Date: Mon, 28 Nov 2016 10:47:52 +0100

> From: Yegor Yefremov 
> 
> This patch adds support for ethtool's '-r' command. Restarting
> N-WAY negotiation can be useful to activate newly changed EEE
> settings etc.
> 
> Signed-off-by: Yegor Yefremov 

This doesn't apply cleanly to net-next.


Re: [PATCH net-next v3 0/4] Fix OdroidC2 Gigabit Tx link issue

2016-11-29 Thread David Miller
From: Jerome Brunet 
Date: Mon, 28 Nov 2016 10:46:45 +0100

> This patchset fixes an issue with the OdroidC2 board (DWMAC + RTL8211F).
> The platform seems to enter LPI on the Rx path too often while performing
> relatively high TX transfer. This eventually break the link (both Tx and
> Rx), and require to bring the interface down and up again to get the Rx
> path working again.
> 
> The root cause of this issue is not fully understood yet but disabling EEE
> advertisement on the PHY prevent this feature to be negotiated.
> With this change, the link is stable and reliable, with the expected
> throughput performance.
> 
> The patchset adds options in the generic phy driver to disable EEE
> advertisement, through device tree. The way it is done is very similar
> to the handling of the max-speed property.

Patches 1-3 applied to net-next, thanks.


Re: [net-next PATCH v3 6/6] virtio_net: xdp, add slowpath case for non contiguous buffers

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 12:11:33PM -0800, John Fastabend wrote:
> virtio_net XDP support expects receive buffers to be contiguous.
> If this is not the case we enable a slowpath to allow connectivity
> to continue but at a significan performance overhead associated with
> linearizing data. To make it painfully aware to users that XDP is
> running in a degraded mode we throw an xdp buffer error.
> 
> To linearize packets we allocate a page and copy the segments of
> the data, including the header, into it. After this the page can be
> handled by XDP code flow as normal.
> 
> Then depending on the return code the page is either freed or sent
> to the XDP xmit path. There is no attempt to optimize this path.
> 
> Signed-off-by: John Fastabend 
...
> +/* The conditions to enable XDP should preclude the underlying device from
> + * sending packets across multiple buffers (num_buf > 1). However per spec
> + * it does not appear to be illegal to do so but rather just against 
> convention.
> + * So in order to avoid making a system unresponsive the packets are pushed
> + * into a page and the XDP program is run. This will be extremely slow and we
> + * push a warning to the user to fix this as soon as possible. Fixing this 
> may
> + * require resolving the underlying hardware to determine why multiple 
> buffers
> + * are being received or simply loading the XDP program in the ingress stack
> + * after the skb is built because there is no advantage to running it here
> + * anymore.
> + */
...
>   if (num_buf > 1) {
>   bpf_warn_invalid_xdp_buffer();
> - goto err_xdp;
> +
> + /* linearize data for XDP */
> + xdp_page = xdp_linearize_page(rq, num_buf,
> +   page, offset, );
> + if (!xdp_page)
> + goto err_xdp;

in case when we're 'lucky' the performance will silently be bad.
Can we do warn_once here? so at least something in dmesg points out
that performance is not as expected. Am I reading it correctly that
you had to do a special kernel hack to trigger this situation and
in all normal cases it's not the case?



Re: bpf debug info

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 09:23:10PM +0100, Daniel Borkmann wrote:
> On 11/29/2016 07:51 PM, Alexei Starovoitov wrote:
> >On Tue, Nov 29, 2016 at 03:38:18PM +, Jakub Kicinski wrote:
> [...]
> So next step is to improve verifier messages to be more human friendly.
> The step after is to introduce BPF_COMMENT pseudo instruction
> that will be ignored by the interpreter yet it will contain the text
> of original source code. Then llvm-objdump step won't be necessary.
> The bpf loader will load both instructions and pieces of C sources.
> Then verifier errors should be even easier to read and humans
> can easily understand the purpose of the program.
> >>>
> >>>So the BPF_COMMENT pseudo insn will get stripped away from the insn array
> >>>after verification step, so we don't need to hold/account for this mem? I
> >>>assume in it's ->imm member it will just hold offset into text blob?
> >>
> >>Associating any form of opaque data with programs always makes me
> >>worried about opening a side channel of communication with a specialized
> >>user space implementations/compilers.  But I guess if the BPF_COMMENTs
> >>are stripped in the verifier as Daniel assumes drivers and JITs will
> >>never see it.
> >
> >yes. the idea that it's a comment. It can contain any text,
> >not only C code, but any other language.
> >It's definitely going to be stripped before JITs and kernel will
> >not make any safety or translation decisions based on such comment.
> >
> >>Just to clarify, however - is there any reason why pushing the source
> >>code into the kernel is necessary?  Or is it just for convenience?
> >>Provided the user space loader has access to the debug info it should
> >>have no problems matching the verifier output to code lines?
> >
> >correct. just for convenience. The user space has to keep .o around,
> >since it can crash, would have to reload and so on.
> >Only for some script that ssh-es into servers and wants to see
> >what is being loaded, it might help to dump full asm and these comments
> >along with prog_digest that Daniel is working on in parallel.
> 
> Which would mean we'd need to keep it around somewhere (prog aux data?)
> in post-verification time (so potentially drivers/JITs could see it, too,
> just not inside insn stream). Some API glue code could probably blind
> this information for the JITing time to stop incentive of playing side
> channel games (e.g. core code could encrypt the pointer value and only
> core kernel knows how to access that data, no modules, no out-of-tree
> code). The other thing I'm wondering is, when we strip this info anyway
> from the insn stream to keep it in aux data (so it can later be reconstructed
> on a dump), then perhaps that is best done before prog loading time? It
> would then allow to keep complexity with stripping that insns out of the
> verifier. If semantics are that these comments are acting as a hole/gap
> (in a similar sense of what we have with cBPF today), then it can never
> become a jmp target and loaders could strip it out already (instead of
> teaching DFS, etc about it), and prepare a meta data structure in bpf_attr
> for bpf(2), and verifier works based on that one. What makes this problematic
> however is when you have rewrites in the kernel (ctx access, constant
> blinding, etc), but perhaps they could just adjust the offsets from that
> meta data thing as well?

yes. all correct.
if we keep comment==nop instructions as part of the program, it's not great,
since we'll be wasting performance for no strong reason.
If we remove them from instruction stream after the verifier then they
can only be useful in verifier messages and that's not much better
than existing 'llvm-objdump -S file.o' approach.
Hmm.

> >Alternatively instead of doing BPF_COMMENT we can load the whole .o
> >as-is into bpffs as a blob. Later (based on digest) the kernel can
> >dump such .o back for user space to run objdump on. It all can be
> >done without kernel involvement. Like tc command can copy .o and so on.
> >But not everything is using tc.
> 
> That means kernel must ensure/verify that loaded insns also come from
> that claimed object file; not sure if easily possible w/o parsing elf.
> It could work if the kernel loads everything based on the content of
> the object file itself,

it will only check that program section of file contain valid insns.
the user may still cheat with junk in dwarf section of such elf.
Sounds more and more that this should really be solved by user space
and correlation to be done via prog_digest.



Re: [PATCH net-next 2/2] tcp: allow to turn tcp timestamp randomization off

2016-11-29 Thread Florian Westphal
Eric Dumazet  wrote:
> On Tue, 2016-11-29 at 16:45 +0100, Florian Westphal wrote:
> > Eric says: "By looking at tcpdump, and TS val of xmit packets of multiple
> > flows, we can deduct the relative qdisc delays (think of fq pacing).
> > This should work even if we have one flow per remote peer."
> > 
> > Having random per flow (or host) offsets doesn't allow that anymore so add
> > a way to turn this off.
> > 
> > Suggested-by: Eric Dumazet 
> > Signed-off-by: Florian Westphal 
> > ---
> 
> Excellent, thanks !
> 
> Acked-by: Eric Dumazet 

Thanks for the ack, I missed connect() side though so this doesn't work
for outgoing connections, sorry :-/

I will send a v2.


Re: [PATCH net-next 5/5] udp: add recvmmsg implementation

2016-11-29 Thread David Miller
From: Hannes Frederic Sowa 
Date: Fri, 25 Nov 2016 18:09:00 +0100

> During review we discussed on how to handle major errors in the kernel:
> 
> The old code and the new code still can report back success even though
> the kernel got back an EFAULT while copying from kernel space to user
> space (due to bad pointers).
> 
> I favor that we drop all packets (also the already received batches) in
> this case and let the code report -EFAULT and increase sk_drops for all
> dropped packets from the queue.
> 
> Currently sk_err is set so the next syscall would get an -EFAULT, which
> seems very bad and can also be overwritten by incoming icmp packets, so
> we never get a notification that we actually had a bad pointer somewhere
> in the mmsghdr. Also delivering -EFAULT on the follow-up syscalls really
> will make people confused that use strace.
> 
> If people would like to know the amount of packets dropped we can make
> sk_drops readable by an getsockopt.
> 
> Thoughts?
> 
> Unfortunately the interface doesn't allow for better error handling.

I think this is a major problem.

If, as a side effect of batch dequeueing the SKBs from the socket,
you cannot stop properly mid-transfer if an error occurs, well then
you simply cannot batch like that.

You have to stop the exact byte where an error occurs mid-stream,
return the successful amount of bytes transferred, and then return
the error on the next recvmmsg call.

There is no other sane error reporting strategy.

If I get 4 frames, and the kernel can successfully copy the first
three and get an -EFAULT on the 4th.  Dammit you better tell the
application this so it can properly process the first 3 packets and
then determine how it is going to error out and recover for the 4th
one.

If we need to add prioritized sk_err stuff, or another value like
"sk_app_err" to handle the ICMP vs. -EFAULT issue, so be it.

I know what you guys are thinking, in that you can't figure out a
way to avoid the transactional overhead if it is necessary to
"put back" some SKBs if one of them in the batch gets a fault.

That's too bad, we need a proper implementation and proper error
reporting.  Those performance numbers are useless if we effectively
lose error notifications.


Re: [PATCH] net: ethernet: ti: cpsw: fix ASSERT_RTNL() warning during resume

2016-11-29 Thread Ivan Khoronzhuk
On Tue, Nov 29, 2016 at 04:27:03PM -0600, Grygorii Strashko wrote:
> netif_set_real_num_tx/rx_queues() are required to be called with rtnl_lock
> taken, otherwise ASSERT_RTNL() warning will be triggered - which happens
> now during System resume from suspend:
> cpsw_resume()
> |- cpsw_ndo_open()
>   |- netif_set_real_num_tx/rx_queues()
>  |- ASSERT_RTNL();
> 
> Hence, fix it by surrounding cpsw_ndo_open() by rtnl_lock/unlock() calls.
> 
> Cc: Dave Gerlach 
> Cc: Ivan Khoronzhuk 
> Fixes: commit e05107e6b747 ("net: ethernet: ti: cpsw: add multi queue 
> support")
> Signed-off-by: Grygorii Strashko 
Reviewed-by: Ivan Khoronzhuk 

> ---
>  drivers/net/ethernet/ti/cpsw.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
> index ae1ec6a..fd6c03b 100644
> --- a/drivers/net/ethernet/ti/cpsw.c
> +++ b/drivers/net/ethernet/ti/cpsw.c
> @@ -2944,6 +2944,8 @@ static int cpsw_resume(struct device *dev)
>   /* Select default pin state */
>   pinctrl_pm_select_default_state(dev);
>  
> + /* shut up ASSERT_RTNL() warning in netif_set_real_num_tx/rx_queues */
> + rtnl_lock();
>   if (cpsw->data.dual_emac) {
>   int i;
>  
> @@ -2955,6 +2957,8 @@ static int cpsw_resume(struct device *dev)
>   if (netif_running(ndev))
>   cpsw_ndo_open(ndev);
>   }
> + rtnl_unlock();
> +
>   return 0;
>  }
>  #endif
> -- 
> 2.10.1
> 


Re: [PATCH net-next v3 4/4] bpf: Add tests and samples for LWT-BPF

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 02:21:23PM +0100, Thomas Graf wrote:
> Adds a series of test to verify the functionality of attaching
> BPF programs at LWT hooks.
> 
> Also adds a sample which collects a histogram of packet sizes which
> pass through an LWT hook.
> 
> $ ./lwt_len_hist.sh
> Starting netserver with host 'IN(6)ADDR_ANY' port '12865' and family AF_UNSPEC
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 
> 192.168.253.2 () port 0 AF_INET : demo
> Recv   SendSend
> Socket Socket  Message  Elapsed
> Size   SizeSize Time Throughput
> bytes  bytes   bytessecs.10^6bits/sec
> 
>  87380  16384  1638410.0039857.69

Nice!

> + ret = bpf_redirect(ifindex, 0);
> + if (ret < 0) {
> + printk("bpf_redirect() failed: %d\n", ret);
> + return BPF_DROP;
> + }

this 'if' looks a bit weird. You're passing 0 as flags,
so this helper will always succeed.
Other sample code often does 'return bpf_redirect(...)'
due to this reasoning.



Re: [PATCH net-next v3 3/4] bpf: BPF for lightweight tunnel infrastructure

2016-11-29 Thread Alexei Starovoitov
On Tue, Nov 29, 2016 at 02:21:22PM +0100, Thomas Graf wrote:
> Registers new BPF program types which correspond to the LWT hooks:
>   - BPF_PROG_TYPE_LWT_IN   => dst_input()
>   - BPF_PROG_TYPE_LWT_OUT  => dst_output()
>   - BPF_PROG_TYPE_LWT_XMIT => lwtunnel_xmit()
> 
> The separate program types are required to differentiate between the
> capabilities each LWT hook allows:
> 
>  * Programs attached to dst_input() or dst_output() are restricted and
>may only read the data of an skb. This prevent modification and
>possible invalidation of already validated packet headers on receive
>and the construction of illegal headers while the IP headers are
>still being assembled.
> 
>  * Programs attached to lwtunnel_xmit() are allowed to modify packet
>content as well as prepending an L2 header via a newly introduced
>helper bpf_skb_push(). This is safe as lwtunnel_xmit() is invoked
>after the IP header has been assembled completely.
> 
> All BPF programs receive an skb with L3 headers attached and may return
> one of the following error codes:
> 
>  BPF_OK - Continue routing as per nexthop
>  BPF_DROP - Drop skb and return EPERM
>  BPF_REDIRECT - Redirect skb to device as per redirect() helper.
> (Only valid in lwtunnel_xmit() context)
> 
> The return codes are binary compatible with their TC_ACT_
> relatives to ease compatibility.
> 
> Signed-off-by: Thomas Graf 
...
> +#define LWT_BPF_MAX_HEADROOM 128

why 128?
btw I'm thinking for XDP to use 256, so metadata can be stored in there.

> +static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
> +struct dst_entry *dst, bool can_redirect)
> +{
> + int ret;
> +
> + /* Preempt disable is needed to protect per-cpu redirect_info between
> +  * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
> +  * access to maps strictly require a rcu_read_lock() for protection,
> +  * mixing with BH RCU lock doesn't work.
> +  */
> + preempt_disable();
> + rcu_read_lock();
> + bpf_compute_data_end(skb);
> + ret = BPF_PROG_RUN(lwt->prog, skb);
> + rcu_read_unlock();
> +
> + switch (ret) {
> + case BPF_OK:
> + break;
> +
> + case BPF_REDIRECT:
> + if (!can_redirect) {
> + WARN_ONCE(1, "Illegal redirect return code in prog 
> %s\n",
> +   lwt->name ? : "");
> + ret = BPF_OK;
> + } else {
> + ret = skb_do_redirect(skb);

I think this assumes that program did bpf_skb_push and L2 header is present.
Would it make sense to check that mac_header < network_header here to make
sure that it actually happened? I think the cost of single 'if' isn't much.
Also skb_do_redirect() can redirect to l3 tunnels like ipip ;)
so program shouldn't be doing bpf_skb_push in such case...
May be rename bpf_skb_push to bpf_skb_push_l2 ?
since it's doing skb_reset_mac_header(skb); at the end of it?
Or it's probably better to use 'flags' argument to tell whether
bpf_skb_push() should set mac_header or not ? Then this bit:

> + case BPF_OK:
> + /* If the L3 header was expanded, headroom might be too
> +  * small for L2 header now, expand as needed.
> +  */
> + ret = xmit_check_hhlen(skb);

will work fine as well...
which probably needs "mac_header wasn't set" check? or it's fine?

All bpf bits look great. Thanks!



Re: [PATCH] net: macb: Write only necessary bits in NCR in macb reset

2016-11-29 Thread David Miller
From: Harini Katakam 
Date: Mon, 28 Nov 2016 14:53:49 +0530

> In macb_reset_hw, use read-modify-write to disable RX and TX.
> This way exiting settings and reserved bits wont be disturbed.
> Use the same method for clearing statistics as well.
> 
> Signed-off-by: Harini Katakam 

This doesn't make much sense to me.

Consider the two callers of this function.

macb_init_hw() is going to do a non-masking write to the NCR
register:

/* Enable TX and RX */
macb_writel(bp, NCR, MACB_BIT(RE) | MACB_BIT(TE) | MACB_BIT(MPE));

So obviously no other writable fields matter at all for programming
the chip properly, otherwise macb_init_hw() would "or" in the bits
after a read of NCR.  But that's not what this code does, it
writes "RE | TE | MPE" directly.

And the other caller is macb_close() which is shutting down the
chip so can zero out all the other bits and it can't possibly
matter, also due to the assertion above about macb_init_hw()
showing that only the RE, TE, and MPE bits matter for proper
functioning of the chip.

You haven't shown a issue caused by the way the code works now, so
this patch isn't fixing a bug.  In fact, the "bit preserving" would
even be misleading to someone reading the code.  They will ask
themselves what bits need to be preserved, and as shown above none of
them need to be.

I'm not applying this, sorry.



Re: [PATCH] net: stmmac: enable tx queue 0 for gmac4 IPs synthesized with multiple TX queues

2016-11-29 Thread David Miller
From: Niklas Cassel 
Date: Thu, 24 Nov 2016 15:36:33 +0100

> From: Niklas Cassel 
> 
> The dwmac4 IP can synthesized with 1-8 number of tx queues.
> On an IP synthesized with DWC_EQOS_NUM_TXQ > 1, all txqueues are disabled
> by default. For these IPs, the bitfield TXQEN is R/W.
> 
> Always enable tx queue 0. The write will have no effect on IPs synthesized
> with DWC_EQOS_NUM_TXQ == 1.
> 
> The driver does still not utilize more than one tx queue in the IP.
> 
> Signed-off-by: Niklas Cassel 

Applied to net-next, thanks.


RE: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support

2016-11-29 Thread Woojung.Huh
> > It seems phy_unregister_fixup() will be needed for module network driver.
> > phy_fixup_list keeps the list even after unloading module.
> > Do you know any update is waiting for submission? If not, I'll make patch.
> 
> Oh, yes, that's a good point, we need such a thing, so far fixups have
> been exclusively used by code that is built-in, but there really is not
> a reason for that. Please go ahead and cook a patch for this, thanks!

OK. Will do it.

Thanks.
- Woojung


Re: [PATCH] net: arc_emac: add dependencies on associated arches and compile test

2016-11-29 Thread David Miller
From: Peter Robinson 
Date: Mon, 28 Nov 2016 07:12:37 +

> Add dependencies on the architectures that support these devices and
> add compile test to ensure ongoing code build coverage.
> 
> Signed-off-by: Peter Robinson 

Applied.


Re: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support

2016-11-29 Thread Florian Fainelli
On 11/29/2016 03:55 PM, woojung@microchip.com wrote:
>> There are two ways to get these settings propagated to the PHY driver:
>>
>> - using a board fixup which is going to be invoked during
>> drv->config_init() time
>>
>> - specifying a phydev->dev_flags and reading it from the PHY driver to
>> act upon and configure the PHY based on that value, there are only
>> 32-bits available though, and you need to make sure they are not
>> conflicting with other potential users in tree
>>
>> My preference would go with 1, since you could just register it in your
>> PHY driver and re-use the code you are proposing to include here.
> 
> Florian,
> 
> It seems phy_unregister_fixup() will be needed for module network driver.
> phy_fixup_list keeps the list even after unloading module.
> Do you know any update is waiting for submission? If not, I'll make patch.

Oh, yes, that's a good point, we need such a thing, so far fixups have
been exclusively used by code that is built-in, but there really is not
a reason for that. Please go ahead and cook a patch for this, thanks!
-- 
Florian


RE: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support

2016-11-29 Thread Woojung.Huh
> There are two ways to get these settings propagated to the PHY driver:
> 
> - using a board fixup which is going to be invoked during
> drv->config_init() time
> 
> - specifying a phydev->dev_flags and reading it from the PHY driver to
> act upon and configure the PHY based on that value, there are only
> 32-bits available though, and you need to make sure they are not
> conflicting with other potential users in tree
> 
> My preference would go with 1, since you could just register it in your
> PHY driver and re-use the code you are proposing to include here.

Florian,

It seems phy_unregister_fixup() will be needed for module network driver.
phy_fixup_list keeps the list even after unloading module.
Do you know any update is waiting for submission? If not, I'll make patch.

Thanks.
Woojung


Re: [PATCH v2 net-next 2/2] openvswitch: Fix skb->protocol for vlan frames.

2016-11-29 Thread Jarno Rajahalme

> On Nov 28, 2016, at 11:21 PM, Pravin Shelar  wrote:
> 
> On Mon, Nov 28, 2016 at 6:41 PM, Jarno Rajahalme  wrote:
>> Do not set skb->protocol to be the ethertype of the L3 header, unless
>> the packet only has the L3 header.  For a non-hardware offloaded VLAN
>> frame skb->protocol needs to be one of the VLAN ethertypes.
>> 
>> Any VLAN offloading is undone on the OVS netlink interface.  Also any
>> VLAN tags added by userspace are non-offloaded.
>> 
>> Incorrect skb->protocol value on a full-size non-offloaded VLAN skb
>> causes packet drop due to failing MTU check, as the VLAN header should
>> not be counted in when considering MTU in ovs_vport_send().
>> 
> I think we should move to is_skb_forwardable() type of packet length
> check in vport-send and get rid of skb-protocol checks altogether.
> 

I added a new patch to do this, thanks for the suggestion.

>> Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
>> Signed-off-by: Jarno Rajahalme 
>> ---
>> v2: Set skb->protocol when an ETH_P_TEB frame is received via ARPHRD_NONE
>>interface.
>> 
>> net/openvswitch/datapath.c |  1 -
>> net/openvswitch/flow.c | 30 ++
>> 2 files changed, 22 insertions(+), 9 deletions(-)
> ...
> ...
>> @@ -531,15 +538,22 @@ static int key_extract(struct sk_buff *skb, struct 
>> sw_flow_key *key)
>>if (unlikely(parse_vlan(skb, key)))
>>return -ENOMEM;
>> 
>> -   skb->protocol = parse_ethertype(skb);
>> -   if (unlikely(skb->protocol == htons(0)))
>> +   key->eth.type = parse_ethertype(skb);
>> +   if (unlikely(key->eth.type == htons(0)))
>>return -ENOMEM;
>> 
>> +   if (skb->protocol == htons(ETH_P_TEB)) {
>> +   if (key->eth.vlan.tci & htons(VLAN_TAG_PRESENT)
>> +   && !skb_vlan_tag_present(skb))
>> +   skb->protocol = key->eth.vlan.tpid;
>> +   else
>> +   skb->protocol = key->eth.type;
>> +   }
>> +
> 
> I am not sure if this work in case of nested vlans.
> Can we move skb-protocol assignment to parse_vlan() to avoid checking
> for non-accelerated vlan case again here?

I did this for the v3 that I sent out a moment ago.

Thanks for the review,

  Jarno




Re: [PATCH net-next 1/7] net/mlx5e: Implement Fragmented Work Queue (WQ)

2016-11-29 Thread Eric Dumazet
On Wed, 2016-11-30 at 00:19 +0200, Saeed Mahameed wrote:
> From: Tariq Toukan 
> 
> Add new type of struct mlx5_frag_buf which is used to allocate fragmented
> buffers rather than contiguous, and make the Completion Queues (CQs) use
> it as they are big (default of 2MB per CQ in Striding RQ).
> 
> This fixes the failures of type:
> "mlx5e_open_locked: mlx5e_open_channels failed, -12"
> due to dma_zalloc_coherent insufficient contiguous coherent memory to
> satisfy the driver's request when the user tries to setup more or larger
> rings.
> 
> Signed-off-by: Tariq Toukan 
> Reported-by: Sebastian Ott 
> Signed-off-by: Saeed Mahameed 
> ---
>  drivers/net/ethernet/mellanox/mlx5/core/alloc.c   | 66 
> +++
>  drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +-
>  drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 ++--
>  drivers/net/ethernet/mellanox/mlx5/core/wq.c  | 26 ++---
>  drivers/net/ethernet/mellanox/mlx5/core/wq.h  | 18 +--
>  include/linux/mlx5/driver.h   | 11 
>  6 files changed, 116 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c 
> b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
> index 2c6e3c7..bc8357d 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
> @@ -106,6 +106,63 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct 
> mlx5_buf *buf)
>  }
>  EXPORT_SYMBOL_GPL(mlx5_buf_free);
>  
> +int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
> +  struct mlx5_frag_buf *buf, int node)
> +{
> + int i;
> +
> + buf->size = size;
> + buf->npages = 1 << get_order(size);
> + buf->page_shift = PAGE_SHIFT;
> + buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list),
> +  GFP_KERNEL);
> + if (!buf->frags)
> + goto err_out;
> +
> + for (i = 0; i < buf->npages; i++) {
> + struct mlx5_buf_list *frag = >frags[i];
> + int frag_sz = min_t(int, size, PAGE_SIZE);
> +
> + frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
> +   >map, node);
> + if (!frag->buf)
> + goto err_free_buf;
> + if (frag->map & ((1 << buf->page_shift) - 1)) {
> + dma_free_coherent(>pdev->dev, frag_sz,
> +   buf->frags[i].buf, buf->frags[i].map);

There is a bug if this happens with i = 0

> + mlx5_core_warn(dev, "unexpected map alignment: 0x%p, 
> page_shift=%d\n",
> +(void *)frag->map, buf->page_shift);
> + goto err_free_buf;
> + }
> + size -= frag_sz;
> + }
> +
> + return 0;
> +
> +err_free_buf:
> + while (--i)

Because this loop will be done about 2^32 times.

> + dma_free_coherent(>pdev->dev, PAGE_SIZE, buf->frags[i].buf,
> +   buf->frags[i].map);
> + kfree(buf->frags);
> +err_out:
> + return -ENOMEM;
> +}






[PATCH v3 net-next 2/3] openvswitch: Use is_skb_forwardable() for length check.

2016-11-29 Thread Jarno Rajahalme
Use is_skb_forwardable() instead of an explicit length check.  This
gets around the apparent MTU check failure in OVS test cases when
skb->protocol is not properly set in case of non-accelerated VLAN
skbs.

Suggested-by: Pravin Shelar 
Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
Signed-off-by: Jarno Rajahalme 
---
v3: New patch suggested by Pravin.

net/openvswitch/vport.c | 38 ++
 1 file changed, 14 insertions(+), 24 deletions(-)

diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index b6c8524..076b39f 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -464,27 +464,8 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff 
*skb,
return 0;
 }
 
-static unsigned int packet_length(const struct sk_buff *skb,
- struct net_device *dev)
-{
-   unsigned int length = skb->len - dev->hard_header_len;
-
-   if (!skb_vlan_tag_present(skb) &&
-   eth_type_vlan(skb->protocol))
-   length -= VLAN_HLEN;
-
-   /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
-* (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none
-* account for 802.1ad. e.g. is_skb_forwardable().
-*/
-
-   return length;
-}
-
 void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto)
 {
-   int mtu = vport->dev->mtu;
-
switch (vport->dev->type) {
case ARPHRD_NONE:
if (mac_proto == MAC_PROTO_ETHERNET) {
@@ -504,11 +485,20 @@ void ovs_vport_send(struct vport *vport, struct sk_buff 
*skb, u8 mac_proto)
goto drop;
}
 
-   if (unlikely(packet_length(skb, vport->dev) > mtu &&
-!skb_is_gso(skb))) {
-   net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
-vport->dev->name,
-packet_length(skb, vport->dev), mtu);
+   if (unlikely(!is_skb_forwardable(vport->dev, skb))) {
+   /* Log only if the device is up. */
+   if (vport->dev->flags & IFF_UP) {
+   unsigned int length = skb->len
+   - vport->dev->hard_header_len;
+
+   if (!skb_vlan_tag_present(skb)
+   && eth_type_vlan(skb->protocol))
+   length -= VLAN_HLEN;
+
+   net_warn_ratelimited("%s: dropped over-mtu packet %d > 
%d\n",
+vport->dev->name, length,
+vport->dev->mtu);
+   }
vport->dev->stats.tx_errors++;
goto drop;
}
-- 
2.1.4



[PATCH v3 net-next 1/3] openvswitch: Add a missing break statement.

2016-11-29 Thread Jarno Rajahalme
Add a break statement to prevent fall-through from
OVS_KEY_ATTR_ETHERNET to OVS_KEY_ATTR_TUNNEL.  Without the break
actions setting ethernet addresses fail to validate with log messages
complaining about invalid tunnel attributes.

Fixes: 0a6410fbde ("openvswitch: netlink: support L3 packets")
Signed-off-by: Jarno Rajahalme 
Acked-by: Pravin B Shelar 
Acked-by: Jiri Benc 
---
v3: No change.

 net/openvswitch/flow_netlink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index d19044f..c87d359 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2195,6 +2195,7 @@ static int validate_set(const struct nlattr *a,
case OVS_KEY_ATTR_ETHERNET:
if (mac_proto != MAC_PROTO_ETHERNET)
return -EINVAL;
+   break;
 
case OVS_KEY_ATTR_TUNNEL:
if (masked)
-- 
2.1.4



[PATCH v3 net-next 3/3] openvswitch: Fix skb->protocol for vlan frames.

2016-11-29 Thread Jarno Rajahalme
Do not always set skb->protocol to be the ethertype of the L3 header.
For a packet with non-accelerated VLAN tags skb->protocol needs to be
the ethertype of the outermost non-accelerated VLAN ethertype.

Any VLAN offloading is undone on the OVS netlink interface, and any
VLAN tags added by userspace are non-accelerated, as are double tagged
VLAN packets.

Fixes: 018c1dda5f ("openvswitch: 802.1AD Flow handling, actions, vlan parsing, 
netlink attributes")
Fixes: 5108bbaddc ("openvswitch: add processing of L3 packets")
Signed-off-by: Jarno Rajahalme 
---
v3: Set skb->protocol properly also for double tagged packets, as suggested
by Pravin.  This patch is no longer needed for the MTU test failure, as
the new patch 2/3 addresses that.

 net/openvswitch/datapath.c |  1 -
 net/openvswitch/flow.c | 62 +++---
 2 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 2d4c4d3..9c62b63 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -606,7 +606,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, 
struct genl_info *info)
rcu_assign_pointer(flow->sf_acts, acts);
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
-   packet->protocol = flow->key.eth.type;
 
rcu_read_lock();
dp = get_dp_rcu(net, ovs_header->dp_ifindex);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 08aa926..e2fe2e5 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -354,6 +354,7 @@ static int parse_vlan(struct sk_buff *skb, struct 
sw_flow_key *key)
res = parse_vlan_tag(skb, >eth.vlan);
if (res <= 0)
return res;
+   skb->protocol = key->eth.vlan.tpid;
}
 
/* Parse inner vlan tag. */
@@ -361,6 +362,11 @@ static int parse_vlan(struct sk_buff *skb, struct 
sw_flow_key *key)
if (res <= 0)
return res;
 
+   /* If the outer vlan tag was accelerated, skb->protocol should
+* refelect the inner vlan type. */
+   if (!eth_type_vlan(skb->protocol))
+   skb->protocol = key->eth.cvlan.tpid;
+
return 0;
 }
 
@@ -477,12 +483,18 @@ static int parse_icmpv6(struct sk_buff *skb, struct 
sw_flow_key *key,
 }
 
 /**
- * key_extract - extracts a flow key from an Ethernet frame.
+ * key_extract - extracts a flow key from a packet with or without an
+ * Ethernet header.
  * @skb: sk_buff that contains the frame, with skb->data pointing to the
- * Ethernet header
+ * beginning of the packet.
  * @key: output flow key
  *
- * The caller must ensure that skb->len >= ETH_HLEN.
+ * 'key->mac_proto' must be initialized to indicate the frame type.  For an L3
+ * frame 'key->mac_proto' must equal 'MAC_PROTO_NONE', and the caller must
+ * ensure that 'skb->protocol' is set to the ethertype of the L3 header.
+ * Otherwise the presence of an Ethernet header is assumed and the caller must
+ * ensure that skb->len >= ETH_HLEN and that 'skb->protocol' is initialized to
+ * zero.
  *
  * Returns 0 if successful, otherwise a negative errno value.
  *
@@ -498,8 +510,9 @@ static int parse_icmpv6(struct sk_buff *skb, struct 
sw_flow_key *key,
  *  of a correct length, otherwise the same as skb->network_header.
  *  For other key->eth.type values it is left untouched.
  *
- *- skb->protocol: the type of the data starting at skb->network_header.
- *  Equals to key->eth.type.
+ *- skb->protocol: For non-accelerated VLAN, one of the VLAN ether types,
+ *  otherwise the same as key->eth.type, the ether type of the payload
+ *  starting at skb->network_header.
  */
 static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)
 {
@@ -531,15 +544,16 @@ static int key_extract(struct sk_buff *skb, struct 
sw_flow_key *key)
if (unlikely(parse_vlan(skb, key)))
return -ENOMEM;
 
-   skb->protocol = parse_ethertype(skb);
-   if (unlikely(skb->protocol == htons(0)))
+   key->eth.type = parse_ethertype(skb);
+   if (unlikely(key->eth.type == htons(0)))
return -ENOMEM;
 
skb_reset_network_header(skb);
__skb_push(skb, skb->data - skb_mac_header(skb));
}
skb_reset_mac_len(skb);
-   key->eth.type = skb->protocol;
+   if (!eth_type_vlan(skb->protocol))
+   skb->protocol = key->eth.type;
 
/* Network layer. */
if (key->eth.type == htons(ETH_P_IP)) {
@@ -800,29 +814,15 @@ int ovs_flow_key_extract_userspace(struct net *net, const 
struct nlattr *attr,
if (err)
return err;
 
-   if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) {
-   /* key_extract assumes that skb->protocol is set-up for
-* layer 3 packets which is the 

Re: bnx2 breaks Dell R815 BMC IPMI since 4.8

2016-11-29 Thread Gavin Shan
On Tue, Nov 29, 2016 at 07:57:51AM +0100, Brice Goglin wrote:
>Hello
>
>My Dell PowerEdge R815 doesn't have IPMI anymore when I boot a 4.8
>kernel, the BMC doesn't even ping anymore. Its Ethernet devices are 4 of
>those:
>
>01:00.0 Ethernet controller: Broadcom Corporation NetXtreme II BCM5709 Gigabit 
>Ethernet (rev 20)
>   DeviceName: Embedded NIC 1  
>   Subsystem: Dell NetXtreme II BCM5709 Gigabit Ethernet
>   Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
> Stepping- SERR- FastB2B- DisINTx+
>   Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort-  SERR-Latency: 0, Cache Line Size: 64 bytes
>   Interrupt: pin A routed to IRQ 42
>   Region 0: Memory at e600 (64-bit, non-prefetchable) [size=32M]
>   Capabilities: 
>   Kernel driver in use: bnx2
>   Kernel modules: bnx2
>
>The only change in bnx2 between 4.7 and 4.8 appears to be this one:
>
>commit 3e1be7ad2d38c6bd6aeef96df9bd0a7822f4e51c
>Author: Baoquan He 
>Date:   Fri Sep 9 22:43:12 2016 +0800
>
>bnx2: Reset device during driver initialization
>
>Could you patch actually break the BMC? What do I need to further debug
>this issue?
>

Brice, could you please confirm NCSI is enabled on BMC? It seems NCSI
AEN pakets aren't handled by NCSI stack on BMC side when resetting NIC
on host side. Those AEN packets usually help to reconfigure the (active)
NCSI channel and bring it back to service after the reset.

Thanks,
Gavin



RE: [PATCH net-netx] net: lan78xx: add LAN7801 MAC-only support

2016-11-29 Thread Woojung.Huh
> There are two ways to get these settings propagated to the PHY driver:
> 
> - using a board fixup which is going to be invoked during
> drv->config_init() time
> 
> - specifying a phydev->dev_flags and reading it from the PHY driver to
> act upon and configure the PHY based on that value, there are only
> 32-bits available though, and you need to make sure they are not
> conflicting with other potential users in tree
> 
> My preference would go with 1, since you could just register it in your
> PHY driver and re-use the code you are proposing to include here.

fixup looks good to me too. Will submit revision soon.

Thanks.
- Woojung


Re: [PATCH net] fib_trie: Avoid expensive update of suffix length when not required

2016-11-29 Thread Alexander Duyck
On Tue, Nov 29, 2016 at 9:46 AM, Robert Shearman  wrote:
> With certain distributions of routes it can take a long time to add
> and delete further routes at scale. For example, with a route for
> 10.37.96.0/20 present it takes 47s to add ~200k contiguous /24 routes
> from 8.0.0.0/24 through to 11.138.207.0/24. Perf shows the bottleneck
> is update_suffix:
>
>   40.39%  [kernel]  [k] update_suffix
>8.02%  libnl-3.so.200.19.0   [.] nl_hash_table_lookup

Well yeah, it will be expensive when you have over 512K entries in a
single node.  I'm assuming that is the case based on the fact that
200K routes will double up in the trie node due to broadcast and the
route ending up in adjacent key vectors.

> With these changes, the time is reduced to 4s for the same scale and
> distribution of routes.
>
> The issue is that update_suffix does an O(n) walk on the children of a
> node and the with a dense distribtion of routes the number of children
> in a node tends towards the number of nodes in the tree.

Other than the performance I was just wondering if you did any other
validation to confirm that nothing else differs.  Basically it would
just be a matter of verifying that /proc/net/fib_trie is the same
before and after your patch.

> In the add case it isn't necessary to walk all the other children to
> update the largest suffix length of the node (which is what
> update_suffix is doing) since we already know what the largest suffix
> length of all the other children is and we only need to update it if
> the new node/leaf has a larger suffix. However, it is currently called
> from resize which is called on any update to rebalance the trie.
>
> Therefore improve the scaling by moving the responsibility of
> recalculating the node's largest suffix length out of the resize
> function into its callers. For fib_insert_node this can be taken care
> of by extending put_child to not only update the largest suffix length
> in the parent node, but to propagate it all the way up the trie as
> required, using a new function, node_push_suffix, based on
> leaf_push_suffix, but renamed to reflect its purpose and made safe if
> the node has no parent.
>
> No changes are required to inflate/halve since the maximum suffix
> length of the sub-trie starting from the node's parent isn't changed,
> and the suffix lengths of the halved/inflated nodes are updated
> through the creation of the new nodes and put_child called to add the
> children to them.
>
> In both fib_table_flush and fib_table_flush_external, given that a
> walk of the entire trie is being done there's a choice of optimising
> either for the case of a small number of leafs being flushed by
> updating the suffix on a change and propagating it up, or optimising
> for large numbers of leafs being flushed by deferring the updating of
> the largest suffix length to the walk up to the parent node. I opted
> for the latter to keep the algorithm linear in complexity in the case
> of flushing all leafs and because it is close to the status quo.
>
> Fixes: 5405afd1a306 ("fib_trie: Add tracking value for suffix length")
> Signed-off-by: Robert Shearman 

So I am not entirely convinced this is a regression, I was wondering
what your numbers looked like before the patch you are saying this
fixes?  I recall I fixed a number of issues leading up to this so I am
just curious.

My thought is this seems more like a performance optimization than a
fix.  If that is the case this probably belongs in net-next.

It seems to me we might be able to simplify update_suffix rather than
going through all this change.  That might be something that is more
acceptable for net.  Have you looked at simply adding code so that
there is a break inside update_suffix if (slen == tn->slen)?  We don't
have to call it for if a leaf is added so it might make sense to add
that check.

> ---
>  net/ipv4/fib_trie.c | 86 
> ++---
>  1 file changed, 62 insertions(+), 24 deletions(-)
>
> diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
> index 026f309c51e9..701cae8af44a 100644
> --- a/net/ipv4/fib_trie.c
> +++ b/net/ipv4/fib_trie.c
> @@ -421,8 +421,22 @@ static inline int tnode_full(struct key_vector *tn, 
> struct key_vector *n)
> return n && ((n->pos + n->bits) == tn->pos) && IS_TNODE(n);
>  }
>
> +static void node_push_suffix(struct key_vector *tn, struct key_vector *l)
> +{
> +   while (tn->slen < l->slen) {
> +   tn->slen = l->slen;
> +   tn = node_parent(tn);
> +   if (!tn)
> +   break;

I don't think the NULL check is necessary, at least it wasn't with the old code.

> +   }
> +}
> +
>  /* Add a child at position i overwriting the old value.
>   * Update the value of full_children and empty_children.
> + *
> + * The suffix length of the parent node and the rest of the tree is
> + * updated (if required) 

Re: netlink: GPF in sock_sndtimeo

2016-11-29 Thread Cong Wang
On Tue, Nov 29, 2016 at 8:48 AM, Richard Guy Briggs  wrote:
> On 2016-11-26 17:11, Cong Wang wrote:
>> It is racy on audit_sock, especially on the netns exit path.
>
> I think that is the only place it is racy.  The other places audit_sock
> is set is when the socket failure has just triggered a reset.
>
> Is there a notifier callback for failed or reaped sockets?

Is NETLINK_URELEASE event what you are looking for?


[PATCH for-next 1/6] IB/hns: Fix the bug when destroy qp

2016-11-29 Thread Salil Mehta
From: "Wei Hu (Xavier)" 

If send queue is still working when qp is in reset state by modify qp
in destroy qp function, hardware will hold on and don't work in hip06
SoC. In current codes, RoCE driver check hardware pointer of sending and
hardware pointer of processing to ensure that hardware has processed all
the dbs of this qp. But while the environment of wire becomes not good,
The checking time maybe too long.

In order to solve this problem, RoCE driver created a workqueue at probe
function. If there is a timeout when checking the status of qp, driver
initialize work entry and push it into the workqueue, Work function will
finish checking and release the related resources later.

Signed-off-by: Wei Hu (Xavier) 
Signed-off-by: Lijun Ou 
Signed-off-by: Dongdong Huang(Donald) 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_common.h |   40 +++
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  |  435 +--
 drivers/infiniband/hw/hns/hns_roce_hw_v1.h  |   23 ++
 3 files changed, 402 insertions(+), 96 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h 
b/drivers/infiniband/hw/hns/hns_roce_common.h
index 0dcb620..a055632 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -57,6 +57,32 @@
 #define roce_set_bit(origin, shift, val) \
roce_set_field((origin), (1ul << (shift)), (shift), (val))
 
+/*
+ * roce_hw_index_cmp_lt - Compare two hardware index values in hisilicon
+ *SOC, check if a is less than b.
+ * @a: hardware index value
+ * @b: hardware index value
+ * @bits: the number of bits of a and b, range: 0~31.
+ *
+ * Hardware index increases continuously till max value, and then restart
+ * from zero, again and again. Because the bits of reg field is often
+ * limited, the reg field can only hold the low bits of the hardware index
+ * in hisilicon SOC.
+ * In some scenes we need to compare two values(a,b) getted from two reg
+ * fields in this driver, for example:
+ * If a equals 0xfffe, b equals 0x1 and bits equals 16, we think b has
+ * incresed from 0x to 0x1 and a is less than b.
+ * If a equals 0xfffe, b equals 0x0xf001 and bits equals 16, we think a
+ * is bigger than b.
+ *
+ * Return true on a less than b, otherwise false.
+ */
+#define roce_hw_index_mask(bits)   ((1ul << (bits)) - 1)
+#define roce_hw_index_shift(bits)  (32 - (bits))
+#define roce_hw_index_cmp_lt(a, b, bits) \
+   ((int)a) - (b)) & roce_hw_index_mask(bits)) << \
+   roce_hw_index_shift(bits)) < 0)
+
 #define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
 #define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
 
@@ -245,10 +271,22 @@
 #define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M   \
(((1UL << 28) - 1) << ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)
 
+#define ROCEE_SDB_PTR_CMP_BITS 28
+
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_S 0
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_M   \
(((1UL << 16) - 1) << ROCEE_SDB_INV_CNT_SDB_INV_CNT_S)
 
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S 0
+#define ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M \
+   (((1UL << 16) - 1) << ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S)
+
+#define ROCEE_SDB_CNT_CMP_BITS 16
+
+#define ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S20
+
+#define ROCEE_CNT_CLR_CE_CNT_CLR_CE_S 0
+
 /*ROCEE_REG DEFINITION/
 #define ROCEE_VENDOR_ID_REG0x0
 #define ROCEE_VENDOR_PART_ID_REG   0x4
@@ -317,6 +355,8 @@
 #define ROCEE_SDB_ISSUE_PTR_REG0x758
 #define ROCEE_SDB_SEND_PTR_REG 0x75C
 #define ROCEE_SDB_INV_CNT_REG  0x9A4
+#define ROCEE_SDB_RETRY_CNT_REG0x9AC
+#define ROCEE_TSP_BP_ST_REG0x9EC
 #define ROCEE_ECC_UCERR_ALM0_REG   0xB34
 #define ROCEE_ECC_CERR_ALM0_REG0xB40
 
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c 
b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index 125ab90..aee1d01 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -948,6 +948,38 @@ int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool 
dereset)
return ret;
 }
 
+static int hns_roce_des_qp_init(struct hns_roce_dev *hr_dev)
+{
+   struct device *dev = _dev->pdev->dev;
+   struct hns_roce_v1_priv *priv;
+   struct hns_roce_des_qp *des_qp;
+
+   priv = (struct hns_roce_v1_priv *)hr_dev->hw->priv;
+   des_qp = >des_qp;
+
+   des_qp->requeue_flag = 1;
+   des_qp->qp_wq = create_singlethread_workqueue("hns_roce_destroy_qp");
+   if (!des_qp->qp_wq) {
+   dev_err(dev, "Create destroy qp workqueue failed!\n");
+   return -ENOMEM;
+   }
+
+   return 0;
+}
+
+static void hns_roce_des_qp_free(struct hns_roce_dev *hr_dev)
+{
+   struct 

[PATCH for-next 6/6] IB/hns: Fix the IB device name

2016-11-29 Thread Salil Mehta
From: Lijun Ou 

This patch mainly fix the name for IB device in order
to match with libhns.

Signed-off-by: Lijun Ou 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_main.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c 
b/drivers/infiniband/hw/hns/hns_roce_main.c
index 28a8f24..eddb053 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -433,7 +433,7 @@ static int hns_roce_register_device(struct hns_roce_dev 
*hr_dev)
spin_lock_init(>lock);
 
ib_dev = _dev->ib_dev;
-   strlcpy(ib_dev->name, "hisi_%d", IB_DEVICE_NAME_MAX);
+   strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX);
 
ib_dev->owner   = THIS_MODULE;
ib_dev->node_type   = RDMA_NODE_IB_CA;
-- 
1.7.9.5




Re: qed, qedi patchset submission

2016-11-29 Thread Arun Easi
Hi David,

As we are preparing to post V3 of the qed (drivers/net/) and qedi 
(drivers/scsi/), I would like to get your opinion on the "qedi" part. As 
qedi (the iSCSI driver) is dependent on the accompanying "qed" changes, 
qedi changes have to follow or be together with qed changes. Martin is ok 
in having you taking the qedi part as well (see below discussion) or wait 
until qed is in, and then take qedi through SCSI tree.

Hi David, Martin,

So far, we have been posting qedi changes split into functional blocks, 
for review, but was not bisectable. With Martin ok to our request to 
squash all patches while committing to tree, we were wondering if we 
should post the qedi patches squashed, with all the Reviewed-by added, or 
continue to post as before?

Regards,
-Arun

On Mon, 14 Nov 2016, 3:47pm, Martin K. Petersen wrote:

> > "Arun" == Arun Easi  writes:
> 
> Arun,
> 
> Arun> Do you have any preference or thoughts on how the "qed" patches be
> Arun> approached? Just as a reference, our rdma driver "qedr" went
> Arun> through something similar[1], and eventually "qed" patches were
> Arun> taken by David in the net tree and "qedr", in the rdma tree
> Arun> (obviously) by Doug L.
> 
> DaveM can queue the whole series or I can hold the SCSI pieces for
> rc1. Either way works for me.
> 
> However, I would like to do a review of the driver first. I will try to
> get it done this week.
> 
> 


[PATCH for-next 5/6] IB/hns: Fix the bug when free cq

2016-11-29 Thread Salil Mehta
From: Shaobo Xu 

If the resources of cq are freed while executing the user case, hardware
can not been notified in hip06 SoC. Then hardware will hold on when it
writes the cq buffer which has been released.

In order to slove this problem, RoCE driver checks the CQE counter, and
ensure that the outstanding CQE have been written. Then the cq buffer
can be released.

Signed-off-by: Shaobo Xu 
Reviewed-by: Wei Hu (Xavier) 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_common.h |2 +
 drivers/infiniband/hw/hns/hns_roce_cq.c |   27 --
 drivers/infiniband/hw/hns/hns_roce_device.h |8 
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  |   53 +++
 4 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h 
b/drivers/infiniband/hw/hns/hns_roce_common.h
index a055632..4af403e 100644
--- a/drivers/infiniband/hw/hns/hns_roce_common.h
+++ b/drivers/infiniband/hw/hns/hns_roce_common.h
@@ -354,6 +354,8 @@
 
 #define ROCEE_SDB_ISSUE_PTR_REG0x758
 #define ROCEE_SDB_SEND_PTR_REG 0x75C
+#define ROCEE_CAEP_CQE_WCMD_EMPTY  0x850
+#define ROCEE_SCAEP_WR_CQE_CNT 0x8D0
 #define ROCEE_SDB_INV_CNT_REG  0x9A4
 #define ROCEE_SDB_RETRY_CNT_REG0x9AC
 #define ROCEE_TSP_BP_ST_REG0x9EC
diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c 
b/drivers/infiniband/hw/hns/hns_roce_cq.c
index c9f6c3d..ff9a6a3 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cq.c
+++ b/drivers/infiniband/hw/hns/hns_roce_cq.c
@@ -179,8 +179,7 @@ static int hns_roce_hw2sw_cq(struct hns_roce_dev *dev,
 HNS_ROCE_CMD_TIMEOUT_MSECS);
 }
 
-static void hns_roce_free_cq(struct hns_roce_dev *hr_dev,
-struct hns_roce_cq *hr_cq)
+void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
 {
struct hns_roce_cq_table *cq_table = _dev->cq_table;
struct device *dev = _dev->pdev->dev;
@@ -392,19 +391,25 @@ int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq)
 {
struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
+   int ret = 0;
 
-   hns_roce_free_cq(hr_dev, hr_cq);
-   hns_roce_mtt_cleanup(hr_dev, _cq->hr_buf.hr_mtt);
+   if (hr_dev->hw->destroy_cq) {
+   ret = hr_dev->hw->destroy_cq(ib_cq);
+   } else {
+   hns_roce_free_cq(hr_dev, hr_cq);
+   hns_roce_mtt_cleanup(hr_dev, _cq->hr_buf.hr_mtt);
 
-   if (ib_cq->uobject)
-   ib_umem_release(hr_cq->umem);
-   else
-   /* Free the buff of stored cq */
-   hns_roce_ib_free_cq_buf(hr_dev, _cq->hr_buf, ib_cq->cqe);
+   if (ib_cq->uobject)
+   ib_umem_release(hr_cq->umem);
+   else
+   /* Free the buff of stored cq */
+   hns_roce_ib_free_cq_buf(hr_dev, _cq->hr_buf,
+   ib_cq->cqe);
 
-   kfree(hr_cq);
+   kfree(hr_cq);
+   }
 
-   return 0;
+   return ret;
 }
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
b/drivers/infiniband/hw/hns/hns_roce_device.h
index 1050829..d4f0fce 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -56,6 +56,12 @@
 #define HNS_ROCE_MAX_INNER_MTPT_NUM0x7
 #define HNS_ROCE_MAX_MTPT_PBL_NUM  0x10
 
+#define HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS   20
+#define HNS_ROCE_MAX_FREE_CQ_WAIT_CNT  \
+   (5000 / HNS_ROCE_EACH_FREE_CQ_WAIT_MSECS)
+#define HNS_ROCE_CQE_WCMD_EMPTY_BIT0x2
+#define HNS_ROCE_MIN_CQE_CNT   16
+
 #define HNS_ROCE_MAX_IRQ_NUM   34
 
 #define HNS_ROCE_COMP_VEC_NUM  32
@@ -528,6 +534,7 @@ struct hns_roce_hw {
int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
+   int (*destroy_cq)(struct ib_cq *ibcq);
void*priv;
 };
 
@@ -734,6 +741,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device 
*ib_dev,
struct ib_udata *udata);
 
 int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
+void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
 void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn);
 void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c 
b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c

[PATCH for-next 3/6] IB/hns: Fix the bug of setting port mtu

2016-11-29 Thread Salil Mehta
From: "Wei Hu (Xavier)" 

In hns_roce driver, we need not call iboe_get_mtu to reduce
IB headers from effective IBoE MTU because hr_dev->caps.max_mtu
has already been reduced.

Signed-off-by: Wei Hu (Xavier) 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_main.c |   16 ++--
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c 
b/drivers/infiniband/hw/hns/hns_roce_main.c
index 0cedec0..5e620f9 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -72,18 +72,6 @@ static void hns_roce_set_mac(struct hns_roce_dev *hr_dev, u8 
port, u8 *addr)
hr_dev->hw->set_mac(hr_dev, phy_port, addr);
 }
 
-static void hns_roce_set_mtu(struct hns_roce_dev *hr_dev, u8 port, int mtu)
-{
-   u8 phy_port = hr_dev->iboe.phy_port[port];
-   enum ib_mtu tmp;
-
-   tmp = iboe_get_mtu(mtu);
-   if (!tmp)
-   tmp = IB_MTU_256;
-
-   hr_dev->hw->set_mtu(hr_dev, phy_port, tmp);
-}
-
 static int hns_roce_add_gid(struct ib_device *device, u8 port_num,
unsigned int index, const union ib_gid *gid,
const struct ib_gid_attr *attr, void **context)
@@ -188,8 +176,8 @@ static int hns_roce_setup_mtu_mac(struct hns_roce_dev 
*hr_dev)
u8 i;
 
for (i = 0; i < hr_dev->caps.num_ports; i++) {
-   hns_roce_set_mtu(hr_dev, i,
-ib_mtu_enum_to_int(hr_dev->caps.max_mtu));
+   hr_dev->hw->set_mtu(hr_dev, hr_dev->iboe.phy_port[i],
+   hr_dev->caps.max_mtu);
hns_roce_set_mac(hr_dev, i, hr_dev->iboe.netdevs[i]->dev_addr);
}
 
-- 
1.7.9.5




[PATCH for-next 2/6] IB/hns: Fix the bug when free mr

2016-11-29 Thread Salil Mehta
From: Shaobo Xu 

If the resources of mr are freed while executing the user case, hardware
can not been notified in hip06 SoC. Then hardware will hold on when it
reads the payload by the PA which has been released.

In order to slove this problem, RoCE driver creates 8 reserved loopback
QPs to ensure zero wqe when free mr. When the mac address is reset, in
order to avoid loopback failure, we need to release the reserved loopback
QPs and recreate them.

Signed-off-by: Shaobo Xu 
Reviewed-by: Wei Hu (Xavier) 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_cmd.h|5 -
 drivers/infiniband/hw/hns/hns_roce_device.h |   10 +
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  |  485 +++
 drivers/infiniband/hw/hns/hns_roce_hw_v1.h  |   34 ++
 drivers/infiniband/hw/hns/hns_roce_main.c   |5 +-
 drivers/infiniband/hw/hns/hns_roce_mr.c |   21 +-
 6 files changed, 545 insertions(+), 15 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h 
b/drivers/infiniband/hw/hns/hns_roce_cmd.h
index ed14ad3..f5a9ee2 100644
--- a/drivers/infiniband/hw/hns/hns_roce_cmd.h
+++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h
@@ -58,11 +58,6 @@ enum {
HNS_ROCE_CMD_QUERY_QP   = 0x22,
 };
 
-struct hns_roce_cmd_mailbox {
-   void   *buf;
-   dma_addr_t  dma;
-};
-
 int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
  unsigned long in_modifier, u8 op_modifier, u16 op,
  unsigned long timeout);
diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
b/drivers/infiniband/hw/hns/hns_roce_device.h
index e48464d..1050829 100644
--- a/drivers/infiniband/hw/hns/hns_roce_device.h
+++ b/drivers/infiniband/hw/hns/hns_roce_device.h
@@ -388,6 +388,11 @@ struct hns_roce_cmdq {
u8  toggle;
 };
 
+struct hns_roce_cmd_mailbox {
+   void   *buf;
+   dma_addr_t  dma;
+};
+
 struct hns_roce_dev;
 
 struct hns_roce_qp {
@@ -522,6 +527,7 @@ struct hns_roce_hw {
 struct ib_recv_wr **bad_recv_wr);
int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+   int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
void*priv;
 };
 
@@ -688,6 +694,10 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 
start, u64 length,
   u64 virt_addr, int access_flags,
   struct ib_udata *udata);
 int hns_roce_dereg_mr(struct ib_mr *ibmr);
+int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
+  struct hns_roce_cmd_mailbox *mailbox,
+  unsigned long mpt_index);
+unsigned long key_to_hw_index(u32 key);
 
 void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size,
   struct hns_roce_buf *buf);
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c 
b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
index aee1d01..f67a3bf 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
@@ -295,6 +295,8 @@ int hns_roce_v1_post_send(struct ib_qp *ibqp, struct 
ib_send_wr *wr,
roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SQ_HEAD_M,
   SQ_DOORBELL_U32_4_SQ_HEAD_S,
  (qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)));
+   roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_SL_M,
+  SQ_DOORBELL_U32_4_SL_S, qp->sl);
roce_set_field(sq_db.u32_4, SQ_DOORBELL_U32_4_PORT_M,
   SQ_DOORBELL_U32_4_PORT_S, qp->phy_port);
roce_set_field(sq_db.u32_8, SQ_DOORBELL_U32_8_QPN_M,
@@ -622,6 +624,213 @@ static int hns_roce_db_ext_init(struct hns_roce_dev 
*hr_dev, u32 sdb_ext_mod,
return ret;
 }
 
+static struct hns_roce_qp *hns_roce_v1_create_lp_qp(struct hns_roce_dev 
*hr_dev,
+   struct ib_pd *pd)
+{
+   struct device *dev = _dev->pdev->dev;
+   struct ib_qp_init_attr init_attr;
+   struct ib_qp *qp;
+
+   memset(_attr, 0, sizeof(struct ib_qp_init_attr));
+   init_attr.qp_type   = IB_QPT_RC;
+   init_attr.sq_sig_type   = IB_SIGNAL_ALL_WR;
+   init_attr.cap.max_recv_wr   = HNS_ROCE_MIN_WQE_NUM;
+   init_attr.cap.max_send_wr   = HNS_ROCE_MIN_WQE_NUM;
+
+   qp = hns_roce_create_qp(pd, _attr, NULL);
+   if (IS_ERR(qp)) {
+   dev_err(dev, "Create loop qp for mr free failed!");
+   return NULL;
+   }
+
+   return to_hr_qp(qp);
+}
+
+static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
+{
+   struct hns_roce_caps *caps = 

[PATCH for-next 4/6] IB/hns: Delete the redundant memset operation

2016-11-29 Thread Salil Mehta
From: "Wei Hu (Xavier)" 

It deleted the redundant memset operation because the memory allocated
by ib_alloc_device has been set zero.

Signed-off-by: Wei Hu (Xavier) 
Signed-off-by: Salil Mehta 
---
 drivers/infiniband/hw/hns/hns_roce_main.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c 
b/drivers/infiniband/hw/hns/hns_roce_main.c
index 5e620f9..28a8f24 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -843,9 +843,6 @@ static int hns_roce_probe(struct platform_device *pdev)
if (!hr_dev)
return -ENOMEM;
 
-   memset((u8 *)hr_dev + sizeof(struct ib_device), 0,
-   sizeof(struct hns_roce_dev) - sizeof(struct ib_device));
-
hr_dev->pdev = pdev;
platform_set_drvdata(pdev, hr_dev);
 
-- 
1.7.9.5




[PATCH for-next 0/6] IB/hns: Bug Fixes for HNS RoCE Driver

2016-11-29 Thread Salil Mehta
This patch-set contains bug fixes for the HNS RoCE driver.

Lijun Ou (1):
  IB/hns: Fix the IB device name

Shaobo Xu (2):
  IB/hns: Fix the bug when free mr
  IB/hns: Fix the bug when free cq

Wei Hu (Xavier) (3):
  IB/hns: Fix the bug when destroy qp
  IB/hns: Fix the bug of setting port mtu
  IB/hns: Delete the redundant memset operation

 drivers/infiniband/hw/hns/hns_roce_cmd.h|5 -
 drivers/infiniband/hw/hns/hns_roce_common.h |   42 ++
 drivers/infiniband/hw/hns/hns_roce_cq.c |   27 +-
 drivers/infiniband/hw/hns/hns_roce_device.h |   18 +
 drivers/infiniband/hw/hns/hns_roce_hw_v1.c  |  967 ---
 drivers/infiniband/hw/hns/hns_roce_hw_v1.h  |   57 ++
 drivers/infiniband/hw/hns/hns_roce_main.c   |   26 +-
 drivers/infiniband/hw/hns/hns_roce_mr.c |   21 +-
 8 files changed, 1026 insertions(+), 137 deletions(-)

-- 
1.7.9.5




[PATCH] ethtool: mark mask values as ULL values

2016-11-29 Thread Jacob Keller
If compiling with signed checks enabled, there will be warnings
generated by the ETHTOOL_RX_FLOW_SPEC_RING(_VF) masks. These are
currently marked as signed constants, which will generate warnings when
masking with unsigned values. Avoid this by marking them explicitly as
unsigned values.

Signed-off-by: Jacob Keller 
---
 include/uapi/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index ed650eef9e54..a716112b2b01 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -891,8 +891,8 @@ struct ethtool_rx_flow_spec {
  * space for this at this time. If a future patch consumes the next
  * byte it should be aware of this possiblity.
  */
-#define ETHTOOL_RX_FLOW_SPEC_RING  0xLL
-#define ETHTOOL_RX_FLOW_SPEC_RING_VF   0x00FFLL
+#define ETHTOOL_RX_FLOW_SPEC_RING  0xULL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF   0x00FFULL
 #define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
 static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
 {
-- 
2.11.0.rc2.152.g4d04e67



Re: 4.9-rc7: (forcedeth?) BUG: sleeping function called from invalid context at kernel/irq/manage.c:110

2016-11-29 Thread Eric Dumazet
On Tue, 2016-11-29 at 12:06 -0800, Linus Torvalds wrote:
> On Nov 29, 2016 11:58 AM, "Eric Dumazet" 
> wrote:
> >
> > nv_do_nic_poll() is simply buggy and needs a fix.
> >
> > synchronize_irq() can sleep.
> 
> Yes, but why did it start showing up now? None of this has changed as
> far as I can see?
> 
> Is it just timing and the transmit queue being busy? Perhaps due to
> the sheer size of messages? Meelis does seem to have a ton of
> debugging enabled..
> 
> Linus
> 
> 
Looks like bad luck really or forced_irqthreads ? 


These commits contain what is wanted.

02cea3958664723a5d2236f0f0058de97c7e4693 
genirq: Provide disable_hardirq()



18258f7239a61d8929b8e0c7b6d46c446459074c
genirq: Provide synchronize_hardirq()




[PATCH] net: ethernet: ti: cpsw: fix ASSERT_RTNL() warning during resume

2016-11-29 Thread Grygorii Strashko
netif_set_real_num_tx/rx_queues() are required to be called with rtnl_lock
taken, otherwise ASSERT_RTNL() warning will be triggered - which happens
now during System resume from suspend:
cpsw_resume()
|- cpsw_ndo_open()
  |- netif_set_real_num_tx/rx_queues()
 |- ASSERT_RTNL();

Hence, fix it by surrounding cpsw_ndo_open() by rtnl_lock/unlock() calls.

Cc: Dave Gerlach 
Cc: Ivan Khoronzhuk 
Fixes: commit e05107e6b747 ("net: ethernet: ti: cpsw: add multi queue support")
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index ae1ec6a..fd6c03b 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2944,6 +2944,8 @@ static int cpsw_resume(struct device *dev)
/* Select default pin state */
pinctrl_pm_select_default_state(dev);
 
+   /* shut up ASSERT_RTNL() warning in netif_set_real_num_tx/rx_queues */
+   rtnl_lock();
if (cpsw->data.dual_emac) {
int i;
 
@@ -2955,6 +2957,8 @@ static int cpsw_resume(struct device *dev)
if (netif_running(ndev))
cpsw_ndo_open(ndev);
}
+   rtnl_unlock();
+
return 0;
 }
 #endif
-- 
2.10.1



[PATCH net-next 4/7] net/mlx5e: Remove redundant hashtable lookup in configure flower

2016-11-29 Thread Saeed Mahameed
From: Roi Dayan 

We will never find a flow with the same cookie as cls_flower always
allocates a new flow and the cookie is the allocated memory address.

Fixes: e3a2b7ed018e ("net/mlx5e: Support offload cls_flower with drop action")
Signed-off-by: Roi Dayan 
Reviewed-by: Hadar Hen Zion 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 26 +++--
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4d06fab..dd6d954 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -915,25 +915,17 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, 
__be16 protocol,
u32 flow_tag, action;
struct mlx5e_tc_flow *flow;
struct mlx5_flow_spec *spec;
-   struct mlx5_flow_handle *old = NULL;
-   struct mlx5_esw_flow_attr *old_attr = NULL;
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 
if (esw && esw->mode == SRIOV_OFFLOADS)
fdb_flow = true;
 
-   flow = rhashtable_lookup_fast(>ht, >cookie,
- tc->ht_params);
-   if (flow) {
-   old = flow->rule;
-   old_attr = flow->attr;
-   } else {
-   if (fdb_flow)
-   flow = kzalloc(sizeof(*flow) + sizeof(struct 
mlx5_esw_flow_attr),
-  GFP_KERNEL);
-   else
-   flow = kzalloc(sizeof(*flow), GFP_KERNEL);
-   }
+   if (fdb_flow)
+   flow = kzalloc(sizeof(*flow) +
+  sizeof(struct mlx5_esw_flow_attr),
+  GFP_KERNEL);
+   else
+   flow = kzalloc(sizeof(*flow), GFP_KERNEL);
 
spec = mlx5_vzalloc(sizeof(*spec));
if (!spec || !flow) {
@@ -970,17 +962,13 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, 
__be16 protocol,
if (err)
goto err_del_rule;
 
-   if (old)
-   mlx5e_tc_del_flow(priv, old, old_attr);
-
goto out;
 
 err_del_rule:
mlx5_del_flow_rules(flow->rule);
 
 err_free:
-   if (!old)
-   kfree(flow);
+   kfree(flow);
 out:
kvfree(spec);
return err;
-- 
2.7.4



[PATCH net-next 2/7] net/mlx5e: Move function mlx5e_create_umr_mkey

2016-11-29 Thread Saeed Mahameed
From: Tariq Toukan 

In next patch we are going to create a UMR MKey per RQ, we need
mlx5e_create_umr_mkey declared before mlx5e_create_rq.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 74 +++
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index ba25cd3..49ca30b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -471,6 +471,43 @@ static void mlx5e_rq_free_mpwqe_info(struct mlx5e_rq *rq)
kfree(rq->mpwqe.info);
 }
 
+static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
+{
+   struct mlx5_core_dev *mdev = priv->mdev;
+   u64 npages = MLX5E_REQUIRED_MTTS(priv->profile->max_nch(mdev),
+
BIT(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW));
+   int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+   void *mkc;
+   u32 *in;
+   int err;
+
+   in = mlx5_vzalloc(inlen);
+   if (!in)
+   return -ENOMEM;
+
+   mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+
+   npages = min_t(u32, ALIGN(U16_MAX, 4) * 2, npages);
+
+   MLX5_SET(mkc, mkc, free, 1);
+   MLX5_SET(mkc, mkc, umr_en, 1);
+   MLX5_SET(mkc, mkc, lw, 1);
+   MLX5_SET(mkc, mkc, lr, 1);
+   MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
+
+   MLX5_SET(mkc, mkc, qpn, 0xff);
+   MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+   MLX5_SET64(mkc, mkc, len, npages << PAGE_SHIFT);
+   MLX5_SET(mkc, mkc, translations_octword_size,
+MLX5_MTT_OCTW(npages));
+   MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
+
+   err = mlx5_core_create_mkey(mdev, >umr_mkey, in, inlen);
+
+   kvfree(in);
+   return err;
+}
+
 static int mlx5e_create_rq(struct mlx5e_channel *c,
   struct mlx5e_rq_param *param,
   struct mlx5e_rq *rq)
@@ -3625,43 +3662,6 @@ static void mlx5e_destroy_q_counter(struct mlx5e_priv 
*priv)
mlx5_core_dealloc_q_counter(priv->mdev, priv->q_counter);
 }
 
-static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
-{
-   struct mlx5_core_dev *mdev = priv->mdev;
-   u64 npages = MLX5E_REQUIRED_MTTS(priv->profile->max_nch(mdev),
-
BIT(MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW));
-   int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
-   void *mkc;
-   u32 *in;
-   int err;
-
-   in = mlx5_vzalloc(inlen);
-   if (!in)
-   return -ENOMEM;
-
-   mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
-
-   npages = min_t(u32, ALIGN(U16_MAX, 4) * 2, npages);
-
-   MLX5_SET(mkc, mkc, free, 1);
-   MLX5_SET(mkc, mkc, umr_en, 1);
-   MLX5_SET(mkc, mkc, lw, 1);
-   MLX5_SET(mkc, mkc, lr, 1);
-   MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT);
-
-   MLX5_SET(mkc, mkc, qpn, 0xff);
-   MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
-   MLX5_SET64(mkc, mkc, len, npages << PAGE_SHIFT);
-   MLX5_SET(mkc, mkc, translations_octword_size,
-MLX5_MTT_OCTW(npages));
-   MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
-
-   err = mlx5_core_create_mkey(mdev, >umr_mkey, in, inlen);
-
-   kvfree(in);
-   return err;
-}
-
 static void mlx5e_nic_init(struct mlx5_core_dev *mdev,
   struct net_device *netdev,
   const struct mlx5e_profile *profile,
-- 
2.7.4



[PATCH net-next 3/7] net/mlx5e: Create UMR MKey per RQ

2016-11-29 Thread Saeed Mahameed
From: Tariq Toukan 

In Striding RQ implementation, we used a single UMR
(User-Mode Memory Registration) memory key for all RQs.
When the product of RQs number*size gets high, we hit a
limitation of u16 field size in FW.

Here we move to using a UMR memory key per RQ, so we can
scale to any number of rings, with the maximum buffer
size in each.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 12 ++---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 12 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 53 --
 3 files changed, 35 insertions(+), 42 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index f16f7fb..63dd639 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -77,9 +77,9 @@
 MLX5_MPWRQ_WQE_PAGE_ORDER)
 
 #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
-#define MLX5E_REQUIRED_MTTS(rqs, wqes)\
-   (rqs * wqes * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
-#define MLX5E_VALID_NUM_MTTS(num_mtts) (MLX5_MTT_OCTW(num_mtts) <= U16_MAX)
+#define MLX5E_REQUIRED_MTTS(wqes)  \
+   (wqes * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
+#define MLX5E_VALID_NUM_MTTS(num_mtts) (MLX5_MTT_OCTW(num_mtts) - 1 <= U16_MAX)
 
 #define MLX5_UMR_ALIGN (2048)
 #define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD  (128)
@@ -347,7 +347,6 @@ struct mlx5e_rq {
struct {
struct mlx5e_mpw_info *info;
void  *mtt_no_align;
-   u32mtt_offset;
} mpwqe;
};
struct {
@@ -382,6 +381,7 @@ struct mlx5e_rq {
u32rqn;
struct mlx5e_channel  *channel;
struct mlx5e_priv *priv;
+   struct mlx5_core_mkey  umr_mkey;
 } cacheline_aligned_in_smp;
 
 struct mlx5e_umr_dma_info {
@@ -689,7 +689,6 @@ struct mlx5e_priv {
 
unsigned long  state;
struct mutex   state_lock; /* Protects Interface state */
-   struct mlx5_core_mkey  umr_mkey;
struct mlx5e_rqdrop_rq;
 
struct mlx5e_channel **channel;
@@ -838,8 +837,7 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
 
 static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix)
 {
-   return rq->mpwqe.mtt_offset +
-   wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8);
+   return wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8);
 }
 
 static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
index aa963d7..352462a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
@@ -499,8 +499,7 @@ static int mlx5e_set_ringparam(struct net_device *dev,
return -EINVAL;
}
 
-   num_mtts = MLX5E_REQUIRED_MTTS(priv->params.num_channels,
-  rx_pending_wqes);
+   num_mtts = MLX5E_REQUIRED_MTTS(rx_pending_wqes);
if (priv->params.rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
!MLX5E_VALID_NUM_MTTS(num_mtts)) {
netdev_info(dev, "%s: rx_pending (%d) request can't be 
satisfied, try to reduce.\n",
@@ -565,7 +564,6 @@ static int mlx5e_set_channels(struct net_device *dev,
unsigned int count = ch->combined_count;
bool arfs_enabled;
bool was_opened;
-   u32 num_mtts;
int err = 0;
 
if (!count) {
@@ -584,14 +582,6 @@ static int mlx5e_set_channels(struct net_device *dev,
return -EINVAL;
}
 
-   num_mtts = MLX5E_REQUIRED_MTTS(count, BIT(priv->params.log_rq_size));
-   if (priv->params.rq_wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ &&
-   !MLX5E_VALID_NUM_MTTS(num_mtts)) {
-   netdev_info(dev, "%s: rx count (%d) request can't be satisfied, 
try to reduce.\n",
-   __func__, count);
-   return -EINVAL;
-   }
-
if (priv->params.num_channels == count)
return 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 49ca30b..84a4adb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -471,24 +471,25 @@ static void mlx5e_rq_free_mpwqe_info(struct mlx5e_rq *rq)
kfree(rq->mpwqe.info);
 }
 
-static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv)
+static int mlx5e_create_umr_mkey(struct mlx5e_priv *priv,
+u64 npages, u8 

[PATCH net-next 7/7] net/mlx5e: Remove flow encap entry in the correct place

2016-11-29 Thread Saeed Mahameed
From: Roi Dayan 

Handling flow encap entry should be inside tc del flow
and is only relevant for offloaded eswitch TC rules.

Fixes: 11a457e9b6c1 ("net/mlx5e: Add basic TC tunnel set action for SRIOV 
offloads")
Signed-off-by: Roi Dayan 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 43 +
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 3875c1c..f07ef8c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -142,6 +142,24 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
return mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
 }
 
+static void mlx5e_detach_encap(struct mlx5e_priv *priv,
+  struct mlx5e_tc_flow *flow) {
+   struct list_head *next = flow->encap.next;
+
+   list_del(>encap);
+   if (list_empty(next)) {
+   struct mlx5_encap_entry *e;
+
+   e = list_entry(next, struct mlx5_encap_entry, flows);
+   if (e->n) {
+   mlx5_encap_dealloc(priv->mdev, e->encap_id);
+   neigh_release(e->n);
+   }
+   hlist_del_rcu(>encap_hlist);
+   kfree(e);
+   }
+}
+
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
  struct mlx5e_tc_flow *flow)
 {
@@ -152,8 +170,11 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
 
mlx5_del_flow_rules(flow->rule);
 
-   if (esw && esw->mode == SRIOV_OFFLOADS)
+   if (esw && esw->mode == SRIOV_OFFLOADS) {
mlx5_eswitch_del_vlan_action(esw, flow->attr);
+   if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
+   mlx5e_detach_encap(priv, flow);
+   }
 
mlx5_fc_destroy(priv->mdev, counter);
 
@@ -973,24 +994,6 @@ int mlx5e_configure_flower(struct mlx5e_priv *priv, __be16 
protocol,
return err;
 }
 
-static void mlx5e_detach_encap(struct mlx5e_priv *priv,
-  struct mlx5e_tc_flow *flow) {
-   struct list_head *next = flow->encap.next;
-
-   list_del(>encap);
-   if (list_empty(next)) {
-   struct mlx5_encap_entry *e;
-
-   e = list_entry(next, struct mlx5_encap_entry, flows);
-   if (e->n) {
-   mlx5_encap_dealloc(priv->mdev, e->encap_id);
-   neigh_release(e->n);
-   }
-   hlist_del_rcu(>encap_hlist);
-   kfree(e);
-   }
-}
-
 int mlx5e_delete_flower(struct mlx5e_priv *priv,
struct tc_cls_flower_offload *f)
 {
@@ -1006,8 +1009,6 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 
mlx5e_tc_del_flow(priv, flow);
 
-   if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
-   mlx5e_detach_encap(priv, flow);
 
kfree(flow);
 
-- 
2.7.4



[PATCH net-next 6/7] net/mlx5e: Refactor tc del flow to accept mlx5e_tc_flow instance

2016-11-29 Thread Saeed Mahameed
From: Roi Dayan 

Change the function that deletes offloaded TC rule to get
struct mlx5e_tc_flow instance which contains both the flow
handle and flow attributes. This is a cleanup needed for
downstream patches, it doesn't change any functionality.

Signed-off-by: Roi Dayan 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4d71445..3875c1c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -143,18 +143,17 @@ mlx5e_tc_add_fdb_flow(struct mlx5e_priv *priv,
 }
 
 static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
- struct mlx5_flow_handle *rule,
- struct mlx5_esw_flow_attr *attr)
+ struct mlx5e_tc_flow *flow)
 {
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
struct mlx5_fc *counter = NULL;
 
-   counter = mlx5_flow_rule_counter(rule);
+   counter = mlx5_flow_rule_counter(flow->rule);
 
-   mlx5_del_flow_rules(rule);
+   mlx5_del_flow_rules(flow->rule);
 
if (esw && esw->mode == SRIOV_OFFLOADS)
-   mlx5_eswitch_del_vlan_action(esw, attr);
+   mlx5_eswitch_del_vlan_action(esw, flow->attr);
 
mlx5_fc_destroy(priv->mdev, counter);
 
@@ -1005,7 +1004,7 @@ int mlx5e_delete_flower(struct mlx5e_priv *priv,
 
rhashtable_remove_fast(>ht, >node, tc->ht_params);
 
-   mlx5e_tc_del_flow(priv, flow->rule, flow->attr);
+   mlx5e_tc_del_flow(priv, flow);
 
if (flow->attr->action & MLX5_FLOW_CONTEXT_ACTION_ENCAP)
mlx5e_detach_encap(priv, flow);
@@ -1065,7 +1064,7 @@ static void _mlx5e_tc_del_flow(void *ptr, void *arg)
struct mlx5e_tc_flow *flow = ptr;
struct mlx5e_priv *priv = arg;
 
-   mlx5e_tc_del_flow(priv, flow->rule, flow->attr);
+   mlx5e_tc_del_flow(priv, flow);
kfree(flow);
 }
 
-- 
2.7.4



[PATCH net-next 0/7] Mellanox 100G mlx5 updates 2016-11-29

2016-11-29 Thread Saeed Mahameed
Hi Dave,

The following series from Tariq and Roi, provides some critical fixes
and updates for the mlx5e driver.

>From Tariq: 
 - Fix driver coherent memory huge allocation issues by fragmenting
   completion queues, in a way that is transparent to the netdev driver by
   providing a new buffer type "mlx5_frag_buf" with the same access API.
 - Create UMR MKey per RQ to have better scalability.

>From Roi:
 - Some fixes for the encap-decap support and tc flower added lately to the
   mlx5e driver.

This series was generated against commit:
31ac1c19455f ("geneve: fix ip_hdr_len reserved for geneve6 tunnel.")

Thanks,
Saeed.

Roi Dayan (4):
  net/mlx5e: Remove redundant hashtable lookup in configure flower
  net/mlx5e: Correct cleanup order when deleting offloaded TC rules
  net/mlx5e: Refactor tc del flow to accept mlx5e_tc_flow instance
  net/mlx5e: Remove flow encap entry in the correct place

Tariq Toukan (3):
  net/mlx5e: Implement Fragmented Work Queue (WQ)
  net/mlx5e: Move function mlx5e_create_umr_mkey
  net/mlx5e: Create UMR MKey per RQ

 drivers/net/ethernet/mellanox/mlx5/core/alloc.c|  66 +++
 drivers/net/ethernet/mellanox/mlx5/core/en.h   |  14 +--
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   |  12 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 121 +++--
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|  82 ++
 drivers/net/ethernet/mellanox/mlx5/core/wq.c   |  26 +++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.h   |  18 ++-
 include/linux/mlx5/driver.h|  11 ++
 8 files changed, 215 insertions(+), 135 deletions(-)

-- 
2.7.4



[PATCH net-next 1/7] net/mlx5e: Implement Fragmented Work Queue (WQ)

2016-11-29 Thread Saeed Mahameed
From: Tariq Toukan 

Add new type of struct mlx5_frag_buf which is used to allocate fragmented
buffers rather than contiguous, and make the Completion Queues (CQs) use
it as they are big (default of 2MB per CQ in Striding RQ).

This fixes the failures of type:
"mlx5e_open_locked: mlx5e_open_channels failed, -12"
due to dma_zalloc_coherent insufficient contiguous coherent memory to
satisfy the driver's request when the user tries to setup more or larger
rings.

Signed-off-by: Tariq Toukan 
Reported-by: Sebastian Ott 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c   | 66 +++
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 ++--
 drivers/net/ethernet/mellanox/mlx5/core/wq.c  | 26 ++---
 drivers/net/ethernet/mellanox/mlx5/core/wq.h  | 18 +--
 include/linux/mlx5/driver.h   | 11 
 6 files changed, 116 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
index 2c6e3c7..bc8357d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -106,6 +106,63 @@ void mlx5_buf_free(struct mlx5_core_dev *dev, struct 
mlx5_buf *buf)
 }
 EXPORT_SYMBOL_GPL(mlx5_buf_free);
 
+int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
+struct mlx5_frag_buf *buf, int node)
+{
+   int i;
+
+   buf->size = size;
+   buf->npages = 1 << get_order(size);
+   buf->page_shift = PAGE_SHIFT;
+   buf->frags = kcalloc(buf->npages, sizeof(struct mlx5_buf_list),
+GFP_KERNEL);
+   if (!buf->frags)
+   goto err_out;
+
+   for (i = 0; i < buf->npages; i++) {
+   struct mlx5_buf_list *frag = >frags[i];
+   int frag_sz = min_t(int, size, PAGE_SIZE);
+
+   frag->buf = mlx5_dma_zalloc_coherent_node(dev, frag_sz,
+ >map, node);
+   if (!frag->buf)
+   goto err_free_buf;
+   if (frag->map & ((1 << buf->page_shift) - 1)) {
+   dma_free_coherent(>pdev->dev, frag_sz,
+ buf->frags[i].buf, buf->frags[i].map);
+   mlx5_core_warn(dev, "unexpected map alignment: 0x%p, 
page_shift=%d\n",
+  (void *)frag->map, buf->page_shift);
+   goto err_free_buf;
+   }
+   size -= frag_sz;
+   }
+
+   return 0;
+
+err_free_buf:
+   while (--i)
+   dma_free_coherent(>pdev->dev, PAGE_SIZE, buf->frags[i].buf,
+ buf->frags[i].map);
+   kfree(buf->frags);
+err_out:
+   return -ENOMEM;
+}
+
+void mlx5_frag_buf_free(struct mlx5_core_dev *dev, struct mlx5_frag_buf *buf)
+{
+   int size = buf->size;
+   int i;
+
+   for (i = 0; i < buf->npages; i++) {
+   int frag_sz = min_t(int, size, PAGE_SIZE);
+
+   dma_free_coherent(>pdev->dev, frag_sz, buf->frags[i].buf,
+ buf->frags[i].map);
+   size -= frag_sz;
+   }
+   kfree(buf->frags);
+}
+
 static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct mlx5_core_dev *dev,
 int node)
 {
@@ -230,3 +287,12 @@ void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 
*pas)
}
 }
 EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
+
+void mlx5_fill_page_frag_array(struct mlx5_frag_buf *buf, __be64 *pas)
+{
+   int i;
+
+   for (i = 0; i < buf->npages; i++)
+   pas[i] = cpu_to_be64(buf->frags[i].map);
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_frag_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 442dbc3..f16f7fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -286,7 +286,7 @@ struct mlx5e_cq {
u16decmprs_wqe_counter;
 
/* control */
-   struct mlx5_wq_ctrlwq_ctrl;
+   struct mlx5_frag_wq_ctrl   wq_ctrl;
 } cacheline_aligned_in_smp;
 
 struct mlx5e_rq;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6b492ca..ba25cd3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1201,7 +1201,7 @@ static int mlx5e_create_cq(struct mlx5e_channel *c,
 
 static void mlx5e_destroy_cq(struct mlx5e_cq *cq)
 {
-   mlx5_wq_destroy(>wq_ctrl);
+   mlx5_cqwq_destroy(>wq_ctrl);
 }
 
 static int mlx5e_enable_cq(struct mlx5e_cq *cq, struct 

[PATCH net-next 5/7] net/mlx5e: Correct cleanup order when deleting offloaded TC rules

2016-11-29 Thread Saeed Mahameed
From: Roi Dayan 

According to the reverse unwinding principle, on delete time we should
first handle deletion of the steering rule and later handle the vlan
deletion from the eswitch.

Fixes: 8b32580df1cb ("net/mlx5e: Add TC vlan action for SRIOV offloads")
Signed-off-by: Roi Dayan 
Reviewed-by: Or Gerlitz 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index dd6d954..4d71445 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -151,11 +151,11 @@ static void mlx5e_tc_del_flow(struct mlx5e_priv *priv,
 
counter = mlx5_flow_rule_counter(rule);
 
+   mlx5_del_flow_rules(rule);
+
if (esw && esw->mode == SRIOV_OFFLOADS)
mlx5_eswitch_del_vlan_action(esw, attr);
 
-   mlx5_del_flow_rules(rule);
-
mlx5_fc_destroy(priv->mdev, counter);
 
if (!mlx5e_tc_num_filters(priv) && (priv->fs.tc.t)) {
-- 
2.7.4



  1   2   3   4   >