date:20180330

Re: [PATCH v3 net-next 07/12] rhashtable: add schedule points

2018-03-30 Thread Herbert Xu

On Fri, Mar 30, 2018 at 05:53:04PM -0700, Eric Dumazet wrote:
> Rehashing and destroying large hash table takes a lot of time,
> and happens in process context. It is safe to add cond_resched()
> in rhashtable_rehash_table() and rhashtable_free_and_destroy()
> 
> Signed-off-by: Eric Dumazet 

Acked-by: Herbert Xu 

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[PATCH iproute2-next 1/1] tc: jsonify sample action

2018-03-30 Thread Roman Mashak

Signed-off-by: Roman Mashak 
---
 tc/m_sample.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/tc/m_sample.c b/tc/m_sample.c
index 1e18c5154fe6..39a99246a8ea 100644
--- a/tc/m_sample.c
+++ b/tc/m_sample.c
@@ -149,23 +149,27 @@ static int print_sample(struct action_util *au, FILE *f, 
struct rtattr *arg)
 
if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] ||
!tb[TCA_SAMPLE_PSAMPLE_GROUP]) {
-   fprintf(f, "[NULL sample parameters]");
+   print_string(PRINT_FP, NULL, "%s", "[NULL sample parameters]");
return -1;
}
p = RTA_DATA(tb[TCA_SAMPLE_PARMS]);
 
-   fprintf(f, "sample rate 1/%d group %d",
-   rta_getattr_u32(tb[TCA_SAMPLE_RATE]),
-   rta_getattr_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]));
+   print_string(PRINT_ANY, "kind", "%s ", "sample");
+   print_uint(PRINT_ANY, "rate", "rate 1/%u ",
+  rta_getattr_u32(tb[TCA_SAMPLE_RATE]));
+   print_uint(PRINT_ANY, "group", "group %u",
+  rta_getattr_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]));
 
if (tb[TCA_SAMPLE_TRUNC_SIZE])
-   fprintf(f, " trunc_size %d",
-   rta_getattr_u32(tb[TCA_SAMPLE_TRUNC_SIZE]));
+   print_uint(PRINT_ANY, "trunc_size", " trunc_size %u",
+  rta_getattr_u32(tb[TCA_SAMPLE_TRUNC_SIZE]));
 
print_action_control(f, " ", p->action, "");
 
-   fprintf(f, "\n\tindex %d ref %d bind %d", p->index, p->refcnt,
-   p->bindcnt);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
+   print_uint(PRINT_ANY, "index", "\t index %u", p->index);
+   print_int(PRINT_ANY, "ref", " ref %d", p->refcnt);
+   print_int(PRINT_ANY, "bind", " bind %d", p->bindcnt);
 
if (show_stats) {
if (tb[TCA_SAMPLE_TM]) {
@@ -174,7 +178,7 @@ static int print_sample(struct action_util *au, FILE *f, 
struct rtattr *arg)
print_tm(f, tm);
}
}
-   fprintf(f, "\n");
+   print_string(PRINT_FP, NULL, "%s", _SL_);
return 0;
 }
 
-- 
2.7.4

Re: [PATCH v2 net-next 07/12] rhashtable: add schedule points

2018-03-30 Thread Herbert Xu

On Fri, Mar 30, 2018 at 01:42:31PM -0700, Eric Dumazet wrote:
> Rehashing and destroying large hash table takes a lot of time,
> and happens in process context. It is safe to add cond_resched()
> in rhashtable_rehash_table() and rhashtable_free_and_destroy()
> 
> Signed-off-by: Eric Dumazet 

Acked-by: Herbert Xu 

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

[PATCH iproute2-next 1/1] tc: support oneline mode in action generic printer functions

2018-03-30 Thread Roman Mashak

Signed-off-by: Roman Mashak 
---
 tc/m_action.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tc/m_action.c b/tc/m_action.c
index 8891659ae15a..2f85d353279a 100644
--- a/tc/m_action.c
+++ b/tc/m_action.c
@@ -301,19 +301,21 @@ static int tc_print_one_action(FILE *f, struct rtattr 
*arg)
return err;
 
if (show_stats && tb[TCA_ACT_STATS]) {
-   print_string(PRINT_FP, NULL, "\tAction statistics:\n", NULL);
+   print_string(PRINT_FP, NULL, "\tAction statistics:", NULL);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
open_json_object("stats");
print_tcstats2_attr(f, tb[TCA_ACT_STATS], "\t", NULL);
close_json_object();
-   print_string(PRINT_FP, NULL, "\n", NULL);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
}
if (tb[TCA_ACT_COOKIE]) {
int strsz = RTA_PAYLOAD(tb[TCA_ACT_COOKIE]);
char b1[strsz * 2 + 1];
 
-   print_string(PRINT_ANY, "cookie", "\tcookie %s\n",
+   print_string(PRINT_ANY, "cookie", "\tcookie %s",
 hexstring_n2a(RTA_DATA(tb[TCA_ACT_COOKIE]),
   strsz, b1, sizeof(b1)));
+   print_string(PRINT_FP, NULL, "%s", _SL_);
}
 
return 0;
@@ -369,8 +371,9 @@ tc_print_action(FILE *f, const struct rtattr *arg, unsigned 
short tot_acts)
for (i = 0; i <= tot_acts; i++) {
if (tb[i]) {
open_json_object(NULL);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
print_uint(PRINT_ANY, "order",
-  "\n\taction order %u: ", i);
+  "\taction order %u: ", i);
if (tc_print_one_action(f, tb[i]) < 0) {
print_string(PRINT_FP, NULL,
 "Error printing action\n", NULL);
@@ -410,6 +413,7 @@ int print_action(const struct sockaddr_nl *who,
open_json_object(NULL);
print_uint(PRINT_ANY, "total acts", "total acts %u",
   tot_acts ? *tot_acts : 0);
+   print_string(PRINT_FP, NULL, "%s", _SL_);
close_json_object();
if (tb[TCA_ACT_TAB] == NULL) {
if (n->nlmsg_type != RTM_GETACTION)
-- 
2.7.4

[PATCH v3 net-next 02/12] inet: frags: change inet_frags_init_net() return value

2018-03-30 Thread Eric Dumazet

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  3 ++-
 net/ieee802154/6lowpan/reassembly.c | 11 ---
 net/ipv4/ip_fragment.c  | 12 +---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +---
 net/ipv6/reassembly.c   | 11 +--
 5 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
351f0c3cdcd9df16078a40f07963fb605eeaa882..b1d62176f3b4fcf100bd263e8eae0db656a3d9b6
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -104,9 +104,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
 {
atomic_set(>mem, 0);
+   return 0;
 }
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
85bf86ad6b1801066a4252af18b5b511070a9e08..2aaab4bba42961647a4d3d1c0b8497917d5065ce
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -581,14 +581,19 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
 {
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
+   int res;
 
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-   inet_frags_init_net(_lowpan->frags);
-
-   return lowpan_frags_ns_sysctl_register(net);
+   res = inet_frags_init_net(_lowpan->frags);
+   if (res < 0)
+   return res;
+   res = lowpan_frags_ns_sysctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(_lowpan->frags, _frags);
+   return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
bbf1b94942c0ed53d1ddb87e4ee63833c08f2684..e0b39d4ecbd411ff4bb72d1ed973e45bd6da9ef1
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+   int res;
+
/* Fragment cache limits.
 *
 * The fragment memory accounting code, (tries to) account for
@@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
net->ipv4.frags.max_dist = 64;
 
-   inet_frags_init_net(>ipv4.frags);
-
-   return ip4_frags_ns_ctl_register(net);
+   res = inet_frags_init_net(>ipv4.frags);
+   if (res < 0)
+   return res;
+   res = ip4_frags_ns_ctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(>ipv4.frags, _frags);
+   return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 
b84ce3e6d728a5b8af65b91faf42ec640ff03910..6ff41569134ae36809a8b42d8e46d50d19ffde53
 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -629,12 +629,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
+   int res;
+
net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-   inet_frags_init_net(>nf_frag.frags);
-
-   return nf_ct_frag6_sysctl_register(net);
+   res = inet_frags_init_net(>nf_frag.frags);
+   if (res < 0)
+   return res;
+   res = nf_ct_frag6_sysctl_register(net);
+   if (res < 0)
+   inet_frags_exit_net(>nf_frag.frags, _frags);
+   return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
08a139f14d0f6fa8ca326088cce1144411e09bf5..a8f7a5f0251a7af0b14cc6de5006b924d9d05672
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -711,13 +711,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+   int res;
+
net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-   inet_frags_init_net(>ipv6.frags);
+   res = inet_frags_init_net(>ipv6.frags);
+   if (res < 0)
+   return res;
 
-   return ip6_frags_ns_sysctl_register(net);
+

[PATCH v3 net-next 08/12] inet: frags: use rhashtables for reassembly units

2018-03-30 Thread Eric Dumazet

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB
of storage for the fragments (exact number depends on frags being evicted
after timeout)

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet 
Cc: Kirill Tkhai 
Cc: Herbert Xu 
Cc: Florian Westphal 
Cc: Jesper Dangaard Brouer 
Cc: Alexander Aring 
Cc: Stefan Schmidt 
---
 Documentation/networking/ip-sysctl.txt  |   7 +-
 include/net/inet_frag.h |  81 +++---
 include/net/ipv6.h  |  16 +-
 net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
 net/ieee802154/6lowpan/reassembly.c |  93 +++
 net/ipv4/inet_fragment.c| 354 +---
 net/ipv4/ip_fragment.c  | 112 
 net/ipv6/netfilter/nf_conntrack_reasm.c |  51 +---
 net/ipv6/reassembly.c   | 110 
 9 files changed, 271 insertions(+), 579 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
33f35f049ad57ad6c06ed6e089966e346d72d108..6f2a3670e44b6662ce53c16cb7ca1e4f61274c15
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
 IP Fragmentation:
 
 ipfrag_high_thresh - INTEGER
-   Maximum memory used to reassemble IP fragments. When
-   ipfrag_high_thresh bytes of memory is allocated for this purpose,
-   the fragment handler will toss packets until ipfrag_low_thresh
-   is reached. This also serves as a maximum limit to namespaces
-   different from the initial one.
+   Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - INTEGER
+   (Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
69e531ed81894393e07cac9e953825fcb55ef42a..3fec0d3a0d0186e98afb951784e1fe7329ba6d77
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,7 +2,11 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include 
+
 struct netns_frags {
+   struct rhashtable   rhashtable cacheline_aligned_in_smp;
+
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_tmem cacheline_aligned_in_smp;
/* sysctls */
@@ -26,12 +30,30 @@ enum {
INET_FRAG_COMPLETE  = BIT(2),
 };
 
+struct frag_v4_compare_key {
+   __be32  saddr;
+   __be32  daddr;
+   u32 user;
+   u32 vif;
+   __be16  id;
+   u16 protocol;
+};
+
+struct frag_v6_compare_key {
+   struct in6_addr saddr;
+   struct in6_addr daddr;
+   u32 user;
+   __be32  id;
+   u32 iif;
+};
+
 /**
  * struct inet_frag_queue - fragment queue
  *
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
  * @timer: queue expiration timer
- * @list: hash bucket list
+ * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
  * @fragments_tail: received fragments tail
@@ -41,12 +63,16 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
  */
 struct inet_frag_queue {
-   spinlock_t  lock;
+   struct rhash_head   node;
+   union {
+   struct

[PATCH v3 net-next 09/12] inet: frags: remove some helpers

2018-03-30 Thread Eric Dumazet

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 5 -
 include/net/ip.h| 1 -
 include/net/ipv6.h  | 7 ---
 net/ipv4/ip_fragment.c  | 5 -
 net/ipv4/proc.c | 6 +++---
 net/ipv6/proc.c | 5 +++--
 6 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
3fec0d3a0d0186e98afb951784e1fe7329ba6d77..4b5449df0aadf1f75144c98317bf5305ec91d88b
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -141,11 +141,6 @@ static inline void add_frag_mem_limit(struct netns_frags 
*nf, int i)
atomic_add(i, >mem);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
-{
-   return atomic_read(>mem);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid 
combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 
36f8f7811093c37de06194dc7410b7596f8bf9fa..ecffd843e7b896a83416847fdaa452be6223f3dc
 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net 
*net, struct sk_buff *s
return skb;
 }
 #endif
-int ip_frag_mem(struct net *net);
 
 /*
  * Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
6fa9a2bc589665dfa9ce84813f33e5e86e12fd74..37455e84034779fab96c231fa069957a6dcf2b42
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv6.frags);
-}
-#endif
-
 #define IPV6_FRAG_HIGH_THRESH  (4 * 1024*1024) /* 4194304 */
 #define IPV6_FRAG_LOW_THRESH   (3 * 1024*1024) /* 3145728 */
 #define IPV6_FRAG_TIMEOUT  (60 * HZ)   /* 60 seconds */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
4021820db6f291b255cc53aeca91dd74aef29934..44f4fa306e224a6f76183b1c04935f01ceb4fe2b
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 
adfb75340275d240487574257c10feb295df44fe..aacfce0d7d82cf59269a69ef4d6ac8d9955b0bdc
 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem;
int orphans, sockets;
 
orphans = percpu_counter_sum_positive(_orphan_count);
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
   sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   frag_mem = ip_frag_mem(net);
-   seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+  atomic_read(>ipv4.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv4.frags));
return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 
6e57028d2e9160be264d07f9312658fcb677a568..8befeb91e0712ecc4d05c4c0a6ecca1808dcbcac
 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem = ip6_frag_mem(net);
 
seq_printf(seq, "TCP6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+  atomic_read(>ipv6.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv6.frags));
return 0;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 04/12] inet: frags: refactor ipv6_frag_init()

2018-03-30 Thread Eric Dumazet

We want to call inet_frags_init() earlier.

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet 
---
 net/ipv6/reassembly.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
4855de6f673a4753526679ca29dcdaebecb5777f..f0071b113a92fcff15ac57610170c12b17cb59ba
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -742,18 +742,6 @@ int __init ipv6_frag_init(void)
 {
int ret;
 
-   ret = inet6_add_protocol(_protocol, IPPROTO_FRAGMENT);
-   if (ret)
-   goto out;
-
-   ret = ip6_frags_sysctl_register();
-   if (ret)
-   goto err_sysctl;
-
-   ret = register_pernet_subsys(_frags_ops);
-   if (ret)
-   goto err_pernet;
-
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
@@ -762,8 +750,21 @@ int __init ipv6_frag_init(void)
ip6_frags.frag_expire = ip6_frag_expire;
ip6_frags.frags_cache_name = ip6_frag_cache_name;
ret = inet_frags_init(_frags);
+   if (ret)
+   goto out;
+
+   ret = inet6_add_protocol(_protocol, IPPROTO_FRAGMENT);
+   if (ret)
+   goto err_protocol;
+
+   ret = ip6_frags_sysctl_register();
+   if (ret)
+   goto err_sysctl;
+
+   ret = register_pernet_subsys(_frags_ops);
if (ret)
goto err_pernet;
+
 out:
return ret;
 
@@ -771,6 +772,8 @@ int __init ipv6_frag_init(void)
ip6_frags_sysctl_unregister();
 err_sysctl:
inet6_del_protocol(_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+   inet_frags_fini(_frags);
goto out;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 05/12] inet: frags: refactor lowpan_net_frag_init()

2018-03-30 Thread Eric Dumazet

We want to call lowpan_net_frag_init() earlier.
Similar to commit "inet: frags: refactor ipv6_frag_init()"

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet 
---
 net/ieee802154/6lowpan/reassembly.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
6badc05b7baedac2051a1aaea15f9e9b180c..ddada12a044de293f904a1dc7a5ff398d089d101
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -615,14 +615,6 @@ int __init lowpan_net_frag_init(void)
 {
int ret;
 
-   ret = lowpan_frags_sysctl_register();
-   if (ret)
-   return ret;
-
-   ret = register_pernet_subsys(_frags_ops);
-   if (ret)
-   goto err_pernet;
-
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL;
@@ -631,12 +623,22 @@ int __init lowpan_net_frag_init(void)
lowpan_frags.frag_expire = lowpan_frag_expire;
lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
ret = inet_frags_init(_frags);
+   if (ret)
+   goto out;
+
+   ret = lowpan_frags_sysctl_register();
+   if (ret)
+   goto err_sysctl;
+
+   ret = register_pernet_subsys(_frags_ops);
if (ret)
goto err_pernet;
-
+out:
return ret;
 err_pernet:
lowpan_frags_sysctl_unregister();
+err_sysctl:
+   inet_frags_fini(_frags);
return ret;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 00/12] inet: frags: bring rhashtables to IP defrag

2018-03-30 Thread Eric Dumazet

IP defrag processing is one of the remaining problematic layer in linux.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket.

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU, and 64bit hosts can now provision whatever amount
of memory needed to handle the expected workloads.

v2: Addressed Herbert and Kirill feedbacks
  (Use rhashtable_free_and_destroy(), and split the big patch into small units)

v3: Removed the extra add_frag_mem_limit(...) from inet_frag_create()
Removed the refcount_inc_not_zero() call from inet_frags_free_cb(),
as we can exploit del_timer() return value.

Eric Dumazet (12):
  ipv6: frag: remove unused field
  inet: frags: change inet_frags_init_net() return value
  inet: frags: add a pointer to struct netns_frags
  inet: frags: refactor ipv6_frag_init()
  inet: frags: refactor lowpan_net_frag_init()
  inet: frags: refactor ipfrag_init()
  rhashtable: add schedule points
  inet: frags: use rhashtables for reassembly units
  inet: frags: remove some helpers
  inet: frags: get rif of inet_frag_evicting()
  inet: frags: remove inet_frag_maybe_warn_overflow()
  inet: frags: break the 2GB limit for frags storage

 Documentation/networking/ip-sysctl.txt  |  11 +-
 include/net/inet_frag.h | 123 
 include/net/ip.h|   1 -
 include/net/ipv6.h  |  27 +-
 lib/rhashtable.c|   2 +
 net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
 net/ieee802154/6lowpan/reassembly.c | 150 +-
 net/ipv4/inet_fragment.c| 362 +---
 net/ipv4/ip_fragment.c  | 214 +++---
 net/ipv4/proc.c |   6 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  90 +++---
 net/ipv6/proc.c |   5 +-
 net/ipv6/reassembly.c   | 181 ++--
 13 files changed, 442 insertions(+), 756 deletions(-)

-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 10/12] inet: frags: get rif of inet_frag_evicting()

2018-03-30 Thread Eric Dumazet

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  5 
 net/ipv4/ip_fragment.c  | 61 -
 net/ipv6/reassembly.c   |  4 ---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
4b5449df0aadf1f75144c98317bf5305ec91d88b..0e8e159d88f7f77254fae5a49f1c7ba07b967e11
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-   return false;
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
44f4fa306e224a6f76183b1c04935f01ceb4fe2b..b844f517b75bd6a52538e9f7687e039e22c93bc7
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
struct inet_frag_queue *frag = from_timer(frag, t, timer);
-   struct ipq *qp;
+   struct sk_buff *clone, *head;
+   const struct iphdr *iph;
struct net *net;
+   struct ipq *qp;
+   int err;
 
qp = container_of(frag, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-   if (!inet_frag_evicting(>q)) {
-   struct sk_buff *clone, *head = qp->q.fragments;
-   const struct iphdr *iph;
-   int err;
+   head = qp->q.fragments;
 
-   __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+   __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-   if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-   goto out;
+   if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+   goto out;
 
-   head->dev = dev_get_by_index_rcu(net, qp->iif);
-   if (!head->dev)
-   goto out;
+   head->dev = dev_get_by_index_rcu(net, qp->iif);
+   if (!head->dev)
+   goto out;
 
 
-   /* skb has no dst, perform route lookup again */
-   iph = ip_hdr(head);
-   err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+   /* skb has no dst, perform route lookup again */
+   iph = ip_hdr(head);
+   err = ip_route_input_noref(head, iph->daddr, iph->saddr,
   iph->tos, head->dev);
-   if (err)
-   goto out;
+   if (err)
+   goto out;
 
-   /* Only an end host needs to send an ICMP
-* "Fragment Reassembly Timeout" message, per RFC792.
-*/
-   if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-   (skb_rtable(head)->rt_type != RTN_LOCAL))
-   goto out;
+   /* Only an end host needs to send an ICMP
+* "Fragment Reassembly Timeout" message, per RFC792.
+*/
+   if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+   (skb_rtable(head)->rt_type != RTN_LOCAL))
+   goto out;
 
-   clone = skb_clone(head, GFP_ATOMIC);
+   clone = skb_clone(head, GFP_ATOMIC);
 
-   /* Send an ICMP "Fragment Reassembly Timeout" message. */
-   if (clone) {
-   spin_unlock(>q.lock);
-   icmp_send(clone, ICMP_TIME_EXCEEDED,
- ICMP_EXC_FRAGTIME, 0);
-   consume_skb(clone);
-   goto out_rcu_unlock;
-   }
+   /* Send an ICMP "Fragment Reassembly Timeout" message. */
+   if (clone) {
+   spin_unlock(>q.lock);
+   icmp_send(clone, ICMP_TIME_EXCEEDED,
+ ICMP_EXC_FRAGTIME, 0);
+   consume_skb(clone);
+   goto out_rcu_unlock;
}
 out:
spin_unlock(>q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
3fc853e4492abb109062d662296c0b470763042a..70acad126d044a0f6a1efc63f307805fcf7b1df9
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct

[PATCH v3 net-next 11/12] inet: frags: remove inet_frag_maybe_warn_overflow()

2018-03-30 Thread Eric Dumazet

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  2 --
 net/ieee802154/6lowpan/reassembly.c |  5 ++---
 net/ipv4/inet_fragment.c| 11 ---
 net/ipv4/ip_fragment.c  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c   |  5 ++---
 6 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
0e8e159d88f7f77254fae5a49f1c7ba07b967e11..95e353e3305b43253084d972e32538138bcc5454
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf);
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-  const char *prefix);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
dd743c287bc229b1ba354e834af7bec34dcb8643..7eaa0617e277b829b801aee4e75f0ec61b2daf41
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
struct inet_frag_queue *q;
 
q = inet_frag_find(_lowpan->frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 
ebb8f411e0db16478e861105b1926e97fbf07b06..c9e35b81d0931df8429a33e8d03e719b87da0747
 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -218,14 +218,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags 
*nf, void *key)
return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-  const char *prefix)
-{
-   static const char msg[] = "inet_frag_find: Fragment hash bucket"
-   " list length grew over limit. Dropping fragment.\n";
-
-   if (PTR_ERR(q) == -ENOBUFS)
-   net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
b844f517b75bd6a52538e9f7687e039e22c93bc7..b0366224f314ae521d8c1f8fe04c53e419292b4c
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr 
*iph,
struct inet_frag_queue *q;
 
q = inet_frag_find(>ipv4.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 
0ad3df551d9884ba30f2d40658ee81a61720e947..d866412b8f6c432f04c0968f08f820fdc561262b
 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 
id, u32 user,
struct inet_frag_queue *q;
 
q = inet_frag_find(>nf_frag.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
70acad126d044a0f6a1efc63f307805fcf7b1df9..2a77fda5e3bca1b6ce8c24df11e741653a15c665
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr 
*hdr, int iif)
key.iif = 0;
 
q = inet_frag_find(>ipv6.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct frag_queue, q);
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 06/12] inet: frags: refactor ipfrag_init()

2018-03-30 Thread Eric Dumazet

We need to call inet_frags_init() before register_pernet_subsys(),
as a prereq for following patch ("inet: frags: use rhashtables for reassembly 
units")

Signed-off-by: Eric Dumazet 
---
 net/ipv4/ip_fragment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
cd2b4c9419fc1552d367b572926e314b11cb6c00..1a3bc85d6f5ea8f36b8f3d221cad632906b317a2
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -896,8 +896,6 @@ static struct pernet_operations ip4_frags_ops = {
 
 void __init ipfrag_init(void)
 {
-   ip4_frags_ctl_register();
-   register_pernet_subsys(_frags_ops);
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free;
@@ -907,4 +905,6 @@ void __init ipfrag_init(void)
ip4_frags.frags_cache_name = ip_frag_cache_name;
if (inet_frags_init(_frags))
panic("IP: failed to allocate ip4_frags cache\n");
+   ip4_frags_ctl_register();
+   register_pernet_subsys(_frags_ops);
 }
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 12/12] inet: frags: break the 2GB limit for frags storage

2018-03-30 Thread Eric Dumazet

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Note that this patch avoids an overflow error, if high_thresh was set
to ~2GB, since this test in inet_frag_alloc() was never true :

if (... || frag_mem_limit(nf) > nf->high_thresh)

Tested:

$ echo 160 >/proc/sys/net/ipv4/ipfrag_high_thresh



$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 1602880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds33171500.0
IpReasmFails33171120.0

Signed-off-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt  |  4 ++--
 include/net/inet_frag.h | 20 ++--
 net/ieee802154/6lowpan/reassembly.c | 10 +-
 net/ipv4/ip_fragment.c  | 10 +-
 net/ipv4/proc.c |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +-
 net/ipv6/proc.c |  2 +-
 net/ipv6/reassembly.c   |  6 +++---
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
6f2a3670e44b6662ce53c16cb7ca1e4f61274c15..5dc1a040a2f1db610873de26c2d79bc57ac5a1a2
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -133,10 +133,10 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments.
 
-ipfrag_low_thresh - INTEGER
+ipfrag_low_thresh - LONG INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
95e353e3305b43253084d972e32538138bcc5454..a52e7273e7a59bc8ce47b21d29235a740add8db0
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -8,11 +8,11 @@ struct netns_frags {
struct rhashtable   rhashtable cacheline_aligned_in_smp;
 
/* Keep atomic mem on separate cachelines in structs that include it */
-   atomic_tmem cacheline_aligned_in_smp;
+   atomic_long_t   mem cacheline_aligned_in_smp;
/* sysctls */
+   longhigh_thresh;
+   longlow_thresh;
int timeout;
-   int high_thresh;
-   int low_thresh;
int max_dist;
struct inet_frags   *f;
 };
@@ -102,7 +102,7 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-   atomic_set(>mem, 0);
+   atomic_long_set(>mem, 0);
return rhashtable_init(>rhashtable, >f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
@@ -119,19 +119,19 @@ static inline void inet_frag_put(struct inet_frag_queue 
*q)
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
+static inline long frag_mem_limit(const struct netns_frags *nf)
 {
-   return atomic_read(>mem);
+   return atomic_long_read(>mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_sub(i, >mem);
+   atomic_long_sub(val, >mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_add(i, >mem);
+   atomic_long_add(val, >mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
7eaa0617e277b829b801aee4e75f0ec61b2daf41..1f0857937ad187b48ff2af5e9c8570cf2b133fd2
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -411,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname   = "6lowpanfrag_high_thresh",
.data   = _net.ieee802154_lowpan.frags.high_thresh,
-   .maxlen = sizeof(int),
+   .maxlen = sizeof(unsigned long),
.mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
+   .proc_handler   = proc_doulongvec_minmax,
.extra1 = _net.ieee802154_lowpan.frags.low_thresh
},
{
.procname   =

[PATCH v3 net-next 03/12] inet: frags: add a pointer to struct netns_frags

2018-03-30 Thread Eric Dumazet

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 11 ++-
 include/net/ipv6.h  |  3 +--
 net/ieee802154/6lowpan/reassembly.c | 13 +++--
 net/ipv4/inet_fragment.c| 17 ++---
 net/ipv4/ip_fragment.c  |  9 +
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +---
 net/ipv6/reassembly.c   | 20 ++--
 7 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
b1d62176f3b4fcf100bd263e8eae0db656a3d9b6..69e531ed81894393e07cac9e953825fcb55ef42a
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
int high_thresh;
int low_thresh;
int max_dist;
+   struct inet_frags   *f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags 
*nf)
atomic_set(>mem, 0);
return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags 
*f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
if (refcount_dec_and_test(>refcnt))
-   inet_frag_destroy(q, f);
+   inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
5c18836672e9d1c560cdce15f5b34928c337abfd..57b7fe43d2ab8e0ef3d663b7a5ee201affd5ca1f
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -607,8 +607,7 @@ struct frag_queue {
u8  ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-  struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
2aaab4bba42961647a4d3d1c0b8497917d5065ce..6badc05b7baedac2051a1aaea15f9e9b180c
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 out:
spin_unlock(>q.lock);
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, 
struct sk_buff *prev,
struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 
/* Make the one we just received the head. */
if (prev) {
@@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
ret = lowpan_frag_queue(fq, skb, frag_type);
spin_unlock(>q.lock);
 
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
return ret;
}
 
@@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+   ieee802154_lowpan->frags.f = _frags;
 
res = inet_frags_init_net(_lowpan->frags);
if (res < 0)
return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
-   inet_frags_exit_net(_lowpan->frags, _frags);
+   inet_frags_exit_net(_lowpan->frags);

[PATCH v3 net-next 07/12] rhashtable: add schedule points

2018-03-30 Thread Eric Dumazet

Rehashing and destroying large hash table takes a lot of time,
and happens in process context. It is safe to add cond_resched()
in rhashtable_rehash_table() and rhashtable_free_and_destroy()

Signed-off-by: Eric Dumazet 
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 
47de025b624520f75e521bef46dc9b28baa6a1a0..2b2b79974b614a94e5325e8c2271804cb27069aa
 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
err = rhashtable_rehash_chain(ht, old_hash);
if (err)
return err;
+   cond_resched();
}
 
/* Publish the new table pointer. */
@@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
for (i = 0; i < tbl->size; i++) {
struct rhash_head *pos, *next;
 
+   cond_resched();
for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 next = !rht_is_a_nulls(pos) ?
rht_dereference(pos->next, ht) : NULL;
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v3 net-next 01/12] ipv6: frag: remove unused field

2018-03-30 Thread Eric Dumazet

csum field in struct frag_queue is not used, remove it.

Signed-off-by: Eric Dumazet 
---
 include/net/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
50a6f0ddb8780f6c9169f4ae0b3b35af2d66cd4b..5c18836672e9d1c560cdce15f5b34928c337abfd
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -603,7 +603,6 @@ struct frag_queue {
struct in6_addr daddr;
 
int iif;
-   unsigned intcsum;
__u16   nhoffset;
u8  ecn;
 };
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[net-next 02/15] net/mlx5: Eliminate query xsrq dead code

2018-03-30 Thread Saeed Mahameed

1. This function is not used anywhere in mlx5 driver
2. It has a memcpy statement that makes no sense and produces build
warning with gcc8

drivers/net/ethernet/mellanox/mlx5/core/transobj.c: In function 
'mlx5_core_query_xsrq':
drivers/net/ethernet/mellanox/mlx5/core/transobj.c:347:3: error: 'memcpy' 
source argument is the same as destination [-Werror=restrict]

Fixes: 01949d0109ee ("net/mlx5_core: Enable XRCs and SRQs when using ISSI > 0")
Reported-by: Arnd Bergmann 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/transobj.c | 21 -
 include/linux/mlx5/transobj.h  |  1 -
 2 files changed, 22 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c 
b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
index c64957b5ef47..dae1c5c5d27c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c
@@ -354,27 +354,6 @@ int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 
xsrqn)
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 }
 
-int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u32 *out)
-{
-   u32 in[MLX5_ST_SZ_DW(query_xrc_srq_in)] = {0};
-   void *srqc;
-   void *xrc_srqc;
-   int err;
-
-   MLX5_SET(query_xrc_srq_in, in, opcode,   MLX5_CMD_OP_QUERY_XRC_SRQ);
-   MLX5_SET(query_xrc_srq_in, in, xrc_srqn, xsrqn);
-   err = mlx5_cmd_exec(dev, in, sizeof(in), out,
-   MLX5_ST_SZ_BYTES(query_xrc_srq_out));
-   if (!err) {
-   xrc_srqc = MLX5_ADDR_OF(query_xrc_srq_out, out,
-   xrc_srq_context_entry);
-   srqc = MLX5_ADDR_OF(query_srq_out, out, srq_context_entry);
-   memcpy(srqc, xrc_srqc, MLX5_ST_SZ_BYTES(srqc));
-   }
-
-   return err;
-}
-
 int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 xsrqn, u16 lwm)
 {
u32 in[MLX5_ST_SZ_DW(arm_xrc_srq_in)]   = {0};
diff --git a/include/linux/mlx5/transobj.h b/include/linux/mlx5/transobj.h
index 80d7aa8b2831..83a33a1873a6 100644
--- a/include/linux/mlx5/transobj.h
+++ b/include/linux/mlx5/transobj.h
@@ -67,7 +67,6 @@ int mlx5_core_arm_rmp(struct mlx5_core_dev *dev, u32 rmpn, 
u16 lwm);
 int mlx5_core_create_xsrq(struct mlx5_core_dev *dev, u32 *in, int inlen,
  u32 *rmpn);
 int mlx5_core_destroy_xsrq(struct mlx5_core_dev *dev, u32 rmpn);
-int mlx5_core_query_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u32 *out);
 int mlx5_core_arm_xsrq(struct mlx5_core_dev *dev, u32 rmpn, u16 lwm);
 
 int mlx5_core_create_rqt(struct mlx5_core_dev *dev, u32 *in, int inlen,
-- 
2.14.3

[net-next 06/15] net/mlx5e: Derive Striding RQ size from MTU

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

In Striding RQ, each WQE serves multiple packets
(hence called Multi-Packet WQE, MPWQE).
The size of a MPWQE is constant (currently 256KB).

Upon a ringparam set operation, we calculate the number of
MPWQEs per RQ. For this, first it is needed to determine the
number of packets that can reside within a single MPWQE.
In this patch we use the actual MTU size instead of ETH_DATA_LEN
for this calculation.

This implies that a change in MTU might require a change
in Striding RQ ring size.

In addition, this obsoletes some WQEs-to-packets translation
functions and helps delete ~60 LOC.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 63 ++---
 .../net/ethernet/mellanox/mlx5/core/en_ethtool.c   | 79 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 71 ---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  |  2 +-
 .../ethernet/mellanox/mlx5/core/ipoib/ipoib_vlan.c |  2 +-
 6 files changed, 80 insertions(+), 139 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 823876bfd6ab..1f89e2194b61 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -63,18 +63,6 @@
 #define MLX5E_MAX_DSCP  64
 #define MLX5E_MAX_NUM_TC   8
 
-#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE0x6
-#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE0xa
-#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE0xd
-
-#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE0x1
-#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE0xa
-#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE0xd
-
-#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW0x2
-#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW0x3
-#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW0x6
-
 #define MLX5_RX_HEADROOM NET_SKB_PAD
 #define MLX5_SKB_FRAG_SZ(len)  (SKB_DATA_ALIGN(len) +  \
 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
@@ -95,9 +83,27 @@
 #define MLX5_MPWRQ_PAGES_PER_WQE   BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
 
 #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
-#define MLX5E_REQUIRED_MTTS(wqes)  \
-   (wqes * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
-#define MLX5E_VALID_NUM_MTTS(num_mtts) (MLX5_MTT_OCTW(num_mtts) - 1 <= U16_MAX)
+#define MLX5E_REQUIRED_WQE_MTTS
(ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
+#define MLX5E_REQUIRED_MTTS(wqes)  (wqes * MLX5E_REQUIRED_WQE_MTTS)
+#define MLX5E_MAX_RQ_NUM_MTTS  \
+   ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
+#define MLX5E_ORDER2_MAX_PACKET_MTU (order_base_2(10 * 1024))
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW   \
+   (ilog2(MLX5E_MAX_RQ_NUM_MTTS / MLX5E_REQUIRED_WQE_MTTS))
+#define MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW \
+   (MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW + \
+(MLX5_MPWRQ_LOG_WQE_SZ - MLX5E_ORDER2_MAX_PACKET_MTU))
+
+#define MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE0x6
+#define MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_SQ_SIZE0xd
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE0x1
+#define MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE0xa
+#define MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE min_t(u8, 0xd,\
+  MLX5E_LOG_MAX_RQ_NUM_PACKETS_MPW)
+
+#define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW0x2
 
 #define MLX5_UMR_ALIGN (2048)
 #define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD  (256)
@@ -155,26 +161,6 @@ static inline u16 mlx5_min_rx_wqes(int wq_type, u32 
wq_size)
}
 }
 
-static inline int mlx5_min_log_rq_size(int wq_type)
-{
-   switch (wq_type) {
-   case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-   return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW;
-   default:
-   return MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE;
-   }
-}
-
-static inline int mlx5_max_log_rq_size(int wq_type)
-{
-   switch (wq_type) {
-   case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-   return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE_MPW;
-   default:
-   return MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE;
-   }
-}
-
 static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
 {
return is_kdump_kernel() ?
@@ -233,7 +219,7 @@ enum mlx5e_priv_flag {
 struct mlx5e_params {
u8  log_sq_size;
u8  rq_wq_type;
-   u8  log_rq_size;
+   u8  log_rq_mtu_frames;
u16 num_channels;
u8  num_tc;
bool rx_cqe_compress_def;
@@ -849,11 +835,6 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_dealloc_rx_mpwqe(struct

[net-next 10/15] net/mlx5e: Use linear SKB in Striding RQ

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Current Striding RQ HW feature utilizes the RX buffers so that
there is no wasted room between the strides. This maximises
the memory utilization.
This prevents the use of build_skb() (which requires headroom
and tailroom), and demands to memcpy the packets headers into
the skb linear part.

In this patch, whenever a set of conditions holds, we apply
an RQ configuration that allows combining the use of linear SKB
on top of a Striding RQ.

To use build_skb() with Striding RQ, the following must hold:
1. packet does not cross a page boundary.
2. there is enough headroom and tailroom surrounding the packet.

We can satisfy 1 and 2 by configuring:
stride size = MTU + headroom + tailoom.

This is possible only when:
a. (MTU - headroom - tailoom) does not exceed PAGE_SIZE.
b. HW LRO is turned off.

Using linear SKB has many advantages:
- Saves a memcpy of the headers.
- No page-boundary checks in datapath.
- No filler CQEs.
- Significantly smaller CQ.
- SKB data continuously resides in linear part, and not split to
  small amount (linear part) and large amount (fragment).
  This saves datapath cycles in driver and improves utilization
  of SKB fragments in GRO.
- The fragments of a resulting GRO SKB follow the IP forwarding
  assumption of equal-size fragments.

Some implementation details:
HW writes the packets to the beginning of a stride,
i.e. does not keep headroom. To overcome this we make sure we can
extend backwards and use the last bytes of stride i-1.
Extra care is needed for stride 0 as it has no preceding stride.
We make sure headroom bytes are available by shifting the buffer
pointer passed to HW by headroom bytes.

This configuration now becomes default, whenever capable.
Of course, this implies turning LRO off.

Performance testing:
ConnectX-5, single core, single RX ring, default MTU.

UDP packet rate, early drop in TC layer:


| pkt size | before| after | ratio |

| 1500byte | 4.65 Mpps | 5.96 Mpps | 1.28x |
|  500byte | 5.23 Mpps | 5.97 Mpps | 1.14x |
|   64byte | 5.94 Mpps | 5.96 Mpps | 1.00x |


TCP streams: ~20% gain

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  10 +++
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  76 +---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 102 --
 include/linux/mlx5/device.h   |   3 +
 include/linux/mlx5/mlx5_ifc.h |   7 +-
 5 files changed, 153 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index c1d3a29388bd..d26dd4bc89f4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -473,6 +473,9 @@ struct mlx5e_page_cache {
 
 struct mlx5e_rq;
 typedef void (*mlx5e_fp_handle_rx_cqe)(struct mlx5e_rq*, struct mlx5_cqe64*);
+typedef struct sk_buff *
+(*mlx5e_fp_skb_from_cqe_mpwrq)(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+  u16 cqe_bcnt, u32 head_offset, u32 page_idx);
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
@@ -491,6 +494,7 @@ struct mlx5e_rq {
} wqe;
struct {
struct mlx5e_mpw_info *info;
+   mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
u16num_strides;
u8 log_stride_sz;
bool   umr_in_progress;
@@ -834,6 +838,12 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq);
 void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_dealloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix);
 void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
+   u16 cqe_bcnt, u32 head_offset, u32 page_idx);
+struct sk_buff *
+mlx5e_skb_from_cqe_mpwrq_nonlinear(struct mlx5e_rq *rq, struct mlx5e_mpw_info 
*wi,
+  u16 cqe_bcnt, u32 head_offset, u32 page_idx);
 
 void mlx5e_update_stats(struct mlx5e_priv *priv);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 42dc350c5ab1..bba2fa0aa15f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -91,9 +91,14 @@ bool mlx5e_check_fragmented_striding_rq_cap(struct 
mlx5_core_dev *mdev)
 
 static u32 mlx5e_mpwqe_get_linear_frag_sz(struct

[net-next 12/15] net/mlx5e: Support XDP over Striding RQ

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Add XDP support over Striding RQ.
Now that linear SKB is supported over Striding RQ,
we can support XDP by setting stride size to PAGE_SIZE
and headroom to XDP_PACKET_HEADROOM.

Upon a MPWQE free, do not release pages that are being
XDP xmit, they will be released upon completions.

Striding RQ is capable of a higher packet-rate than
conventional RQ.
A performance gain is expected for all cases that had
a HW packet-rate bottleneck. This is the case whenever
using many flows that distribute to many cores.

Performance testing:
ConnectX-5, 24 rings, default MTU.
CQE compression ON (to reduce completions BW in PCI).

XDP_DROP packet rate:
--
| pkt size | XDP rate   | 100GbE linerate | pct% |
--
|   64byte | 126.2 Mpps |  148.0 Mpps |  85% |
|  128byte |  80.0 Mpps |   84.8 Mpps |  94% |
|  256byte |  42.7 Mpps |   42.7 Mpps | 100% |
|  512byte |  23.4 Mpps |   23.4 Mpps | 100% |
--

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |  3 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 30 +--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index a6ca54393bb6..7997d7c159db 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -457,6 +457,7 @@ struct mlx5e_mpw_info {
struct mlx5e_umr_dma_info umr;
u16 consumed_strides;
u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
+   DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };
 
 /* a single cache unit is capable to serve one napi call (for non-striding rq)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bba2fa0aa15f..b03a2327356a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -200,7 +200,8 @@ bool mlx5e_striding_rq_possible(struct mlx5_core_dev *mdev,
struct mlx5e_params *params)
 {
return mlx5e_check_fragmented_striding_rq_cap(mdev) &&
-   !params->xdp_prog && !MLX5_IPSEC_DEV(mdev);
+   !MLX5_IPSEC_DEV(mdev) &&
+   !(params->xdp_prog && !mlx5e_rx_mpwqe_is_linear_skb(mdev, 
params));
 }
 
 void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index a827571deb85..1da79cab1838 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -349,13 +349,16 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
 
 void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 {
+   const bool no_xdp_xmit =
+   bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
-   struct mlx5e_dma_info *dma_info = >umr.dma_info[0];
+   struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
int i;
 
-   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
-   page_ref_sub(dma_info->page, pg_strides - wi->skbs_frags[i]);
-   mlx5e_page_release(rq, dma_info, true);
+   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
+   page_ref_sub(dma_info[i].page, pg_strides - wi->skbs_frags[i]);
+   if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))
+   mlx5e_page_release(rq, _info[i], true);
}
 }
 
@@ -404,6 +407,7 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
}
 
memset(wi->skbs_frags, 0, sizeof(*wi->skbs_frags) * 
MLX5_MPWRQ_PAGES_PER_WQE);
+   bitmap_zero(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
wi->consumed_strides = 0;
 
rq->mpwqe.umr_in_progress = true;
@@ -1028,18 +1032,30 @@ mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, 
struct mlx5e_mpw_info *wi,
 {
struct mlx5e_dma_info *di = >umr.dma_info[page_idx];
u16 rx_headroom = rq->buff.headroom;
+   u32 cqe_bcnt32 = cqe_bcnt;
struct sk_buff *skb;
void *va, *data;
u32 frag_size;
+   bool consumed;
 
va = page_address(di->page) + head_offset;
data   = va + rx_headroom;
-   frag_size  = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
+   frag_size  = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt32);
 
dma_sync_single_range_for_cpu(rq->pdev,

[net-next 09/15] net/mlx5e: Use inline MTTs in UMR WQEs

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

When modifying the page mapping of a HW memory region
(via a UMR post), post the new values inlined in WQE,
instead of using a data pointer.

This is a micro-optimization, inline UMR WQEs of different
rings scale better in HW.

In addition, this obsoletes a few control flows and helps
delete ~50 LOC.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  | 16 ++---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 82 +--
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   | 28 
 3 files changed, 38 insertions(+), 88 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 1f89e2194b61..c1d3a29388bd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -105,7 +105,6 @@
 
 #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW0x2
 
-#define MLX5_UMR_ALIGN (2048)
 #define MLX5_MPWRQ_SMALL_PACKET_THRESHOLD  (256)
 
 #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ (64 * 1024)
@@ -130,8 +129,13 @@
 #define MLX5E_UPDATE_STATS_INTERVAL200 /* msecs */
 #define MLX5E_SQ_RECOVER_MIN_INTERVAL  500 /* msecs */
 
-#define MLX5E_ICOSQ_MAX_WQEBBS \
-   (DIV_ROUND_UP(sizeof(struct mlx5e_umr_wqe), MLX5_SEND_WQE_BB))
+#define MLX5E_UMR_WQE_INLINE_SZ \
+   (sizeof(struct mlx5e_umr_wqe) + \
+ALIGN(MLX5_MPWRQ_PAGES_PER_WQE * sizeof(struct mlx5_mtt), \
+  MLX5_UMR_MTT_ALIGNMENT))
+#define MLX5E_UMR_WQEBBS \
+   (DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_BB))
+#define MLX5E_ICOSQ_MAX_WQEBBS MLX5E_UMR_WQEBBS
 
 #define MLX5E_XDP_MIN_INLINE (ETH_HLEN + VLAN_HLEN)
 #define MLX5E_XDP_TX_DS_COUNT \
@@ -183,7 +187,7 @@ struct mlx5e_umr_wqe {
struct mlx5_wqe_ctrl_seg   ctrl;
struct mlx5_wqe_umr_ctrl_seg   uctrl;
struct mlx5_mkey_seg   mkc;
-   struct mlx5_wqe_data_seg   data;
+   struct mlx5_mttinline_mtts[0];
 };
 
 extern const char mlx5e_self_tests[][ETH_GSTRING_LEN];
@@ -421,7 +425,6 @@ struct mlx5e_icosq {
void __iomem  *uar_map;
u32sqn;
u16edge;
-   __be32 mkey_be;
unsigned long  state;
 
/* control path */
@@ -446,8 +449,6 @@ struct mlx5e_wqe_frag_info {
 };
 
 struct mlx5e_umr_dma_info {
-   __be64*mtt;
-   dma_addr_t mtt_addr;
struct mlx5e_dma_info  dma_info[MLX5_MPWRQ_PAGES_PER_WQE];
struct mlx5e_umr_wqe   wqe;
 };
@@ -490,7 +491,6 @@ struct mlx5e_rq {
} wqe;
struct {
struct mlx5e_mpw_info *info;
-   void  *mtt_no_align;
u16num_strides;
u8 log_stride_sz;
bool   umr_in_progress;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index e627b81cebe9..42dc350c5ab1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -73,9 +73,20 @@ struct mlx5e_channel_param {
 
 bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
 {
-   return MLX5_CAP_GEN(mdev, striding_rq) &&
+   bool striding_rq_umr = MLX5_CAP_GEN(mdev, striding_rq) &&
MLX5_CAP_GEN(mdev, umr_ptr_rlky) &&
MLX5_CAP_ETH(mdev, reg_umr_sq);
+   u16 max_wqe_sz_cap = MLX5_CAP_GEN(mdev, max_wqe_sz_sq);
+   bool inline_umr = MLX5E_UMR_WQE_INLINE_SZ <= max_wqe_sz_cap;
+
+   if (!striding_rq_umr)
+   return false;
+   if (!inline_umr) {
+   mlx5_core_warn(mdev, "Cannot support Striding RQ: UMR WQE size 
(%d) exceeds maximum supported (%d).\n",
+  (int)MLX5E_UMR_WQE_INLINE_SZ, max_wqe_sz_cap);
+   return false;
+   }
+   return true;
 }
 
 static u32 mlx5e_mpwqe_get_linear_frag_sz(struct mlx5e_params *params)
@@ -258,16 +269,6 @@ static void mlx5e_disable_async_events(struct mlx5e_priv 
*priv)
synchronize_irq(pci_irq_vector(priv->mdev->pdev, MLX5_EQ_VEC_ASYNC));
 }
 
-static inline int mlx5e_get_wqe_mtt_sz(void)
-{
-   /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
-* To avoid copying garbage after the mtt array, we allocate
-* a little more.
-*/
-   return ALIGN(MLX5_MPWRQ_PAGES_PER_WQE * sizeof(__be64),
-MLX5_UMR_MTT_ALIGNMENT);
-}
-
 static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
   struct mlx5e_icosq *sq,

[pull request][net-next 00/15] Mellanox, mlx5 updates 2018-03-30

2018-03-30 Thread Saeed Mahameed

Hi Dave,

This series contains updates to mlx5 core and mlx5e netdev drivers.
The main highlight of this series is the RX optimizations for striding RQ path,
introduced by Tariq.

For more information please see tag log below.

Please pull and let me know if there's any problem.

Thanks,
Saeed.

---

The following changes since commit c0b6edef0bf0e33c12eaf80c676ff09def011518:

  tc-testing: Add newline when writing test case files (2018-03-30 14:22:51 
-0400)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git 
tags/mlx5-updates-2018-03-30

for you to fetch changes up to ab966d7e4ff988a48b3ad72e7abf903aa840afd1:

  net/mlx5e: RX, Recycle buffer of UMR WQEs (2018-03-30 16:55:07 -0700)


mlx5-updates-2018-03-30

This series contains updates to mlx5 core and mlx5e netdev drivers.
The main highlight of this series is the RX optimizations for striding RQ path,
introduced by Tariq.

First Four patches are trivial misc cleanups.
 - Spelling mistake fix
 - Dead code removal
 - Warning messages

RX optimizations for striding RQ:

1) RX refactoring, cleanups and micro optimizations
   - MTU calculation simplifications, obsoletes some WQEs-to-packets translation
 functions and helps delete ~60 LOC.
   - Do not busy-wait a pending UMR completion.
   - post the new values of UMR WQE inline, instead of using a data pointer.
   - use pre-initialized structures to save calculations in datapath.

2) Use linear SKB in Striding RQ "build_skb", (Using linear SKB has many 
advantages):
- Saves a memcpy of the headers.
- No page-boundary checks in datapath.
- No filler CQEs.
- Significantly smaller CQ.
- SKB data continuously resides in linear part, and not split to
  small amount (linear part) and large amount (fragment).
  This saves datapath cycles in driver and improves utilization
  of SKB fragments in GRO.
- The fragments of a resulting GRO SKB follow the IP forwarding
  assumption of equal-size fragments.

implementation details:
HW writes the packets to the beginning of a stride,
i.e. does not keep headroom. To overcome this we make sure we can
extend backwards and use the last bytes of stride i-1.
Extra care is needed for stride 0 as it has no preceding stride.
We make sure headroom bytes are available by shifting the buffer
pointer passed to HW by headroom bytes.

This configuration now becomes default, whenever capable.
Of course, this implies turning LRO off.

Performance testing:
ConnectX-5, single core, single RX ring, default MTU.

UDP packet rate, early drop in TC layer:


| pkt size | before| after | ratio |

| 1500byte | 4.65 Mpps | 5.96 Mpps | 1.28x |
|  500byte | 5.23 Mpps | 5.97 Mpps | 1.14x |
|   64byte | 5.94 Mpps | 5.96 Mpps | 1.00x |


TCP streams: ~20% gain

3) Support XDP over Striding RQ:
Now that linear SKB is supported over Striding RQ,
we can support XDP by setting stride size to PAGE_SIZE
and headroom to XDP_PACKET_HEADROOM.

Striding RQ is capable of a higher packet-rate than
conventional RQ.

Performance testing:
ConnectX-5, 24 rings, default MTU.
CQE compression ON (to reduce completions BW in PCI).

XDP_DROP packet rate:
--
| pkt size | XDP rate   | 100GbE linerate | pct% |
--
|   64byte | 126.2 Mpps |  148.0 Mpps |  85% |
|  128byte |  80.0 Mpps |   84.8 Mpps |  94% |
|  256byte |  42.7 Mpps |   42.7 Mpps | 100% |
|  512byte |  23.4 Mpps |   23.4 Mpps | 100% |
--

4) Remove mlx5 page_ref bulking in Striding RQ and use page_ref_inc only when 
needed.
   Without this bulking, we have:
- no atomic ops on WQE allocation or free
- one atomic op per SKB
- In the default MTU configuration (1500, stride size is 2K),
  the non-bulking method execute 2 atomic ops as before
- For larger MTUs with stride size of 4K, non-bulking method
  executes only a single op.
- For XDP (stride size of 4K, no SKBs), non-bulking have no atomic ops per 
packet at all.

Performance testing:
ConnectX-5, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz.

Single core packet rate (64 bytes).

Early drop in TC: no degradation.

XDP_DROP:
before: 14,270,188 pps
after:  20,503,603 pps, 43% improvement.

Thanks,
saeed.


Alaa Hleihel (1):
  net/mlx5: Change teardown with force mode failure message to warning

Saeed Mahameed (2):
  net/mlx5e: Use eq ptr from cq
  net/mlx5: Eliminate query xsrq dead code

[net-next 04/15] net/mlx5e: IPoIB, Fix spelling mistake

2018-03-30 Thread Saeed Mahameed

From: Talat Batheesh 

Fix spelling mistake in debug message text.
"dettaching" -> "detaching"

Signed-off-by: Talat Batheesh 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c 
b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
index a35608faf8d2..4899de74e252 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@ -540,7 +540,7 @@ static int mlx5i_detach_mcast(struct net_device *netdev, 
struct ib_device *hca,
 
err = mlx5_core_detach_mcg(mdev, gid, ipriv->qp.qpn);
if (err)
-   mlx5_core_dbg(mdev, "failed dettaching QPN 0x%x, MGID %pI6\n",
+   mlx5_core_dbg(mdev, "failed detaching QPN 0x%x, MGID %pI6\n",
  ipriv->qp.qpn, gid->raw);
 
return err;
-- 
2.14.3

[net-next 01/15] net/mlx5e: Use eq ptr from cq

2018-03-30 Thread Saeed Mahameed

Instead of looking for the EQ of the CQ, remove that redundant code and
use the eq pointer stored in the cq struct.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1b48dec67abf..2aff4db9bdaa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3728,21 +3728,11 @@ static netdev_features_t mlx5e_features_check(struct 
sk_buff *skb,
 static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev,
struct mlx5e_txqsq *sq)
 {
-   struct mlx5e_priv *priv = netdev_priv(dev);
-   struct mlx5_core_dev *mdev = priv->mdev;
-   int irqn_not_used, eqn;
-   struct mlx5_eq *eq;
+   struct mlx5_eq *eq = sq->cq.mcq.eq;
u32 eqe_count;
 
-   if (mlx5_vector2eqn(mdev, sq->cq.mcq.vector, , _not_used))
-   return false;
-
-   eq = mlx5_eqn2eq(mdev, eqn);
-   if (IS_ERR(eq))
-   return false;
-
netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n",
-  eqn, eq->cons_index, eq->irqn);
+  eq->eqn, eq->cons_index, eq->irqn);
 
eqe_count = mlx5_eq_poll_irq_disabled(eq);
if (!eqe_count)
-- 
2.14.3

[net-next 03/15] net/mlx5: Change teardown with force mode failure message to warning

2018-03-30 Thread Saeed Mahameed

From: Alaa Hleihel 

With ConnectX-4, we expect the force teardown to fail in case that
DC was enabled, therefore change the message from error to warning.

Signed-off-by: Alaa Hleihel 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/fw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c 
b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
index d7bb10ab2173..70066975f1b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -245,7 +245,7 @@ int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev)
 
force_state = MLX5_GET(teardown_hca_out, out, force_state);
if (force_state == MLX5_TEARDOWN_HCA_OUT_FORCE_STATE_FAIL) {
-   mlx5_core_err(dev, "teardown with force mode failed\n");
+   mlx5_core_warn(dev, "teardown with force mode failed, doing 
normal teardown\n");
return -EIO;
}
 
-- 
2.14.3

[net-next 07/15] net/mlx5e: Code movements in RX UMR WQE post

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Gets the process of a UMR WQE post in one function,
in preparation for a downstream patch that inlines
the WQE data.
No functional change here.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 107 ++--
 1 file changed, 45 insertions(+), 62 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index c0d528f2131b..8aa94d3cff59 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -347,39 +347,44 @@ mlx5e_copy_skb_header_mpwqe(struct device *pdev,
}
 }
 
-static inline void mlx5e_post_umr_wqe(struct mlx5e_rq *rq, u16 ix)
+void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
 {
-   struct mlx5e_mpw_info *wi = >mpwqe.info[ix];
-   struct mlx5e_icosq *sq = >channel->icosq;
-   struct mlx5_wq_cyc *wq = >wq;
-   struct mlx5e_umr_wqe *wqe;
-   u8 num_wqebbs = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_BB);
-   u16 pi;
+   int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
+   struct mlx5e_dma_info *dma_info = >umr.dma_info[0];
+   int i;
 
-   /* fill sq edge with nops to avoid wqe wrap around */
-   while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
-   sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
-   mlx5e_post_nop(wq, sq->sqn, >pc);
+   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
+   page_ref_sub(dma_info->page, pg_strides - wi->skbs_frags[i]);
+   mlx5e_page_release(rq, dma_info, true);
}
+}
 
-   wqe = mlx5_wq_cyc_get_wqe(wq, pi);
-   memcpy(wqe, >umr.wqe, sizeof(*wqe));
-   wqe->ctrl.opmod_idx_opcode =
-   cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
-   MLX5_OPCODE_UMR);
+static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
+{
+   struct mlx5_wq_ll *wq = >wq;
+   struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
 
-   sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
-   sq->pc += num_wqebbs;
-   mlx5e_notify_hw(>wq, sq->pc, sq->uar_map, >ctrl);
+   rq->mpwqe.umr_in_progress = false;
+
+   mlx5_wq_ll_push(wq, be16_to_cpu(wqe->next.next_wqe_index));
+
+   /* ensure wqes are visible to device before updating doorbell record */
+   dma_wmb();
+
+   mlx5_wq_ll_update_db_record(wq);
 }
 
-static int mlx5e_alloc_rx_umr_mpwqe(struct mlx5e_rq *rq,
-   u16 ix)
+static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
struct mlx5e_mpw_info *wi = >mpwqe.info[ix];
int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
struct mlx5e_dma_info *dma_info = >umr.dma_info[0];
+   struct mlx5e_icosq *sq = >channel->icosq;
+   struct mlx5_wq_cyc *wq = >wq;
+   struct mlx5e_umr_wqe *wqe;
+   u8 num_wqebbs = DIV_ROUND_UP(sizeof(*wqe), MLX5_SEND_WQE_BB);
int err;
+   u16 pi;
int i;
 
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
@@ -393,6 +398,24 @@ static int mlx5e_alloc_rx_umr_mpwqe(struct mlx5e_rq *rq,
memset(wi->skbs_frags, 0, sizeof(*wi->skbs_frags) * 
MLX5_MPWRQ_PAGES_PER_WQE);
wi->consumed_strides = 0;
 
+   rq->mpwqe.umr_in_progress = true;
+
+   /* fill sq edge with nops to avoid wqe wrap around */
+   while ((pi = (sq->pc & wq->sz_m1)) > sq->edge) {
+   sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_NOP;
+   mlx5e_post_nop(wq, sq->sqn, >pc);
+   }
+
+   wqe = mlx5_wq_cyc_get_wqe(wq, pi);
+   memcpy(wqe, >umr.wqe, sizeof(*wqe));
+   wqe->ctrl.opmod_idx_opcode =
+   cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
+   MLX5_OPCODE_UMR);
+
+   sq->db.ico_wqe[pi].opcode = MLX5_OPCODE_UMR;
+   sq->pc += num_wqebbs;
+   mlx5e_notify_hw(>wq, sq->pc, sq->uar_map, >ctrl);
+
return 0;
 
 err_unmap:
@@ -401,51 +424,11 @@ static int mlx5e_alloc_rx_umr_mpwqe(struct mlx5e_rq *rq,
page_ref_sub(dma_info->page, pg_strides);
mlx5e_page_release(rq, dma_info, true);
}
+   rq->stats.buff_alloc_err++;
 
return err;
 }
 
-void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi)
-{
-   int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
-   struct mlx5e_dma_info *dma_info = >umr.dma_info[0];
-   int i;
-
-   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
-   page_ref_sub(dma_info->page, pg_strides - wi->skbs_frags[i]);
-   mlx5e_page_release(rq, dma_info, true);
-   }
-}
-
-static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
-{
-   struct mlx5_wq_ll *wq = >wq;
-   struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(wq, wq->head);
-
-

[net-next 08/15] net/mlx5e: Do not busy-wait for UMR completion in Striding RQ

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Do not busy-wait a pending UMR completion. Under high HW load,
busy-waiting a delayed completion would fully utilize the CPU core
and mistakenly indicate a SW bottleneck.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 8aa94d3cff59..8eb9e7e89b09 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -527,7 +527,7 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
if (!rq->mpwqe.umr_in_progress)
mlx5e_alloc_rx_mpwqe(rq, wq->head);
 
-   return true;
+   return false;
 }
 
 static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr 
*tcp)
-- 
2.14.3

[net-next 15/15] net/mlx5e: RX, Recycle buffer of UMR WQEs

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Upon a new UMR post, check if the WQE buffer contains
a previous UMR WQE. If so, modify the dynamic fields
instead of a whole WQE overwrite. This saves a memcpy.

In current setting, after 2 WQ cycles (12 UMR posts),
this will always be the case.

No degradation sensed.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 682f9ff9da34..176645762e49 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -365,6 +365,11 @@ static void mlx5e_post_rx_mpwqe(struct mlx5e_rq *rq)
mlx5_wq_ll_update_db_record(wq);
 }
 
+static inline u16 mlx5e_icosq_wrap_cnt(struct mlx5e_icosq *sq)
+{
+   return sq->pc >> MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
+}
+
 static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
 {
struct mlx5e_mpw_info *wi = >mpwqe.info[ix];
@@ -372,7 +377,6 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 ix)
struct mlx5e_icosq *sq = >channel->icosq;
struct mlx5_wq_cyc *wq = >wq;
struct mlx5e_umr_wqe *umr_wqe;
-   int cpy = offsetof(struct mlx5e_umr_wqe, inline_mtts);
u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
int err;
u16 pi;
@@ -385,7 +389,10 @@ static int mlx5e_alloc_rx_mpwqe(struct mlx5e_rq *rq, u16 
ix)
}
 
umr_wqe = mlx5_wq_cyc_get_wqe(wq, pi);
-   memcpy(umr_wqe, >mpwqe.umr_wqe, cpy);
+   if (unlikely(mlx5e_icosq_wrap_cnt(sq) < 2))
+   memcpy(umr_wqe, >mpwqe.umr_wqe,
+  offsetof(struct mlx5e_umr_wqe, inline_mtts));
+
for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++, dma_info++) {
err = mlx5e_page_alloc_mapped(rq, dma_info);
if (unlikely(err))
-- 
2.14.3

[net-next 14/15] net/mlx5e: Keep single pre-initialized UMR WQE per RQ

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

All UMR WQEs of an RQ share many common fields. We use
pre-initialized structures to save calculations in datapath.
One field (xlt_offset) was the only reason we saved a pre-initialized
copy per WQE index.
Here we remove its initialization (move its calculation to datapath),
and reduce the number of copies to one-per-RQ.

A very small datapath calculation is added, it occurs once per a MPWQE
(i.e. once every 256KB), but reduces memory consumption and gives
better cache utilization.

Performance testing:
Tested packet rate, no degradation sensed.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h  |  8 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 20 
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c   |  4 +++-
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 5853de4e4fc7..30cad07be2b5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -84,6 +84,7 @@
 
 #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
 #define MLX5E_REQUIRED_WQE_MTTS
(ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8))
+#define MLX5E_LOG_ALIGNED_MPWQE_PPW(ilog2(MLX5E_REQUIRED_WQE_MTTS))
 #define MLX5E_REQUIRED_MTTS(wqes)  (wqes * MLX5E_REQUIRED_WQE_MTTS)
 #define MLX5E_MAX_RQ_NUM_MTTS  \
((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
@@ -450,7 +451,6 @@ struct mlx5e_wqe_frag_info {
 
 struct mlx5e_umr_dma_info {
struct mlx5e_dma_info  dma_info[MLX5_MPWRQ_PAGES_PER_WQE];
-   struct mlx5e_umr_wqe   wqe;
 };
 
 struct mlx5e_mpw_info {
@@ -496,6 +496,7 @@ struct mlx5e_rq {
};
} wqe;
struct {
+   struct mlx5e_umr_wqe   umr_wqe;
struct mlx5e_mpw_info *info;
mlx5e_fp_skb_from_cqe_mpwrq skb_from_cqe_mpwrq;
u16num_strides;
@@ -978,11 +979,6 @@ static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
mlx5_cq_arm(mcq, MLX5_CQ_DB_REQ_NOT, mcq->uar->map, cq->wq.cc);
 }
 
-static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix)
-{
-   return wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8);
-}
-
 extern const struct ethtool_ops mlx5e_ethtool_ops;
 #ifdef CONFIG_MLX5_CORE_EN_DCB
 extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b03a2327356a..0339609cfa56 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -306,13 +306,11 @@ static void mlx5e_disable_async_events(struct mlx5e_priv 
*priv)
 
 static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
   struct mlx5e_icosq *sq,
-  struct mlx5e_umr_wqe *wqe,
-  u16 ix)
+  struct mlx5e_umr_wqe *wqe)
 {
struct mlx5_wqe_ctrl_seg  *cseg = >ctrl;
struct mlx5_wqe_umr_ctrl_seg *ucseg = >uctrl;
u8 ds_cnt = DIV_ROUND_UP(MLX5E_UMR_WQE_INLINE_SZ, MLX5_SEND_WQE_DS);
-   u32 umr_wqe_mtt_offset = mlx5e_get_wqe_mtt_offset(rq, ix);
 
cseg->qpn_ds= cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) |
  ds_cnt);
@@ -322,8 +320,6 @@ static inline void mlx5e_build_umr_wqe(struct mlx5e_rq *rq,
ucseg->flags = MLX5_UMR_TRANSLATION_OFFSET_EN | MLX5_UMR_INLINE;
ucseg->xlt_octowords =
cpu_to_be16(MLX5_MTT_OCTW(MLX5_MPWRQ_PAGES_PER_WQE));
-   ucseg->bsf_octowords =
-   cpu_to_be16(MLX5_MTT_OCTW(umr_wqe_mtt_offset));
ucseg->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
 }
 
@@ -331,18 +327,13 @@ static int mlx5e_rq_alloc_mpwqe_info(struct mlx5e_rq *rq,
 struct mlx5e_channel *c)
 {
int wq_sz = mlx5_wq_ll_get_size(>wq);
-   int i;
 
rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
  GFP_KERNEL, cpu_to_node(c->cpu));
if (!rq->mpwqe.info)
return -ENOMEM;
 
-   for (i = 0; i < wq_sz; i++) {
-   struct mlx5e_mpw_info *wi = >mpwqe.info[i];
-
-   mlx5e_build_umr_wqe(rq, >icosq, >umr.wqe, i);
-   }
+   mlx5e_build_umr_wqe(rq, >icosq, >mpwqe.umr_wqe);
 
return 0;
 }
@@ -388,6 +379,11 @@ static int mlx5e_create_rq_umr_mkey(struct mlx5_core_dev 
*mdev, struct mlx5e_rq
return mlx5e_create_umr_mkey(mdev, num_mtts, PAGE_SHIFT, >umr_mkey);
 }
 
+static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16

[net-next 11/15] net/mlx5e: Refactor RQ XDP_TX indication

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Make the xdp_xmit indication available for Striding RQ
by taking it out of the type-specific union.
This refactor is a preparation for a downstream patch that
adds XDP support over Striding RQ.
In addition, use a bitmap instead of a boolean for possible
future flags.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h| 6 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 8 +++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index d26dd4bc89f4..a6ca54393bb6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -479,6 +479,10 @@ typedef struct sk_buff *
 typedef bool (*mlx5e_fp_post_rx_wqes)(struct mlx5e_rq *rq);
 typedef void (*mlx5e_fp_dealloc_wqe)(struct mlx5e_rq*, u16);
 
+enum mlx5e_rq_flag {
+   MLX5E_RQ_FLAG_XDP_XMIT = BIT(0),
+};
+
 struct mlx5e_rq {
/* data path */
struct mlx5_wq_ll  wq;
@@ -489,7 +493,6 @@ struct mlx5e_rq {
u32 frag_sz;/* max possible skb frag_sz */
union {
bool page_reuse;
-   bool xdp_xmit;
};
} wqe;
struct {
@@ -528,6 +531,7 @@ struct mlx5e_rq {
struct bpf_prog   *xdp_prog;
unsigned int   hw_mtu;
struct mlx5e_xdpsq xdpsq;
+   DECLARE_BITMAP(flags, 8);
 
/* control */
struct mlx5_wq_ctrlwq_ctrl;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 07db8a58d0a2..a827571deb85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -788,7 +788,7 @@ static inline bool mlx5e_xmit_xdp_frame(struct mlx5e_rq *rq,
/* move page to reference to sq responsibility,
 * and mark so it's not put back in page-cache.
 */
-   rq->wqe.xdp_xmit = true;
+   __set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
sq->db.di[pi] = *di;
sq->pc++;
 
@@ -913,9 +913,8 @@ void mlx5e_handle_rx_cqe(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
if (!skb) {
/* probably for XDP */
-   if (rq->wqe.xdp_xmit) {
+   if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
wi->di.page = NULL;
-   rq->wqe.xdp_xmit = false;
/* do not return page to cache, it will be returned on 
XDP_TX completion */
goto wq_ll_pop;
}
@@ -955,9 +954,8 @@ void mlx5e_handle_rx_cqe_rep(struct mlx5e_rq *rq, struct 
mlx5_cqe64 *cqe)
 
skb = skb_from_cqe(rq, cqe, wi, cqe_bcnt);
if (!skb) {
-   if (rq->wqe.xdp_xmit) {
+   if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
wi->di.page = NULL;
-   rq->wqe.xdp_xmit = false;
/* do not return page to cache, it will be returned on 
XDP_TX completion */
goto wq_ll_pop;
}
-- 
2.14.3

[net-next 05/15] net/mlx5e: Save MTU in channels params

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

Knowing the MTU is required for RQ creation flow.
By our design, channels creation flow is totally isolated
from priv/netdev, and can be completed with access to
channels params and mdev.
Adding the MTU to the channels params helps preserving that.
In addition, we save it in RQ to make its access faster in
datapath checks.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h   | 10 ++--
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 58 --
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c|  3 +-
 .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c  | 23 +
 5 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 353ac6daa3dc..823876bfd6ab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -57,8 +57,8 @@
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
 
-#define MLX5E_HW2SW_MTU(priv, hwmtu) ((hwmtu) - ((priv)->hard_mtu))
-#define MLX5E_SW2HW_MTU(priv, swmtu) ((swmtu) + ((priv)->hard_mtu))
+#define MLX5E_HW2SW_MTU(params, hwmtu) ((hwmtu) - ((params)->hard_mtu))
+#define MLX5E_SW2HW_MTU(params, swmtu) ((swmtu) + ((params)->hard_mtu))
 
 #define MLX5E_MAX_DSCP  64
 #define MLX5E_MAX_NUM_TC   8
@@ -251,6 +251,8 @@ struct mlx5e_params {
u32 lro_timeout;
u32 pflags;
struct bpf_prog *xdp_prog;
+   unsigned int sw_mtu;
+   int hard_mtu;
 };
 
 #ifdef CONFIG_MLX5_CORE_EN_DCB
@@ -534,6 +536,7 @@ struct mlx5e_rq {
 
/* XDP */
struct bpf_prog   *xdp_prog;
+   unsigned int   hw_mtu;
struct mlx5e_xdpsq xdpsq;
 
/* control */
@@ -767,7 +770,6 @@ struct mlx5e_priv {
struct mlx5e_tir   inner_indir_tir[MLX5E_NUM_INDIR_TIRS];
struct mlx5e_tir   direct_tir[MLX5E_MAX_NUM_CHANNELS];
u32tx_rates[MLX5E_MAX_NUM_SQS];
-   inthard_mtu;
 
struct mlx5e_flow_steering fs;
struct mlx5e_vxlan_db  vxlan;
@@ -,7 +1113,7 @@ void mlx5e_detach_netdev(struct mlx5e_priv *priv);
 void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
 void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
-   u16 max_channels);
+   u16 max_channels, u16 mtu);
 u8 mlx5e_params_calculate_tx_min_inline(struct mlx5_core_dev *mdev);
 void mlx5e_rx_dim_work(struct work_struct *work);
 #endif /* __MLX5_EN_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 2aff4db9bdaa..af345323b2ce 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -419,6 +419,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
rq->channel = c;
rq->ix  = c->ix;
rq->mdev= mdev;
+   rq->hw_mtu  = MLX5E_SW2HW_MTU(params, params->sw_mtu);
 
rq->xdp_prog = params->xdp_prog ? bpf_prog_inc(params->xdp_prog) : NULL;
if (IS_ERR(rq->xdp_prog)) {
@@ -494,7 +495,7 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
byte_count = params->lro_en  ?
params->lro_wqe_sz :
-   MLX5E_SW2HW_MTU(c->priv, c->netdev->mtu);
+   MLX5E_SW2HW_MTU(params, params->sw_mtu);
 #ifdef CONFIG_MLX5_EN_IPSEC
if (MLX5_IPSEC_DEV(mdev))
byte_count += MLX5E_METADATA_ETHER_LEN;
@@ -2498,10 +2499,10 @@ static void mlx5e_build_inner_indir_tir_ctx(struct 
mlx5e_priv *priv,
mlx5e_build_indir_tir_ctx_hash(>channels.params, tt, tirc, true);
 }
 
-static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
+static int mlx5e_set_mtu(struct mlx5_core_dev *mdev,
+struct mlx5e_params *params, u16 mtu)
 {
-   struct mlx5_core_dev *mdev = priv->mdev;
-   u16 hw_mtu = MLX5E_SW2HW_MTU(priv, mtu);
+   u16 hw_mtu = MLX5E_SW2HW_MTU(params, mtu);
int err;
 
err = mlx5_set_port_mtu(mdev, hw_mtu, 1);
@@ -2513,9 +2514,9 @@ static int mlx5e_set_mtu(struct mlx5e_priv *priv, u16 mtu)
return 0;
 }
 
-static void mlx5e_query_mtu(struct mlx5e_priv *priv, u16 *mtu)
+static void mlx5e_query_mtu(struct mlx5_core_dev *mdev,
+   struct mlx5e_params *params, u16 *mtu)
 {
-   struct mlx5_core_dev *mdev = priv->mdev;
u16 hw_mtu = 0;
int err;
 
@@ -2523,25 +2524,27 @@ static void mlx5e_query_mtu(struct mlx5e_priv *priv, 
u16 *mtu)
if (err || !hw_mtu) /* fallback to port oper mtu */

[net-next 13/15] net/mlx5e: Remove page_ref bulking in Striding RQ

2018-03-30 Thread Saeed Mahameed

From: Tariq Toukan 

When many packets reside on the same page, the bulking of
page_ref modifications reduces the total number of atomic
operations executed.

Besides the necessary 2 operations on page alloc/free, we
have the following extra ops per page:
- one on WQE allocation (bump refcnt to maximum possible),
- zero ops for SKBs,
- one on WQE free,
a constant of two operations in total, no matter how many
packets/SKBs actually populate the page.

Without this bulking, we have:
- no ops on WQE allocation or free,
- one op per SKB,

Comparing the two methods when PAGE_SIZE is 4K:
- As mentioned above, bulking method always executes 2 operations,
  not more, but not less.
- In the default MTU configuration (1500, stride size is 2K),
  the non-bulking method execute 2 ops as well.
- For larger MTUs with stride size of 4K, non-bulking method
  executes only a single op.
- For XDP (stride size of 4K, no SKBs), non-bulking method
  executes no ops at all!

Hence, to optimize the flows with linear SKB and XDP over Striding RQ,
we here remove the page_ref bulking method.

Performance testing:
ConnectX-5, Intel(R) Xeon(R) CPU E5-2680 v3 @ 2.50GHz.

Single core packet rate (64 bytes).

Early drop in TC: no degradation.

XDP_DROP:
before: 14,270,188 pps
after:  20,503,603 pps, 43% improvement.

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en.h|  1 -
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 47 +
 2 files changed, 16 insertions(+), 32 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 7997d7c159db..5853de4e4fc7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -456,7 +456,6 @@ struct mlx5e_umr_dma_info {
 struct mlx5e_mpw_info {
struct mlx5e_umr_dma_info umr;
u16 consumed_strides;
-   u16 skbs_frags[MLX5_MPWRQ_PAGES_PER_WQE];
DECLARE_BITMAP(xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 1da79cab1838..9bb47a6d40f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -296,37 +296,28 @@ void mlx5e_dealloc_rx_wqe(struct mlx5e_rq *rq, u16 ix)
mlx5e_free_rx_wqe(rq, wi);
 }
 
-static inline int mlx5e_mpwqe_strides_per_page(struct mlx5e_rq *rq)
-{
-   return rq->mpwqe.num_strides >> MLX5_MPWRQ_WQE_PAGE_ORDER;
-}
-
 static inline void mlx5e_add_skb_frag_mpwqe(struct mlx5e_rq *rq,
struct sk_buff *skb,
-   struct mlx5e_mpw_info *wi,
-   u32 page_idx, u32 frag_offset,
-   u32 len)
+   struct mlx5e_dma_info *di,
+   u32 frag_offset, u32 len)
 {
unsigned int truesize = ALIGN(len, BIT(rq->mpwqe.log_stride_sz));
 
dma_sync_single_for_cpu(rq->pdev,
-   wi->umr.dma_info[page_idx].addr + frag_offset,
+   di->addr + frag_offset,
len, DMA_FROM_DEVICE);
-   wi->skbs_frags[page_idx]++;
+   page_ref_inc(di->page);
skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
-   wi->umr.dma_info[page_idx].page, frag_offset,
-   len, truesize);
+   di->page, frag_offset, len, truesize);
 }
 
 static inline void
 mlx5e_copy_skb_header_mpwqe(struct device *pdev,
struct sk_buff *skb,
-   struct mlx5e_mpw_info *wi,
-   u32 page_idx, u32 offset,
-   u32 headlen)
+   struct mlx5e_dma_info *dma_info,
+   u32 offset, u32 headlen)
 {
u16 headlen_pg = min_t(u32, headlen, PAGE_SIZE - offset);
-   struct mlx5e_dma_info *dma_info = >umr.dma_info[page_idx];
unsigned int len;
 
 /* Aligning len to sizeof(long) optimizes memcpy performance */
@@ -351,15 +342,12 @@ void mlx5e_free_rx_mpwqe(struct mlx5e_rq *rq, struct 
mlx5e_mpw_info *wi)
 {
const bool no_xdp_xmit =
bitmap_empty(wi->xdp_xmit_bitmap, MLX5_MPWRQ_PAGES_PER_WQE);
-   int pg_strides = mlx5e_mpwqe_strides_per_page(rq);
struct mlx5e_dma_info *dma_info = wi->umr.dma_info;
int i;
 
-   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-   page_ref_sub(dma_info[i].page, pg_strides - wi->skbs_frags[i]);
+   for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++)
if (no_xdp_xmit || !test_bit(i, wi->xdp_xmit_bitmap))

Re: [net-next V7 PATCH 14/16] mlx5: use page_pool for xdp_return_frame call

2018-03-30 Thread Saeed Mahameed

On Thu, 2018-03-29 at 19:02 +0200, Jesper Dangaard Brouer wrote:
> This patch shows how it is possible to have both the driver local
> page
> cache, which uses elevated refcnt for "catching"/avoiding SKB
> put_page returns the page through the page allocator.  And at the
> same time, have pages getting returned to the page_pool from
> ndp_xdp_xmit DMA completion.
> 
> The performance improvement for XDP_REDIRECT in this patch is really
> good.  Especially considering that (currently) the xdp_return_frame
> API and page_pool_put_page() does per frame operations of both
> rhashtable ID-lookup and locked return into (page_pool) ptr_ring.
> (It is the plan to remove these per frame operation in a followup
> patchset).
> 
> The benchmark performed was RX on mlx5 and XDP_REDIRECT out ixgbe,
> with xdp_redirect_map (using devmap) . And the target/maximum
> capability of ixgbe is 13Mpps (on this HW setup).
> 
> Before this patch for mlx5, XDP redirected frames were returned via
> the page allocator.  The single flow performance was 6Mpps, and if I
> started two flows the collective performance drop to 4Mpps, because
> we
> hit the page allocator lock (further negative scaling occurs).
> 
> Two test scenarios need to be covered, for xdp_return_frame API,
> which
> is DMA-TX completion running on same-CPU or cross-CPU free/return.
> Results were same-CPU=10Mpps, and cross-CPU=12Mpps.  This is very
> close to our 13Mpps max target.
> 
> The reason max target isn't reached in cross-CPU test, is likely due
> to RX-ring DMA unmap/map overhead (which doesn't occur in ixgbe to
> ixgbe testing).  It is also planned to remove this unnecessary DMA
> unmap in a later patchset
> 
> V2: Adjustments requested by Tariq
>  - Changed page_pool_create return codes not return NULL, only
>ERR_PTR, as this simplifies err handling in drivers.
>  - Save a branch in mlx5e_page_release
>  - Correct page_pool size calc for
> MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ
> 
> V5: Updated patch desc
> 
> Signed-off-by: Jesper Dangaard Brouer 
> Reviewed-by: Tariq Toukan 

Acked-by: Saeed Mahameed

[next-queue PATCH] igb: Fix the transmission mode of queue 0 for Qav mode

2018-03-30 Thread Vinicius Costa Gomes

When Qav mode is enabled, queue 0 should be kept on Stream Reservation
mode. From the i210 datasheet, section 8.12.19:

"Note: Queue0 QueueMode must be set to 1b when TransmitMode is set to
Qav." ("QueueMode 1b" represents the Stream Reservation mode)

The solution is to give queue 0 the all the credits it might need, so
it has priority over queue 1.

A situation where this can happen is when cbs is "installed" only on
queue 1, leaving queue 0 alone. For example:

$ tc qdisc replace dev enp2s0 handle 100: parent root mqprio num_tc 3 \
   map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 queues 1@0 1@1 2@2 hw 0

$ tc qdisc replace dev enp2s0 parent 100:2 cbs locredit -1470 \
   hicredit 30 sendslope -98 idleslope 2 offload 1

Signed-off-by: Vinicius Costa Gomes 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index c1c0bc30a16d..cce7ada89255 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1700,7 +1700,22 @@ static void igb_configure_cbs(struct igb_adapter 
*adapter, int queue,
WARN_ON(hw->mac.type != e1000_i210);
WARN_ON(queue < 0 || queue > 1);
 
-   if (enable) {
+   if (enable || queue == 0) {
+   /* i210 does not allow the queue 0 to be in the Strict
+* Priority mode while the Qav mode is enabled, so,
+* instead of disabling strict priority mode, we give
+* queue 0 the maximum of credits possible.
+*
+* See section 8.12.19 of the i210 datasheet, "Note:
+* Queue0 QueueMode must be set to 1b when
+* TransmitMode is set to Qav."
+*/
+   if (queue == 0 && !enable) {
+   /* max "linkspeed" idleslope in kbps */
+   idleslope = 100;
+   hicredit = ETH_FRAME_LEN;
+   }
+
set_tx_desc_fetch_prio(hw, queue, TX_QUEUE_PRIO_HIGH);
set_queue_mode(hw, queue, QUEUE_MODE_STREAM_RESERVATION);
 
-- 
2.16.3

Re: [net-next V7 PATCH 10/16] mlx5: register a memory model when XDP is enabled

2018-03-30 Thread Saeed Mahameed

On Thu, 2018-03-29 at 19:01 +0200, Jesper Dangaard Brouer wrote:
> Now all the users of ndo_xdp_xmit have been converted to use
> xdp_return_frame.
> This enable a different memory model, thus activating another code
> path
> in the xdp_return_frame API.
> 
> V2: Fixed issues pointed out by Tariq.
> 
> Signed-off-by: Jesper Dangaard Brouer 
> Reviewed-by: Tariq Toukan 
> 

Acked-by: Saeed Mahameed

Re: [net-next V7 PATCH 01/16] mlx5: basic XDP_REDIRECT forward support

2018-03-30 Thread Saeed Mahameed

On Thu, 2018-03-29 at 19:01 +0200, Jesper Dangaard Brouer wrote:
> This implements basic XDP redirect support in mlx5 driver.
> 
> Notice that the ndo_xdp_xmit() is NOT implemented, because that API
> need some changes that this patchset is working towards.
> 
> The main purpose of this patch is have different drivers doing
> XDP_REDIRECT to show how different memory models behave in a cross
> driver world.
> 
> Update(pre-RFCv2 Tariq): Need to DMA unmap page before
> xdp_do_redirect,
> as the return API does not exist yet to to keep this mapped.
> 
> Update(pre-RFCv3 Saeed): Don't mix XDP_TX and XDP_REDIRECT flushing,
> introduce xdpsq.db.redirect_flush boolian.
> 
> Signed-off-by: Jesper Dangaard Brouer 
> Reviewed-by: Tariq Toukan 

Acked-by: Saeed Mahameed

Re: [PATCH bpf-next 01/10] bpf: btf: Introduce BPF Type Format (BTF)

2018-03-30 Thread Martin KaFai Lau

On Sat, Mar 31, 2018 at 01:22:53AM +0200, Daniel Borkmann wrote:
> On 03/30/2018 08:26 PM, Martin KaFai Lau wrote:
> [...]
> > +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
> > +{
> > +   struct btf *btf = env->btf;
> > +
> > +   /* < 2 because +1 for btf_void which is always in btf->types[0].
> > +* btf_void is not accounted in btf->nr_types because btf_void
> > +* does not come from the BTF file.
> > +*/
> > +   if (btf->types_size - btf->nr_types < 2) {
> > +   /* Expand 'types' array */
> > +
> > +   struct btf_type **new_types;
> > +   u32 expand_by, new_size;
> > +
> > +   if (btf->types_size == BTF_MAX_NR_TYPES) {
> > +   btf_verifier_log(env, "Exceeded max num of types");
> > +   return -E2BIG;
> > +   }
> > +
> > +   expand_by = max_t(u32, btf->types_size >> 2, 16);
> > +   new_size = min_t(u32, BTF_MAX_NR_TYPES,
> > +btf->types_size + expand_by);
> > +
> > +   new_types = kvzalloc(new_size * sizeof(*new_types),
> > +GFP_KERNEL | __GFP_NOWARN);
> > +   if (!new_types)
> > +   return -ENOMEM;
> > +
> > +   if (btf->nr_types == 0)
> > +   new_types[0] = _void;
> > +   else
> > +   memcpy(new_types, btf->types,
> > +  sizeof(*btf->types) * (btf->nr_types + 1));
> > +
> > +   kfree(btf->types);
> > +   btf->types = new_types;
> 
> Haven't read through the whole series yet, but this type of pattern pops up
> immediately in several locations throughout multiple patches in this series.
> 
> Here, you'll free kv*alloc() backed memory into the wrong backend allocator,
> thus if it's vmalloc() backed, it cannot go into kmalloc() backed memory via
> kfree(), thus please audit the whole series on this.
Thanks. I will make the change in the next spin.

> 
> > +   btf->types_size = new_size;
> > +   }
> > +
> > +   btf->types[++(btf->nr_types)] = t;
> > +
> > +   return 0;
> > +}
> > +
> > +static void btf_free(struct btf *btf)
> > +{
> > +   kfree(btf->types);
> > +   kfree(btf->data);
> > +   kfree(btf);
> > +}
> > +
> > +static void btf_verifier_env_free(struct btf_verifier_env *env)
> > +{
> > +   kfree(env);
> > +}
> > +
> [...]
> > +   data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
> > +   if (!data) {
> > +   err = -ENOMEM;
> > +   goto errout;
> > +   }
> > +
> > +   btf->data = data;
> > +   btf->data_size = btf_data_size;
> > +
> > +   if (copy_from_user(data, btf_data, btf_data_size)) {
> > +   err = -EFAULT;
> > +   goto errout;
> > +   }
> > +
> > +   env->btf = btf;
> > +
> > +   err = btf_parse_hdr(env);
> > +   if (err)
> > +   goto errout;
> > +
> > +   err = btf_parse_str_sec(env);
> > +   if (err)
> > +   goto errout;
> > +
> > +   err = btf_parse_type_sec(env);
> > +   if (err)
> > +   goto errout;
> > +
> > +   if (!err && log->level && bpf_verifier_log_full(log)) {
> > +   err = -ENOSPC;
> > +   goto errout;
> > +   }
> > +
> > +   if (!err) {
> > +   btf_verifier_env_free(env);
> > +   return btf;
> > +   }
> > +
> > +errout:
> > +   btf_verifier_env_free(env);
> > +   if (btf)
> > +   btf_free(btf);
> > +   return ERR_PTR(err);
> > +}

Re: [PATCH bpf-next 01/10] bpf: btf: Introduce BPF Type Format (BTF)

2018-03-30 Thread Daniel Borkmann

On 03/30/2018 08:26 PM, Martin KaFai Lau wrote:
[...]
> +static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
> +{
> + struct btf *btf = env->btf;
> +
> + /* < 2 because +1 for btf_void which is always in btf->types[0].
> +  * btf_void is not accounted in btf->nr_types because btf_void
> +  * does not come from the BTF file.
> +  */
> + if (btf->types_size - btf->nr_types < 2) {
> + /* Expand 'types' array */
> +
> + struct btf_type **new_types;
> + u32 expand_by, new_size;
> +
> + if (btf->types_size == BTF_MAX_NR_TYPES) {
> + btf_verifier_log(env, "Exceeded max num of types");
> + return -E2BIG;
> + }
> +
> + expand_by = max_t(u32, btf->types_size >> 2, 16);
> + new_size = min_t(u32, BTF_MAX_NR_TYPES,
> +  btf->types_size + expand_by);
> +
> + new_types = kvzalloc(new_size * sizeof(*new_types),
> +  GFP_KERNEL | __GFP_NOWARN);
> + if (!new_types)
> + return -ENOMEM;
> +
> + if (btf->nr_types == 0)
> + new_types[0] = _void;
> + else
> + memcpy(new_types, btf->types,
> +sizeof(*btf->types) * (btf->nr_types + 1));
> +
> + kfree(btf->types);
> + btf->types = new_types;

Haven't read through the whole series yet, but this type of pattern pops up
immediately in several locations throughout multiple patches in this series.

Here, you'll free kv*alloc() backed memory into the wrong backend allocator,
thus if it's vmalloc() backed, it cannot go into kmalloc() backed memory via
kfree(), thus please audit the whole series on this.

> + btf->types_size = new_size;
> + }
> +
> + btf->types[++(btf->nr_types)] = t;
> +
> + return 0;
> +}
> +
> +static void btf_free(struct btf *btf)
> +{
> + kfree(btf->types);
> + kfree(btf->data);
> + kfree(btf);
> +}
> +
> +static void btf_verifier_env_free(struct btf_verifier_env *env)
> +{
> + kfree(env);
> +}
> +
[...]
> + data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
> + if (!data) {
> + err = -ENOMEM;
> + goto errout;
> + }
> +
> + btf->data = data;
> + btf->data_size = btf_data_size;
> +
> + if (copy_from_user(data, btf_data, btf_data_size)) {
> + err = -EFAULT;
> + goto errout;
> + }
> +
> + env->btf = btf;
> +
> + err = btf_parse_hdr(env);
> + if (err)
> + goto errout;
> +
> + err = btf_parse_str_sec(env);
> + if (err)
> + goto errout;
> +
> + err = btf_parse_type_sec(env);
> + if (err)
> + goto errout;
> +
> + if (!err && log->level && bpf_verifier_log_full(log)) {
> + err = -ENOSPC;
> + goto errout;
> + }
> +
> + if (!err) {
> + btf_verifier_env_free(env);
> + return btf;
> + }
> +
> +errout:
> + btf_verifier_env_free(env);
> + if (btf)
> + btf_free(btf);
> + return ERR_PTR(err);
> +}

Re: [PATCH net-next RFC 0/5] ipv6: sr: introduce seg6local End.BPF action

2018-03-30 Thread Alexei Starovoitov

On Fri, Mar 23, 2018 at 10:15:59AM +, Mathieu Xhonneux wrote:
> As of Linux 4.14, it is possible to define advanced local processing for
> IPv6 packets with a Segment Routing Header through the seg6local LWT
> infrastructure. This LWT implements the network programming principles
> defined in the IETF “SRv6 Network Programming” draft.
> 
> The implemented operations are generic, and it would be very interesting to
> be able to implement user-specific seg6local actions, without having to
> modify the kernel directly. To do so, this patchset adds an End.BPF action
> to seg6local, powered by some specific Segment Routing-related helpers,
> which provide SR functionalities that can be applied on the packet. This
> BPF hook would then allow to implement specific actions at native kernel
> speed such as OAM features, advanced SR SDN policies, SRv6 actions like
> Segment Routing Header (SRH) encapsulation depending on the content of
> the packet, etc ... 
> 
> This patchset is divided in 5 patches, whose main features are :
> 
> - A new seg6local action End.BPF with the corresponding new BPF program
>   type BPF_PROG_TYPE_LWT_SEG6LOCAL. Such attached BPF program can be
>   passed to the LWT seg6local through netlink, the same way as the LWT
>   BPF hook operates.
> - 3 new BPF helpers for the seg6local BPF hook, allowing to edit/grow/
>   shrink a SRH and apply on a packet some of the generic SRv6 actions.
> - 1 new BPF helper for the LWT BPF IN hook, allowing to add a SRH through
>   encapsulation (via IPv6 encapsulation or inlining if the packet contains
>   already an IPv6 header).
> 
> As this patchset adds a new LWT BPF hook, I took into account the result of
> the discussions when the LWT BPF infrastructure got merged. Hence, the
> seg6local BPF hook doesn’t allow write access to skb->data directly, only
> the SRH can be modified through specific helpers, which ensures that the
> integrity of the packet is maintained.
> More details are available in the related patches messages.
> 
> The performances of this BPF hook have been assessed with the BPF JIT
> enabled on a Intel Xeon X3440 processors with 4 cores and 8 threads
> clocked at 2.53 GHz. No throughput losses are noted with the seg6local
> BPF hook when the BPF program does nothing (440kpps). Adding a 8-bytes
> TLV (1 call each to bpf_lwt_seg6_adjust_srh and bpf_lwt_seg6_store_bytes)
> drops the throughput to 410kpps, and inlining a SRH via
> bpf_lwt_seg6_action drops the throughput to 420kpps.
> All throughputs are stable.
> 
> Any comments on the patchset are welcome.

I've looked through the patches and everything looks very good.
Feel free to resubmit without RFC tag.

In patch 2 I was a bit concerned that:
+   struct seg6_bpf_srh_state *srh_state = (struct seg6_bpf_srh_state *)
+  >cb;
would not collide with other users of skb->cb, but it seems the way
the hook is placed such usage should always be valid.
Would be good to add a comment describing the situation.

Looks like somewhat odd 'End.BPF' name comes from similar names in SRv6 draft.
Do you plan to disclose such End.BPF action in the draft as well?

Thanks

Re: [PATCH v2 net-next 08/12] inet: frags: use rhashtables for reassembly units

2018-03-30 Thread Eric Dumazet

On 03/30/2018 03:44 PM, Kirill Tkhai wrote:
> Hi, Eric,
> 
> thanks for more small patches in v2. One comment below.
> 

>> -
>> -struct inet_frag_bucket {
>> -struct hlist_head   chain;
>> -spinlock_t  chain_lock;
>> +struct netns_frags  *net;
>> +struct rcu_head rcu;
> 
> inet_frag_destroy() calls call_rcu() after frags are destroyed.
> It looks like we may place this rcu in union with fragments and
> fragments_tail and to sa

No, because I am planning to free skbs in the rcu call back very shortly.

As I mentioned, I believe we could use pure RCU lookups, without grabing a
reference and then release it after packet is processed/added to the queue.

This is not urgent, I wanted to get the first patches for review before doing
this final step, as I do not expect more than 5 % improvement from this 
optimization.

Also note that the ipq structure is 192 bytes, nicely using 3 cache lines.

Trying to save 16 bytes will likely reduce performance,  unless we force padding
to reach 192 bytes again :)

[PATCH net 1/1] net/mlx5e: Set EQE based as default TX interrupt moderation mode

2018-03-30 Thread Saeed Mahameed

From: Tal Gilboa 

The default TX moderation mode was mistakenly set to CQE based. The
intention was to add a control ability in order to improve some specific
use-cases. In general, we prefer to use EQE based moderation as it gives
much better numbers for the common cases.

CQE based causes a degradation in the common case since it resets the
moderation timer on CQE generation. This causes an issue when TSO is
well utilized (large TSO sessions). The timer is set to 16us so traffic
of ~64KB TSO sessions per second would mean timer reset (CQE per TSO
session -> long time between CQEs). In this case we quickly reach the
tcp_limit_output_bytes (256KB by default) and cause a halt in TX traffic.

By setting EQE based moderation we make sure timer would expire after
16us regardless of the packet rate.
This fixes an up to 40% packet rate and up to 23% bandwidth degradtions.

Fixes: 0088cbbc4b66 ("net/mlx5e: Enable CQE based moderation on TX CQ")
Signed-off-by: Tal Gilboa 
Signed-off-by: Saeed Mahameed 
---

Hi Dave,

Sorry for the late net submission but this patch is very important since
it addresses a very serious TX performance degradation.

For -stable 4.15.

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9b4827d36e3e..1560ec0e1779 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3980,7 +3980,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
struct mlx5e_params *params,
u16 max_channels)
 {
-   u8 cq_period_mode = 0;
+   u8 rx_cq_period_mode;
u32 link_speed = 0;
u32 pci_bw = 0;
 
@@ -4016,12 +4016,12 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
params->lro_timeout = mlx5e_choose_lro_timeout(mdev, 
MLX5E_DEFAULT_LRO_TIMEOUT);
 
/* CQ moderation params */
-   cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
+   rx_cq_period_mode = MLX5_CAP_GEN(mdev, cq_period_start_from_cqe) ?
MLX5_CQ_PERIOD_MODE_START_FROM_CQE :
MLX5_CQ_PERIOD_MODE_START_FROM_EQE;
params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation);
-   mlx5e_set_rx_cq_mode_params(params, cq_period_mode);
-   mlx5e_set_tx_cq_mode_params(params, cq_period_mode);
+   mlx5e_set_rx_cq_mode_params(params, rx_cq_period_mode);
+   mlx5e_set_tx_cq_mode_params(params, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
 
/* TX inline */
params->tx_max_inline = mlx5e_get_max_inline_cap(mdev);
-- 
2.14.3

Re: [PATCH v2 net-next 08/12] inet: frags: use rhashtables for reassembly units

2018-03-30 Thread Kirill Tkhai

Hi, Eric,

thanks for more small patches in v2. One comment below.

On 30.03.2018 23:42, Eric Dumazet wrote:
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.
> 
> It uses static hash tables of 1024 buckets, and up to 128 items per bucket 
> (!!!)
> 
> A work queue is supposed to garbage collect items when host is under memory
> pressure, and doing a hash rebuild, changing seed used in hash computations.
> 
> This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
> occurring every 5 seconds if host is under fire.
> 
> Then there is the problem of sharing this hash table for all netns.
> 
> It is time to switch to rhashtables, and allocate one of them per netns
> to speedup netns dismantle, since this is a critical metric these days.
> 
> Lookup is now using RCU. A followup patch will even remove
> the refcount hold/release left from prior implementation and save
> a couple of atomic operations.
> 
> Before this patch, 16 cpus (16 RX queue NIC) could not handle more
> than 1 Mpps frags DDOS.
> 
> After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
> of storage for the fragments.
> 
> $ grep FRAG /proc/net/sockstat
> FRAG: inuse 1966916 memory 2140004608
> 
> A followup patch will change the limits for 64bit arches.
> 
> Signed-off-by: Eric Dumazet 
> Cc: Florian Westphal 
> Cc: Nikolay Aleksandrov 
> Cc: Jesper Dangaard Brouer 
> Cc: Alexander Aring 
> Cc: Stefan Schmidt 
> ---
>  Documentation/networking/ip-sysctl.txt  |   7 +-
>  include/net/inet_frag.h |  81 +++---
>  include/net/ipv6.h  |  16 +-
>  net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
>  net/ieee802154/6lowpan/reassembly.c |  93 +++
>  net/ipv4/inet_fragment.c| 352 +---
>  net/ipv4/ip_fragment.c  | 112 
>  net/ipv6/netfilter/nf_conntrack_reasm.c |  51 +---
>  net/ipv6/reassembly.c   | 110 
>  9 files changed, 269 insertions(+), 579 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt 
> b/Documentation/networking/ip-sysctl.txt
> index 
> 33f35f049ad57ad6c06ed6e089966e346d72d108..6f2a3670e44b6662ce53c16cb7ca1e4f61274c15
>  100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -134,13 +134,10 @@ min_adv_mss - INTEGER
>  IP Fragmentation:
>  
>  ipfrag_high_thresh - INTEGER
> - Maximum memory used to reassemble IP fragments. When
> - ipfrag_high_thresh bytes of memory is allocated for this purpose,
> - the fragment handler will toss packets until ipfrag_low_thresh
> - is reached. This also serves as a maximum limit to namespaces
> - different from the initial one.
> + Maximum memory used to reassemble IP fragments.
>  
>  ipfrag_low_thresh - INTEGER
> + (Obsolete since linux-4.17)
>   Maximum memory used to reassemble IP fragments before the kernel
>   begins to remove incomplete fragment queues to free up resources.
>   The kernel still accepts new fragments for defragmentation.
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 
> 69e531ed81894393e07cac9e953825fcb55ef42a..3fec0d3a0d0186e98afb951784e1fe7329ba6d77
>  100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -2,7 +2,11 @@
>  #ifndef __NET_FRAG_H__
>  #define __NET_FRAG_H__
>  
> +#include 
> +
>  struct netns_frags {
> + struct rhashtable   rhashtable cacheline_aligned_in_smp;
> +
>   /* Keep atomic mem on separate cachelines in structs that include it */
>   atomic_tmem cacheline_aligned_in_smp;
>   /* sysctls */
> @@ -26,12 +30,30 @@ enum {
>   INET_FRAG_COMPLETE  = BIT(2),
>  };
>  
> +struct frag_v4_compare_key {
> + __be32  saddr;
> + __be32  daddr;
> + u32 user;
> + u32 vif;
> + __be16  id;
> + u16 protocol;
> +};
> +
> +struct frag_v6_compare_key {
> + struct in6_addr saddr;
> + struct in6_addr daddr;
> + u32 user;
> + __be32  id;
> + u32 iif;
> +};
> +
>  /**
>   * struct inet_frag_queue - fragment queue
>   *
> - * @lock: spinlock protecting the queue
> + * @node: rhash node
> + * @key: keys identifying this frag.
>   * @timer: queue expiration timer
> - * @list: hash bucket list
> + * @lock: spinlock protecting this frag
>   * @refcnt: reference count of the queue
>   * @fragments: received fragments head
>   * @fragments_tail: received fragments tail
> @@ -41,12 +63,16 @@ enum {
>   * @flags: fragment queue flags
>   * @max_size: maximum received fragment size
>   * @net: namespace that this frag belongs to
> - * @list_evictor: list of queues to forcefully evict (e.g. due

Re: [PATCH v2 net-next 08/12] inet: frags: use rhashtables for reassembly units

2018-03-30 Thread Eric Dumazet



On 03/30/2018 01:42 PM, Eric Dumazet wrote:
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.

...

> -
>  static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
>  struct inet_frags *f,
>  void *arg)
>  {
>   struct inet_frag_queue *q;
>  
> - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> - inet_frag_schedule_worker(f);
> + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
>   return NULL;
> - }
>  
>   q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
>   if (!q)
> @@ -374,59 +167,53 @@ static struct inet_frag_queue *inet_frag_alloc(struct 
> netns_frags *nf,
>  
>   timer_setup(>timer, f->frag_expire, 0);
>   spin_lock_init(>lock);
> - refcount_set(>refcnt, 1);
> + refcount_set(>refcnt, 3);
>  
>   return q;
>  }
>  
>  static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> - struct inet_frags *f,
>   void *arg)
>  {
> + struct inet_frags *f = nf->f;
>   struct inet_frag_queue *q;
> + int err;
>  
>   q = inet_frag_alloc(nf, f, arg);
>   if (!q)
>   return NULL;
>  
> - return inet_frag_intern(nf, q, f, arg);
> + mod_timer(>timer, jiffies + nf->timeout);
> +
> + err = rhashtable_insert_fast(>rhashtable, >node,
> +  f->rhash_params);
> + add_frag_mem_limit(nf, f->qsize);

When adding back inet_frag_alloc() to address Kirill feedback,
it looks like I forgot to remove this add_frag_mem_limit() :/

Re: [PATCH net-next 0/9] devlink: Add support for region access

2018-03-30 Thread David Ahern

On 3/30/18 1:39 PM, Alex Vesker wrote:
> 
> 
> On 3/30/2018 7:57 PM, David Ahern wrote:
>> On 3/30/18 8:34 AM, Andrew Lunn wrote:
> And it seems to want contiguous pages. How well does that work after
> the system has been running for a while and memory is fragmented?
 The allocation can be changed, there is no read need for contiguous
 pages.
 It is important to note that we the amount of snapshots is limited
 by the
 driver
 this can be based on the dump size or expected frequency of collection.
 I also prefer not to pre-allocate this memory.
>>> The driver code also asks for a 1MB contiguous chunk of memory!  You
>>> really should think about this API, how can you avoid double memory
>>> allocations. And can kvmalloc be used. But then you get into the
>>> problem for DMA'ing the memory from the device...
>>>
>>> This API also does not scale. 1MB is actually quite small. I'm sure
>>> there is firmware running on CPUs with a lot more than 1MB of RAM.
>>> How well does with API work with 64MB? Say i wanted to snapshot my
>>> GPU? Or the MC/BMC?
>>>
>> That and the drivers control the number of snapshots. The user should be
>> able to control the number of snapshots, and an option to remove all
>> snapshots to free up that memory.
> 
> There is an option to free up this memory, using a delete command.
> The reason I added the option to control the number of snapshots from
> the driver side only is because the driver knows the size of the snapshots
> and when/why they will be taken.
> For example in our mlx4 driver the snapshots are taken on rare failures,
> the snapshot is quite large and from past analyses the first dump is
> usually
> the important one, this means that 8 is more than enough in my case.
> If a user wants more than that he can always monitor notification read
> the snapshot and delete once backup-ed, there is no reason for keeping
> all of this data in the kernel.
> 
> 

I was thinking less. ie., a user says keep only 1 or 2 snapshots or
disable snapshots altogether.

Re: [PATCH] usb: plusb: Add support for PL-27A1

2018-03-30 Thread Daniel Kučera

Hello Roman,

it would be at least polite to mention where you got the code from:

https://lkml.org/lkml/2016/2/21/14

-- 

S pozdravom / Best regards
Daniel Kucera.

[PATCH v3 bpf-next 4/9] selftests/bpf: Selftest for sys_bind hooks

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

Add selftest to work with bpf_sock_addr context from
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR` programs.

Try to bind(2) on IP:port and apply:
* loads to make sure context can be read correctly, including narrow
  loads (byte, half) for IP and full-size loads (word) for all fields;
* stores to those fields allowed by verifier.

All combination from IPv4/IPv6 and TCP/UDP are tested.

Both scenarios are tested:
* valid programs can be loaded and attached;
* invalid programs can be neither loaded nor attached.

Test passes when expected data can be read from context in the
BPF-program, and after the call to bind(2) socket is bound to IP:port
pair that was written by BPF-program to the context.

Example:
  # ./test_sock_addr
  Attached bind4 program.
  Test case #1 (IPv4/TCP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Test case #2 (IPv4/UDP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Attached bind6 program.
  Test case #3 (IPv6/TCP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Test case #4 (IPv6/UDP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  ### SUCCESS

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h   |  23 ++
 tools/lib/bpf/libbpf.c   |   6 +
 tools/testing/selftests/bpf/Makefile |   3 +-
 tools/testing/selftests/bpf/test_sock_addr.c | 486 +++
 4 files changed, 517 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sock_addr.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e1c1fed63396..f2120c5c0578 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -136,6 +136,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_CGROUP_DEVICE,
BPF_PROG_TYPE_SK_MSG,
BPF_PROG_TYPE_RAW_TRACEPOINT,
+   BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
 };
 
 enum bpf_attach_type {
@@ -147,6 +148,8 @@ enum bpf_attach_type {
BPF_SK_SKB_STREAM_VERDICT,
BPF_CGROUP_DEVICE,
BPF_SK_MSG_VERDICT,
+   BPF_CGROUP_INET4_BIND,
+   BPF_CGROUP_INET6_BIND,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -1009,6 +1012,26 @@ struct bpf_map_info {
__u64 netns_ino;
 } __attribute__((aligned(8)));
 
+/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed
+ * by user and intended to be used by socket (e.g. to bind to, depends on
+ * attach attach type).
+ */
+struct bpf_sock_addr {
+   __u32 user_family;  /* Allows 4-byte read, but no write. */
+   __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write.
+* Stored in network byte order.
+*/
+   __u32 user_ip6[4];  /* Allows 1,2,4-byte read an 4-byte write.
+* Stored in network byte order.
+*/
+   __u32 user_port;/* Allows 4-byte read and write.
+* Stored in network byte order
+*/
+   __u32 family;   /* Allows 4-byte read, but no write */
+   __u32 type; /* Allows 4-byte read, but no write */
+   __u32 protocol; /* Allows 4-byte read, but no write */
+};
+
 /* User bpf_sock_ops struct to access socket values and specify request ops
  * and their replies.
  * Some of this fields are in network (bigendian) byte order and may need
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 48e3e743ebf7..d7ce8818982c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1859,6 +1859,9 @@ static void bpf_program__set_expected_attach_type(struct 
bpf_program *prog,
 
 #define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_FULL(string, ptype, 0)
 
+#define BPF_SA_PROG_SEC(string, ptype) \
+   BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ptype)
+
 static const struct {
const char *sec;
size_t len;
@@ -1882,10 +1885,13 @@ static const struct {
BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS),
BPF_PROG_SEC("sk_skb",  BPF_PROG_TYPE_SK_SKB),
BPF_PROG_SEC("sk_msg",  BPF_PROG_TYPE_SK_MSG),
+   BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND),
+   BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND),
 };
 
 #undef BPF_PROG_SEC
 #undef BPF_PROG_SEC_FULL
+#undef BPF_SA_PROG_SEC
 
 static int bpf_program__identify_section(struct bpf_program *prog)
 {
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index f35fb02bdf56..f4717c910874 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ urandom_read:

[PATCH v3 bpf-next 0/9] bpf: introduce cgroup-bpf bind, connect, post-bind hooks

2018-03-30 Thread Alexei Starovoitov

v2->v3:
- rebase due to conflicts
- fix ipv6=m build

v1->v2:
- support expected_attach_type at prog load time so that prog (incl.
  context accesses and calls to helpers) can be validated with regard to
  specific attach point it is supposed to be attached to.
  Later, at attach time, attach type is checked so that it must be same as
  at load time if it was provided
- reworked hooks to rely on expected_attach_type, and reduced number of new
  prog types from 6 to just 1: BPF_PROG_TYPE_CGROUP_SOCK_ADDR
- reused BPF_PROG_TYPE_CGROUP_SOCK for sys_bind post-hooks
- add selftests for post-sys_bind hook

For our container management we've been using complicated and fragile setup
consisting of LD_PRELOAD wrapper intercepting bind and connect calls from
all containerized applications. Unfortunately it doesn't work for apps that
don't use glibc and changing all applications that run in the datacenter
is not possible due to 3rd party code and libraries (despite being
open source code) and sheer amount of legacy code that has to be rewritten
(we're rewriting what we can in parallel)

These applications are written without containers in mind and have
builtin assumptions about network services. Like an application X
expects to connect localhost:special_port and find service Y in there.
To move application X and service Y into two different containers
LD_PRELOAD approach is used to help one service connect to another
without rewriting them.
Moving these two applications into different L2 (netns) or L3 (vrf)
network isolation scopes doesn't help to solve the problem, since
applications need to see each other like they were running on
the host without containers.
So if app X and app Y would run in different netns something
would need to punch a connectivity hole in those namespaces.
That would be real layering violation (with corresponding
network debugging pains), since clean l2, l3 abstraction would
suddenly support something that breaks through the layers.

Instead we used LD_PRELOAD (and now bpf programs) at bind/connect
time to help applications discover and connect to each other.
All applications are running in init_nens and there are no vrfs.
After bind/connect the normal fib/neighbor core networking
logic works as it should always do and the whole system is
clean from network point of view and can be debugged with
standard tools.

We also considered resurrecting Hannes's afnetns work,
but all hierarchical namespace abstraction don't work due
to these builtin networking assumptions inside the apps.
To run an application inside cgroup container that was not written
with containers in mind we have to make an illusion of running
in non-containerized environment.
In some cases we remember the port and container id in the post-bind hook
in a bpf map and when some other task in a different container is trying
to connect to a service we need to know where this service is running.
It can be remote and can be local. Both client and service may or may not
be written with containers in mind and this sockaddr rewrite is providing
connectivity and load balancing feature.

BPF+cgroup looks to be the best solution for this problem.
Hence we introduce 3 hooks:
- at entry into sys_bind and sys_connect
  to let bpf prog look and modify 'struct sockaddr' provided
  by user space and fail bind/connect when appropriate
- post sys_bind after port is allocated

The approach works great and has zero overhead for anyone who doesn't
use it and very low overhead when deployed.

Different use case for this feature is to do low overhead firewall
that doesn't need to inspect all packets and works at bind/connect time.

Andrey Ignatov (9):
  bpf: Check attach type at prog load time
  libbpf: Support expected_attach_type at prog load
  bpf: Hooks for sys_bind
  selftests/bpf: Selftest for sys_bind hooks
  net: Introduce __inet_bind() and __inet6_bind
  bpf: Hooks for sys_connect
  selftests/bpf: Selftest for sys_connect hooks
  bpf: Post-hooks for sys_bind
  selftests/bpf: Selftest for sys_bind post-hooks.

 include/linux/bpf-cgroup.h|  68 ++-
 include/linux/bpf.h   |   5 +-
 include/linux/bpf_types.h |   1 +
 include/linux/filter.h|  11 +
 include/net/addrconf.h|   7 +
 include/net/inet_common.h |   2 +
 include/net/ipv6.h|   2 +
 include/net/sock.h|   3 +
 include/net/udp.h |   1 +
 include/uapi/linux/bpf.h  |  51 ++-
 kernel/bpf/cgroup.c   |  39 +-
 kernel/bpf/syscall.c  | 102 -
 kernel/bpf/verifier.c |   7 +-
 kernel/trace/bpf_trace.c  |  27 +-
 net/core/filter.c | 442 +--
 net/ipv4/af_inet.c|  71 +++-
 net/ipv4/tcp_ipv4.c

[PATCH v3 bpf-next 9/9] selftests/bpf: Selftest for sys_bind post-hooks.

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

Add selftest for attach types `BPF_CGROUP_INET4_POST_BIND` and
`BPF_CGROUP_INET6_POST_BIND`.

The main things tested are:
* prog load behaves as expected (valid/invalid accesses in prog);
* prog attach behaves as expected (load- vs attach-time attach types);
* `BPF_CGROUP_INET_SOCK_CREATE` can be attached in a backward compatible
  way;
* post-hooks return expected result and errno.

Example:
  # ./test_sock
  Test case: bind4 load with invalid access: src_ip6 .. [PASS]
  Test case: bind4 load with invalid access: mark .. [PASS]
  Test case: bind6 load with invalid access: src_ip4 .. [PASS]
  Test case: sock_create load with invalid access: src_port .. [PASS]
  Test case: sock_create load w/o expected_attach_type (compat mode) ..
  [PASS]
  Test case: sock_create load w/ expected_attach_type .. [PASS]
  Test case: attach type mismatch bind4 vs bind6 .. [PASS]
  Test case: attach type mismatch bind6 vs bind4 .. [PASS]
  Test case: attach type mismatch default vs bind4 .. [PASS]
  Test case: attach type mismatch bind6 vs sock_create .. [PASS]
  Test case: bind4 reject all .. [PASS]
  Test case: bind6 reject all .. [PASS]
  Test case: bind6 deny specific IP & port .. [PASS]
  Test case: bind4 allow specific IP & port .. [PASS]
  Test case: bind4 allow all .. [PASS]
  Test case: bind6 allow all .. [PASS]
  Summary: 16 PASSED, 0 FAILED

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h  |  11 +
 tools/testing/selftests/bpf/Makefile|   4 +-
 tools/testing/selftests/bpf/test_sock.c | 479 
 3 files changed, 493 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_sock.c

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 71051d01e8dd..9d07465023a2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -152,6 +152,8 @@ enum bpf_attach_type {
BPF_CGROUP_INET6_BIND,
BPF_CGROUP_INET4_CONNECT,
BPF_CGROUP_INET6_CONNECT,
+   BPF_CGROUP_INET4_POST_BIND,
+   BPF_CGROUP_INET6_POST_BIND,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -947,6 +949,15 @@ struct bpf_sock {
__u32 protocol;
__u32 mark;
__u32 priority;
+   __u32 src_ip4;  /* Allows 1,2,4-byte read.
+* Stored in network byte order.
+*/
+   __u32 src_ip6[4];   /* Allows 1,2,4-byte read.
+* Stored in network byte order.
+*/
+   __u32 src_port; /* Allows 4-byte read.
+* Stored in host byte order
+*/
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index c64d4ebc77ff..0a315ddabbf4 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,8 @@ urandom_read: urandom_read.c
 
 # Order correspond to 'make run_tests' order
 TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map 
test_progs \
-   test_align test_verifier_log test_dev_cgroup test_tcpbpf_user 
test_sock_addr
+   test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
+   test_sock test_sock_addr
 
 TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o 
test_obj_id.o \
test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o 
sockmap_parse_prog.o \
@@ -52,6 +53,7 @@ $(TEST_GEN_PROGS): $(BPFOBJ)
 $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+$(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
 
 .PHONY: force
diff --git a/tools/testing/selftests/bpf/test_sock.c 
b/tools/testing/selftests/bpf/test_sock.c
new file mode 100644
index ..73bb20cfb9b7
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include 
+
+#include 
+
+#include "cgroup_helpers.h"
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#define CG_PATH"/foo"
+#define MAX_INSNS  512
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_test {
+   const char *descr;
+   /* BPF prog properties */
+   struct bpf_insn insns[MAX_INSNS];
+   enum bpf_attach_type expected_attach_type;
+   enum bpf_attach_type attach_type;
+   /* Socket properties */
+   int domain;
+   int type;
+   /* Endpoint to bind() to */
+   const char *ip;
+   unsigned short port;
+   /* Expected test result */
+   enum {
+   LOAD_REJECT,
+   ATTACH_REJECT,
+   BIND_REJECT,
+

[PATCH v3 bpf-next 3/9] bpf: Hooks for sys_bind

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

== The problem ==

There is a use-case when all processes inside a cgroup should use one
single IP address on a host that has multiple IP configured.  Those
processes should use the IP for both ingress and egress, for TCP and UDP
traffic. So TCP/UDP servers should be bound to that IP to accept
incoming connections on it, and TCP/UDP clients should make outgoing
connections from that IP. It should not require changing application
code since it's often not possible.

Currently it's solved by intercepting glibc wrappers around syscalls
such as `bind(2)` and `connect(2)`. It's done by a shared library that
is preloaded for every process in a cgroup so that whenever TCP/UDP
server calls `bind(2)`, the library replaces IP in sockaddr before
passing arguments to syscall. When application calls `connect(2)` the
library transparently binds the local end of connection to that IP
(`bind(2)` with `IP_BIND_ADDRESS_NO_PORT` to avoid performance penalty).

Shared library approach is fragile though, e.g.:
* some applications clear env vars (incl. `LD_PRELOAD`);
* `/etc/ld.so.preload` doesn't help since some applications are linked
  with option `-z nodefaultlib`;
* other applications don't use glibc and there is nothing to intercept.

== The solution ==

The patch provides much more reliable in-kernel solution for the 1st
part of the problem: binding TCP/UDP servers on desired IP. It does not
depend on application environment and implementation details (whether
glibc is used or not).

It adds new eBPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` and
attach types `BPF_CGROUP_INET4_BIND` and `BPF_CGROUP_INET6_BIND`
(similar to already existing `BPF_CGROUP_INET_SOCK_CREATE`).

The new program type is intended to be used with sockets (`struct sock`)
in a cgroup and provided by user `struct sockaddr`. Pointers to both of
them are parts of the context passed to programs of newly added types.

The new attach types provides hooks in `bind(2)` system call for both
IPv4 and IPv6 so that one can write a program to override IP addresses
and ports user program tries to bind to and apply such a program for
whole cgroup.

== Implementation notes ==

[1]
Separate attach types for `AF_INET` and `AF_INET6` are added
intentionally to prevent reading/writing to offsets that don't make
sense for corresponding socket family. E.g. if user passes `sockaddr_in`
it doesn't make sense to read from / write to `user_ip6[]` context
fields.

[2]
The write access to `struct bpf_sock_addr_kern` is implemented using
special field as an additional "register".

There are just two registers in `sock_addr_convert_ctx_access`: `src`
with value to write and `dst` with pointer to context that can't be
changed not to break later instructions. But the fields, allowed to
write to, are not available directly and to access them address of
corresponding pointer has to be loaded first. To get additional register
the 1st not used by `src` and `dst` one is taken, its content is saved
to `bpf_sock_addr_kern.tmp_reg`, then the register is used to load
address of pointer field, and finally the register's content is restored
from the temporary field after writing `src` value.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h |  21 
 include/linux/bpf_types.h  |   1 +
 include/linux/filter.h |  10 ++
 include/uapi/linux/bpf.h   |  23 +
 kernel/bpf/cgroup.c|  36 +++
 kernel/bpf/syscall.c   |  36 +--
 kernel/bpf/verifier.c  |   1 +
 net/core/filter.c  | 232 +
 net/ipv4/af_inet.c |   7 ++
 net/ipv6/af_inet6.c|   7 ++
 10 files changed, 366 insertions(+), 8 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 8a4566691c8f..67dc4a6471ad 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -6,6 +6,7 @@
 #include 
 
 struct sock;
+struct sockaddr;
 struct cgroup;
 struct sk_buff;
 struct bpf_sock_ops_kern;
@@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 int __cgroup_bpf_run_filter_sk(struct sock *sk,
   enum bpf_attach_type type);
 
+int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+ struct sockaddr *uaddr,
+ enum bpf_attach_type type);
+
 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 struct bpf_sock_ops_kern *sock_ops,
 enum bpf_attach_type type);
@@ -103,6 +108,20 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)
   \
+({\
+

[PATCH v3 bpf-next 1/9] bpf: Check attach type at prog load time

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

== The problem ==

There are use-cases when a program of some type can be attached to
multiple attach points and those attach points must have different
permissions to access context or to call helpers.

E.g. context structure may have fields for both IPv4 and IPv6 but it
doesn't make sense to read from / write to IPv6 field when attach point
is somewhere in IPv4 stack.

Same applies to BPF-helpers: it may make sense to call some helper from
some attach point, but not from other for same prog type.

== The solution ==

Introduce `expected_attach_type` field in in `struct bpf_attr` for
`BPF_PROG_LOAD` command. If scenario described in "The problem" section
is the case for some prog type, the field will be checked twice:

1) At load time prog type is checked to see if attach type for it must
   be known to validate program permissions correctly. Prog will be
   rejected with EINVAL if it's the case and `expected_attach_type` is
   not specified or has invalid value.

2) At attach time `attach_type` is compared with `expected_attach_type`,
   if prog type requires to have one, and, if they differ, attach will
   be rejected with EINVAL.

The `expected_attach_type` is now available as part of `struct bpf_prog`
in both `bpf_verifier_ops->is_valid_access()` and
`bpf_verifier_ops->get_func_proto()` () and can be used to check context
accesses and calls to helpers correspondingly.

Initially the idea was discussed by Alexei Starovoitov  and
Daniel Borkmann  here:
https://marc.info/?l=linux-netdev=152107378717201=2

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf.h  |  5 -
 include/linux/filter.h   |  1 +
 include/uapi/linux/bpf.h |  5 +
 kernel/bpf/cgroup.c  |  3 ++-
 kernel/bpf/syscall.c | 31 ++-
 kernel/bpf/verifier.c|  6 +++---
 kernel/trace/bpf_trace.c | 27 ++-
 net/core/filter.c| 39 +--
 8 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 819229c80eca..95a7abd0ee92 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -208,12 +208,15 @@ struct bpf_prog_ops {
 
 struct bpf_verifier_ops {
/* return eBPF function prototype for verification */
-   const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id 
func_id);
+   const struct bpf_func_proto *
+   (*get_func_proto)(enum bpf_func_id func_id,
+ const struct bpf_prog *prog);
 
/* return true if 'size' wide access at offset 'off' within bpf_context
 * with 'type' (read or write) is allowed
 */
bool (*is_valid_access)(int off, int size, enum bpf_access_type type,
+   const struct bpf_prog *prog,
struct bpf_insn_access_aux *info);
int (*gen_prologue)(struct bpf_insn *insn, bool direct_write,
const struct bpf_prog *prog);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 897ff3d95968..13c044e4832d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -469,6 +469,7 @@ struct bpf_prog {
is_func:1,  /* program is a bpf function */
kprobe_override:1; /* Do we override a kprobe? 
*/
enum bpf_prog_type  type;   /* Type of BPF program */
+   enum bpf_attach_typeexpected_attach_type; /* For some prog types */
u32 len;/* Number of filter blocks */
u32 jited_len;  /* Size of jited insns in bytes 
*/
u8  tag[BPF_TAG_SIZE];
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 1878201c2d77..102718624d1e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -296,6 +296,11 @@ union bpf_attr {
__u32   prog_flags;
charprog_name[BPF_OBJ_NAME_LEN];
__u32   prog_ifindex;   /* ifindex of netdev to prep 
for */
+   /* For some prog types expected attach type must be known at
+* load time to verify attach type specific parts of prog
+* (context accesses, allowed helpers, etc).
+*/
+   __u32   expected_attach_type;
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c1c0b60d3f2f..8730b24ed540 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -545,7 +545,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 
 static const struct bpf_func_proto *
-cgroup_dev_func_proto(enum bpf_func_id func_id)

[PATCH v3 bpf-next 2/9] libbpf: Support expected_attach_type at prog load

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

Support setting `expected_attach_type` at prog load time in both
`bpf/bpf.h` and `bpf/libbpf.h`.

Since both headers already have API to load programs, new functions are
added not to break backward compatibility for existing ones:
* `bpf_load_program_xattr()` is added to `bpf/bpf.h`;
* `bpf_prog_load_xattr()` is added to `bpf/libbpf.h`.

Both new functions accept structures, `struct bpf_load_program_attr` and
`struct bpf_prog_load_attr` correspondingly, where new fields can be
added in the future w/o changing the API.

Standard `_xattr` suffix is used to name the new API functions.

Since `bpf_load_program_name()` is not used as heavily as
`bpf_load_program()`, it was removed in favor of more generic
`bpf_load_program_xattr()`.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h |   5 ++
 tools/lib/bpf/bpf.c|  44 +++--
 tools/lib/bpf/bpf.h|  17 +--
 tools/lib/bpf/libbpf.c | 105 +++--
 tools/lib/bpf/libbpf.h |   8 
 5 files changed, 133 insertions(+), 46 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 58060bec999d..e1c1fed63396 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -296,6 +296,11 @@ union bpf_attr {
__u32   prog_flags;
charprog_name[BPF_OBJ_NAME_LEN];
__u32   prog_ifindex;   /* ifindex of netdev to prep 
for */
+   /* For some prog types expected attach type must be known at
+* load time to verify attach type specific parts of prog
+* (context accesses, allowed helpers, etc).
+*/
+   __u32   expected_attach_type;
};
 
struct { /* anonymous struct used by BPF_OBJ_* commands */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e0500055f1a6..acbb3f8b3bec 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -146,26 +146,30 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, 
const char *name,
  -1);
 }
 
-int bpf_load_program_name(enum bpf_prog_type type, const char *name,
- const struct bpf_insn *insns,
- size_t insns_cnt, const char *license,
- __u32 kern_version, char *log_buf,
- size_t log_buf_sz)
+int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
+  char *log_buf, size_t log_buf_sz)
 {
-   int fd;
union bpf_attr attr;
-   __u32 name_len = name ? strlen(name) : 0;
+   __u32 name_len;
+   int fd;
+
+   if (!load_attr)
+   return -EINVAL;
+
+   name_len = load_attr->name ? strlen(load_attr->name) : 0;
 
bzero(, sizeof(attr));
-   attr.prog_type = type;
-   attr.insn_cnt = (__u32)insns_cnt;
-   attr.insns = ptr_to_u64(insns);
-   attr.license = ptr_to_u64(license);
+   attr.prog_type = load_attr->prog_type;
+   attr.expected_attach_type = load_attr->expected_attach_type;
+   attr.insn_cnt = (__u32)load_attr->insns_cnt;
+   attr.insns = ptr_to_u64(load_attr->insns);
+   attr.license = ptr_to_u64(load_attr->license);
attr.log_buf = ptr_to_u64(NULL);
attr.log_size = 0;
attr.log_level = 0;
-   attr.kern_version = kern_version;
-   memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+   attr.kern_version = load_attr->kern_version;
+   memcpy(attr.prog_name, load_attr->name,
+  min(name_len, BPF_OBJ_NAME_LEN - 1));
 
fd = sys_bpf(BPF_PROG_LOAD, , sizeof(attr));
if (fd >= 0 || !log_buf || !log_buf_sz)
@@ -184,8 +188,18 @@ int bpf_load_program(enum bpf_prog_type type, const struct 
bpf_insn *insns,
 __u32 kern_version, char *log_buf,
 size_t log_buf_sz)
 {
-   return bpf_load_program_name(type, NULL, insns, insns_cnt, license,
-kern_version, log_buf, log_buf_sz);
+   struct bpf_load_program_attr load_attr;
+
+   memset(_attr, 0, sizeof(struct bpf_load_program_attr));
+   load_attr.prog_type = type;
+   load_attr.expected_attach_type = 0;
+   load_attr.name = NULL;
+   load_attr.insns = insns;
+   load_attr.insns_cnt = insns_cnt;
+   load_attr.license = license;
+   load_attr.kern_version = kern_version;
+
+   return bpf_load_program_xattr(_attr, log_buf, log_buf_sz);
 }
 
 int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index ee59342c6f42..39f6a0d64a3b 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -41,13 +41,20 @@ int bpf_create_map_in_map(enum bpf_map_type

[PATCH v3 bpf-next 7/9] selftests/bpf: Selftest for sys_connect hooks

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

Add selftest for BPF_CGROUP_INET4_CONNECT and BPF_CGROUP_INET6_CONNECT
attach types.

Try to connect(2) to specified IP:port and test that:
* remote IP:port pair is overridden;
* local end of connection is bound to specified IP.

All combinations of IPv4/IPv6 and TCP/UDP are tested.

Example:
  # tcpdump -pn -i lo -w connect.pcap 2>/dev/null &
  [1] 478
  # strace -qqf -e connect -o connect.trace ./test_sock_addr.sh
  Wait for testing IPv4/IPv6 to become available ... OK
  Load bind4 with invalid type (can pollute stderr) ... REJECTED
  Load bind4 with valid type ... OK
  Attach bind4 with invalid type ... REJECTED
  Attach bind4 with valid type ... OK
  Load connect4 with invalid type (can pollute stderr) libbpf: load bpf \
program failed: Permission denied
  libbpf: -- BEGIN DUMP LOG ---
  libbpf:
  0: (b7) r2 = 23569
  1: (63) *(u32 *)(r1 +24) = r2
  2: (b7) r2 = 16777343
  3: (63) *(u32 *)(r1 +4) = r2
  invalid bpf_context access off=4 size=4
  [ 1518.404609] random: crng init done

  libbpf: -- END LOG --
  libbpf: failed to load program 'cgroup/connect4'
  libbpf: failed to load object './connect4_prog.o'
  ... REJECTED
  Load connect4 with valid type ... OK
  Attach connect4 with invalid type ... REJECTED
  Attach connect4 with valid type ... OK
  Test case #1 (IPv4/TCP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Requested: connect(192.168.1.254, 4040) from (*, *) ..
 Actual: connect(127.0.0.1, ) from (127.0.0.4, 56068)
  Test case #2 (IPv4/UDP):
  Requested: bind(192.168.1.254, 4040) ..
 Actual: bind(127.0.0.1, )
  Requested: connect(192.168.1.254, 4040) from (*, *) ..
 Actual: connect(127.0.0.1, ) from (127.0.0.4, 56447)
  Load bind6 with invalid type (can pollute stderr) ... REJECTED
  Load bind6 with valid type ... OK
  Attach bind6 with invalid type ... REJECTED
  Attach bind6 with valid type ... OK
  Load connect6 with invalid type (can pollute stderr) libbpf: load bpf \
program failed: Permission denied
  libbpf: -- BEGIN DUMP LOG ---
  libbpf:
  0: (b7) r6 = 0
  1: (63) *(u32 *)(r1 +12) = r6
  invalid bpf_context access off=12 size=4

  libbpf: -- END LOG --
  libbpf: failed to load program 'cgroup/connect6'
  libbpf: failed to load object './connect6_prog.o'
  ... REJECTED
  Load connect6 with valid type ... OK
  Attach connect6 with invalid type ... REJECTED
  Attach connect6 with valid type ... OK
  Test case #3 (IPv6/TCP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Requested: connect(face:b00c:1234:5678::abcd, 6060) from (*, *)
 Actual: connect(::1, ) from (::6, 37458)
  Test case #4 (IPv6/UDP):
  Requested: bind(face:b00c:1234:5678::abcd, 6060) ..
 Actual: bind(::1, )
  Requested: connect(face:b00c:1234:5678::abcd, 6060) from (*, *)
 Actual: connect(::1, ) from (::6, 39315)
  ### SUCCESS
  # egrep 'connect\(.*AF_INET' connect.trace | \
  > egrep -vw 'htons\(1025\)' | fold -b -s -w 72
  502   connect(7, {sa_family=AF_INET, sin_port=htons(4040),
  sin_addr=inet_addr("192.168.1.254")}, 128) = 0
  502   connect(8, {sa_family=AF_INET, sin_port=htons(4040),
  sin_addr=inet_addr("192.168.1.254")}, 128) = 0
  502   connect(9, {sa_family=AF_INET6, sin6_port=htons(6060),
  inet_pton(AF_INET6, "face:b00c:1234:5678::abcd", _addr),
  sin6_flowinfo=0, sin6_scope_id=0}, 128) = 0
  502   connect(10, {sa_family=AF_INET6, sin6_port=htons(6060),
  inet_pton(AF_INET6, "face:b00c:1234:5678::abcd", _addr),
  sin6_flowinfo=0, sin6_scope_id=0}, 128) = 0
  # fg
  tcpdump -pn -i lo -w connect.pcap 2> /dev/null
  # tcpdump -r connect.pcap -n tcp | cut -c 1-72
  reading from file connect.pcap, link-type EN10MB (Ethernet)
  17:57:40.383533 IP 127.0.0.4.56068 > 127.0.0.1.: Flags [S], seq 1333
  17:57:40.383566 IP 127.0.0.1. > 127.0.0.4.56068: Flags [S.], seq 112
  17:57:40.383589 IP 127.0.0.4.56068 > 127.0.0.1.: Flags [.], ack 1, w
  17:57:40.384578 IP 127.0.0.1. > 127.0.0.4.56068: Flags [R.], seq 1,
  17:57:40.403327 IP6 ::6.37458 > ::1.: Flags [S], seq 406513443, win
  17:57:40.403357 IP6 ::1. > ::6.37458: Flags [S.], seq 2448389240, ac
  17:57:40.403376 IP6 ::6.37458 > ::1.: Flags [.], ack 1, win 342, opt
  17:57:40.404263 IP6 ::1. > ::6.37458: Flags [R.], seq 1, ack 1, win

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 tools/include/uapi/linux/bpf.h|  12 ++-
 tools/lib/bpf/libbpf.c|   2 +
 tools/testing/selftests/bpf/Makefile  |   5 +-
 tools/testing/selftests/bpf/bpf_helpers.h |   2 +
 tools/testing/selftests/bpf/connect4_prog.c   |  45 +++
 tools/testing/selftests/bpf/connect6_prog.c   |  61 +++
 tools/testing/selftests/bpf/test_sock_addr.c  | 104

[PATCH v3 bpf-next 5/9] net: Introduce __inet_bind() and __inet6_bind

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

Refactor `bind()` code to make it ready to be called from BPF helper
function `bpf_bind()` (will be added soon). Implementation of
`inet_bind()` and `inet6_bind()` is separated into `__inet_bind()` and
`__inet6_bind()` correspondingly. These function can be used from both
`sk_prot->bind` and `bpf_bind()` contexts.

New functions have two additional arguments.

`force_bind_address_no_port` forces binding to IP only w/o checking
`inet_sock.bind_address_no_port` field. It'll allow to bind local end of
a connection to desired IP in `bpf_bind()` w/o changing
`bind_address_no_port` field of a socket. It's useful since `bpf_bind()`
can return an error and we'd need to restore original value of
`bind_address_no_port` in that case if we changed this before calling to
the helper.

`with_lock` specifies whether to lock socket when working with `struct
sk` or not. The argument is set to `true` for `sk_prot->bind`, i.e. old
behavior is preserved. But it will be set to `false` for `bpf_bind()`
use-case. The reason is all call-sites, where `bpf_bind()` will be
called, already hold that socket lock.

Signed-off-by: Andrey Ignatov 
Acked-by: Alexei Starovoitov 
Signed-off-by: Alexei Starovoitov 
---
 include/net/inet_common.h |  2 ++
 include/net/ipv6.h|  2 ++
 net/ipv4/af_inet.c| 39 ---
 net/ipv6/af_inet6.c   | 37 -
 4 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/include/net/inet_common.h b/include/net/inet_common.h
index 500f81375200..384b90c62c0b 100644
--- a/include/net/inet_common.h
+++ b/include/net/inet_common.h
@@ -32,6 +32,8 @@ int inet_shutdown(struct socket *sock, int how);
 int inet_listen(struct socket *sock, int backlog);
 void inet_sock_destruct(struct sock *sk);
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+   bool force_bind_address_no_port, bool with_lock);
 int inet_getname(struct socket *sock, struct sockaddr *uaddr,
 int peer);
 int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 50a6f0ddb878..2e5fedc56e59 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -1066,6 +1066,8 @@ void ipv6_local_error(struct sock *sk, int err, struct 
flowi6 *fl6, u32 info);
 void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu);
 
 int inet6_release(struct socket *sock);
+int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len,
+bool force_bind_address_no_port, bool with_lock);
 int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
 int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
  int peer);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2dec266507dc..e203a39d6988 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -432,30 +432,37 @@ EXPORT_SYMBOL(inet_release);
 
 int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
-   struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
-   struct inet_sock *inet = inet_sk(sk);
-   struct net *net = sock_net(sk);
-   unsigned short snum;
-   int chk_addr_ret;
-   u32 tb_id = RT_TABLE_LOCAL;
int err;
 
/* If the socket has its own bind function then use it. (RAW) */
if (sk->sk_prot->bind) {
-   err = sk->sk_prot->bind(sk, uaddr, addr_len);
-   goto out;
+   return sk->sk_prot->bind(sk, uaddr, addr_len);
}
-   err = -EINVAL;
if (addr_len < sizeof(struct sockaddr_in))
-   goto out;
+   return -EINVAL;
 
/* BPF prog is run before any checks are done so that if the prog
 * changes context in a wrong way it will be caught.
 */
err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
if (err)
-   goto out;
+   return err;
+
+   return __inet_bind(sk, uaddr, addr_len, false, true);
+}
+EXPORT_SYMBOL(inet_bind);
+
+int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
+   bool force_bind_address_no_port, bool with_lock)
+{
+   struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+   struct inet_sock *inet = inet_sk(sk);
+   struct net *net = sock_net(sk);
+   unsigned short snum;
+   int chk_addr_ret;
+   u32 tb_id = RT_TABLE_LOCAL;
+   int err;
 
if (addr->sin_family != AF_INET) {
/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
@@ -499,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
 *  would be illegal to use them (multicast/broadcast) in
 *  which case the sending

[PATCH v3 bpf-next 8/9] bpf: Post-hooks for sys_bind

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

"Post-hooks" are hooks that are called right before returning from
sys_bind. At this time IP and port are already allocated and no further
changes to `struct sock` can happen before returning from sys_bind but
BPF program has a chance to inspect the socket and change sys_bind
result.

Specifically it can e.g. inspect what port was allocated and if it
doesn't satisfy some policy, BPF program can force sys_bind to fail and
return EPERM to user.

Another example of usage is recording the IP:port pair to some map to
use it in later calls to sys_connect. E.g. if some TCP server inside
cgroup was bound to some IP:port_n, it can be recorded to a map. And
later when some TCP client inside same cgroup is trying to connect to
127.0.0.1:port_n, BPF hook for sys_connect can override the destination
and connect application to IP:port_n instead of 127.0.0.1:port_n. That
helps forcing all applications inside a cgroup to use desired IP and not
break those applications if they e.g. use localhost to communicate
between each other.

== Implementation details ==

Post-hooks are implemented as two new attach types
`BPF_CGROUP_INET4_POST_BIND` and `BPF_CGROUP_INET6_POST_BIND` for
existing prog type `BPF_PROG_TYPE_CGROUP_SOCK`.

Separate attach types for IPv4 and IPv6 are introduced to avoid access
to IPv6 field in `struct sock` from `inet_bind()` and to IPv4 field from
`inet6_bind()` since those fields might not make sense in such cases.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h |  16 +--
 include/uapi/linux/bpf.h   |  11 +
 kernel/bpf/syscall.c   |  43 +
 net/core/filter.c  | 116 +++--
 net/ipv4/af_inet.c |  18 ---
 net/ipv6/af_inet6.c|  21 +---
 6 files changed, 195 insertions(+), 30 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index c6ab295e6dcb..30d15e64b993 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -98,16 +98,24 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
-#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+#define BPF_CGROUP_RUN_SK_PROG(sk, type)  \
 ({\
int __ret = 0; \
if (cgroup_bpf_enabled) {  \
-   __ret = __cgroup_bpf_run_filter_sk(sk, \
-BPF_CGROUP_INET_SOCK_CREATE); \
+   __ret = __cgroup_bpf_run_filter_sk(sk, type);  \
}  \
__ret; \
 })
 
+#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE)
+
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk)
   \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND)
+
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)
   \
+   BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND)
+
 #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type)
   \
 ({\
int __ret = 0; \
@@ -183,6 +191,8 @@ static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { 
return 0; }
 #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77afaf1ba556..c5ec89732a8d 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,8 @@ enum bpf_attach_type {
BPF_CGROUP_INET6_BIND,
BPF_CGROUP_INET4_CONNECT,
BPF_CGROUP_INET6_CONNECT,
+   BPF_CGROUP_INET4_POST_BIND,
+   BPF_CGROUP_INET6_POST_BIND,
__MAX_BPF_ATTACH_TYPE
 };
 
@@ -948,6 +950,15 @@ struct bpf_sock {
__u32 protocol;
__u32 mark;
__u32 priority;
+   __u32 src_ip4;  /* Allows 1,2,4-byte read.
+

[PATCH v3 bpf-next 6/9] bpf: Hooks for sys_connect

2018-03-30 Thread Alexei Starovoitov

From: Andrey Ignatov 

== The problem ==

See description of the problem in the initial patch of this patch set.

== The solution ==

The patch provides much more reliable in-kernel solution for the 2nd
part of the problem: making outgoing connecttion from desired IP.

It adds new attach types `BPF_CGROUP_INET4_CONNECT` and
`BPF_CGROUP_INET6_CONNECT` for program type
`BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that can be used to override both
source and destination of a connection at connect(2) time.

Local end of connection can be bound to desired IP using newly
introduced BPF-helper `bpf_bind()`. It allows to bind to only IP though,
and doesn't support binding to port, i.e. leverages
`IP_BIND_ADDRESS_NO_PORT` socket option. There are two reasons for this:
* looking for a free port is expensive and can affect performance
  significantly;
* there is no use-case for port.

As for remote end (`struct sockaddr *` passed by user), both parts of it
can be overridden, remote IP and remote port. It's useful if an
application inside cgroup wants to connect to another application inside
same cgroup or to itself, but knows nothing about IP assigned to the
cgroup.

Support is added for IPv4 and IPv6, for TCP and UDP.

IPv4 and IPv6 have separate attach types for same reason as sys_bind
hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields
when user passes sockaddr_in since it'd be out-of-bound.

== Implementation notes ==

The patch introduces new field in `struct proto`: `pre_connect` that is
a pointer to a function with same signature as `connect` but is called
before it. The reason is in some cases BPF hooks should be called way
before control is passed to `sk->sk_prot->connect`. Specifically
`inet_dgram_connect` autobinds socket before calling
`sk->sk_prot->connect` and there is no way to call `bpf_bind()` from
hooks from e.g. `ip4_datagram_connect` or `ip6_datagram_connect` since
it'd cause double-bind. On the other hand `proto.pre_connect` provides a
flexible way to add BPF hooks for connect only for necessary `proto` and
call them at desired time before `connect`. Since `bpf_bind()` is
allowed to bind only to IP and autobind in `inet_dgram_connect` binds
only port there is no chance of double-bind.

bpf_bind() sets `force_bind_address_no_port` to bind to only IP despite
of value of `bind_address_no_port` socket field.

bpf_bind() sets `with_lock` to `false` when calling to __inet_bind()
and __inet6_bind() since all call-sites, where bpf_bind() is called,
already hold socket lock.

Signed-off-by: Andrey Ignatov 
Signed-off-by: Alexei Starovoitov 
---
 include/linux/bpf-cgroup.h | 31 +
 include/net/addrconf.h |  7 ++
 include/net/sock.h |  3 +++
 include/net/udp.h  |  1 +
 include/uapi/linux/bpf.h   | 12 +-
 kernel/bpf/syscall.c   |  8 +++
 net/core/filter.c  | 57 ++
 net/ipv4/af_inet.c | 13 +++
 net/ipv4/tcp_ipv4.c| 16 +
 net/ipv4/udp.c | 14 
 net/ipv6/af_inet6.c|  5 
 net/ipv6/tcp_ipv6.c| 16 +
 net/ipv6/udp.c | 20 
 13 files changed, 202 insertions(+), 1 deletion(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 67dc4a6471ad..c6ab295e6dcb 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -116,12 +116,38 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 
major, u32 minor,
__ret; \
 })
 
+#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type)  \
+({\
+   int __ret = 0; \
+   if (cgroup_bpf_enabled) {  \
+   lock_sock(sk); \
+   __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type);\
+   release_sock(sk);  \
+   }  \
+   __ret; \
+})
+
 #define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND)
 
 #define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \
BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND)
 
+#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \
+   sk->sk_prot->pre_connect)
+
+#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)  \
+   BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT)
+
+#define

Re: [PATCH net v4 0/3] ipv6: udp6: set dst cache for a connected sk if current not valid

2018-03-30 Thread Martin KaFai Lau

On Fri, Mar 30, 2018 at 08:53:06PM +0300, Alexey Kodanev wrote:
> A new RTF_CACHE route can be created with the socket's dst cache
> update between the below calls in udpv6_sendmsg(), when datagram
> sending results to ICMPV6_PKT_TOOBIG error:
> 
>dst = ip6_sk_dst_lookup_flow(...)
>...
> release_dst:
> if (dst) {
> if (connected) {
> ip6_dst_store(sk, dst)
> 
> Therefore, the new socket's dst cache reset to the old one on
> "release_dst:".
> 
> The first two patches prepare the code to store dst cache
> with ip6_sk_dst_lookup_flow():
> 
>   * the first patch adds ip6_dst_store_flow() function with
> commonly used source and destiantion addresses checks using
> the flow information.
> 
>   * the second patch adds new argument to ip6_sk_dst_lookup_flow()
> and ability to store dst in the socket's cache. Also, the two
> users of the function are updated without enabling the new
> behavior: pingv6_sendmsg() and udpv6_sendmsg().
> 
> The last patch contains the actual fix that removes sk dst cache
> update in the end of udpv6_sendmsg(), and allows to do it in
> ip6_sk_dst_lookup_flow().
> 
> v4: * fix the error in the build of ip_dst_store_flow() reported by
>   kbuild test robot due to missing checks for CONFIG_IPV6: add
>   new function to ip6_output.c instead of ip6_route.h
Thanks for the patches!

Instead of ip6_output.c, would net/ipv6/route.c be a better
place for the new ip6_sk_dst_store_flow()?

Others LGTM.  

> * add 'const' to struct flowi6 in ip6_dst_store_flow()
> * minor commit messages fixes
> 
> v3: * instead of moving ip6_dst_store() above udp_v6_send_skb(),
>   update socket's dst cache inside ip6_sk_dst_lookup_flow()
>   if the current one is invalid
> * the issue not reproduced in 4.1, but starting from 4.2. Add
>   one more 'Fixes:' commit that creates new RTF_CACHE route.
>   Though, it is also mentioned in the first one
> 
> 
> Alexey Kodanev (3):
>   ipv6: add a wrapper for ip6_dst_store() with flowi6 checks
>   ipv6: allow to cache dst for a connected sk in ip6_sk_dst_lookup_flow()
>   ipv6: udp6: set dst cache for a connected sk if current not valid
> 
>  include/net/ipv6.h|  6 --
>  net/ipv6/datagram.c   |  9 +
>  net/ipv6/ip6_output.c | 32 +---
>  net/ipv6/ping.c   |  2 +-
>  net/ipv6/udp.c| 21 ++---
>  5 files changed, 37 insertions(+), 33 deletions(-)
> 
> -- 
> 1.8.3.1
>

[PATCH net-next 00/12] rxrpc: Fixes and more traces

2018-03-30 Thread David Howells


Here are some patches that add some more tracepoints to AF_RXRPC and fix
some issues therein:

 (1) Fix the use of VERSION packets to keep firewall routes open.

 (2) Fix the incorrect current time usage in a tracepoint.

 (3) Fix Tx ring annotation corruption.

 (4) Fix accidental conversion of call-level abort into connection-level
 abort.

 (5) Fix calculation of resend time.

 (6) Remove a couple of unused variables.

 (7) Fix a bunch of checker warnings and an error.  Note that not all
 warnings can be quashed as checker doesn't seem to correctly handle
 seqlocks.

 (8) Fix a potential race between call destruction and socket/net
 destruction.

 (9) Add a tracepoint to track rxrpc_local refcounting.

(10) Fix an apparent leak of rxrpc_local objects.

(11) Add a tracepoint to track rxrpc_peer refcounting.

(12) Fix a leak of rxrpc_peer objects.

The patches are tagged here:

git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git
rxrpc-next-20180330

and can also be found on this branch:


http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-next

David
---
David Howells (10):
  rxrpc: Fix firewall route keepalive
  rxrpc: Fix a bit of time confusion
  rxrpc: Fix Tx ring annotation after initial Tx failure
  rxrpc: Don't treat call aborts as conn aborts
  rxrpc: Fix checker warnings and errors
  rxrpc: Fix potential call vs socket/net destruction race
  rxrpc: Add a tracepoint to track rxrpc_local refcounting
  rxrpc: Fix apparent leak of rxrpc_local objects
  rxrpc: Add a tracepoint to track rxrpc_peer refcounting
  rxrpc: Fix leak of rxrpc_peer objects

Marc Dionne (1):
  rxrpc: Fix resend event time calculation

Sebastian Andrzej Siewior (1):
  rxrpc: remove unused static variables


 include/trace/events/rxrpc.h |   85 
 net/rxrpc/af_rxrpc.c |6 +++
 net/rxrpc/ar-internal.h  |   68 +++--
 net/rxrpc/call_accept.c  |9 +++-
 net/rxrpc/call_event.c   |4 +-
 net/rxrpc/call_object.c  |   17 ++-
 net/rxrpc/conn_client.c  |3 +
 net/rxrpc/conn_event.c   |3 +
 net/rxrpc/conn_object.c  |   10 
 net/rxrpc/conn_service.c |1 
 net/rxrpc/input.c|   17 +--
 net/rxrpc/local_object.c |   65 +++-
 net/rxrpc/net_ns.c   |   24 ++
 net/rxrpc/output.c   |   59 +
 net/rxrpc/peer_event.c   |   98 ++
 net/rxrpc/peer_object.c  |   93 +++-
 net/rxrpc/proc.c |6 +++
 net/rxrpc/rxkad.c|2 +
 net/rxrpc/security.c |3 -
 net/rxrpc/sendmsg.c  |7 +++
 20 files changed, 509 insertions(+), 71 deletions(-)

[PATCH net-next 01/12] rxrpc: Fix firewall route keepalive

2018-03-30 Thread David Howells

Fix the firewall route keepalive part of AF_RXRPC which is currently
function incorrectly by replying to VERSION REPLY packets from the server
with VERSION REQUEST packets.

Instead, send VERSION REPLY packets to the peers of service connections to
act as keep-alives 20s after the latest packet was transmitted to that
peer.

Also, just discard VERSION REPLY packets rather than replying to them.

Signed-off-by: David Howells 
---

 net/rxrpc/af_rxrpc.c|4 ++
 net/rxrpc/ar-internal.h |   14 ++-
 net/rxrpc/conn_event.c  |3 +
 net/rxrpc/input.c   |2 +
 net/rxrpc/net_ns.c  |   21 ++
 net/rxrpc/output.c  |   59 -
 net/rxrpc/peer_event.c  |   96 +++
 net/rxrpc/peer_object.c |7 +++
 net/rxrpc/rxkad.c   |2 +
 9 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index ec5ec68be1aa..0b3026b8fa40 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -762,6 +762,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket 
*sock,
 static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
int kern)
 {
+   struct rxrpc_net *rxnet;
struct rxrpc_sock *rx;
struct sock *sk;
 
@@ -801,6 +802,9 @@ static int rxrpc_create(struct net *net, struct socket 
*sock, int protocol,
rwlock_init(>call_lock);
memset(>srx, 0, sizeof(rx->srx));
 
+   rxnet = rxrpc_net(sock_net(>sk));
+   timer_reduce(>peer_keepalive_timer, jiffies + 1);
+
_leave(" = 0 [%p]", rx);
return 0;
 }
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 21cf164b6d85..8a348e0a9d95 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -97,8 +97,16 @@ struct rxrpc_net {
struct list_headlocal_endpoints;
struct mutexlocal_mutex;/* Lock for ->local_endpoints */
 
-   spinlock_t  peer_hash_lock; /* Lock for ->peer_hash */
DECLARE_HASHTABLE   (peer_hash, 10);
+   spinlock_t  peer_hash_lock; /* Lock for ->peer_hash */
+
+#define RXRPC_KEEPALIVE_TIME 20 /* NAT keepalive time in seconds */
+   u8  peer_keepalive_cursor;
+   ktime_t peer_keepalive_base;
+   struct hlist_head   peer_keepalive[RXRPC_KEEPALIVE_TIME + 1];
+   struct hlist_head   peer_keepalive_new;
+   struct timer_list   peer_keepalive_timer;
+   struct work_struct  peer_keepalive_work;
 };
 
 /*
@@ -285,6 +293,8 @@ struct rxrpc_peer {
struct hlist_head   error_targets;  /* targets for net error 
distribution */
struct work_struct  error_distributor;
struct rb_root  service_conns;  /* Service connections */
+   struct hlist_node   keepalive_link; /* Link in 
net->peer_keepalive[] */
+   time64_tlast_tx_at; /* Last time packet sent here */
seqlock_t   service_conn_lock;
spinlock_t  lock;   /* access lock */
unsigned intif_mtu; /* interface MTU for this peer 
*/
@@ -1026,6 +1036,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *, bool, 
rxrpc_serial_t *);
 int rxrpc_send_abort_packet(struct rxrpc_call *);
 int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool);
 void rxrpc_reject_packets(struct rxrpc_local *);
+void rxrpc_send_keepalive(struct rxrpc_peer *);
 
 /*
  * peer_event.c
@@ -1034,6 +1045,7 @@ void rxrpc_error_report(struct sock *);
 void rxrpc_peer_error_distributor(struct work_struct *);
 void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace,
rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
+void rxrpc_peer_keepalive_worker(struct work_struct *);
 
 /*
  * peer_object.c
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index d2ec3fd593e8..c717152070df 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -136,6 +136,7 @@ static void rxrpc_conn_retransmit_call(struct 
rxrpc_connection *conn,
}
 
kernel_sendmsg(conn->params.local->socket, , iov, ioc, len);
+   conn->params.peer->last_tx_at = ktime_get_real();
_leave("");
return;
 }
@@ -239,6 +240,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection 
*conn,
return -EAGAIN;
}
 
+   conn->params.peer->last_tx_at = ktime_get_real();
+
_leave(" = 0");
return 0;
 }
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 2a868fdab0ae..d4f2509e018b 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1183,6 +1183,8 @@ void rxrpc_data_ready(struct sock *udp_sk)
 
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
+   if (!(sp->hdr.flags & RXRPC_CLIENT_INITIATED))
+   goto discard;

[PATCH net-next 02/12] rxrpc: Fix a bit of time confusion

2018-03-30 Thread David Howells

The rxrpc_reduce_call_timer() function should be passed the 'current time'
in jiffies, not the current ktime time.  It's confusing in rxrpc_resend
because that has to deal with both.  Pass the correct current time in.

Note that this only affects the trace produced and not the functioning of
the code.

Fixes: a158bdd3247b ("rxrpc: Fix call timeouts")
Signed-off-by: David Howells 
---

 net/rxrpc/call_event.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 6a62e42e1d8d..3dee89c7a06e 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -238,7 +238,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned 
long now_j)
 * retransmitting data.
 */
if (!retrans) {
-   rxrpc_reduce_call_timer(call, resend_at, now,
+   rxrpc_reduce_call_timer(call, resend_at, now_j,
rxrpc_timer_set_for_resend);
spin_unlock_bh(>lock);
ack_ts = ktime_sub(now, call->acks_latest_ts);

[PATCH net-next 03/12] rxrpc: Fix Tx ring annotation after initial Tx failure

2018-03-30 Thread David Howells

rxrpc calls have a ring of packets that are awaiting ACK or retransmission
and a parallel ring of annotations that tracks the state of those packets.
If the initial transmission of a packet on the underlying UDP socket fails
then the packet annotation is marked for resend - but the setting of this
mark accidentally erases the last-packet mark also stored in the same
annotation slot.  If this happens, a call won't switch out of the Tx phase
when all the packets have been transmitted.

Fix this by retaining the last-packet mark and only altering the packet
state.

Fixes: 248f219cb8bc ("rxrpc: Rewrite the data and ack handling code")
Signed-off-by: David Howells 
---

 net/rxrpc/sendmsg.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 8503f279b467..783c777fc6e7 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -130,7 +130,9 @@ static inline void rxrpc_instant_resend(struct rxrpc_call 
*call, int ix)
spin_lock_bh(>lock);
 
if (call->state < RXRPC_CALL_COMPLETE) {
-   call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS;
+   call->rxtx_annotations[ix] =
+   (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) |
+   RXRPC_TX_ANNO_RETRANS;
if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, >events))
rxrpc_queue_call(call);
}

[PATCH net-next 04/12] rxrpc: Don't treat call aborts as conn aborts

2018-03-30 Thread David Howells

If a call-level abort is received for the previous call to complete on a
connection channel, then that abort is queued for the connection processor
to handle.  Unfortunately, the connection processor then assumes without
checking that the abort is connection-level (ie. callNumber is 0) and
distributes it over all active calls on that connection, thereby
incorrectly aborting them.

Fix this by discarding aborts aimed at a completed call.

Further, discard all packets aimed at a call that's complete if there's
currently an active call on a channel, since the DATA packets associated
with the new call automatically terminate the old call.

Fixes: 18bfeba50dfd ("rxrpc: Perform terminal call ACK/ABORT retransmission 
from conn processor")
Reported-by: Marc Dionne 
Signed-off-by: David Howells 
---

 net/rxrpc/input.c |   15 +--
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index d4f2509e018b..21800e6f5019 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -1242,16 +1242,19 @@ void rxrpc_data_ready(struct sock *udp_sk)
goto discard_unlock;
 
if (sp->hdr.callNumber == chan->last_call) {
-   /* For the previous service call, if completed 
successfully, we
-* discard all further packets.
+   if (chan->call ||
+   sp->hdr.type == RXRPC_PACKET_TYPE_ABORT)
+   goto discard_unlock;
+
+   /* For the previous service call, if completed
+* successfully, we discard all further packets.
 */
if (rxrpc_conn_is_service(conn) &&
-   (chan->last_type == RXRPC_PACKET_TYPE_ACK ||
-sp->hdr.type == RXRPC_PACKET_TYPE_ABORT))
+   chan->last_type == RXRPC_PACKET_TYPE_ACK)
goto discard_unlock;
 
-   /* But otherwise we need to retransmit the final packet 
from
-* data cached in the connection record.
+   /* But otherwise we need to retransmit the final packet
+* from data cached in the connection record.
 */
rxrpc_post_packet_to_conn(conn, skb);
goto out_unlock;

[PATCH net-next 07/12] rxrpc: Fix checker warnings and errors

2018-03-30 Thread David Howells

Fix various issues detected by checker.

Errors:

 (*) rxrpc_discard_prealloc() should be using rcu_assign_pointer to set
 call->socket.

Warnings:

 (*) rxrpc_service_connection_reaper() should be passing NULL rather than 0 to
 trace_rxrpc_conn() as the where argument.

 (*) rxrpc_disconnect_client_call() should get its net pointer via the
 call->conn rather than call->sock to avoid a warning about accessing
 an RCU pointer without protection.

 (*) Proc seq start/stop functions need annotation as they pass locks
 between the functions.

False positives:

 (*) Checker doesn't correctly handle of seq-retry lock context balance in
 rxrpc_find_service_conn_rcu().

 (*) Checker thinks execution may proceed past the BUG() in
 rxrpc_publish_service_conn().

 (*) Variable length array warnings from SKCIPHER_REQUEST_ON_STACK() in
 rxkad.c.

Signed-off-by: David Howells 
---

 net/rxrpc/call_accept.c |3 ++-
 net/rxrpc/call_object.c |1 +
 net/rxrpc/conn_client.c |2 +-
 net/rxrpc/conn_object.c |2 +-
 net/rxrpc/proc.c|6 ++
 net/rxrpc/sendmsg.c |2 ++
 6 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 92ebd1d7e0bb..4ce24c000653 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -225,7 +225,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
tail = b->call_backlog_tail;
while (CIRC_CNT(head, tail, size) > 0) {
struct rxrpc_call *call = b->call_backlog[tail];
-   call->socket = rx;
+   rcu_assign_pointer(call->socket, rx);
if (rx->discard_new_call) {
_debug("discard %lx", call->user_call_ID);
rx->discard_new_call(call, call->user_call_ID);
@@ -456,6 +456,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
 unsigned long user_call_ID,
 rxrpc_notify_rx_t notify_rx)
__releases(>sk.sk_lock.slock)
+   __acquires(call->user_mutex)
 {
struct rxrpc_call *call;
struct rb_node *parent, **pp;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 147657dfe757..85b12c472522 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -219,6 +219,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock 
*rx,
 gfp_t gfp,
 unsigned int debug_id)
__releases(>sk.sk_lock.slock)
+   __acquires(>user_mutex)
 {
struct rxrpc_call *call, *xcall;
struct rxrpc_net *rxnet = rxrpc_net(sock_net(>sk));
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 064175068059..041da40dbf93 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -776,7 +776,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_call *call)
unsigned int channel = call->cid & RXRPC_CHANNELMASK;
struct rxrpc_connection *conn = call->conn;
struct rxrpc_channel *chan = >channels[channel];
-   struct rxrpc_net *rxnet = rxrpc_net(sock_net(>socket->sk));
+   struct rxrpc_net *rxnet = conn->params.local->rxnet;
 
trace_rxrpc_client(conn, channel, rxrpc_client_chan_disconnect);
call->conn = NULL;
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index ccbac190add1..bfc46fd69a62 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -418,7 +418,7 @@ void rxrpc_service_connection_reaper(struct work_struct 
*work)
 */
if (atomic_cmpxchg(>usage, 1, 0) != 1)
continue;
-   trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, 0);
+   trace_rxrpc_conn(conn, rxrpc_conn_reap_service, 0, NULL);
 
if (rxrpc_conn_is_client(conn))
BUG();
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index f79f260c6ddc..7e45db058823 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -29,6 +29,8 @@ static const char *const 
rxrpc_conn_states[RXRPC_CONN__NR_STATES] = {
  * generate a list of extant and dead calls in /proc/net/rxrpc_calls
  */
 static void *rxrpc_call_seq_start(struct seq_file *seq, loff_t *_pos)
+   __acquires(rcu)
+   __acquires(rxnet->call_lock)
 {
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
@@ -45,6 +47,8 @@ static void *rxrpc_call_seq_next(struct seq_file *seq, void 
*v, loff_t *pos)
 }
 
 static void rxrpc_call_seq_stop(struct seq_file *seq, void *v)
+   __releases(rxnet->call_lock)
+   __releases(rcu)
 {
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
 
@@ -135,6 +139,7 @@ const struct file_operations rxrpc_call_seq_fops = {
  * generate a list of extant virtual connections in /proc/net/rxrpc_conns
  */
 static void *rxrpc_connection_seq_start(struct seq_file *seq, loff_t

[PATCH net-next 06/12] rxrpc: remove unused static variables

2018-03-30 Thread David Howells

From: Sebastian Andrzej Siewior 

The rxrpc_security_methods and rxrpc_security_sem user has been removed
in 648af7fca159 ("rxrpc: Absorb the rxkad security module"). This was
noticed by kbuild test robot for the -RT tree but is also true for !RT.

Reported-by: kbuild test robot 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: David Howells 
---

 net/rxrpc/security.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index e9f428351293..c4479afe8ae7 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -19,9 +19,6 @@
 #include 
 #include "ar-internal.h"
 
-static LIST_HEAD(rxrpc_security_methods);
-static DECLARE_RWSEM(rxrpc_security_sem);
-
 static const struct rxrpc_security *rxrpc_security_types[] = {
[RXRPC_SECURITY_NONE]   = _no_security,
 #ifdef CONFIG_RXKAD

[PATCH net-next 08/12] rxrpc: Fix potential call vs socket/net destruction race

2018-03-30 Thread David Howells

rxrpc_call structs don't pin sockets or network namespaces, but may attempt
to access both after their refcount reaches 0 so that they can detach
themselves from the network namespace.  However, there's no guarantee that
the socket still exists at this point (so sock_net(>socket->sk) may
be invalid) and the namespace may have gone away if the call isn't pinning
a peer.

Fix this by (a) carrying a net pointer in the rxrpc_call struct and (b)
waiting for all calls to be destroyed when the network namespace goes away.

This was detected by checker:

net/rxrpc/call_object.c:634:57: warning: incorrect type in argument 1 
(different address spaces)
net/rxrpc/call_object.c:634:57:expected struct sock const *sk
net/rxrpc/call_object.c:634:57:got struct sock [noderef] *

Fixes: 2baec2c3f854 ("rxrpc: Support network namespacing")
Signed-off-by: David Howells 
---

 net/rxrpc/ar-internal.h |2 ++
 net/rxrpc/call_accept.c |1 +
 net/rxrpc/call_object.c |   16 +---
 net/rxrpc/net_ns.c  |1 +
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 8a348e0a9d95..2a2b0fdfb157 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -75,6 +75,7 @@ struct rxrpc_net {
u32 epoch;  /* Local epoch for detecting 
local-end reset */
struct list_headcalls;  /* List of calls active in this 
namespace */
rwlock_tcall_lock;  /* Lock for ->calls */
+   atomic_tnr_calls;   /* Count of allocated calls */
 
struct list_headconn_proc_list; /* List of conns in this 
namespace for proc */
struct list_headservice_conns;  /* Service conns in this 
namespace */
@@ -528,6 +529,7 @@ struct rxrpc_call {
struct rxrpc_connection *conn;  /* connection carrying call */
struct rxrpc_peer   *peer;  /* Peer record for remote 
address */
struct rxrpc_sock __rcu *socket;/* socket responsible */
+   struct rxrpc_net*rxnet; /* Network namespace to which 
call belongs */
struct mutexuser_mutex; /* User access mutex */
unsigned long   ack_at; /* When deferred ACK needs to 
happen */
unsigned long   ack_lost_at;/* When ACK is figured as lost 
*/
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 4ce24c000653..493545033e42 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -138,6 +138,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
 
write_unlock(>call_lock);
 
+   rxnet = call->rxnet;
write_lock(>call_lock);
list_add_tail(>link, >calls);
write_unlock(>call_lock);
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 85b12c472522..f721c2b7e234 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -103,6 +103,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, 
gfp_t gfp,
unsigned int debug_id)
 {
struct rxrpc_call *call;
+   struct rxrpc_net *rxnet = rxrpc_net(sock_net(>sk));
 
call = kmem_cache_zalloc(rxrpc_call_jar, gfp);
if (!call)
@@ -153,6 +154,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, 
gfp_t gfp,
 
call->cong_cwnd = 2;
call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1;
+
+   call->rxnet = rxnet;
+   atomic_inc(>nr_calls);
return call;
 
 nomem_2:
@@ -222,7 +226,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock 
*rx,
__acquires(>user_mutex)
 {
struct rxrpc_call *call, *xcall;
-   struct rxrpc_net *rxnet = rxrpc_net(sock_net(>sk));
+   struct rxrpc_net *rxnet;
struct rb_node *parent, **pp;
const void *here = __builtin_return_address(0);
int ret;
@@ -272,6 +276,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock 
*rx,
 
write_unlock(>call_lock);
 
+   rxnet = call->rxnet;
write_lock(>call_lock);
list_add_tail(>link, >calls);
write_unlock(>call_lock);
@@ -617,7 +622,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx)
  */
 void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op)
 {
-   struct rxrpc_net *rxnet;
+   struct rxrpc_net *rxnet = call->rxnet;
const void *here = __builtin_return_address(0);
int n;
 
@@ -631,7 +636,6 @@ void rxrpc_put_call(struct rxrpc_call *call, enum 
rxrpc_call_trace op)
ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE);
 
if (!list_empty(>link)) {
-   rxnet = rxrpc_net(sock_net(>socket->sk));
write_lock(>call_lock);
list_del_init(>link);
write_unlock(>call_lock);
@@ -647,11 +651,14 @@ void

[PATCH net-next 05/12] rxrpc: Fix resend event time calculation

2018-03-30 Thread David Howells

From: Marc Dionne 

Commit a158bdd3 ("rxrpc: Fix call timeouts") reworked the time calculation
for the next resend event.  For this calculation, "oldest" will be before
"now", so ktime_sub(oldest, now) will yield a negative value.  When passed
to nsecs_to_jiffies which expects an unsigned value, the end result will be
a very large value, and a resend event scheduled far into the future.  This
could cause calls to stall if some packets were lost.

Fix by ordering the arguments to ktime_sub correctly.

Fixes: a158bdd3247b ("rxrpc: Fix call timeouts")
Signed-off-by: Marc Dionne 
Signed-off-by: David Howells 
---

 net/rxrpc/call_event.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 3dee89c7a06e..6e0d788b4dc4 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -226,7 +226,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned 
long now_j)
   ktime_to_ns(ktime_sub(skb->tstamp, 
max_age)));
}
 
-   resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(oldest, now)));
+   resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest)));
resend_at += jiffies + rxrpc_resend_timeout;
WRITE_ONCE(call->resend_at, resend_at);

[PATCH net-next 11/12] rxrpc: Add a tracepoint to track rxrpc_peer refcounting

2018-03-30 Thread David Howells

Add a tracepoint to track reference counting on the rxrpc_peer struct.

Signed-off-by: David Howells 
---

 include/trace/events/rxrpc.h |   42 +++
 net/rxrpc/ar-internal.h  |   23 +++
 net/rxrpc/peer_event.c   |2 +
 net/rxrpc/peer_object.c  |   65 +-
 4 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 0410dfeb79c6..9e96c2fe2793 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -50,6 +50,14 @@ enum rxrpc_local_trace {
rxrpc_local_queued,
 };
 
+enum rxrpc_peer_trace {
+   rxrpc_peer_got,
+   rxrpc_peer_new,
+   rxrpc_peer_processing,
+   rxrpc_peer_put,
+   rxrpc_peer_queued_error,
+};
+
 enum rxrpc_conn_trace {
rxrpc_conn_got,
rxrpc_conn_new_client,
@@ -230,6 +238,13 @@ enum rxrpc_congest_change {
EM(rxrpc_local_put, "PUT") \
E_(rxrpc_local_queued,  "QUE")
 
+#define rxrpc_peer_traces \
+   EM(rxrpc_peer_got,  "GOT") \
+   EM(rxrpc_peer_new,  "NEW") \
+   EM(rxrpc_peer_processing,   "PRO") \
+   EM(rxrpc_peer_put,  "PUT") \
+   E_(rxrpc_peer_queued_error, "QER")
+
 #define rxrpc_conn_traces \
EM(rxrpc_conn_got,  "GOT") \
EM(rxrpc_conn_new_client,   "NWc") \
@@ -482,6 +497,33 @@ TRACE_EVENT(rxrpc_local,
  __entry->where)
);
 
+TRACE_EVENT(rxrpc_peer,
+   TP_PROTO(struct rxrpc_peer *peer, enum rxrpc_peer_trace op,
+int usage, const void *where),
+
+   TP_ARGS(peer, op, usage, where),
+
+   TP_STRUCT__entry(
+   __field(unsigned int,   peer)
+   __field(int,op  )
+   __field(int,usage   )
+   __field(const void *,   where   )
+),
+
+   TP_fast_assign(
+   __entry->peer = peer->debug_id;
+   __entry->op = op;
+   __entry->usage = usage;
+   __entry->where = where;
+  ),
+
+   TP_printk("P=%08x %s u=%d sp=%pSR",
+ __entry->peer,
+ __print_symbolic(__entry->op, rxrpc_peer_traces),
+ __entry->usage,
+ __entry->where)
+   );
+
 TRACE_EVENT(rxrpc_conn,
TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op,
 int usage, const void *where),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index d40d54b78567..c46583bc255d 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1041,25 +1041,10 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local 
*,
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
 struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
  struct rxrpc_peer *);
-
-static inline struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
-{
-   atomic_inc(>usage);
-   return peer;
-}
-
-static inline
-struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer)
-{
-   return atomic_inc_not_zero(>usage) ? peer : NULL;
-}
-
-extern void __rxrpc_put_peer(struct rxrpc_peer *peer);
-static inline void rxrpc_put_peer(struct rxrpc_peer *peer)
-{
-   if (peer && atomic_dec_and_test(>usage))
-   __rxrpc_put_peer(peer);
-}
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
+struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
+void rxrpc_put_peer(struct rxrpc_peer *);
+void __rxrpc_queue_peer_error(struct rxrpc_peer *);
 
 /*
  * proc.c
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index d01eb9a06448..78c2f95d1f22 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -192,7 +192,7 @@ void rxrpc_error_report(struct sock *sk)
rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
 
/* The ref we obtained is passed off to the work item */
-   rxrpc_queue_work(>error_distributor);
+   __rxrpc_queue_peer_error(peer);
_leave("");
 }
 
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 94a6dbfcf129..a4a750aea1e5 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -386,9 +386,54 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local 
*local,
 }
 
 /*
- * Discard a ref on a remote peer record.
+ * Get a ref on a peer record.
  */
-void __rxrpc_put_peer(struct rxrpc_peer *peer)
+struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer)
+{
+   const void *here = __builtin_return_address(0);
+   int n;
+
+   n =

[PATCH net-next 10/12] rxrpc: Fix apparent leak of rxrpc_local objects

2018-03-30 Thread David Howells

rxrpc_local objects cannot be disposed of until all the connections that
point to them have been RCU'd as a connection object holds refcount on the
local endpoint it is communicating through.  Currently, this can cause an
assertion failure to occur when a network namespace is destroyed as there's
no check that the RCU destructors for the connections have been run before
we start trying to destroy local endpoints.

The kernel reports:

rxrpc: AF_RXRPC: Leaked local 36a41bc1 {5}
[ cut here ]
kernel BUG at ../net/rxrpc/local_object.c:439!

Fix this by keeping a count of the live connections and waiting for it to
go to zero at the end of rxrpc_destroy_all_connections().

Fixes: dee46364ce6f ("rxrpc: Add RCU destruction for connections and calls")
Signed-off-by: David Howells 
---

 net/rxrpc/ar-internal.h  |1 +
 net/rxrpc/call_accept.c  |2 ++
 net/rxrpc/conn_client.c  |1 +
 net/rxrpc/conn_object.c  |8 
 net/rxrpc/conn_service.c |1 +
 net/rxrpc/net_ns.c   |1 +
 6 files changed, 14 insertions(+)

diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index cc51d3eb0548..d40d54b78567 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -77,6 +77,7 @@ struct rxrpc_net {
rwlock_tcall_lock;  /* Lock for ->calls */
atomic_tnr_calls;   /* Count of allocated calls */
 
+   atomic_tnr_conns;
struct list_headconn_proc_list; /* List of conns in this 
namespace for proc */
struct list_headservice_conns;  /* Service conns in this 
namespace */
rwlock_tconn_lock;  /* Lock for ->conn_proc_list, 
->service_conns */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 5a9b1d916124..f67017dcb25e 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -219,6 +219,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
list_del(>proc_link);
write_unlock(>conn_lock);
kfree(conn);
+   if (atomic_dec_and_test(>nr_conns))
+   wake_up_atomic_t(>nr_conns);
tail = (tail + 1) & (size - 1);
}
 
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 041da40dbf93..5736f643c516 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -207,6 +207,7 @@ rxrpc_alloc_client_connection(struct rxrpc_conn_parameters 
*cp, gfp_t gfp)
if (ret < 0)
goto error_2;
 
+   atomic_inc(>nr_conns);
write_lock(>conn_lock);
list_add_tail(>proc_link, >conn_proc_list);
write_unlock(>conn_lock);
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index bfc46fd69a62..0950ee3d26f5 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -365,6 +365,9 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu)
key_put(conn->params.key);
key_put(conn->server_key);
rxrpc_put_peer(conn->params.peer);
+
+   if (atomic_dec_and_test(>params.local->rxnet->nr_conns))
+   wake_up_atomic_t(>params.local->rxnet->nr_conns);
rxrpc_put_local(conn->params.local);
 
kfree(conn);
@@ -458,6 +461,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
_enter("");
 
+   atomic_dec(>nr_conns);
rxrpc_destroy_all_client_connections(rxnet);
 
del_timer_sync(>service_conn_reap_timer);
@@ -475,5 +479,9 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet)
 
ASSERT(list_empty(>conn_proc_list));
 
+   /* We need to wait for the connections to be destroyed by RCU as they
+* pin things that we still need to get rid of.
+*/
+   wait_on_atomic_t(>nr_conns, atomic_t_wait, TASK_UNINTERRUPTIBLE);
_leave("");
 }
diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c
index f6fcdb3130a1..80773a50c755 100644
--- a/net/rxrpc/conn_service.c
+++ b/net/rxrpc/conn_service.c
@@ -132,6 +132,7 @@ struct rxrpc_connection 
*rxrpc_prealloc_service_connection(struct rxrpc_net *rxn
conn->state = RXRPC_CONN_SERVICE_PREALLOC;
atomic_set(>usage, 2);
 
+   atomic_inc(>nr_conns);
write_lock(>conn_lock);
list_add_tail(>link, >service_conns);
list_add_tail(>proc_link, >conn_proc_list);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index 101019b0be34..fa9ce60e7bfa 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -57,6 +57,7 @@ static __net_init int rxrpc_init_net(struct net *net)
rwlock_init(>call_lock);
atomic_set(>nr_calls, 1);
 
+   atomic_set(>nr_conns, 1);
INIT_LIST_HEAD(>conn_proc_list);
INIT_LIST_HEAD(>service_conns);
rwlock_init(>conn_lock);

[PATCH net-next 09/12] rxrpc: Add a tracepoint to track rxrpc_local refcounting

2018-03-30 Thread David Howells

Add a tracepoint to track reference counting on the rxrpc_local struct.

Signed-off-by: David Howells 
---

 include/trace/events/rxrpc.h |   43 
 net/rxrpc/ar-internal.h  |   27 +++--
 net/rxrpc/call_accept.c  |3 +-
 net/rxrpc/local_object.c |   65 +-
 4 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
index 2ea788f6f95d..0410dfeb79c6 100644
--- a/include/trace/events/rxrpc.h
+++ b/include/trace/events/rxrpc.h
@@ -42,6 +42,14 @@ enum rxrpc_skb_trace {
rxrpc_skb_tx_seen,
 };
 
+enum rxrpc_local_trace {
+   rxrpc_local_got,
+   rxrpc_local_new,
+   rxrpc_local_processing,
+   rxrpc_local_put,
+   rxrpc_local_queued,
+};
+
 enum rxrpc_conn_trace {
rxrpc_conn_got,
rxrpc_conn_new_client,
@@ -215,6 +223,13 @@ enum rxrpc_congest_change {
EM(rxrpc_skb_tx_rotated,"Tx ROT") \
E_(rxrpc_skb_tx_seen,   "Tx SEE")
 
+#define rxrpc_local_traces \
+   EM(rxrpc_local_got, "GOT") \
+   EM(rxrpc_local_new, "NEW") \
+   EM(rxrpc_local_processing,  "PRO") \
+   EM(rxrpc_local_put, "PUT") \
+   E_(rxrpc_local_queued,  "QUE")
+
 #define rxrpc_conn_traces \
EM(rxrpc_conn_got,  "GOT") \
EM(rxrpc_conn_new_client,   "NWc") \
@@ -416,6 +431,7 @@ enum rxrpc_congest_change {
 #define E_(a, b) TRACE_DEFINE_ENUM(a);
 
 rxrpc_skb_traces;
+rxrpc_local_traces;
 rxrpc_conn_traces;
 rxrpc_client_traces;
 rxrpc_call_traces;
@@ -439,6 +455,33 @@ rxrpc_congest_changes;
 #define EM(a, b)   { a, b },
 #define E_(a, b)   { a, b }
 
+TRACE_EVENT(rxrpc_local,
+   TP_PROTO(struct rxrpc_local *local, enum rxrpc_local_trace op,
+int usage, const void *where),
+
+   TP_ARGS(local, op, usage, where),
+
+   TP_STRUCT__entry(
+   __field(unsigned int,   local   )
+   __field(int,op  )
+   __field(int,usage   )
+   __field(const void *,   where   )
+),
+
+   TP_fast_assign(
+   __entry->local = local->debug_id;
+   __entry->op = op;
+   __entry->usage = usage;
+   __entry->where = where;
+  ),
+
+   TP_printk("L=%08x %s u=%d sp=%pSR",
+ __entry->local,
+ __print_symbolic(__entry->op, rxrpc_local_traces),
+ __entry->usage,
+ __entry->where)
+   );
+
 TRACE_EVENT(rxrpc_conn,
TP_PROTO(struct rxrpc_connection *conn, enum rxrpc_conn_trace op,
 int usage, const void *where),
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 2a2b0fdfb157..cc51d3eb0548 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -981,31 +981,12 @@ extern void rxrpc_process_local_events(struct rxrpc_local 
*);
  * local_object.c
  */
 struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct 
sockaddr_rxrpc *);
-void __rxrpc_put_local(struct rxrpc_local *);
+struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *);
+struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *);
+void rxrpc_put_local(struct rxrpc_local *);
+void rxrpc_queue_local(struct rxrpc_local *);
 void rxrpc_destroy_all_locals(struct rxrpc_net *);
 
-static inline void rxrpc_get_local(struct rxrpc_local *local)
-{
-   atomic_inc(>usage);
-}
-
-static inline
-struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local)
-{
-   return atomic_inc_not_zero(>usage) ? local : NULL;
-}
-
-static inline void rxrpc_put_local(struct rxrpc_local *local)
-{
-   if (local && atomic_dec_and_test(>usage))
-   __rxrpc_put_local(local);
-}
-
-static inline void rxrpc_queue_local(struct rxrpc_local *local)
-{
-   rxrpc_queue_work(>processor);
-}
-
 /*
  * misc.c
  */
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 493545033e42..5a9b1d916124 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -296,8 +296,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct 
rxrpc_sock *rx,
b->conn_backlog[conn_tail] = NULL;
smp_store_release(>conn_backlog_tail,
  (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1));
-   rxrpc_get_local(local);
-   conn->params.local = local;
+   conn->params.local = rxrpc_get_local(local);
conn->params.peer = peer;
rxrpc_see_connection(conn);
rxrpc_new_incoming_connection(rx, conn, skb);

[PATCH net-next 12/12] rxrpc: Fix leak of rxrpc_peer objects

2018-03-30 Thread David Howells

When a new client call is requested, an rxrpc_conn_parameters struct object
is passed in with a bunch of parameters set, such as the local endpoint to
use.  A pointer to the target peer record is also placed in there by
rxrpc_get_client_conn() - and this is removed if and only if a new
connection object is allocated.  Thus it leaks if a new connection object
isn't allocated.

Fix this by putting any peer object attached to the rxrpc_conn_parameters
object in the function that allocated it.

Fixes: 19ffa01c9c45 ("rxrpc: Use structs to hold connection params and protocol 
info")
Signed-off-by: David Howells 
---

 net/rxrpc/af_rxrpc.c|2 ++
 net/rxrpc/ar-internal.h |1 +
 net/rxrpc/net_ns.c  |1 +
 net/rxrpc/peer_object.c |   21 +
 net/rxrpc/sendmsg.c |1 +
 5 files changed, 26 insertions(+)

diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 0b3026b8fa40..9a2c8e7c000e 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -324,6 +324,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket 
*sock,
mutex_unlock(>user_mutex);
}
 
+   rxrpc_put_peer(cp.peer);
_leave(" = %p", call);
return call;
 }
@@ -447,6 +448,7 @@ int rxrpc_kernel_retry_call(struct socket *sock, struct 
rxrpc_call *call,
ret = rxrpc_retry_client_call(rx, call, , srx, GFP_KERNEL);
 
mutex_unlock(>user_mutex);
+   rxrpc_put_peer(cp.peer);
_leave(" = %d", ret);
return ret;
 }
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index c46583bc255d..90d7079e0aa9 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -1041,6 +1041,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *,
 struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t);
 struct rxrpc_peer *rxrpc_lookup_incoming_peer(struct rxrpc_local *,
  struct rxrpc_peer *);
+void rxrpc_destroy_all_peers(struct rxrpc_net *);
 struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *);
 struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *);
 void rxrpc_put_peer(struct rxrpc_peer *);
diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c
index fa9ce60e7bfa..c7a023fb22d0 100644
--- a/net/rxrpc/net_ns.c
+++ b/net/rxrpc/net_ns.c
@@ -118,6 +118,7 @@ static __net_exit void rxrpc_exit_net(struct net *net)
cancel_work_sync(>peer_keepalive_work);
rxrpc_destroy_all_calls(rxnet);
rxrpc_destroy_all_connections(rxnet);
+   rxrpc_destroy_all_peers(rxnet);
rxrpc_destroy_all_locals(rxnet);
proc_remove(rxnet->proc_net);
 }
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index a4a750aea1e5..1b7e8107b3ae 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -463,6 +463,27 @@ void rxrpc_put_peer(struct rxrpc_peer *peer)
}
 }
 
+/*
+ * Make sure all peer records have been discarded.
+ */
+void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
+{
+   struct rxrpc_peer *peer;
+   int i;
+
+   for (i = 0; i < HASH_SIZE(rxnet->peer_hash); i++) {
+   if (hlist_empty(>peer_hash[i]))
+   continue;
+
+   hlist_for_each_entry(peer, >peer_hash[i], hash_link) {
+   pr_err("Leaked peer %u {%u} %pISp\n",
+  peer->debug_id,
+  atomic_read(>usage),
+  >srx.transport);
+   }
+   }
+}
+
 /**
  * rxrpc_kernel_get_peer - Get the peer address of a call
  * @sock: The socket on which the call is in progress.
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index a62980a80151..206e802ccbdc 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -586,6 +586,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, 
struct msghdr *msg,
 atomic_inc_return(_debug_id));
/* The socket is now unlocked */
 
+   rxrpc_put_peer(cp.peer);
_leave(" = %p\n", call);
return call;
 }

[PATCH] mt7601u: phy: mark expected switch fall-through

2018-03-30 Thread Gustavo A. R. Silva

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva 
---
 drivers/net/wireless/mediatek/mt7601u/phy.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/wireless/mediatek/mt7601u/phy.c 
b/drivers/net/wireless/mediatek/mt7601u/phy.c
index ca09a5d..9a90f1f 100644
--- a/drivers/net/wireless/mediatek/mt7601u/phy.c
+++ b/drivers/net/wireless/mediatek/mt7601u/phy.c
@@ -795,6 +795,7 @@ mt7601u_phy_rf_pa_mode_val(struct mt7601u_dev *dev, int 
phy_mode, int tx_rate)
switch (phy_mode) {
case MT_PHY_TYPE_OFDM:
tx_rate += 4;
+   /* fall through */
case MT_PHY_TYPE_CCK:
reg = dev->rf_pa_mode[0];
break;
-- 
2.7.4

[PATCH v5 00/14] Report PCI device link status

2018-03-30 Thread Bjorn Helgaas

This is mostly Tal's work to reduce code duplication in drivers and unify
the approach for reporting PCIe link speed/width and whether the device is
being limited by a slower upstream link.

This v5 series is based on Tal's v4 [1].

Changes since v4:
  - Added patches to replace uses of pcie_get_minimum_link() in bnx2x,
bnxt_en, cxgb4, fm10k, and ixgbe.  Note that this is a user-visible
change to the log messages, and in some cases changes dev_warn() to
dev_info().  I hope we can converge on something that works for
everybody, and it's OK if we need to tweak the text and/or level used
in pcie_print_link_status() to get there.

  - Rebased on top of Jay Fang's patch that adds 16 GT/s decoding support.

  - Changed pcie_get_speed_cap() and pcie_get_width_cap() to return the
values directly instead of returning both an error code and the value
via a reference parameter.  I don't think the callers can really use
both the error and the value.

  - Moved some declarations from linux/pci.h to drivers/pci/pci.h so
they're not visible outside the PCI subsystem.  Also removed
corresponding EXPORT_SYMBOL()s.  If we need these outside the PCI core,
we can export them again, but that's not needed yet.

  - Reworked pcie_bandwidth_available() so it finds the uppermost limiting
device and returns width/speed info for that device (previously it
could return width from one device and speed from a different one).

The incremental diff between the v4 series (based on v4.17-rc1) and this v5
series (based on v4.17-rc1 + Jay Fang's patch) is attached.  This diff
doesn't include the new patches to bnx2x, bnxt_en, cxgb4, fm10k, and ixgbe.

I don't have any of this hardware, so this is only compile-tested.

Bjorn


[1] 
https://lkml.kernel.org/r/1522394086-3555-1-git-send-email-ta...@mellanox.com

---

Bjorn Helgaas (6):
  bnx2x: Report PCIe link properties with pcie_print_link_status()
  bnxt_en: Report PCIe link properties with pcie_print_link_status()
  cxgb4: Report PCIe link properties with pcie_print_link_status()
  fm10k: Report PCIe link properties with pcie_print_link_status()
  ixgbe: Report PCIe link properties with pcie_print_link_status()
  PCI: Remove unused pcie_get_minimum_link()

Tal Gilboa (8):
  PCI: Add pcie_get_speed_cap() to find max supported link speed
  PCI: Add pcie_get_width_cap() to find max supported link width
  PCI: Add pcie_bandwidth_capable() to compute max supported link bandwidth
  PCI: Add pcie_bandwidth_available() to compute bandwidth available to 
device
  PCI: Add pcie_print_link_status() to log link speed and whether it's 
limited
  net/mlx4_core: Report PCIe link properties with pcie_print_link_status()
  net/mlx5: Report PCIe link properties with pcie_print_link_status()
  net/mlx5e: Use pcie_bandwidth_available() to compute bandwidth


 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c  |   23 +--
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   19 --
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |   75 -
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c  |   87 ---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   47 --
 drivers/net/ethernet/mellanox/mlx4/main.c |   81 --
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   32 
 drivers/net/ethernet/mellanox/mlx5/core/main.c|4 +
 drivers/pci/pci-sysfs.c   |   38 +
 drivers/pci/pci.c |  167 ++---
 drivers/pci/pci.h |   20 +++
 include/linux/pci.h   |6 +
 12 files changed, 189 insertions(+), 410 deletions(-)



diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 1bbd6cd20213..93291ec4a3d1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3864,25 +3864,6 @@ void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, 
int len,
indirection_rqt[i] = i % num_channels;
 }
 
-static int mlx5e_get_pci_bw(struct mlx5_core_dev *mdev, u32 *pci_bw)
-{
-   enum pcie_link_width width;
-   enum pci_bus_speed speed;
-   int err = 0;
-   int bw;
-
-   err = pcie_bandwidth_available(mdev->pdev, , , , NULL);
-   if (err)
-   return err;
-
-   if (speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN)
-   return -EINVAL;
-
-   *pci_bw = bw;
-
-   return 0;
-}
-
 static bool cqe_compress_heuristic(u32 link_speed, u32 pci_bw)
 {
return (link_speed && pci_bw &&
@@ -3968,7 +3949,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
params->num_tc   = 1;
 
mlx5e_get_max_linkspeed(mdev, _speed);
-   mlx5e_get_pci_bw(mdev, _bw);
+   pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL,

[PATCH v5 01/14] PCI: Add pcie_get_speed_cap() to find max supported link speed

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Add pcie_get_speed_cap() to find the max link speed supported by a device.
Change max_link_speed_show() to use pcie_get_speed_cap().

Signed-off-by: Tal Gilboa 
[bhelgaas: return speed directly instead of error and *speed, don't export
outside drivers/pci]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Tariq Toukan 
---
 drivers/pci/pci-sysfs.c |   28 ++--
 drivers/pci/pci.c   |   44 
 drivers/pci/pci.h   |   10 ++
 3 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 7dc5be545d18..c2ea05fbbf1d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -158,33 +158,9 @@ static DEVICE_ATTR_RO(resource);
 static ssize_t max_link_speed_show(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
-   struct pci_dev *pci_dev = to_pci_dev(dev);
-   u32 linkcap;
-   int err;
-   const char *speed;
-
-   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
-   if (err)
-   return -EINVAL;
-
-   switch (linkcap & PCI_EXP_LNKCAP_SLS) {
-   case PCI_EXP_LNKCAP_SLS_16_0GB:
-   speed = "16 GT/s";
-   break;
-   case PCI_EXP_LNKCAP_SLS_8_0GB:
-   speed = "8 GT/s";
-   break;
-   case PCI_EXP_LNKCAP_SLS_5_0GB:
-   speed = "5 GT/s";
-   break;
-   case PCI_EXP_LNKCAP_SLS_2_5GB:
-   speed = "2.5 GT/s";
-   break;
-   default:
-   speed = "Unknown speed";
-   }
+   struct pci_dev *pdev = to_pci_dev(dev);
 
-   return sprintf(buf, "%s\n", speed);
+   return sprintf(buf, "%s\n", PCIE_SPEED2STR(pcie_get_speed_cap(pdev)));
 }
 static DEVICE_ATTR_RO(max_link_speed);
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index f6a4dd10d9b0..b29d3436ee9f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5146,6 +5146,50 @@ int pcie_get_minimum_link(struct pci_dev *dev, enum 
pci_bus_speed *speed,
 }
 EXPORT_SYMBOL(pcie_get_minimum_link);
 
+/**
+ * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device speed capability.  Return the maximum link speed
+ * supported by the device.
+ */
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+{
+   u32 lnkcap2, lnkcap;
+
+   /*
+* PCIe r4.0 sec 7.5.3.18 recommends using the Supported Link
+* Speeds Vector in Link Capabilities 2 when supported, falling
+* back to Max Link Speed in Link Capabilities otherwise.
+*/
+   pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, );
+   if (lnkcap2) { /* PCIe r3.0-compliant */
+   if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_16_0GB)
+   return PCIE_SPEED_16_0GT;
+   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
+   return PCIE_SPEED_8_0GT;
+   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
+   return PCIE_SPEED_5_0GT;
+   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
+   return PCIE_SPEED_2_5GT;
+   return PCI_SPEED_UNKNOWN;
+   }
+
+   pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, );
+   if (lnkcap) {
+   if (lnkcap & PCI_EXP_LNKCAP_SLS_16_0GB)
+   return PCIE_SPEED_16_0GT;
+   else if (lnkcap & PCI_EXP_LNKCAP_SLS_8_0GB)
+   return PCIE_SPEED_8_0GT;
+   else if (lnkcap & PCI_EXP_LNKCAP_SLS_5_0GB)
+   return PCIE_SPEED_5_0GT;
+   else if (lnkcap & PCI_EXP_LNKCAP_SLS_2_5GB)
+   return PCIE_SPEED_2_5GT;
+   }
+
+   return PCI_SPEED_UNKNOWN;
+}
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index fcd81911b127..1186d8be6055 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -253,6 +253,16 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx);
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
 
+/* PCIe link information */
+#define PCIE_SPEED2STR(speed) \
+   ((speed) == PCIE_SPEED_16_0GT ? "16 GT/s" : \
+(speed) == PCIE_SPEED_8_0GT ? "8 GT/s" : \
+(speed) == PCIE_SPEED_5_0GT ? "5 GT/s" : \
+(speed) == PCIE_SPEED_2_5GT ? "2.5 GT/s" : \
+"Unknown speed")
+
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
+
 /* Single Root I/O Virtualization */
 struct pci_sriov {
int pos;/* Capability position */

[PATCH v5 03/14] PCI: Add pcie_bandwidth_capable() to compute max supported link bandwidth

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Add pcie_bandwidth_capable() to compute the max link bandwidth supported by
a device, based on the max link speed and width, adjusted by the encoding
overhead.

The maximum bandwidth of the link is computed as:

  max_link_speed * max_link_width * (1 - encoding_overhead)

The encoding overhead is about 20% for 2.5 and 5.0 GT/s links using 8b/10b
encoding, and about 1.5% for 8 GT/s or higher speed links using 128b/130b
encoding.

Signed-off-by: Tal Gilboa 
[bhelgaas: adjust for pcie_get_speed_cap() and pcie_get_width_cap()
signatures, don't export outside drivers/pci]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Tariq Toukan 
---
 drivers/pci/pci.c |   21 +
 drivers/pci/pci.h |9 +
 2 files changed, 30 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 43075be79388..9ce89e254197 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5208,6 +5208,27 @@ enum pcie_link_width pcie_get_width_cap(struct pci_dev 
*dev)
return PCIE_LNK_WIDTH_UNKNOWN;
 }
 
+/**
+ * pcie_bandwidth_capable - calculates a PCI device's link bandwidth capability
+ * @dev: PCI device
+ * @speed: storage for link speed
+ * @width: storage for link width
+ *
+ * Calculate a PCI device's link bandwidth by querying for its link speed
+ * and width, multiplying them, and applying encoding overhead.
+ */
+u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
+  enum pcie_link_width *width)
+{
+   *speed = pcie_get_speed_cap(dev);
+   *width = pcie_get_width_cap(dev);
+
+   if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
+   return 0;
+
+   return *width * PCIE_SPEED2MBS_ENC(*speed);
+}
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 66738f1050c0..2a50172b9803 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -261,8 +261,17 @@ void pci_disable_bridge_window(struct pci_dev *dev);
 (speed) == PCIE_SPEED_2_5GT ? "2.5 GT/s" : \
 "Unknown speed")
 
+/* PCIe speed to Mb/s with encoding overhead: 20% for gen2, ~1.5% for gen3 */
+#define PCIE_SPEED2MBS_ENC(speed) \
+   ((speed) == PCIE_SPEED_8_0GT ? 7877 : \
+(speed) == PCIE_SPEED_5_0GT ? 4000 : \
+(speed) == PCIE_SPEED_2_5GT ? 2000 : \
+0)
+
 enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
 enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
+u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed,
+  enum pcie_link_width *width);
 
 /* Single Root I/O Virtualization */
 struct pci_sriov {

[PATCH v5 02/14] PCI: Add pcie_get_width_cap() to find max supported link width

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Add pcie_get_width_cap() to find the max link width supported by a device.
Change max_link_width_show() to use pcie_get_width_cap().

Signed-off-by: Tal Gilboa 
[bhelgaas: return width directly instead of error and *width, don't export
outside drivers/pci]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Tariq Toukan 
---
 drivers/pci/pci-sysfs.c |   10 ++
 drivers/pci/pci.c   |   18 ++
 drivers/pci/pci.h   |1 +
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index c2ea05fbbf1d..63d0952684fb 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -167,15 +167,9 @@ static DEVICE_ATTR_RO(max_link_speed);
 static ssize_t max_link_width_show(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
-   struct pci_dev *pci_dev = to_pci_dev(dev);
-   u32 linkcap;
-   int err;
-
-   err = pcie_capability_read_dword(pci_dev, PCI_EXP_LNKCAP, );
-   if (err)
-   return -EINVAL;
+   struct pci_dev *pdev = to_pci_dev(dev);
 
-   return sprintf(buf, "%u\n", (linkcap & PCI_EXP_LNKCAP_MLW) >> 4);
+   return sprintf(buf, "%u\n", pcie_get_width_cap(pdev));
 }
 static DEVICE_ATTR_RO(max_link_width);
 
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index b29d3436ee9f..43075be79388 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5190,6 +5190,24 @@ enum pci_bus_speed pcie_get_speed_cap(struct pci_dev 
*dev)
return PCI_SPEED_UNKNOWN;
 }
 
+/**
+ * pcie_get_width_cap - query for the PCI device's link width capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device width capability.  Return the maximum link width
+ * supported by the device.
+ */
+enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev)
+{
+   u32 lnkcap;
+
+   pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, );
+   if (lnkcap)
+   return (lnkcap & PCI_EXP_LNKCAP_MLW) >> 4;
+
+   return PCIE_LNK_WIDTH_UNKNOWN;
+}
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1186d8be6055..66738f1050c0 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -262,6 +262,7 @@ void pci_disable_bridge_window(struct pci_dev *dev);
 "Unknown speed")
 
 enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
+enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
 
 /* Single Root I/O Virtualization */
 struct pci_sriov {

[PATCH v5 04/14] PCI: Add pcie_bandwidth_available() to compute bandwidth available to device

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Add pcie_bandwidth_available() to compute the bandwidth available to a
device.  This may be limited by the device itself or by a slower upstream
link leading to the device.

The available bandwidth at each link along the path is computed as:

  link_speed * link_width * (1 - encoding_overhead)

The encoding overhead is about 20% for 2.5 and 5.0 GT/s links using 8b/10b
encoding, and about 1.5% for 8 GT/s or higher speed links using 128b/130b
encoding.

Also return the device with the slowest link and the speed and width of
that link.

Signed-off-by: Tal Gilboa 
[bhelgaas: changelog, leave pcie_get_minimum_link() alone for now, return
bw directly, use pci_upstream_bridge(), check "next_bw <= bw" to find
uppermost limiting device, return speed/width of the limiting device]
Signed-off-by: Bjorn Helgaas 
---
 drivers/pci/pci.c   |   54 +++
 include/linux/pci.h |3 +++
 2 files changed, 57 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 9ce89e254197..e00d56b12747 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5146,6 +5146,60 @@ int pcie_get_minimum_link(struct pci_dev *dev, enum 
pci_bus_speed *speed,
 }
 EXPORT_SYMBOL(pcie_get_minimum_link);
 
+/**
+ * pcie_bandwidth_available - determine minimum link settings of a PCIe
+ *   device and its bandwidth limitation
+ * @dev: PCI device to query
+ * @limiting_dev: storage for device causing the bandwidth limitation
+ * @speed: storage for speed of limiting device
+ * @width: storage for width of limiting device
+ *
+ * Walk up the PCI device chain and find the point where the minimum
+ * bandwidth is available.  Return the bandwidth available there and (if
+ * limiting_dev, speed, and width pointers are supplied) information about
+ * that point.
+ */
+u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev 
**limiting_dev,
+enum pci_bus_speed *speed,
+enum pcie_link_width *width)
+{
+   u16 lnksta;
+   enum pci_bus_speed next_speed;
+   enum pcie_link_width next_width;
+   u32 bw, next_bw;
+
+   *speed = PCI_SPEED_UNKNOWN;
+   *width = PCIE_LNK_WIDTH_UNKNOWN;
+   bw = 0;
+
+   while (dev) {
+   pcie_capability_read_word(dev, PCI_EXP_LNKSTA, );
+
+   next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
+   next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
+   PCI_EXP_LNKSTA_NLW_SHIFT;
+
+   next_bw = next_width * PCIE_SPEED2MBS_ENC(next_speed);
+
+   /* Check if current device limits the total bandwidth */
+   if (!bw || next_bw <= bw) {
+   bw = next_bw;
+
+   if (limiting_dev)
+   *limiting_dev = dev;
+   if (speed)
+   *speed = next_speed;
+   if (width)
+   *width = next_width;
+   }
+
+   dev = pci_upstream_bridge(dev);
+   }
+
+   return bw;
+}
+EXPORT_SYMBOL(pcie_bandwidth_available);
+
 /**
  * pcie_get_speed_cap - query for the PCI device's link speed capability
  * @dev: PCI device to query
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8043a5937ad0..f2bf2b7a66c7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1083,6 +1083,9 @@ int pcie_get_mps(struct pci_dev *dev);
 int pcie_set_mps(struct pci_dev *dev, int mps);
 int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
  enum pcie_link_width *width);
+u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev 
**limiting_dev,
+enum pci_bus_speed *speed,
+enum pcie_link_width *width);
 void pcie_flr(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);

[PATCH v5 07/14] net/mlx5: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations.

Signed-off-by: Tal Gilboa 
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c |4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 2ef641c91c26..622f02d34aae 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1043,6 +1043,10 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, 
struct mlx5_priv *priv,
dev_info(>dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
 fw_rev_min(dev), fw_rev_sub(dev));
 
+   /* Only PFs hold the relevant PCIe information for this query */
+   if (mlx5_core_is_pf(dev))
+   pcie_print_link_status(dev->pdev);
+
/* on load removing any previous indication of internal error, device is
 * up
 */

[PATCH v5 08/14] net/mlx5e: Use pcie_bandwidth_available() to compute bandwidth

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Use the new pci_bandwidth_available() function to calculate maximum
available bandwidth through the PCI chain instead of computing it ourselves
with mlx5e_get_pci_bw().

This is used to detect when the device is capable of more bandwidth than is
available in the current slot.  The driver may adjust compression settings
accordingly.

Note that pci_bandwidth_available() accounts for PCIe encoding overhead, so
it is more accurate than mlx5e_get_pci_bw() was.

Signed-off-by: Tal Gilboa 
[bhelgaas: remove mlx5e_get_pci_bw() wrapper altogether]
Signed-off-by: Bjorn Helgaas 
Reviewed-by: Tariq Toukan 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   32 +
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 47bab842c5ee..93291ec4a3d1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -3864,36 +3864,6 @@ void mlx5e_build_default_indir_rqt(u32 *indirection_rqt, 
int len,
indirection_rqt[i] = i % num_channels;
 }
 
-static int mlx5e_get_pci_bw(struct mlx5_core_dev *mdev, u32 *pci_bw)
-{
-   enum pcie_link_width width;
-   enum pci_bus_speed speed;
-   int err = 0;
-
-   err = pcie_get_minimum_link(mdev->pdev, , );
-   if (err)
-   return err;
-
-   if (speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN)
-   return -EINVAL;
-
-   switch (speed) {
-   case PCIE_SPEED_2_5GT:
-   *pci_bw = 2500 * width;
-   break;
-   case PCIE_SPEED_5_0GT:
-   *pci_bw = 5000 * width;
-   break;
-   case PCIE_SPEED_8_0GT:
-   *pci_bw = 8000 * width;
-   break;
-   default:
-   return -EINVAL;
-   }
-
-   return 0;
-}
-
 static bool cqe_compress_heuristic(u32 link_speed, u32 pci_bw)
 {
return (link_speed && pci_bw &&
@@ -3979,7 +3949,7 @@ void mlx5e_build_nic_params(struct mlx5_core_dev *mdev,
params->num_tc   = 1;
 
mlx5e_get_max_linkspeed(mdev, _speed);
-   mlx5e_get_pci_bw(mdev, _bw);
+   pci_bw = pcie_bandwidth_available(mdev->pdev, NULL, NULL, NULL);
mlx5_core_dbg(mdev, "Max link speed = %d, PCI BW = %d\n",
  link_speed, pci_bw);

[PATCH v5 06/14] net/mlx4_core: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Signed-off-by: Tal Gilboa 
Signed-off-by: Tariq Toukan 
[bhelgaas: changelog]
Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/mellanox/mlx4/main.c |   81 -
 1 file changed, 1 insertion(+), 80 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c 
b/drivers/net/ethernet/mellanox/mlx4/main.c
index 4d84cab77105..30cacac54e69 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -623,85 +623,6 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
return 0;
 }
 
-static int mlx4_get_pcie_dev_link_caps(struct mlx4_dev *dev,
-  enum pci_bus_speed *speed,
-  enum pcie_link_width *width)
-{
-   u32 lnkcap1, lnkcap2;
-   int err1, err2;
-
-#define  PCIE_MLW_CAP_SHIFT 4  /* start of MLW mask in link capabilities */
-
-   *speed = PCI_SPEED_UNKNOWN;
-   *width = PCIE_LNK_WIDTH_UNKNOWN;
-
-   err1 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP,
- );
-   err2 = pcie_capability_read_dword(dev->persist->pdev, PCI_EXP_LNKCAP2,
- );
-   if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
-   if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
-   *speed = PCIE_SPEED_8_0GT;
-   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
-   *speed = PCIE_SPEED_5_0GT;
-   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
-   *speed = PCIE_SPEED_2_5GT;
-   }
-   if (!err1) {
-   *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
-   if (!lnkcap2) { /* pre-r3.0 */
-   if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
-   *speed = PCIE_SPEED_5_0GT;
-   else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
-   *speed = PCIE_SPEED_2_5GT;
-   }
-   }
-
-   if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN) {
-   return err1 ? err1 :
-   err2 ? err2 : -EINVAL;
-   }
-   return 0;
-}
-
-static void mlx4_check_pcie_caps(struct mlx4_dev *dev)
-{
-   enum pcie_link_width width, width_cap;
-   enum pci_bus_speed speed, speed_cap;
-   int err;
-
-#define PCIE_SPEED_STR(speed) \
-   (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
-speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
-speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
-"Unknown")
-
-   err = mlx4_get_pcie_dev_link_caps(dev, _cap, _cap);
-   if (err) {
-   mlx4_warn(dev,
- "Unable to determine PCIe device BW capabilities\n");
-   return;
-   }
-
-   err = pcie_get_minimum_link(dev->persist->pdev, , );
-   if (err || speed == PCI_SPEED_UNKNOWN ||
-   width == PCIE_LNK_WIDTH_UNKNOWN) {
-   mlx4_warn(dev,
- "Unable to determine PCI device chain minimum BW\n");
-   return;
-   }
-
-   if (width != width_cap || speed != speed_cap)
-   mlx4_warn(dev,
- "PCIe BW is different than device's capability\n");
-
-   mlx4_info(dev, "PCIe link speed is %s, device supports %s\n",
- PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
-   mlx4_info(dev, "PCIe link width is x%d, device supports x%d\n",
- width, width_cap);
-   return;
-}
-
 /*The function checks if there are live vf, return the num of them*/
 static int mlx4_how_many_lives_vf(struct mlx4_dev *dev)
 {
@@ -3475,7 +3396,7 @@ static int mlx4_load_one(struct pci_dev *pdev, int 
pci_dev_data,
 * express device capabilities are under-satisfied by the bus.
 */
if (!mlx4_is_slave(dev))
-   mlx4_check_pcie_caps(dev);
+   pcie_print_link_status(dev->persist->pdev);
 
/* In master functions, the communication channel must be initialized
 * after obtaining its address from fw */

[PATCH v5 09/14] bnx2x: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Note that pcie_get_minimum_link() can return misleading information because
it finds the slowest link and the narrowest link without considering the
total bandwidth of the link.  If the path contains a 16 GT/s x1 link and a
2.5 GT/s x16 link, pcie_get_minimum_link() returns 2.5 GT/s x1, which
corresponds to 250 MB/s of bandwidth, not the actual available bandwidth of
about 2000 MB/s for a 16 GT/s x1 link.

Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |   23 ++
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 74fc9af4aadb..c92601f1b0f3 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -13922,8 +13922,6 @@ static int bnx2x_init_one(struct pci_dev *pdev,
 {
struct net_device *dev = NULL;
struct bnx2x *bp;
-   enum pcie_link_width pcie_width;
-   enum pci_bus_speed pcie_speed;
int rc, max_non_def_sbs;
int rx_count, tx_count, rss_count, doorbell_size;
int max_cos_est;
@@ -14091,21 +14089,12 @@ static int bnx2x_init_one(struct pci_dev *pdev,
dev_addr_add(bp->dev, bp->fip_mac, NETDEV_HW_ADDR_T_SAN);
rtnl_unlock();
}
-   if (pcie_get_minimum_link(bp->pdev, _speed, _width) ||
-   pcie_speed == PCI_SPEED_UNKNOWN ||
-   pcie_width == PCIE_LNK_WIDTH_UNKNOWN)
-   BNX2X_DEV_INFO("Failed to determine PCI Express Bandwidth\n");
-   else
-   BNX2X_DEV_INFO(
-  "%s (%c%d) PCI-E x%d %s found at mem %lx, IRQ %d, node 
addr %pM\n",
-  board_info[ent->driver_data].name,
-  (CHIP_REV(bp) >> 12) + 'A', (CHIP_METAL(bp) >> 4),
-  pcie_width,
-  pcie_speed == PCIE_SPEED_2_5GT ? "2.5GHz" :
-  pcie_speed == PCIE_SPEED_5_0GT ? "5.0GHz" :
-  pcie_speed == PCIE_SPEED_8_0GT ? "8.0GHz" :
-  "Unknown",
-  dev->base_addr, bp->pdev->irq, dev->dev_addr);
+   BNX2X_DEV_INFO(
+  "%s (%c%d) PCI-E found at mem %lx, IRQ %d, node addr %pM\n",
+  board_info[ent->driver_data].name,
+  (CHIP_REV(bp) >> 12) + 'A', (CHIP_METAL(bp) >> 4),
+  dev->base_addr, bp->pdev->irq, dev->dev_addr);
+   pcie_print_link_status(bp->pdev);
 
bnx2x_register_phc(bp);

[PATCH v5 11/14] cxgb4: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Note that pcie_get_minimum_link() can return misleading information because
it finds the slowest link and the narrowest link without considering the
total bandwidth of the link.  If the path contains a 16 GT/s x1 link and a
2.5 GT/s x16 link, pcie_get_minimum_link() returns 2.5 GT/s x1, which
corresponds to 250 MB/s of bandwidth, not the actual available bandwidth of
about 2000 MB/s for a 16 GT/s x1 link.

Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   75 ---
 1 file changed, 1 insertion(+), 74 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 56bc626ef006..2d6864c8199e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4762,79 +4762,6 @@ static int init_rss(struct adapter *adap)
return 0;
 }
 
-static int cxgb4_get_pcie_dev_link_caps(struct adapter *adap,
-   enum pci_bus_speed *speed,
-   enum pcie_link_width *width)
-{
-   u32 lnkcap1, lnkcap2;
-   int err1, err2;
-
-#define  PCIE_MLW_CAP_SHIFT 4   /* start of MLW mask in link capabilities */
-
-   *speed = PCI_SPEED_UNKNOWN;
-   *width = PCIE_LNK_WIDTH_UNKNOWN;
-
-   err1 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP,
- );
-   err2 = pcie_capability_read_dword(adap->pdev, PCI_EXP_LNKCAP2,
- );
-   if (!err2 && lnkcap2) { /* PCIe r3.0-compliant */
-   if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_8_0GB)
-   *speed = PCIE_SPEED_8_0GT;
-   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_5_0GB)
-   *speed = PCIE_SPEED_5_0GT;
-   else if (lnkcap2 & PCI_EXP_LNKCAP2_SLS_2_5GB)
-   *speed = PCIE_SPEED_2_5GT;
-   }
-   if (!err1) {
-   *width = (lnkcap1 & PCI_EXP_LNKCAP_MLW) >> PCIE_MLW_CAP_SHIFT;
-   if (!lnkcap2) { /* pre-r3.0 */
-   if (lnkcap1 & PCI_EXP_LNKCAP_SLS_5_0GB)
-   *speed = PCIE_SPEED_5_0GT;
-   else if (lnkcap1 & PCI_EXP_LNKCAP_SLS_2_5GB)
-   *speed = PCIE_SPEED_2_5GT;
-   }
-   }
-
-   if (*speed == PCI_SPEED_UNKNOWN || *width == PCIE_LNK_WIDTH_UNKNOWN)
-   return err1 ? err1 : err2 ? err2 : -EINVAL;
-   return 0;
-}
-
-static void cxgb4_check_pcie_caps(struct adapter *adap)
-{
-   enum pcie_link_width width, width_cap;
-   enum pci_bus_speed speed, speed_cap;
-
-#define PCIE_SPEED_STR(speed) \
-   (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" : \
-speed == PCIE_SPEED_5_0GT ? "5.0GT/s" : \
-speed == PCIE_SPEED_2_5GT ? "2.5GT/s" : \
-"Unknown")
-
-   if (cxgb4_get_pcie_dev_link_caps(adap, _cap, _cap)) {
-   dev_warn(adap->pdev_dev,
-"Unable to determine PCIe device BW capabilities\n");
-   return;
-   }
-
-   if (pcie_get_minimum_link(adap->pdev, , ) ||
-   speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
-   dev_warn(adap->pdev_dev,
-"Unable to determine PCI Express bandwidth.\n");
-   return;
-   }
-
-   dev_info(adap->pdev_dev, "PCIe link speed is %s, device supports %s\n",
-PCIE_SPEED_STR(speed), PCIE_SPEED_STR(speed_cap));
-   dev_info(adap->pdev_dev, "PCIe link width is x%d, device supports 
x%d\n",
-width, width_cap);
-   if (speed < speed_cap || width < width_cap)
-   dev_info(adap->pdev_dev,
-"A slot with more lanes and/or higher speed is "
-"suggested for optimal performance.\n");
-}
-
 /* Dump basic information about the adapter */
 static void print_adapter_info(struct adapter *adapter)
 {
@@ -5466,7 +5393,7 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
}
 
/* check for PCI Express bandwidth capabiltites */
-   cxgb4_check_pcie_caps(adapter);
+   pcie_print_link_status(pdev);
 
err = init_rss(adapter);
if (err)

[PATCH v5 13/14] ixgbe: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Note that pcie_get_minimum_link() can return misleading information because
it finds the slowest link and the narrowest link without considering the
total bandwidth of the link.  If the path contains a 16 GT/s x1 link and a
2.5 GT/s x16 link, pcie_get_minimum_link() returns 2.5 GT/s x1, which
corresponds to 250 MB/s of bandwidth, not the actual available bandwidth of
about 2000 MB/s for a 16 GT/s x1 link.

Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |   47 +
 1 file changed, 1 insertion(+), 46 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0da5aa2c8aba..38bb9c17d333 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -270,9 +270,6 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter 
*adapter,
 int expected_gts)
 {
struct ixgbe_hw *hw = >hw;
-   int max_gts = 0;
-   enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
-   enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
struct pci_dev *pdev;
 
/* Some devices are not connected over PCIe and thus do not negotiate
@@ -288,49 +285,7 @@ static void ixgbe_check_minimum_link(struct ixgbe_adapter 
*adapter,
else
pdev = adapter->pdev;
 
-   if (pcie_get_minimum_link(pdev, , ) ||
-   speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
-   e_dev_warn("Unable to determine PCI Express bandwidth.\n");
-   return;
-   }
-
-   switch (speed) {
-   case PCIE_SPEED_2_5GT:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   max_gts = 2 * width;
-   break;
-   case PCIE_SPEED_5_0GT:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   max_gts = 4 * width;
-   break;
-   case PCIE_SPEED_8_0GT:
-   /* 128b/130b encoding reduces throughput by less than 2% */
-   max_gts = 8 * width;
-   break;
-   default:
-   e_dev_warn("Unable to determine PCI Express bandwidth.\n");
-   return;
-   }
-
-   e_dev_info("PCI Express bandwidth of %dGT/s available\n",
-  max_gts);
-   e_dev_info("(Speed:%s, Width: x%d, Encoding Loss:%s)\n",
-  (speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
-   speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
-   speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
-   "Unknown"),
-  width,
-  (speed == PCIE_SPEED_2_5GT ? "20%" :
-   speed == PCIE_SPEED_5_0GT ? "20%" :
-   speed == PCIE_SPEED_8_0GT ? "<2%" :
-   "Unknown"));
-
-   if (max_gts < expected_gts) {
-   e_dev_warn("This is not sufficient for optimal performance of 
this card.\n");
-   e_dev_warn("For optimal performance, at least %dGT/s of 
bandwidth is required.\n",
-   expected_gts);
-   e_dev_warn("A slot with more lanes and/or higher speed is 
suggested.\n");
-   }
+   pcie_print_link_status(pdev);
 }
 
 static void ixgbe_service_event_schedule(struct ixgbe_adapter *adapter)

[PATCH v5 10/14] bnxt_en: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Note that pcie_get_minimum_link() can return misleading information because
it finds the slowest link and the narrowest link without considering the
total bandwidth of the link.  If the path contains a 16 GT/s x1 link and a
2.5 GT/s x16 link, pcie_get_minimum_link() returns 2.5 GT/s x1, which
corresponds to 250 MB/s of bandwidth, not the actual available bandwidth of
about 2000 MB/s for a 16 GT/s x1 link.

Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c |   19 +--
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c 
b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 1500243b9886..3be42431e029 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -8469,22 +8469,6 @@ static int bnxt_init_mac_addr(struct bnxt *bp)
return rc;
 }
 
-static void bnxt_parse_log_pcie_link(struct bnxt *bp)
-{
-   enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
-   enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
-
-   if (pcie_get_minimum_link(pci_physfn(bp->pdev), , ) ||
-   speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN)
-   netdev_info(bp->dev, "Failed to determine PCIe Link Info\n");
-   else
-   netdev_info(bp->dev, "PCIe: Speed %s Width x%d\n",
-   speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
-   speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
-   speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
-   "Unknown", width);
-}
-
 static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
static int version_printed;
@@ -8694,8 +8678,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
netdev_info(dev, "%s found at mem %lx, node addr %pM\n",
board_info[ent->driver_data].name,
(long)pci_resource_start(pdev, 0), dev->dev_addr);
-
-   bnxt_parse_log_pcie_link(bp);
+   pcie_print_link_status(pdev);
 
return 0;

[PATCH v5 12/14] fm10k: Report PCIe link properties with pcie_print_link_status()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

Use pcie_print_link_status() to report PCIe link speed and possible
limitations instead of implementing this in the driver itself.

Note that pcie_get_minimum_link() can return misleading information because
it finds the slowest link and the narrowest link without considering the
total bandwidth of the link.  If the path contains a 16 GT/s x1 link and a
2.5 GT/s x16 link, pcie_get_minimum_link() returns 2.5 GT/s x1, which
corresponds to 250 MB/s of bandwidth, not the actual available bandwidth of
about 2000 MB/s for a 16 GT/s x1 link.

Signed-off-by: Bjorn Helgaas 
---
 drivers/net/ethernet/intel/fm10k/fm10k_pci.c |   87 --
 1 file changed, 1 insertion(+), 86 deletions(-)

diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c 
b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index a434fecfdfeb..aa05fb534942 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -2120,91 +2120,6 @@ static int fm10k_sw_init(struct fm10k_intfc *interface,
return 0;
 }
 
-static void fm10k_slot_warn(struct fm10k_intfc *interface)
-{
-   enum pcie_link_width width = PCIE_LNK_WIDTH_UNKNOWN;
-   enum pci_bus_speed speed = PCI_SPEED_UNKNOWN;
-   struct fm10k_hw *hw = >hw;
-   int max_gts = 0, expected_gts = 0;
-
-   if (pcie_get_minimum_link(interface->pdev, , ) ||
-   speed == PCI_SPEED_UNKNOWN || width == PCIE_LNK_WIDTH_UNKNOWN) {
-   dev_warn(>pdev->dev,
-"Unable to determine PCI Express bandwidth.\n");
-   return;
-   }
-
-   switch (speed) {
-   case PCIE_SPEED_2_5GT:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   max_gts = 2 * width;
-   break;
-   case PCIE_SPEED_5_0GT:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   max_gts = 4 * width;
-   break;
-   case PCIE_SPEED_8_0GT:
-   /* 128b/130b encoding has less than 2% impact on throughput */
-   max_gts = 8 * width;
-   break;
-   default:
-   dev_warn(>pdev->dev,
-"Unable to determine PCI Express bandwidth.\n");
-   return;
-   }
-
-   dev_info(>pdev->dev,
-"PCI Express bandwidth of %dGT/s available\n",
-max_gts);
-   dev_info(>pdev->dev,
-"(Speed:%s, Width: x%d, Encoding Loss:%s, Payload:%s)\n",
-(speed == PCIE_SPEED_8_0GT ? "8.0GT/s" :
- speed == PCIE_SPEED_5_0GT ? "5.0GT/s" :
- speed == PCIE_SPEED_2_5GT ? "2.5GT/s" :
- "Unknown"),
-hw->bus.width,
-(speed == PCIE_SPEED_2_5GT ? "20%" :
- speed == PCIE_SPEED_5_0GT ? "20%" :
- speed == PCIE_SPEED_8_0GT ? "<2%" :
- "Unknown"),
-(hw->bus.payload == fm10k_bus_payload_128 ? "128B" :
- hw->bus.payload == fm10k_bus_payload_256 ? "256B" :
- hw->bus.payload == fm10k_bus_payload_512 ? "512B" :
- "Unknown"));
-
-   switch (hw->bus_caps.speed) {
-   case fm10k_bus_speed_2500:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   expected_gts = 2 * hw->bus_caps.width;
-   break;
-   case fm10k_bus_speed_5000:
-   /* 8b/10b encoding reduces max throughput by 20% */
-   expected_gts = 4 * hw->bus_caps.width;
-   break;
-   case fm10k_bus_speed_8000:
-   /* 128b/130b encoding has less than 2% impact on throughput */
-   expected_gts = 8 * hw->bus_caps.width;
-   break;
-   default:
-   dev_warn(>pdev->dev,
-"Unable to determine expected PCI Express 
bandwidth.\n");
-   return;
-   }
-
-   if (max_gts >= expected_gts)
-   return;
-
-   dev_warn(>pdev->dev,
-"This device requires %dGT/s of bandwidth for optimal 
performance.\n",
-expected_gts);
-   dev_warn(>pdev->dev,
-"A %sslot with x%d lanes is suggested.\n",
-(hw->bus_caps.speed == fm10k_bus_speed_2500 ? "2.5GT/s " :
- hw->bus_caps.speed == fm10k_bus_speed_5000 ? "5.0GT/s " :
- hw->bus_caps.speed == fm10k_bus_speed_8000 ? "8.0GT/s " : ""),
-hw->bus_caps.width);
-}
-
 /**
  * fm10k_probe - Device Initialization Routine
  * @pdev: PCI device information struct
@@ -2326,7 +2241,7 @@ static int fm10k_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
mod_timer(>service_timer, (HZ * 2) + jiffies);
 
/* print warning for non-optimal configurations */
-   fm10k_slot_warn(interface);
+   pcie_print_link_status(interface->pdev);
 
/* report MAC

[PATCH v5 14/14] PCI: Remove unused pcie_get_minimum_link()

2018-03-30 Thread Bjorn Helgaas

From: Bjorn Helgaas 

In some cases pcie_get_minimum_link() returned misleading information
because it found the slowest link and the narrowest link without
considering the total bandwidth of the link.  For example, if the path
contained a 16 GT/s x1 link and a 2.5 GT/s x16 link,
pcie_get_minimum_link() returned 2.5 GT/s x1, which corresponds to 250 MB/s
of bandwidth, not the actual available bandwidth of about 2000 MB/s for a
16 GT/s x1 link.

Callers should use pcie_print_link_status() instead, or
pcie_bandwidth_available() if they need more detailed information.

Signed-off-by: Bjorn Helgaas 
---
 drivers/pci/pci.c   |   43 ---
 include/linux/pci.h |2 --
 2 files changed, 45 deletions(-)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index cec7aed09f6b..b6951c44ae6c 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5103,49 +5103,6 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 }
 EXPORT_SYMBOL(pcie_set_mps);
 
-/**
- * pcie_get_minimum_link - determine minimum link settings of a PCI device
- * @dev: PCI device to query
- * @speed: storage for minimum speed
- * @width: storage for minimum width
- *
- * This function will walk up the PCI device chain and determine the minimum
- * link width and speed of the device.
- */
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
- enum pcie_link_width *width)
-{
-   int ret;
-
-   *speed = PCI_SPEED_UNKNOWN;
-   *width = PCIE_LNK_WIDTH_UNKNOWN;
-
-   while (dev) {
-   u16 lnksta;
-   enum pci_bus_speed next_speed;
-   enum pcie_link_width next_width;
-
-   ret = pcie_capability_read_word(dev, PCI_EXP_LNKSTA, );
-   if (ret)
-   return ret;
-
-   next_speed = pcie_link_speed[lnksta & PCI_EXP_LNKSTA_CLS];
-   next_width = (lnksta & PCI_EXP_LNKSTA_NLW) >>
-   PCI_EXP_LNKSTA_NLW_SHIFT;
-
-   if (next_speed < *speed)
-   *speed = next_speed;
-
-   if (next_width < *width)
-   *width = next_width;
-
-   dev = dev->bus->self;
-   }
-
-   return 0;
-}
-EXPORT_SYMBOL(pcie_get_minimum_link);
-
 /**
  * pcie_bandwidth_available - determine minimum link settings of a PCIe
  *   device and its bandwidth limitation
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 38f7957121ef..5ccee29fe1b1 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1081,8 +1081,6 @@ int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
 int pcie_get_mps(struct pci_dev *dev);
 int pcie_set_mps(struct pci_dev *dev, int mps);
-int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed,
- enum pcie_link_width *width);
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev 
**limiting_dev,
 enum pci_bus_speed *speed,
 enum pcie_link_width *width);

[PATCH v5 05/14] PCI: Add pcie_print_link_status() to log link speed and whether it's limited

2018-03-30 Thread Bjorn Helgaas

From: Tal Gilboa 

Add pcie_print_link_status().  This logs the current settings of the link
(speed, width, and total available bandwidth).

If the device is capable of more bandwidth but is limited by a slower
upstream link, we include information about the link that limits the
device's performance.

The user may be able to move the device to a different slot for better
performance.

This provides a unified method for all PCI devices to report status and
issues, instead of each device reporting in a different way, using
different code.

Signed-off-by: Tal Gilboa 
[bhelgaas: changelog, reword log messages, print device capabilities when
not limited]
Signed-off-by: Bjorn Helgaas 
---
 drivers/pci/pci.c   |   29 +
 include/linux/pci.h |1 +
 2 files changed, 30 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e00d56b12747..cec7aed09f6b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -5283,6 +5283,35 @@ u32 pcie_bandwidth_capable(struct pci_dev *dev, enum 
pci_bus_speed *speed,
return *width * PCIE_SPEED2MBS_ENC(*speed);
 }
 
+/**
+ * pcie_print_link_status - Report the PCI device's link speed and width
+ * @dev: PCI device to query
+ *
+ * Report the available bandwidth at the device.  If this is less than the
+ * device is capable of, report the device's maximum possible bandwidth and
+ * the upstream link that limits its performance to less than that.
+ */
+void pcie_print_link_status(struct pci_dev *dev)
+{
+   enum pcie_link_width width, width_cap;
+   enum pci_bus_speed speed, speed_cap;
+   struct pci_dev *limiting_dev = NULL;
+   u32 bw_avail, bw_cap;
+
+   bw_cap = pcie_bandwidth_capable(dev, _cap, _cap);
+   bw_avail = pcie_bandwidth_available(dev, _dev, , );
+
+   if (bw_avail >= bw_cap)
+   pci_info(dev, "%d Mb/s available bandwidth (%s x%d link)\n",
+bw_cap, PCIE_SPEED2STR(speed_cap), width_cap);
+   else
+   pci_info(dev, "%d Mb/s available bandwidth, limited by %s x%d 
link at %s (capable of %d Mb/s with %s x%d link)\n",
+bw_avail, PCIE_SPEED2STR(speed), width,
+limiting_dev ? pci_name(limiting_dev) : "",
+bw_cap, PCIE_SPEED2STR(speed_cap), width_cap);
+}
+EXPORT_SYMBOL(pcie_print_link_status);
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
diff --git a/include/linux/pci.h b/include/linux/pci.h
index f2bf2b7a66c7..38f7957121ef 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1086,6 +1086,7 @@ int pcie_get_minimum_link(struct pci_dev *dev, enum 
pci_bus_speed *speed,
 u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev 
**limiting_dev,
 enum pci_bus_speed *speed,
 enum pcie_link_width *width);
+void pcie_print_link_status(struct pci_dev *dev);
 void pcie_flr(struct pci_dev *dev);
 int __pci_reset_function_locked(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);

[PATCH] Bluetooth: Mark expected switch fall-throughs

2018-03-30 Thread Gustavo A. R. Silva

In preparation to enabling -Wimplicit-fallthrough, mark switch cases
where we are expecting to fall through.

Signed-off-by: Gustavo A. R. Silva 
---
 net/bluetooth/mgmt.c| 1 +
 net/bluetooth/rfcomm/sock.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 6e9fc86..8a80d48 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4801,6 +4801,7 @@ static int load_long_term_keys(struct sock *sk, struct 
hci_dev *hdev,
case MGMT_LTK_P256_DEBUG:
authenticated = 0x00;
type = SMP_LTK_P256_DEBUG;
+   /* fall through */
default:
continue;
}
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 93a3b21..d606e92 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -221,6 +221,7 @@ static void __rfcomm_sock_close(struct sock *sk)
case BT_CONFIG:
case BT_CONNECTED:
rfcomm_dlc_close(d, 0);
+   /* fall through */
 
default:
sock_set_flag(sk, SOCK_ZAPPED);
-- 
2.7.4

[PATCH net-next] hv_netvsc: Clean up extra parameter from rndis_filter_receive_data()

2018-03-30 Thread Haiyang Zhang

From: Haiyang Zhang 

The variables, msg and data, have the same value. This patch removes
the extra one.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/rndis_filter.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/net/hyperv/rndis_filter.c 
b/drivers/net/hyperv/rndis_filter.c
index 4a4952363e8a..e2b68d9328a7 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -365,14 +365,15 @@ static inline void *rndis_get_ppi(struct rndis_packet 
*rpkt, u32 type)
 
 static int rndis_filter_receive_data(struct net_device *ndev,
 struct netvsc_device *nvdev,
-struct rndis_message *msg,
 struct vmbus_channel *channel,
-void *data, u32 data_buflen)
+struct rndis_message *msg,
+u32 data_buflen)
 {
struct rndis_packet *rndis_pkt = >msg.pkt;
const struct ndis_tcp_ip_checksum_info *csum_info;
const struct ndis_pkt_8021q_info *vlan;
u32 data_offset;
+   void *data;
 
/* Remove the rndis header and pass it back up the stack */
data_offset = RNDIS_HEADER_SIZE + rndis_pkt->data_offset;
@@ -393,14 +394,15 @@ static int rndis_filter_receive_data(struct net_device 
*ndev,
 
vlan = rndis_get_ppi(rndis_pkt, IEEE_8021Q_INFO);
 
+   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
+
+   data = (void *)msg + data_offset;
+
/*
 * Remove the rndis trailer padding from rndis packet message
 * rndis_pkt->data_len tell us the real data length, we only copy
 * the data packet to the stack, without the rndis trailer padding
 */
-   data = (void *)((unsigned long)data + data_offset);
-   csum_info = rndis_get_ppi(rndis_pkt, TCPIP_CHKSUM_PKTINFO);
-
return netvsc_recv_callback(ndev, nvdev, channel,
data, rndis_pkt->data_len,
csum_info, vlan);
@@ -419,8 +421,8 @@ int rndis_filter_receive(struct net_device *ndev,
 
switch (rndis_msg->ndis_msg_type) {
case RNDIS_MSG_PACKET:
-   return rndis_filter_receive_data(ndev, net_dev, rndis_msg,
-channel, data, buflen);
+   return rndis_filter_receive_data(ndev, net_dev, channel,
+rndis_msg, buflen);
case RNDIS_MSG_INIT_C:
case RNDIS_MSG_QUERY_C:
case RNDIS_MSG_SET_C:
-- 
2.15.1

Re: [bpf-next PATCH v3 0/4] bpf, sockmap BPF_F_INGRESS support

2018-03-30 Thread Daniel Borkmann

On 03/28/2018 09:49 PM, John Fastabend wrote:
> This series adds the BPF_F_INGRESS flag support to the redirect APIs.
> Bringing the sockmap API in-line with the cls_bpf redirect APIs.
> 
> We add it to both variants of sockmap programs, the first patch adds
> support for tx ulp hooks and the third patch adds support for the recv
> skb hooks. Patches two and four add tests for the corresponding
> ingress redirect hooks.
> 
> Follow on patches can address busy polling support, but next series
> from me will move the sockmap sample program into selftests.
> 
> v2: added static to function definition caught by kbuild bot
> v3: fixed an error branch with missing mem_uncharge
> in recvmsg op moved receive_queue check outside of RCU region

Applied this yesterday to bpf-next, thanks John!

Re: [PATCH v2 bpf-next 0/2] sockmap: fix sg api usage

2018-03-30 Thread Daniel Borkmann

On 03/30/2018 05:39 AM, John Fastabend wrote:
> On 03/29/2018 05:20 PM, Prashant Bhole wrote:
>> These patches fix sg api usage in sockmap. Previously sockmap didn't
>> use sg_init_table(), which caused hitting BUG_ON in sg api, when
>> CONFIG_DEBUG_SG is enabled
>>
>> v1: added sg_init_table() calls wherever needed.
>>
>> v2:
>> - Patch1 adds new helper function in sg api. sg_init_marker()
>> - Patch2 sg_init_marker() and sg_init_table() in appropriate places
>>
>> Backgroud:
>> While reviewing v1, John Fastabend raised a valid point about
>> unnecessary memset in sg_init_table() because sockmap uses sg table
>> which embedded in a struct. As enclosing struct is zeroed out, there
>> is unnecessary memset in sg_init_table.
>>
>> So Daniel Borkmann suggested to define another static inline function
>> in scatterlist.h which only initializes sg_magic. Also this function 
>> will be called from sg_init_table. From this suggestion I defined a
>> function sg_init_marker() which sets sg_magic and calls sg_mark_end()
> 
> Series looks good to me thanks for finding and fixing this!

Applied to bpf-next, thanks Prashant!

[PATCH v2 net-next 12/12] inet: frags: break the 2GB limit for frags storage

2018-03-30 Thread Eric Dumazet

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Note that this patch avoids an overflow error, if high_thresh was set
to ~2GB, since this test in inet_frag_alloc() was never true :

if (... || frag_mem_limit(nf) > nf->high_thresh)

Tested:

$ echo 160 >/proc/sys/net/ipv4/ipfrag_high_thresh



$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 1602880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds33171500.0
IpReasmFails33171120.0

Signed-off-by: Eric Dumazet 
---
 Documentation/networking/ip-sysctl.txt  |  4 ++--
 include/net/inet_frag.h | 20 ++--
 net/ieee802154/6lowpan/reassembly.c | 10 +-
 net/ipv4/ip_fragment.c  | 10 +-
 net/ipv4/proc.c |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +-
 net/ipv6/proc.c |  2 +-
 net/ipv6/reassembly.c   |  6 +++---
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
6f2a3670e44b6662ce53c16cb7ca1e4f61274c15..5dc1a040a2f1db610873de26c2d79bc57ac5a1a2
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -133,10 +133,10 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
Maximum memory used to reassemble IP fragments.
 
-ipfrag_low_thresh - INTEGER
+ipfrag_low_thresh - LONG INTEGER
(Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
95e353e3305b43253084d972e32538138bcc5454..a52e7273e7a59bc8ce47b21d29235a740add8db0
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -8,11 +8,11 @@ struct netns_frags {
struct rhashtable   rhashtable cacheline_aligned_in_smp;
 
/* Keep atomic mem on separate cachelines in structs that include it */
-   atomic_tmem cacheline_aligned_in_smp;
+   atomic_long_t   mem cacheline_aligned_in_smp;
/* sysctls */
+   longhigh_thresh;
+   longlow_thresh;
int timeout;
-   int high_thresh;
-   int low_thresh;
int max_dist;
struct inet_frags   *f;
 };
@@ -102,7 +102,7 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-   atomic_set(>mem, 0);
+   atomic_long_set(>mem, 0);
return rhashtable_init(>rhashtable, >f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
@@ -119,19 +119,19 @@ static inline void inet_frag_put(struct inet_frag_queue 
*q)
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
+static inline long frag_mem_limit(const struct netns_frags *nf)
 {
-   return atomic_read(>mem);
+   return atomic_long_read(>mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_sub(i, >mem);
+   atomic_long_sub(val, >mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
 {
-   atomic_add(i, >mem);
+   atomic_long_add(val, >mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
7eaa0617e277b829b801aee4e75f0ec61b2daf41..1f0857937ad187b48ff2af5e9c8570cf2b133fd2
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -411,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname   = "6lowpanfrag_high_thresh",
.data   = _net.ieee802154_lowpan.frags.high_thresh,
-   .maxlen = sizeof(int),
+   .maxlen = sizeof(unsigned long),
.mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
+   .proc_handler   = proc_doulongvec_minmax,
.extra1 = _net.ieee802154_lowpan.frags.low_thresh
},
{
.procname   =

[PATCH v2 net-next 11/12] inet: frags: remove inet_frag_maybe_warn_overflow()

2018-03-30 Thread Eric Dumazet

This function is obsolete, after rhashtable addition to inet defrag.

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  2 --
 net/ieee802154/6lowpan/reassembly.c |  5 ++---
 net/ipv4/inet_fragment.c| 11 ---
 net/ipv4/ip_fragment.c  |  5 ++---
 net/ipv6/netfilter/nf_conntrack_reasm.c |  5 ++---
 net/ipv6/reassembly.c   |  5 ++---
 6 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
0e8e159d88f7f77254fae5a49f1c7ba07b967e11..95e353e3305b43253084d972e32538138bcc5454
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -110,8 +110,6 @@ void inet_frags_exit_net(struct netns_frags *nf);
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-  const char *prefix);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
dd743c287bc229b1ba354e834af7bec34dcb8643..7eaa0617e277b829b801aee4e75f0ec61b2daf41
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -84,10 +84,9 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
struct inet_frag_queue *q;
 
q = inet_frag_find(_lowpan->frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct lowpan_frag_queue, q);
 }
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 
bbb0ff2c262e2d73630b441a088a036397df6f28..2323129c9ccbd4bc7c87f100863ab764a47f5765
 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -216,14 +216,3 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags 
*nf, void *key)
return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-  const char *prefix)
-{
-   static const char msg[] = "inet_frag_find: Fragment hash bucket"
-   " list length grew over limit. Dropping fragment.\n";
-
-   if (PTR_ERR(q) == -ENOBUFS)
-   net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
b844f517b75bd6a52538e9f7687e039e22c93bc7..b0366224f314ae521d8c1f8fe04c53e419292b4c
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -221,10 +221,9 @@ static struct ipq *ip_find(struct net *net, struct iphdr 
*iph,
struct inet_frag_queue *q;
 
q = inet_frag_find(>ipv4.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct ipq, q);
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 
0ad3df551d9884ba30f2d40658ee81a61720e947..d866412b8f6c432f04c0968f08f820fdc561262b
 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -178,10 +178,9 @@ static struct frag_queue *fq_find(struct net *net, __be32 
id, u32 user,
struct inet_frag_queue *q;
 
q = inet_frag_find(>nf_frag.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct frag_queue, q);
 }
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
70acad126d044a0f6a1efc63f307805fcf7b1df9..2a77fda5e3bca1b6ce8c24df11e741653a15c665
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -155,10 +155,9 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr 
*hdr, int iif)
key.iif = 0;
 
q = inet_frag_find(>ipv6.frags, );
-   if (IS_ERR_OR_NULL(q)) {
-   inet_frag_maybe_warn_overflow(q, pr_fmt());
+   if (!q)
return NULL;
-   }
+
return container_of(q, struct frag_queue, q);
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 10/12] inet: frags: get rif of inet_frag_evicting()

2018-03-30 Thread Eric Dumazet

This refactors ip_expire() since one indentation level is removed.

Note: in the future, we should try hard to avoid the skb_clone()
since this is a serious performance cost.
Under DDOS, the ICMP message wont be sent because of rate limits.

Fact that ip6_expire_frag_queue() does not use skb_clone() is
disturbing too. Presumably IPv6 should have the same
issue than the one we fixed in commit ec4fbd64751d
("inet: frag: release spinlock before calling icmp_send()")

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h |  5 
 net/ipv4/ip_fragment.c  | 61 -
 net/ipv6/reassembly.c   |  4 ---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
4b5449df0aadf1f75144c98317bf5305ec91d88b..0e8e159d88f7f77254fae5a49f1c7ba07b967e11
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -119,11 +119,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-   return false;
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
44f4fa306e224a6f76183b1c04935f01ceb4fe2b..b844f517b75bd6a52538e9f7687e039e22c93bc7
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -143,8 +143,11 @@ static bool frag_expire_skip_icmp(u32 user)
 static void ip_expire(struct timer_list *t)
 {
struct inet_frag_queue *frag = from_timer(frag, t, timer);
-   struct ipq *qp;
+   struct sk_buff *clone, *head;
+   const struct iphdr *iph;
struct net *net;
+   struct ipq *qp;
+   int err;
 
qp = container_of(frag, struct ipq, q);
net = container_of(qp->q.net, struct net, ipv4.frags);
@@ -158,45 +161,41 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-   if (!inet_frag_evicting(>q)) {
-   struct sk_buff *clone, *head = qp->q.fragments;
-   const struct iphdr *iph;
-   int err;
+   head = qp->q.fragments;
 
-   __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
+   __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-   if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
-   goto out;
+   if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+   goto out;
 
-   head->dev = dev_get_by_index_rcu(net, qp->iif);
-   if (!head->dev)
-   goto out;
+   head->dev = dev_get_by_index_rcu(net, qp->iif);
+   if (!head->dev)
+   goto out;
 
 
-   /* skb has no dst, perform route lookup again */
-   iph = ip_hdr(head);
-   err = ip_route_input_noref(head, iph->daddr, iph->saddr,
+   /* skb has no dst, perform route lookup again */
+   iph = ip_hdr(head);
+   err = ip_route_input_noref(head, iph->daddr, iph->saddr,
   iph->tos, head->dev);
-   if (err)
-   goto out;
+   if (err)
+   goto out;
 
-   /* Only an end host needs to send an ICMP
-* "Fragment Reassembly Timeout" message, per RFC792.
-*/
-   if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
-   (skb_rtable(head)->rt_type != RTN_LOCAL))
-   goto out;
+   /* Only an end host needs to send an ICMP
+* "Fragment Reassembly Timeout" message, per RFC792.
+*/
+   if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
+   (skb_rtable(head)->rt_type != RTN_LOCAL))
+   goto out;
 
-   clone = skb_clone(head, GFP_ATOMIC);
+   clone = skb_clone(head, GFP_ATOMIC);
 
-   /* Send an ICMP "Fragment Reassembly Timeout" message. */
-   if (clone) {
-   spin_unlock(>q.lock);
-   icmp_send(clone, ICMP_TIME_EXCEEDED,
- ICMP_EXC_FRAGTIME, 0);
-   consume_skb(clone);
-   goto out_rcu_unlock;
-   }
+   /* Send an ICMP "Fragment Reassembly Timeout" message. */
+   if (clone) {
+   spin_unlock(>q.lock);
+   icmp_send(clone, ICMP_TIME_EXCEEDED,
+ ICMP_EXC_FRAGTIME, 0);
+   consume_skb(clone);
+   goto out_rcu_unlock;
}
 out:
spin_unlock(>q.lock);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
3fc853e4492abb109062d662296c0b470763042a..70acad126d044a0f6a1efc63f307805fcf7b1df9
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -106,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct

[PATCH v2 net-next 08/12] inet: frags: use rhashtables for reassembly units

2018-03-30 Thread Eric Dumazet

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
of storage for the fragments.

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet 
Cc: Florian Westphal 
Cc: Nikolay Aleksandrov 
Cc: Jesper Dangaard Brouer 
Cc: Alexander Aring 
Cc: Stefan Schmidt 
---
 Documentation/networking/ip-sysctl.txt  |   7 +-
 include/net/inet_frag.h |  81 +++---
 include/net/ipv6.h  |  16 +-
 net/ieee802154/6lowpan/6lowpan_i.h  |  26 +-
 net/ieee802154/6lowpan/reassembly.c |  93 +++
 net/ipv4/inet_fragment.c| 352 +---
 net/ipv4/ip_fragment.c  | 112 
 net/ipv6/netfilter/nf_conntrack_reasm.c |  51 +---
 net/ipv6/reassembly.c   | 110 
 9 files changed, 269 insertions(+), 579 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 
33f35f049ad57ad6c06ed6e089966e346d72d108..6f2a3670e44b6662ce53c16cb7ca1e4f61274c15
 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
 IP Fragmentation:
 
 ipfrag_high_thresh - INTEGER
-   Maximum memory used to reassemble IP fragments. When
-   ipfrag_high_thresh bytes of memory is allocated for this purpose,
-   the fragment handler will toss packets until ipfrag_low_thresh
-   is reached. This also serves as a maximum limit to namespaces
-   different from the initial one.
+   Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - INTEGER
+   (Obsolete since linux-4.17)
Maximum memory used to reassemble IP fragments before the kernel
begins to remove incomplete fragment queues to free up resources.
The kernel still accepts new fragments for defragmentation.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
69e531ed81894393e07cac9e953825fcb55ef42a..3fec0d3a0d0186e98afb951784e1fe7329ba6d77
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,7 +2,11 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include 
+
 struct netns_frags {
+   struct rhashtable   rhashtable cacheline_aligned_in_smp;
+
/* Keep atomic mem on separate cachelines in structs that include it */
atomic_tmem cacheline_aligned_in_smp;
/* sysctls */
@@ -26,12 +30,30 @@ enum {
INET_FRAG_COMPLETE  = BIT(2),
 };
 
+struct frag_v4_compare_key {
+   __be32  saddr;
+   __be32  daddr;
+   u32 user;
+   u32 vif;
+   __be16  id;
+   u16 protocol;
+};
+
+struct frag_v6_compare_key {
+   struct in6_addr saddr;
+   struct in6_addr daddr;
+   u32 user;
+   __be32  id;
+   u32 iif;
+};
+
 /**
  * struct inet_frag_queue - fragment queue
  *
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
  * @timer: queue expiration timer
- * @list: hash bucket list
+ * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
  * @fragments_tail: received fragments tail
@@ -41,12 +63,16 @@ enum {
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
  * @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
  */
 struct inet_frag_queue {
-   spinlock_t  lock;
+   struct rhash_head   node;
+   union {
+   struct frag_v4_compare_key v4;
+   struct frag_v6_compare_key v6;
+   } key;
struct timer_list

[PATCH v2 net-next 09/12] inet: frags: remove some helpers

2018-03-30 Thread Eric Dumazet

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 5 -
 include/net/ip.h| 1 -
 include/net/ipv6.h  | 7 ---
 net/ipv4/ip_fragment.c  | 5 -
 net/ipv4/proc.c | 6 +++---
 net/ipv6/proc.c | 5 +++--
 6 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
3fec0d3a0d0186e98afb951784e1fe7329ba6d77..4b5449df0aadf1f75144c98317bf5305ec91d88b
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -141,11 +141,6 @@ static inline void add_frag_mem_limit(struct netns_frags 
*nf, int i)
atomic_add(i, >mem);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
-{
-   return atomic_read(>mem);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid 
combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 
36f8f7811093c37de06194dc7410b7596f8bf9fa..ecffd843e7b896a83416847fdaa452be6223f3dc
 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net 
*net, struct sk_buff *s
return skb;
 }
 #endif
-int ip_frag_mem(struct net *net);
 
 /*
  * Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
6fa9a2bc589665dfa9ce84813f33e5e86e12fd74..37455e84034779fab96c231fa069957a6dcf2b42
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
idev->cnf.accept_ra;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv6.frags);
-}
-#endif
-
 #define IPV6_FRAG_HIGH_THRESH  (4 * 1024*1024) /* 4194304 */
 #define IPV6_FRAG_LOW_THRESH   (3 * 1024*1024) /* 3145728 */
 #define IPV6_FRAG_TIMEOUT  (60 * HZ)   /* 60 seconds */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
4021820db6f291b255cc53aeca91dd74aef29934..44f4fa306e224a6f76183b1c04935f01ceb4fe2b
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-   return sum_frag_mem_limit(>ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 
adfb75340275d240487574257c10feb295df44fe..aacfce0d7d82cf59269a69ef4d6ac8d9955b0bdc
 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem;
int orphans, sockets;
 
orphans = percpu_counter_sum_positive(_orphan_count);
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
   sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   frag_mem = ip_frag_mem(net);
-   seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+  atomic_read(>ipv4.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv4.frags));
return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 
6e57028d2e9160be264d07f9312658fcb677a568..8befeb91e0712ecc4d05c4c0a6ecca1808dcbcac
 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
struct net *net = seq->private;
-   unsigned int frag_mem = ip6_frag_mem(net);
 
seq_printf(seq, "TCP6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
sock_prot_inuse_get(net, _prot));
seq_printf(seq, "RAW6: inuse %d\n",
   sock_prot_inuse_get(net, _prot));
-   seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+   seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+  atomic_read(>ipv6.frags.rhashtable.nelems),
+  frag_mem_limit(>ipv6.frags));
return 0;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 04/12] inet: frags: refactor ipv6_frag_init()

2018-03-30 Thread Eric Dumazet

We want to call inet_frags_init() earlier.

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet 
---
 net/ipv6/reassembly.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 
4855de6f673a4753526679ca29dcdaebecb5777f..f0071b113a92fcff15ac57610170c12b17cb59ba
 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -742,18 +742,6 @@ int __init ipv6_frag_init(void)
 {
int ret;
 
-   ret = inet6_add_protocol(_protocol, IPPROTO_FRAGMENT);
-   if (ret)
-   goto out;
-
-   ret = ip6_frags_sysctl_register();
-   if (ret)
-   goto err_sysctl;
-
-   ret = register_pernet_subsys(_frags_ops);
-   if (ret)
-   goto err_pernet;
-
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
@@ -762,8 +750,21 @@ int __init ipv6_frag_init(void)
ip6_frags.frag_expire = ip6_frag_expire;
ip6_frags.frags_cache_name = ip6_frag_cache_name;
ret = inet_frags_init(_frags);
+   if (ret)
+   goto out;
+
+   ret = inet6_add_protocol(_protocol, IPPROTO_FRAGMENT);
+   if (ret)
+   goto err_protocol;
+
+   ret = ip6_frags_sysctl_register();
+   if (ret)
+   goto err_sysctl;
+
+   ret = register_pernet_subsys(_frags_ops);
if (ret)
goto err_pernet;
+
 out:
return ret;
 
@@ -771,6 +772,8 @@ int __init ipv6_frag_init(void)
ip6_frags_sysctl_unregister();
 err_sysctl:
inet6_del_protocol(_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+   inet_frags_fini(_frags);
goto out;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 07/12] rhashtable: add schedule points

2018-03-30 Thread Eric Dumazet

Rehashing and destroying large hash table takes a lot of time,
and happens in process context. It is safe to add cond_resched()
in rhashtable_rehash_table() and rhashtable_free_and_destroy()

Signed-off-by: Eric Dumazet 
---
 lib/rhashtable.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 
47de025b624520f75e521bef46dc9b28baa6a1a0..2b2b79974b614a94e5325e8c2271804cb27069aa
 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -333,6 +333,7 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
err = rhashtable_rehash_chain(ht, old_hash);
if (err)
return err;
+   cond_resched();
}
 
/* Publish the new table pointer. */
@@ -1112,6 +1113,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
for (i = 0; i < tbl->size; i++) {
struct rhash_head *pos, *next;
 
+   cond_resched();
for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
 next = !rht_is_a_nulls(pos) ?
rht_dereference(pos->next, ht) : NULL;
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 06/12] inet: frags: refactor ipfrag_init()

2018-03-30 Thread Eric Dumazet

We need to call inet_frags_init() before register_pernet_subsys(),
as a prereq for following patch ("inet: frags: use rhashtables for reassembly 
units")

Signed-off-by: Eric Dumazet 
---
 net/ipv4/ip_fragment.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 
cd2b4c9419fc1552d367b572926e314b11cb6c00..1a3bc85d6f5ea8f36b8f3d221cad632906b317a2
 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -896,8 +896,6 @@ static struct pernet_operations ip4_frags_ops = {
 
 void __init ipfrag_init(void)
 {
-   ip4_frags_ctl_register();
-   register_pernet_subsys(_frags_ops);
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free;
@@ -907,4 +905,6 @@ void __init ipfrag_init(void)
ip4_frags.frags_cache_name = ip_frag_cache_name;
if (inet_frags_init(_frags))
panic("IP: failed to allocate ip4_frags cache\n");
+   ip4_frags_ctl_register();
+   register_pernet_subsys(_frags_ops);
 }
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 05/12] inet: frags: refactor lowpan_net_frag_init()

2018-03-30 Thread Eric Dumazet

We want to call lowpan_net_frag_init() earlier.
Similar to commit "inet: frags: refactor ipv6_frag_init()"

This is a prereq to "inet: frags: use rhashtables for reassembly units"

Signed-off-by: Eric Dumazet 
---
 net/ieee802154/6lowpan/reassembly.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
6badc05b7baedac2051a1aaea15f9e9b180c..ddada12a044de293f904a1dc7a5ff398d089d101
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -615,14 +615,6 @@ int __init lowpan_net_frag_init(void)
 {
int ret;
 
-   ret = lowpan_frags_sysctl_register();
-   if (ret)
-   return ret;
-
-   ret = register_pernet_subsys(_frags_ops);
-   if (ret)
-   goto err_pernet;
-
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL;
@@ -631,12 +623,22 @@ int __init lowpan_net_frag_init(void)
lowpan_frags.frag_expire = lowpan_frag_expire;
lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
ret = inet_frags_init(_frags);
+   if (ret)
+   goto out;
+
+   ret = lowpan_frags_sysctl_register();
+   if (ret)
+   goto err_sysctl;
+
+   ret = register_pernet_subsys(_frags_ops);
if (ret)
goto err_pernet;
-
+out:
return ret;
 err_pernet:
lowpan_frags_sysctl_unregister();
+err_sysctl:
+   inet_frags_fini(_frags);
return ret;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

[PATCH v2 net-next 03/12] inet: frags: add a pointer to struct netns_frags

2018-03-30 Thread Eric Dumazet

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */)
ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)

Signed-off-by: Eric Dumazet 
---
 include/net/inet_frag.h | 11 ++-
 include/net/ipv6.h  |  3 +--
 net/ieee802154/6lowpan/reassembly.c | 13 +++--
 net/ipv4/inet_fragment.c| 17 ++---
 net/ipv4/ip_fragment.c  |  9 +
 net/ipv6/netfilter/nf_conntrack_reasm.c | 16 +---
 net/ipv6/reassembly.c   | 20 ++--
 7 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 
b1d62176f3b4fcf100bd263e8eae0db656a3d9b6..69e531ed81894393e07cac9e953825fcb55ef42a
 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
int high_thresh;
int low_thresh;
int max_dist;
+   struct inet_frags   *f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags 
*nf)
atomic_set(>mem, 0);
return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags 
*f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
if (refcount_dec_and_test(>refcnt))
-   inet_frag_destroy(q, f);
+   inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 
5c18836672e9d1c560cdce15f5b34928c337abfd..57b7fe43d2ab8e0ef3d663b7a5ee201affd5ca1f
 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -607,8 +607,7 @@ struct frag_queue {
u8  ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-  struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
index 
2aaab4bba42961647a4d3d1c0b8497917d5065ce..6badc05b7baedac2051a1aaea15f9e9b180c
 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 out:
spin_unlock(>q.lock);
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, 
struct sk_buff *prev,
struct sk_buff *fp, *head = fq->q.fragments;
int sum_truesize;
 
-   inet_frag_kill(>q, _frags);
+   inet_frag_kill(>q);
 
/* Make the one we just received the head. */
if (prev) {
@@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
ret = lowpan_frag_queue(fq, skb, frag_type);
spin_unlock(>q.lock);
 
-   inet_frag_put(>q, _frags);
+   inet_frag_put(>q);
return ret;
}
 
@@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+   ieee802154_lowpan->frags.f = _frags;
 
res = inet_frags_init_net(_lowpan->frags);
if (res < 0)
return res;
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
-   inet_frags_exit_net(_lowpan->frags, _frags);
+   inet_frags_exit_net(_lowpan->frags);

1 2 3 4 >

1 - 100 of 333 matches

Mail list logo