[PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives

2018-06-29 Thread Lawrence Brakmo
We observed high 99 and 99.9% latencies when doing RPCs with DCTCP. The
problem is triggered when the last packet of a request arrives CE
marked. The reply will carry the ECE mark causing TCP to shrink its cwnd
to 1 (because there are no packets in flight). When the 1st packet of
the next request arrives it was sometimes delayed adding up to 40ms to
the latency.

This patch insures that CWR makred data packets arriving will be acked
immediately.

Signed-off-by: Lawrence Brakmo 
---
 net/ipv4/tcp_input.c | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 76ca88f63b70..b024d36f0d56 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,6 +98,8 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 #define FLAG_UPDATE_TS_RECENT  0x4000 /* tcp_replace_ts_recent() */
 #define FLAG_NO_CHALLENGE_ACK  0x8000 /* do not call tcp_send_challenge_ack()  
*/
 #define FLAG_ACK_MAYBE_DELAYED 0x1 /* Likely a delayed ACK */
+#define FLAG_OFO_POSSIBLE  0x2 /* Possible OFO */
+#define FLAG_CWR   0x4 /* CWR in this ACK */
 
 #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP   (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -5087,7 +5089,7 @@ static inline void tcp_data_snd_check(struct sock *sk)
 /*
  * Check if sending an ack is needed.
  */
-static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+static void __tcp_ack_snd_check(struct sock *sk, int flags)
 {
struct tcp_sock *tp = tcp_sk(sk);
unsigned long rtt, delay;
@@ -5102,13 +5104,16 @@ static void __tcp_ack_snd_check(struct sock *sk, int 
ofo_possible)
(tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat ||
 __tcp_select_window(sk) >= tp->rcv_wnd)) ||
/* We ACK each frame or... */
-   tcp_in_quickack_mode(sk)) {
+   tcp_in_quickack_mode(sk) ||
+   /* We received CWR */
+   flags & FLAG_CWR) {
 send_now:
tcp_send_ack(sk);
return;
}
 
-   if (!ofo_possible || RB_EMPTY_ROOT(>out_of_order_queue)) {
+   if (!(flags & FLAG_OFO_POSSIBLE) ||
+   RB_EMPTY_ROOT(>out_of_order_queue)) {
tcp_send_delayed_ack(sk);
return;
}
@@ -5134,13 +5139,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int 
ofo_possible)
  HRTIMER_MODE_REL_PINNED_SOFT);
 }
 
-static inline void tcp_ack_snd_check(struct sock *sk)
+static inline void tcp_ack_snd_check(struct sock *sk, int flags)
 {
if (!inet_csk_ack_scheduled(sk)) {
/* We sent a data segment already. */
return;
}
-   __tcp_ack_snd_check(sk, 1);
+   __tcp_ack_snd_check(sk, flags | FLAG_OFO_POSSIBLE);
 }
 
 /*
@@ -5489,6 +5494,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff 
*skb)
goto discard;
}
} else {
+   int flags;
int eaten = 0;
bool fragstolen = false;
 
@@ -5525,7 +5531,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff 
*skb)
goto no_ack;
}
 
-   __tcp_ack_snd_check(sk, 0);
+   flags = (tcp_flag_word(th) & TCP_FLAG_CWR) ?
+   FLAG_CWR : 0;
+   __tcp_ack_snd_check(sk, flags);
 no_ack:
if (eaten)
kfree_skb_partial(skb, fragstolen);
@@ -5561,7 +5569,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff 
*skb)
tcp_data_queue(sk, skb);
 
tcp_data_snd_check(sk);
-   tcp_ack_snd_check(sk);
+   tcp_ack_snd_check(sk, (tcp_flag_word(th) & TCP_FLAG_CWR) ?
+ FLAG_CWR : 0);
return;
 
 csum_error:
@@ -6150,7 +6159,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
/* tcp_data could move socket to TIME-WAIT */
if (sk->sk_state != TCP_CLOSE) {
tcp_data_snd_check(sk);
-   tcp_ack_snd_check(sk);
+   tcp_ack_snd_check(sk, 0);
}
 
if (!queued) {
-- 
2.17.1



[PATCH net-next 1/2] tcp: notify when a delayed ack is sent

2018-06-29 Thread Lawrence Brakmo
DCTCP depends on the CA_EVENT_NON_DELAYED_ACK and CA_EVENT_DELAYED_ACK
notifications to keep track if it needs to send an ACK for packets that
were received with a particular ECN state but whose ACK was delayed.

Under some circumstances, for example when a delayed ACK is sent with a
data packet, DCTCP state was not being updated due to a lack of
notification that the previously delayed ACK was sent. As a result, it
would sometimes send a duplicate ACK when a new data packet arrived.

This patch insures that DCTCP's state is correctly updated so it will
not send the duplicate ACK.

Signed-off-by: Lawrence Brakmo 
---
 net/ipv4/tcp_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f6129160dd..41f6ad7a21e4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -172,6 +172,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, 
unsigned int pkts)
__sock_put(sk);
}
tcp_dec_quickack_mode(sk, pkts);
+   if (inet_csk_ack_scheduled(sk))
+   tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
-- 
2.17.1



[PATCH net-next 0/2] tcp: fix high tail latencies in DCTCP

2018-06-29 Thread Lawrence Brakmo
When have observed high tail latencies when using DCTCP for RPCs as
compared to using Cubic. For example, in one setup there are 2 hosts
sending to a 3rd one, with each sender having 3 flows (1 stream,
1 1MB back-to-back RPCs and 1 10KB back-to-back RPCs). The following
table shows the 99% and 99.9% latencies for both Cubic and dctcp:

   Cubic 99%  Cubic 99.9%   dctcp 99%dctcp 99.9%
  1MB RPCs2.6ms   5.5ms 43ms  208ms
 10KB RPCs1.1ms   1.3ms 53ms  212ms

Looking at tcpdump traces showed that there are two causes for the
latency.  

  1) RTOs caused by the receiver sending a dup ACK and not ACKing
 the last (and only) packet sent.
  2) Delaying ACKs when the sender has a cwnd of 1, so everything
 pauses for the duration of the delayed ACK.

The first patch fixes the cause of the dup ACKs, not updating DCTCP
state when an ACK that was initially delayed has been sent with a
data packet.

The second patch insures that an ACK is sent immediately when a
CWR marked packet arrives.

With the patches the latencies for DCTCP now look like:

   dctcp 99%  dctcp 99.9% 
  1MB RPCs4.8ms   6.5ms
 10KB RPCs143us   184us

Note that while the 1MB RPCs tail latencies are higher than Cubic's,
the 10KB latencies are much smaller than Cubic's. These patches fix
issues on the receiver, but tcpdump traces indicate there is an
opportunity to also fix an issue at the sender that adds about 3ms
to the tail latencies.

The following trace shows the issue that tiggers an RTO (fixed by these 
patches):

   Host A sends the last packets of the request
   Host B receives them, and the last packet is marked with congestion (CE)
   Host B sends ACKs for packets not marked with congestion
   Host B sends data packet with reply and ACK for packet marked with
  congestion (TCP flag ECE)
   Host A receives ACKs with no ECE flag
   Host A receives data packet with ACK for the last packet of request
  and which has TCP ECE bit set
   Host A sends 1st data packet of the next request with TCP flag CWR
   Host B receives the packet (as seen in tcpdump at B), no CE flag
   Host B sends a dup ACK that also has the TCP ECE flag
   Host A RTO timer fires!
   Host A to send the next packet
   Host A receives an ACK for everything it has sent (i.e. Host B
  did receive 1st packet of request)
   Host A send more packets…

[PATCH net-next 1/2] tcp: notify when a delayed ack is sent
[PATCH net-next 2/2] tcp: ack immediately when a cwr packet arrives

 net/ipv4/tcp_input.c  | 25 +
 net/ipv4/tcp_output.c |  2 ++
 2 files changed, 19 insertions(+), 8 deletions(-)




Re: [RFC v2 PATCH 1/4] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER

2018-06-29 Thread Tushar Dave




On 06/29/2018 01:27 AM, Daniel Borkmann wrote:

On 06/19/2018 08:00 PM, Tushar Dave wrote:

Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
existing socket filter infrastructure for bpf program attach and load.
SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
contrast to SOCKET_FILTER which deals with struct skb. This is useful
for kernel entities that don't have skb to represent packet data but
want to run eBPF socket filter on packet data that is in form of struct
scatterlist e.g. IB/RDMA

Signed-off-by: Tushar Dave 
Acked-by: Sowmini Varadhan 
---
  include/linux/bpf_types.h  |  1 +
  include/linux/filter.h |  8 +
  include/uapi/linux/bpf.h   |  7 
  kernel/bpf/syscall.c   |  1 +
  kernel/bpf/verifier.c  |  1 +
  net/core/filter.c  | 77 --
  samples/bpf/bpf_load.c | 11 --
  tools/bpf/bpftool/prog.c   |  1 +
  tools/include/uapi/linux/bpf.h |  7 
  tools/lib/bpf/libbpf.c |  3 ++
  tools/lib/bpf/libbpf.h |  2 ++
  11 files changed, 114 insertions(+), 5 deletions(-)


[...]
  
+static bool socksg_filter_is_valid_access(int off, int size,

+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+   switch (off) {
+   case offsetof(struct sg_filter_md, data):
+   info->reg_type = PTR_TO_PACKET;
+   break;
+   case offsetof(struct sg_filter_md, data_end):
+   info->reg_type = PTR_TO_PACKET_END;
+   break;
+   }
+
+   if (off < 0 || off >= sizeof(struct sg_filter_md))
+   return false;
+   if (off % size != 0)
+   return false;
+   if (size != sizeof(__u64))
+   return false;
+
+   return true;
+}


Just a note, don't know much about rds, but when you make this writeable for
rds/tcp you definitely must make sure that it can be handled properly in there,
meaning when program rewrites packet data that this data is private to the BPF
prog (to avoid races/corruption) and that the rewritten data is correctly 
handled
from there.


Sure thing. When I add something like bpf_sg_store_bytes(), I will make
sure to take care of rewrites.

Thanks.

-Tushar




[PATCH net-next 13/13] selftests: mlxsw: Add scale test for resources

2018-06-29 Thread Petr Machata
From: Yuval Mintz 

Add a scale test capable of validating that offloaded network
functionality is indeed functional at scale when configured to
the different KVD profiles available.

Start by testing offloaded routes are functional at scale by
passing traffic on each one of them in turn.

Signed-off-by: Yuval Mintz 
Signed-off-by: Petr Machata 
---
 .../drivers/net/mlxsw/spectrum/resource_scale.sh   | 55 ++
 1 file changed, 55 insertions(+)
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh

diff --git 
a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
new file mode 100755
index ..a0a80e1a69e8
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NUM_NETIFS=6
+source ../../../../net/forwarding/lib.sh
+source ../../../../net/forwarding/tc_common.sh
+source devlink_lib_spectrum.sh
+
+current_test=""
+
+cleanup()
+{
+   pre_cleanup
+   if [ ! -z $current_test ]; then
+   ${current_test}_cleanup
+   fi
+   devlink_sp_size_kvd_to_default
+}
+
+devlink_sp_read_kvd_defaults
+trap cleanup EXIT
+
+ALL_TESTS="router tc_flower mirror_gre"
+for current_test in ${TESTS:-$ALL_TESTS}; do
+   source ${current_test}_scale.sh
+
+   num_netifs_var=${current_test^^}_NUM_NETIFS
+   num_netifs=${!num_netifs_var:-$NUM_NETIFS}
+
+   for profile in $KVD_PROFILES; do
+   RET=0
+   devlink_sp_resource_kvd_profile_set $profile
+   if [[ $RET -gt 0 ]]; then
+   log_test "'$current_test' [$profile] setting"
+   continue
+   fi
+
+   for should_fail in 0 1; do
+   RET=0
+   target=$(${current_test}_get_target "$should_fail")
+   ${current_test}_setup_prepare
+   setup_wait $num_netifs
+   ${current_test}_test "$target" "$should_fail"
+   ${current_test}_cleanup
+   if [[ "$should_fail" -eq 0 ]]; then
+   log_test "'$current_test' [$profile] $target"
+   else
+   log_test "'$current_test' [$profile] overflow 
$target"
+   fi
+   done
+   done
+done
+current_test=""
+
+exit "$RET"
-- 
2.4.11



[PATCH net-next 12/13] selftests: mlxsw: Add target for mirror-to-gretap test on spectrum

2018-06-29 Thread Petr Machata
Add a wrapper around mlxsw/mirror_gre_scale.sh that parameterized number
of offloadable mirrors on Spectrum machines.

Signed-off-by: Petr Machata 
Reviewed-by: Jiri Pirko 
---
 .../drivers/net/mlxsw/spectrum/mirror_gre_scale.sh  | 13 +
 1 file changed, 13 insertions(+)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh

diff --git 
a/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
new file mode 100644
index ..8d2186c7c62b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../mirror_gre_scale.sh
+
+mirror_gre_get_target()
+{
+   local should_fail=$1; shift
+
+   if ((! should_fail)); then
+   echo 3
+   else
+   echo 4
+   fi
+}
-- 
2.4.11



[PATCH net-next 11/13] selftests: mlxsw: Add scale test for mirror-to-gretap

2018-06-29 Thread Petr Machata
Test that it's possible to offload a given number of mirrors.

Signed-off-by: Petr Machata 
Reviewed-by: Jiri Pirko 
---
 .../drivers/net/mlxsw/mirror_gre_scale.sh  | 197 +
 1 file changed, 197 insertions(+)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
new file mode 100644
index ..6f3a70df63bc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
@@ -0,0 +1,197 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Test offloading a number of mirrors-to-gretap. The test creates a number of
+# tunnels. Then it adds one flower mirror for each of the tunnels, matching a
+# given host IP. Then it generates traffic at each of the host IPs and checks
+# that the traffic has been mirrored at the appropriate tunnel.
+#
+#   +--+   +--+
+#   | H1   |   |   H2 |
+#   | + $h1|   |$h2 + |
+#   | | 2001:db8:1:X::1/64 |   | 2001:db8:1:X::2/64 | |
+#   +-|+   +|-+
+# | |
+#   +-|-|-+
+#   | SW  o--> mirrors  | |
+#   | +---|-|---+ |
+#   | |   + $swp1BR   $swp2 +   | |
+#   | +-+ |
+#   | |
+#   | + $swp3  + gt6- (ip6gretap)  |
+#   | | 2001:db8:2:X::1/64 : loc=2001:db8:2:X::1  |
+#   | |: rem=2001:db8:2:X::2  |
+#   | |: ttl=100  |
+#   | |: tos=inherit  |
+#   | |:  |
+#   +-|:--+
+# |:
+#   +-|:--+
+#   | H3  + $h3+ h3-gt6- (ip6gretap)   |
+#   |   2001:db8:2:X::2/64   loc=2001:db8:2:X::2  |
+#   |rem=2001:db8:2:X::1  |
+#   |ttl=100  |
+#   |tos=inherit  |
+#   | |
+#   +-+
+
+source ../../../../net/forwarding/mirror_lib.sh
+
+MIRROR_NUM_NETIFS=6
+
+mirror_gre_ipv6_addr()
+{
+   local net=$1; shift
+   local num=$1; shift
+
+   printf "2001:db8:%x:%x" $net $num
+}
+
+mirror_gre_tunnels_create()
+{
+   local count=$1; shift
+   local should_fail=$1; shift
+
+   MIRROR_GRE_BATCH_FILE="$(mktemp)"
+   for ((i=0; i < count; ++i)); do
+   local match_dip=$(mirror_gre_ipv6_addr 1 $i)::2
+   local htun=h3-gt6-$i
+   local tun=gt6-$i
+
+   ((mirror_gre_tunnels++))
+
+   ip address add dev $h1 $(mirror_gre_ipv6_addr 1 $i)::1/64
+   ip address add dev $h2 $(mirror_gre_ipv6_addr 1 $i)::2/64
+
+   ip address add dev $swp3 $(mirror_gre_ipv6_addr 2 $i)::1/64
+   ip address add dev $h3 $(mirror_gre_ipv6_addr 2 $i)::2/64
+
+   tunnel_create $tun ip6gretap \
+ $(mirror_gre_ipv6_addr 2 $i)::1 \
+ $(mirror_gre_ipv6_addr 2 $i)::2 \
+ ttl 100 tos inherit allow-localremote
+
+   tunnel_create $htun ip6gretap \
+ $(mirror_gre_ipv6_addr 2 $i)::2 \
+ $(mirror_gre_ipv6_addr 2 $i)::1
+   ip link set $htun vrf v$h3
+   matchall_sink_create $htun
+
+   cat >> $MIRROR_GRE_BATCH_FILE <<-EOF
+   filter add dev $swp1 ingress pref 1000 \
+   protocol ipv6 \
+   flower $tcflags dst_ip $match_dip \
+   action mirred egress mirror dev $tun
+   EOF
+   done
+
+   tc -b $MIRROR_GRE_BATCH_FILE
+   check_err_fail $should_fail $? "Mirror rule insertion"
+}
+

[PATCH net-next 10/13] selftests: mlxsw: Add target for tc flower test on spectrum

2018-06-29 Thread Petr Machata
Add a wrapper around mlxsw/tc_flower_scale.sh that parameterizes the
generic tc flower scale test template with Spectrum-specific target
values.

Signed-off-by: Petr Machata 
Reviewed-by: Yuval Mintz 
---
 .../drivers/net/mlxsw/spectrum/tc_flower_scale.sh | 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh

diff --git 
a/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
new file mode 100644
index ..f9bfd8937765
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_flower_scale.sh
+
+tc_flower_get_target()
+{
+   local should_fail=$1; shift
+
+   # 6144 (6x1024) is the theoretical maximum.
+   # One bank of 512 rules is taken by the 18-byte MC router rule.
+   # One rule is the ACL catch-all.
+   # 6144 - 512 - 1 = 5631
+   local target=5631
+
+   if ((! should_fail)); then
+   echo $target
+   else
+   echo $((target + 1))
+   fi
+}
-- 
2.4.11



[PATCH net-next 09/13] selftests: mlxsw: Add tc flower scale test

2018-06-29 Thread Petr Machata
Add test of capacity to offload flower.

This is a generic portion of the test that is meant to be called from a
driver that supplies a particular number of rules to be tested with.

Signed-off-by: Petr Machata 
Reviewed-by: Yuval Mintz 
---
 .../selftests/drivers/net/mlxsw/tc_flower_scale.sh | 134 +
 1 file changed, 134 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
new file mode 100644
index ..a6d733d2a4b4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
@@ -0,0 +1,134 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for resource limit of offloaded flower rules. The test adds a given
+# number of flower matches for different IPv6 addresses, then generates 
traffic,
+# and ensures each was hit exactly once. This file contains functions to set up
+# a testing topology and run the test, and is meant to be sourced from a test
+# script that calls the testing routine with a given number of rules.
+
+TC_FLOWER_NUM_NETIFS=2
+
+tc_flower_h1_create()
+{
+   simple_if_init $h1
+   tc qdisc add dev $h1 clsact
+}
+
+tc_flower_h1_destroy()
+{
+   tc qdisc del dev $h1 clsact
+   simple_if_fini $h1
+}
+
+tc_flower_h2_create()
+{
+   simple_if_init $h2
+   tc qdisc add dev $h2 clsact
+}
+
+tc_flower_h2_destroy()
+{
+   tc qdisc del dev $h2 clsact
+   simple_if_fini $h2
+}
+
+tc_flower_setup_prepare()
+{
+   h1=${NETIFS[p1]}
+   h2=${NETIFS[p2]}
+
+   vrf_prepare
+
+   tc_flower_h1_create
+   tc_flower_h2_create
+}
+
+tc_flower_cleanup()
+{
+   pre_cleanup
+
+   tc_flower_h2_destroy
+   tc_flower_h1_destroy
+
+   vrf_cleanup
+
+   if [[ -v TC_FLOWER_BATCH_FILE ]]; then
+   rm -f $TC_FLOWER_BATCH_FILE
+   fi
+}
+
+tc_flower_addr()
+{
+   local num=$1; shift
+
+   printf "2001:db8:1::%x" $num
+}
+
+tc_flower_rules_create()
+{
+   local count=$1; shift
+   local should_fail=$1; shift
+
+   TC_FLOWER_BATCH_FILE="$(mktemp)"
+
+   for ((i = 0; i < count; ++i)); do
+   cat >> $TC_FLOWER_BATCH_FILE <<-EOF
+   filter add dev $h2 ingress \
+   prot ipv6 \
+   pref 1000 \
+   flower $tcflags dst_ip $(tc_flower_addr $i) \
+   action drop
+   EOF
+   done
+
+   tc -b $TC_FLOWER_BATCH_FILE
+   check_err_fail $should_fail $? "Rule insertion"
+}
+
+__tc_flower_test()
+{
+   local count=$1; shift
+   local should_fail=$1; shift
+   local last=$((count - 1))
+
+   tc_flower_rules_create $count $should_fail
+
+   for ((i = 0; i < count; ++i)); do
+   $MZ $h1 -q -c 1 -t ip -p 20 -b bc -6 \
+   -A 2001:db8:2::1 \
+   -B $(tc_flower_addr $i)
+   done
+
+   MISMATCHES=$(
+   tc -j -s filter show dev $h2 ingress |
+   jq -r '[ .[] | select(.kind == "flower") | .options |
+values as $rule | .actions[].stats.packets |
+select(. != 1) | "\(.) on \($rule.keys.dst_ip)" ] |
+  join(", ")'
+   )
+
+   test -z "$MISMATCHES"
+   check_err $? "Expected to capture 1 packet for each IP, but got 
$MISMATCHES"
+}
+
+tc_flower_test()
+{
+   local count=$1; shift
+   local should_fail=$1; shift
+
+   # We use lower 16 bits of IPv6 address for match. Also there are only 16
+   # bits of rule priority space.
+   if ((count > 65536)); then
+   check_err 1 "Invalid count of $count. At most 65536 rules 
supported"
+   return
+   fi
+
+   if ! tc_offload_check $TC_FLOWER_NUM_NETIFS; then
+   check_err 1 "Could not test offloaded functionality"
+   return
+   fi
+
+   tcflags="skip_sw"
+   __tc_flower_test $count $should_fail
+}
-- 
2.4.11



[PATCH net-next 08/13] selftests: mlxsw: Add target for router test on spectrum

2018-06-29 Thread Petr Machata
From: Yuval Mintz 

IPv4 routes in Spectrum are based on the kvd single-hash, but as it's
a hash we need to assume we cannot reach 100% of its capacity.

Add a wrapper that provides us with good/bad target numbers for the
Spectrum ASIC.

Signed-off-by: Yuval Mintz 
Reviewed-by: Petr Machata 
[pe...@mellanox.com: Drop shebang.]
Signed-off-by: Petr Machata 
---
 .../drivers/net/mlxsw/spectrum/router_scale.sh | 18 ++
 1 file changed, 18 insertions(+)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
new file mode 100644
index ..21c4697d5bab
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../router_scale.sh
+
+router_get_target()
+{
+   local should_fail=$1
+   local target
+
+   target=$(devlink_resource_size_get kvd hash_single)
+
+   if [[ $should_fail -eq 0 ]]; then
+   target=$((target * 85 / 100))
+   else
+   target=$((target + 1))
+   fi
+
+   echo $target
+}
-- 
2.4.11



[PATCH net-next 07/13] selftests: mlxsw: Add router test

2018-06-29 Thread Petr Machata
From: Arkadi Sharshevsky 

This test aims for both stand alone and internal usage by the resource
infra. The test receives the number routes to offload and checks:
- The routes were offloaded correctly
- Traffic for each route.

Signed-off-by: Arkadi Sharshevsky 
Signed-off-by: Yuval Mintz 
Reviewed-by: Petr Machata 
Signed-off-by: Petr Machata 
---
 .../selftests/drivers/net/mlxsw/router_scale.sh| 167 +
 1 file changed, 167 insertions(+)
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/router_scale.sh

diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh 
b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
new file mode 100644
index ..d231649b4f01
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ROUTER_NUM_NETIFS=4
+
+router_h1_create()
+{
+   simple_if_init $h1 192.0.1.1/24
+   ip route add 193.0.0.0/8 via 192.0.1.2 dev $h1
+}
+
+router_h1_destroy()
+{
+   ip route del 193.0.0.0/8 via 192.0.1.2 dev $h1
+   simple_if_fini $h1 192.0.1.1/24
+}
+
+router_h2_create()
+{
+   simple_if_init $h2 192.0.2.1/24
+   tc qdisc add dev $h2 handle : ingress
+}
+
+router_h2_destroy()
+{
+   tc qdisc del dev $h2 handle : ingress
+   simple_if_fini $h2 192.0.2.1/24
+}
+
+router_create()
+{
+   ip link set dev $rp1 up
+   ip link set dev $rp2 up
+
+   ip address add 192.0.1.2/24 dev $rp1
+   ip address add 192.0.2.2/24 dev $rp2
+}
+
+router_destroy()
+{
+   ip address del 192.0.2.2/24 dev $rp2
+   ip address del 192.0.1.2/24 dev $rp1
+
+   ip link set dev $rp2 down
+   ip link set dev $rp1 down
+}
+
+router_setup_prepare()
+{
+   h1=${NETIFS[p1]}
+   rp1=${NETIFS[p2]}
+
+   rp2=${NETIFS[p3]}
+   h2=${NETIFS[p4]}
+
+   h1mac=$(mac_get $h1)
+   rp1mac=$(mac_get $rp1)
+
+   vrf_prepare
+
+   router_h1_create
+   router_h2_create
+
+   router_create
+}
+
+router_offload_validate()
+{
+   local route_count=$1
+   local offloaded_count
+
+   offloaded_count=$(ip route | grep -o 'offload' | wc -l)
+   [[ $offloaded_count -ge $route_count ]]
+}
+
+router_routes_create()
+{
+   local route_count=$1
+   local count=0
+
+   ROUTE_FILE="$(mktemp)"
+
+   for i in {0..255}
+   do
+   for j in {0..255}
+   do
+   for k in {0..255}
+   do
+   if [[ $count -eq $route_count ]]; then
+   break 3
+   fi
+
+   echo route add 193.${i}.${j}.${k}/32 via \
+  192.0.2.1 dev $rp2  >> $ROUTE_FILE
+   ((count++))
+   done
+   done
+   done
+
+   ip -b $ROUTE_FILE &> /dev/null
+}
+
+router_routes_destroy()
+{
+   if [[ -v ROUTE_FILE ]]; then
+   rm -f $ROUTE_FILE
+   fi
+}
+
+router_test()
+{
+   local route_count=$1
+   local should_fail=$2
+   local count=0
+
+   RET=0
+
+   router_routes_create $route_count
+
+   router_offload_validate $route_count
+   check_err_fail $should_fail $? "Offload of $route_count routes"
+   if [[ $RET -ne 0 ]] || [[ $should_fail -eq 1 ]]; then
+   return
+   fi
+
+   tc filter add dev $h2 ingress protocol ip pref 1 flower \
+   skip_sw dst_ip 193.0.0.0/8 action drop
+
+   for i in {0..255}
+   do
+   for j in {0..255}
+   do
+   for k in {0..255}
+   do
+   if [[ $count -eq $route_count ]]; then
+   break 3
+   fi
+
+   $MZ $h1 -c 1 -p 64 -a $h1mac -b $rp1mac \
+   -A 192.0.1.1 -B 193.${i}.${j}.${k} \
+   -t ip -q
+   ((count++))
+   done
+   done
+   done
+
+   tc_check_packets "dev $h2 ingress" 1 $route_count
+   check_err $? "Offload mismatch"
+
+   tc filter del dev $h2 ingress protocol ip pref 1 flower \
+   skip_sw dst_ip 193.0.0.0/8 action drop
+
+   router_routes_destroy
+}
+
+router_cleanup()
+{
+   pre_cleanup
+
+   router_routes_destroy
+   router_destroy
+
+   router_h2_destroy
+   router_h1_destroy
+
+   vrf_cleanup
+}
-- 
2.4.11



[PATCH net-next 06/13] selftests: mlxsw: Add devlink KVD resource test

2018-06-29 Thread Petr Machata
From: Yuval Mintz 

Add a selftest that can be used to perform basic sanity of the devlink
resource API as well as test the behavior of KVD manipulation in the
driver.

This is the first case of a HW-only test - in order to test the devlink
resource a driver capable of exposing resources has to be provided
first.

Signed-off-by: Yuval Mintz 
[pe...@mellanox.com: Extracted two patches out of this patch. Tweaked
commit message.]
Signed-off-by: Petr Machata 
---
 .../net/mlxsw/spectrum/devlink_resources.sh| 117 +
 1 file changed, 117 insertions(+)
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh

diff --git 
a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
new file mode 100755
index ..b1fe960e398a
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
@@ -0,0 +1,117 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+NUM_NETIFS=1
+source devlink_lib_spectrum.sh
+
+setup_prepare()
+{
+   devlink_sp_read_kvd_defaults
+}
+
+cleanup()
+{
+   pre_cleanup
+   devlink_sp_size_kvd_to_default
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+profiles_test()
+{
+   local i
+
+   log_info "Running profile tests"
+
+   for i in $KVD_PROFILES; do
+   RET=0
+   devlink_sp_resource_kvd_profile_set $i
+   log_test "'$i' profile"
+   done
+
+   # Default is explicitly tested at end to ensure it's actually applied
+   RET=0
+   devlink_sp_resource_kvd_profile_set "default"
+   log_test "'default' profile"
+}
+
+resources_min_test()
+{
+   local size
+   local i
+   local j
+
+   log_info "Running KVD-minimum tests"
+
+   for i in $KVD_CHILDREN; do
+   RET=0
+   size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+   devlink_resource_size_set "$size" kvd "$i"
+
+   # In case of linear, need to minimize sub-resources as well
+   if [[ "$i" == "linear" ]]; then
+   for j in $KVDL_CHILDREN; do
+   devlink_resource_size_set 0 kvd linear "$j"
+   done
+   fi
+
+   devlink_reload
+   devlink_sp_size_kvd_to_default
+   log_test "'$i' minimize [$size]"
+   done
+}
+
+resources_max_test()
+{
+   local min_size
+   local size
+   local i
+   local j
+
+   log_info "Running KVD-maximum tests"
+   for i in $KVD_CHILDREN; do
+   RET=0
+   devlink_sp_resource_minimize
+
+   # Calculate the maximum possible size for the given partition
+   size=$(devlink_resource_size_get kvd)
+   for j in $KVD_CHILDREN; do
+   if [ "$i" != "$j" ]; then
+   min_size=$(devlink_resource_get kvd "$j" | \
+  jq '.["size_min"]')
+   size=$((size - min_size))
+   fi
+   done
+
+   # Test almost maximum size
+   devlink_resource_size_set "$((size - 128))" kvd "$i"
+   devlink_reload
+   log_test "'$i' almost maximize [$((size - 128))]"
+
+   # Test above maximum size
+   devlink resource set "$DEVLINK_DEV" \
+   path "kvd/$i" size $((size + 128)) &> /dev/null
+   check_fail $? "Set kvd/$i to size $((size + 128)) should fail"
+   log_test "'$i' Overflow rejection [$((size + 128))]"
+
+   # Test maximum size
+   if [ "$i" == "hash_single" ] || [ "$i" == "hash_double" ]; then
+   echo "SKIP: Observed problem with exact max $i"
+   continue
+   fi
+
+   devlink_resource_size_set "$size" kvd "$i"
+   devlink_reload
+   log_test "'$i' maximize [$size]"
+
+   devlink_sp_size_kvd_to_default
+   done
+}
+
+profiles_test
+resources_min_test
+resources_max_test
+
+exit "$RET"
-- 
2.4.11



[PATCH net-next 05/13] selftests: mlxsw: Add devlink_lib_spectrum.sh

2018-06-29 Thread Petr Machata
This library builds on top of devlink_lib.sh and contains functionality
specific to Spectrum ASICs, e.g., re-partitioning the various KVD
sub-parts.

Signed-off-by: Yuval Mintz 
[pe...@mellanox.com: Split this out from another patch. Fix line length
in devlink_sp_read_kvd_defaults().]
Signed-off-by: Petr Machata 
---
 MAINTAINERS|   1 +
 .../net/mlxsw/spectrum/devlink_lib_spectrum.sh | 119 +
 2 files changed, 120 insertions(+)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index f3e766e504d8..f3dd6c4f61e1 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9154,6 +9154,7 @@ S:Supported
 W: http://www.mellanox.com
 Q: http://patchwork.ozlabs.org/project/netdev/list/
 F: drivers/net/ethernet/mellanox/mlxsw/
+F: tools/testing/selftests/drivers/net/mlxsw/
 
 MELLANOX FIRMWARE FLASH LIBRARY (mlxfw)
 M: ml...@mellanox.com
diff --git 
a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh 
b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
new file mode 100644
index ..73035e25085d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source "../../../../net/forwarding/devlink_lib.sh"
+
+if [ "$DEVLINK_VIDDID" != "15b3:cb84" ]; then
+   echo "SKIP: test is tailored for Mellanox Spectrum"
+   exit 1
+fi
+
+# Needed for returning to default
+declare -A KVD_DEFAULTS
+
+KVD_CHILDREN="linear hash_single hash_double"
+KVDL_CHILDREN="singles chunks large_chunks"
+
+devlink_sp_resource_minimize()
+{
+   local size
+   local i
+
+   for i in $KVD_CHILDREN; do
+   size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+   devlink_resource_size_set "$size" kvd "$i"
+   done
+
+   for i in $KVDL_CHILDREN; do
+   size=$(devlink_resource_get kvd linear "$i" | \
+  jq '.["size_min"]')
+   devlink_resource_size_set "$size" kvd linear "$i"
+   done
+}
+
+devlink_sp_size_kvd_to_default()
+{
+   local need_reload=0
+   local i
+
+   for i in $KVD_CHILDREN; do
+   local size=$(echo "${KVD_DEFAULTS[kvd_$i]}" | jq '.["size"]')
+   current_size=$(devlink_resource_size_get kvd "$i")
+
+   if [ "$size" -ne "$current_size" ]; then
+   devlink_resource_size_set "$size" kvd "$i"
+   need_reload=1
+   fi
+   done
+
+   for i in $KVDL_CHILDREN; do
+   local size=$(echo "${KVD_DEFAULTS[kvd_linear_$i]}" | \
+jq '.["size"]')
+   current_size=$(devlink_resource_size_get kvd linear "$i")
+
+   if [ "$size" -ne "$current_size" ]; then
+   devlink_resource_size_set "$size" kvd linear "$i"
+   need_reload=1
+   fi
+   done
+
+   if [ "$need_reload" -ne "0" ]; then
+   devlink_reload
+   fi
+}
+
+devlink_sp_read_kvd_defaults()
+{
+   local key
+   local i
+
+   KVD_DEFAULTS[kvd]=$(devlink_resource_get "kvd")
+   for i in $KVD_CHILDREN; do
+   key=kvd_$i
+   KVD_DEFAULTS[$key]=$(devlink_resource_get kvd "$i")
+   done
+
+   for i in $KVDL_CHILDREN; do
+   key=kvd_linear_$i
+   KVD_DEFAULTS[$key]=$(devlink_resource_get kvd linear "$i")
+   done
+}
+
+KVD_PROFILES="default scale ipv4_max"
+
+devlink_sp_resource_kvd_profile_set()
+{
+   local profile=$1
+
+   case "$profile" in
+   scale)
+   devlink_resource_size_set 64000 kvd linear
+   devlink_resource_size_set 15616 kvd linear singles
+   devlink_resource_size_set 32000 kvd linear chunks
+   devlink_resource_size_set 16384 kvd linear large_chunks
+   devlink_resource_size_set 128000 kvd hash_single
+   devlink_resource_size_set 48000 kvd hash_double
+   devlink_reload
+   ;;
+   ipv4_max)
+   devlink_resource_size_set 64000 kvd linear
+   devlink_resource_size_set 15616 kvd linear singles
+   devlink_resource_size_set 32000 kvd linear chunks
+   devlink_resource_size_set 16384 kvd linear large_chunks
+   devlink_resource_size_set 144000 kvd hash_single
+   devlink_resource_size_set 32768 kvd hash_double
+   devlink_reload
+   ;;
+   default)
+   devlink_resource_size_set 98304 kvd linear
+   devlink_resource_size_set 16384 kvd linear singles
+   devlink_resource_size_set 49152 kvd linear chunks
+   devlink_resource_size_set 32768 kvd linear large_chunks
+   

[PATCH net-next 04/13] selftests: forwarding: Add devlink_lib.sh

2018-06-29 Thread Petr Machata
This helper library contains wrappers to devlink functionality agnostic
to the underlying device.

Signed-off-by: Yuval Mintz 
[pe...@mellanox.com: Split this out from another patch.]
Signed-off-by: Petr Machata 
---
 .../selftests/net/forwarding/devlink_lib.sh| 108 +
 1 file changed, 108 insertions(+)
 create mode 100644 tools/testing/selftests/net/forwarding/devlink_lib.sh

diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh 
b/tools/testing/selftests/net/forwarding/devlink_lib.sh
new file mode 100644
index ..5ab1e5f43022
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##
+# Source library
+
+relative_path="${BASH_SOURCE%/*}"
+if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
+   relative_path="."
+fi
+
+source "$relative_path/lib.sh"
+
+##
+# Defines
+
+DEVLINK_DEV=$(devlink port show | grep "${NETIFS[p1]}" | \
+ grep -v "${NETIFS[p1]}[0-9]" | cut -d" " -f1 | \
+ rev | cut -d"/" -f2- | rev)
+if [ -z "$DEVLINK_DEV" ]; then
+   echo "SKIP: ${NETIFS[p1]} has no devlink device registered for it"
+   exit 1
+fi
+if [[ "$(echo $DEVLINK_DEV | grep -c pci)" -eq 0 ]]; then
+   echo "SKIP: devlink device's bus is not PCI"
+   exit 1
+fi
+
+DEVLINK_VIDDID=$(lspci -s $(echo $DEVLINK_DEV | cut -d"/" -f2) \
+-n | cut -d" " -f3)
+
+##
+# Sanity checks
+
+devlink -j resource show "$DEVLINK_DEV" &> /dev/null
+if [ $? -ne 0 ]; then
+   echo "SKIP: iproute2 too old, missing devlink resource support"
+   exit 1
+fi
+
+##
+# Devlink helpers
+
+devlink_resource_names_to_path()
+{
+   local resource
+   local path=""
+
+   for resource in "${@}"; do
+   if [ "$path" == "" ]; then
+   path="$resource"
+   else
+   path="${path}/$resource"
+   fi
+   done
+
+   echo "$path"
+}
+
+devlink_resource_get()
+{
+   local name=$1
+   local resource_name=.[][\"$DEVLINK_DEV\"]
+
+   resource_name="$resource_name | .[] | select (.name == \"$name\")"
+
+   shift
+   for resource in "${@}"; do
+   resource_name="${resource_name} | .[\"resources\"][] | \
+  select (.name == \"$resource\")"
+   done
+
+   devlink -j resource show "$DEVLINK_DEV" | jq "$resource_name"
+}
+
+devlink_resource_size_get()
+{
+   local size=$(devlink_resource_get "$@" | jq '.["size_new"]')
+
+   if [ "$size" == "null" ]; then
+   devlink_resource_get "$@" | jq '.["size"]'
+   else
+   echo "$size"
+   fi
+}
+
+devlink_resource_size_set()
+{
+   local new_size=$1
+   local path
+
+   shift
+   path=$(devlink_resource_names_to_path "$@")
+   devlink resource set "$DEVLINK_DEV" path "$path" size "$new_size"
+   check_err $? "Failed setting path $path to size $size"
+}
+
+devlink_reload()
+{
+   local still_pending
+
+   devlink dev reload "$DEVLINK_DEV" &> /dev/null
+   check_err $? "Failed reload"
+
+   still_pending=$(devlink resource show "$DEVLINK_DEV" | \
+   grep -c "size_new")
+   check_err $still_pending "Failed reload - There are still unset sizes"
+}
-- 
2.4.11



[PATCH net-next 03/13] selftests: forwarding: lib: Parameterize NUM_NETIFS in two functions

2018-06-29 Thread Petr Machata
setup_wait() and tc_offload_check() both assume that all NUM_NETIFS
interfaces are relevant for a given test. However, the scale test script
acts as an umbrella for a number of sub-tests, some of which may not
require all the interfaces.

Thus it's suboptimal for tc_offload_check() to query all the interfaces.
In case of setup_wait() it's incorrect, because the sub-test in question
of course doesn't configure any interfaces beyond what it needs, and
setup_wait() then ends up waiting indefinitely for the extraneous
interfaces to come up.

For that reason, give setup_wait() and tc_offload_check() an optional
parameter with a number of interfaces to probe. Fall back to global
NUM_NETIFS if the parameter is not given.

Signed-off-by: Petr Machata 
Reviewed-by: Yuval Mintz 
---
 tools/testing/selftests/net/forwarding/lib.sh | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh 
b/tools/testing/selftests/net/forwarding/lib.sh
index 5f4b7ed2c65a..e073918bbe15 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -220,7 +220,9 @@ setup_wait_dev()
 
 setup_wait()
 {
-   for i in $(eval echo {1..$NUM_NETIFS}); do
+   local num_netifs=${1:-$NUM_NETIFS}
+
+   for ((i = 1; i <= num_netifs; ++i)); do
setup_wait_dev ${NETIFS[p$i]}
done
 
@@ -481,7 +483,9 @@ forwarding_restore()
 
 tc_offload_check()
 {
-   for i in $(eval echo {1..$NUM_NETIFS}); do
+   local num_netifs=${1:-$NUM_NETIFS}
+
+   for ((i = 1; i <= num_netifs; ++i)); do
ethtool -k ${NETIFS[p$i]} \
| grep "hw-tc-offload: on" &> /dev/null
if [[ $? -ne 0 ]]; then
-- 
2.4.11



[PATCH net-next 01/13] selftests: forwarding: Allow lib.sh sourcing from other directories

2018-06-29 Thread Petr Machata
From: Yuval Mintz 

The devlink related scripts are mlxsw-specific. As a result, they'll
reside in a different directory - but would still need the common logic
implemented in lib.sh.
So as a preliminary step, allow lib.sh to be sourced from other
directories as well.

Signed-off-by: Yuval Mintz 
Signed-off-by: Petr Machata 
---
 tools/testing/selftests/net/forwarding/lib.sh | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh 
b/tools/testing/selftests/net/forwarding/lib.sh
index d1f14f83979e..59272824ef37 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -14,8 +14,13 @@ PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no}
 NETIF_TYPE=${NETIF_TYPE:=veth}
 NETIF_CREATE=${NETIF_CREATE:=yes}
 
-if [[ -f forwarding.config ]]; then
-   source forwarding.config
+relative_path="${BASH_SOURCE%/*}"
+if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
+   relative_path="."
+fi
+
+if [[ -f $relative_path/forwarding.config ]]; then
+   source "$relative_path/forwarding.config"
 fi
 
 ##
-- 
2.4.11



[PATCH net-next 02/13] selftests: forwarding: lib: Add check_err_fail()

2018-06-29 Thread Petr Machata
In the scale testing scenarios, one usually has a condition that is
expected to either fail, or pass, depending on which side of the scale
is being tested.

To capture this logic, add a function check_err_fail(), which dispatches
either to check_err() or check_fail(), depending on the value of the
first argument, should_fail.

Signed-off-by: Petr Machata 
Reviewed-by: Yuval Mintz 
---
 tools/testing/selftests/net/forwarding/lib.sh | 13 +
 1 file changed, 13 insertions(+)

diff --git a/tools/testing/selftests/net/forwarding/lib.sh 
b/tools/testing/selftests/net/forwarding/lib.sh
index 59272824ef37..5f4b7ed2c65a 100644
--- a/tools/testing/selftests/net/forwarding/lib.sh
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -156,6 +156,19 @@ check_fail()
fi
 }
 
+check_err_fail()
+{
+   local should_fail=$1; shift
+   local err=$1; shift
+   local what=$1; shift
+
+   if ((should_fail)); then
+   check_fail $err "$what succeeded, but should have failed"
+   else
+   check_err $err "$what failed"
+   fi
+}
+
 log_test()
 {
local test_name=$1
-- 
2.4.11



[PATCH net-next 00/13] mlxsw: Add resource scale tests

2018-06-29 Thread Petr Machata
There are a number of tests that check features of the Linux networking
stack. By running them on suitable interfaces, one can exercise the
mlxsw offloading code. However none of these tests attempts to push
mlxsw to the limits supported by the ASIC.

As an additional wrinkle, the "limits supported by the ASIC" themselves
may not be a set of fixed numbers, but rather depend on a profile that
determines how the ASIC resources are allocated for different purposes.

This patchset introduces several tests that verify capability of mlxsw
to offload amounts of routes, flower rules, and mirroring sessions that
match predicted ASIC capacity, at different configuration profiles.
Additionally they verify that amounts exceeding the predicted capacity
can *not* be offloaded.

These are not generic tests, but ones that are tailored for mlxsw
specifically. For that reason they are not added to net/forwarding
selftests subdirectory, but rather to a newly-added drivers/net/mlxsw.

Patches #1, #2 and #3 tweak the generic forwarding/lib.sh to support the
new additions.

In patches #4 and #5, new libraries for interfacing with devlink are
introduced, first a generic one, then a Spectrum-specific one.

In patch #6, a devlink resource test is introduced.

Patches #7 and #8, #9 and #10, and #11 and #12 introduce three scale
tests: router, flower and mirror-to-gretap. The first of each pair of
patches introduces a generic portion of the test (mlxsw-specific), the
second introduces a Spectrum-specific wrapper.

Patch #13 then introduces a scale test driver that runs (possibly a
subset of) the tests introduced by patches from previous paragraph.

Arkadi Sharshevsky (1):
  selftests: mlxsw: Add router test

Petr Machata (8):
  selftests: forwarding: lib: Add check_err_fail()
  selftests: forwarding: lib: Parameterize NUM_NETIFS in two functions
  selftests: forwarding: Add devlink_lib.sh
  selftests: mlxsw: Add devlink_lib_spectrum.sh
  selftests: mlxsw: Add tc flower scale test
  selftests: mlxsw: Add target for tc flower test on spectrum
  selftests: mlxsw: Add scale test for mirror-to-gretap
  selftests: mlxsw: Add target for mirror-to-gretap test on spectrum

Yuval Mintz (4):
  selftests: forwarding: Allow lib.sh sourcing from other directories
  selftests: mlxsw: Add devlink KVD resource test
  selftests: mlxsw: Add target for router test on spectrum
  selftests: mlxsw: Add scale test for resources

 MAINTAINERS|   1 +
 .../drivers/net/mlxsw/mirror_gre_scale.sh  | 197 +
 .../selftests/drivers/net/mlxsw/router_scale.sh| 167 +
 .../net/mlxsw/spectrum/devlink_lib_spectrum.sh | 119 +
 .../net/mlxsw/spectrum/devlink_resources.sh| 117 
 .../drivers/net/mlxsw/spectrum/mirror_gre_scale.sh |  13 ++
 .../drivers/net/mlxsw/spectrum/resource_scale.sh   |  55 ++
 .../drivers/net/mlxsw/spectrum/router_scale.sh |  18 ++
 .../drivers/net/mlxsw/spectrum/tc_flower_scale.sh  |  19 ++
 .../selftests/drivers/net/mlxsw/tc_flower_scale.sh | 134 ++
 .../selftests/net/forwarding/devlink_lib.sh| 108 +++
 tools/testing/selftests/net/forwarding/lib.sh  |  30 +++-
 12 files changed, 974 insertions(+), 4 deletions(-)
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
 create mode 100755 
tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
 create mode 100644 
tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
 create mode 100644 tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
 create mode 100644 tools/testing/selftests/net/forwarding/devlink_lib.sh

-- 
2.4.11



Re: [RFC v2 PATCH 2/4] ebpf: Add sg_filter_run and sg helper

2018-06-29 Thread Tushar Dave




On 06/29/2018 01:32 AM, Daniel Borkmann wrote:

On 06/19/2018 08:00 PM, Tushar Dave wrote:
[...]

+int sg_filter_run(struct sock *sk, struct scatterlist *sg)
+{
+   struct sk_filter *filter;
+   int err;
+
+   rcu_read_lock();
+   filter = rcu_dereference(sk->sk_filter);
+   if (filter) {
+   struct bpf_scatterlist bpfsg;
+   int num_sg;
+
+   if (!sg) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   num_sg = sg_nents(sg);
+   if (num_sg <= 0) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   /* We store a reference  to the sg list so it can later used by
+* eBPF helpers to retrieve the next sg element.
+*/
+   bpfsg.num_sg = num_sg;
+   bpfsg.cur_sg = 0;
+   bpfsg.sg = sg;
+
+   /* For the first sg element, we store the pkt access pointers
+* into start and end so eBPF program can have pkt access using
+* data and data_end. The pkt access for subsequent element of
+* sg list is possible when eBPF program invokes bpf_sg_next
+* which takes care of setting start and end to the correct sg
+* element.
+*/
+   bpfsg.start = sg_virt(sg);
+   bpfsg.end = bpfsg.start + sg->length;
+   BPF_PROG_RUN(filter->prog, );


Return code here from BPF prog is ignored entirely, I thought you wanted to
use it also for dropping packets? If UAPI would get frozen like this then it's
baked in stone.


Yeah, I am going to add return code necessary for pass, drop and
forward. I will do that. Thanks.

-Tushar




+   }
+out:
+   rcu_read_unlock();
+
+   return err;
+}
+EXPORT_SYMBOL(sg_filter_run);




Re: [RFC v2 PATCH 2/4] ebpf: Add sg_filter_run and sg helper

2018-06-29 Thread Tushar Dave




On 06/29/2018 01:18 AM, Daniel Borkmann wrote:

On 06/19/2018 08:00 PM, Tushar Dave wrote:

When sg_filter_run() is invoked it runs the attached eBPF
SOCKET_SG_FILTER program which deals with struct scatterlist.

In addition, this patch also adds bpf_sg_next helper function that
allows users to retrieve the next sg element from sg list.

Signed-off-by: Tushar Dave 
Acked-by: Sowmini Varadhan 
---
  include/linux/filter.h|  2 +
  include/uapi/linux/bpf.h  | 10 -
  net/core/filter.c | 72 +++
  tools/include/uapi/linux/bpf.h| 10 -
  tools/testing/selftests/bpf/bpf_helpers.h |  3 ++
  5 files changed, 95 insertions(+), 2 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 71618b1..d176402 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1072,4 +1072,6 @@ struct bpf_sock_ops_kern {
 */
  };
  
+int sg_filter_run(struct sock *sk, struct scatterlist *sg);

+
  #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef0a7b6..036432b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2076,6 +2076,13 @@ struct bpf_stack_build_id {
   *Return
   *A 64-bit integer containing the current cgroup id based
   *on the cgroup within which the current task is running.
+ *
+ * int bpf_sg_next(struct bpf_scatterlist *sg)
+ * Description
+ * This helper allows user to retrieve next sg element from
+ * sg list.
+ * Return
+ * Returns 0 on success, or a negative error in case of failure.
   */
  #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -2158,7 +2165,8 @@ struct bpf_stack_build_id {
FN(rc_repeat),  \
FN(rc_keydown), \
FN(skb_cgroup_id),  \
-   FN(get_current_cgroup_id),
+   FN(get_current_cgroup_id),  \
+   FN(sg_next),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper

   * function eBPF program intends to call
diff --git a/net/core/filter.c b/net/core/filter.c
index 8f67942..702ff5b 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -121,6 +121,53 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff 
*skb, unsigned int cap)
  }
  EXPORT_SYMBOL(sk_filter_trim_cap);
  
+int sg_filter_run(struct sock *sk, struct scatterlist *sg)

+{
+   struct sk_filter *filter;
+   int err;
+
+   rcu_read_lock();
+   filter = rcu_dereference(sk->sk_filter);
+   if (filter) {
+   struct bpf_scatterlist bpfsg;
+   int num_sg;
+
+   if (!sg) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   num_sg = sg_nents(sg);
+   if (num_sg <= 0) {
+   err = -EINVAL;
+   goto out;
+   }
+
+   /* We store a reference  to the sg list so it can later used by
+* eBPF helpers to retrieve the next sg element.
+*/
+   bpfsg.num_sg = num_sg;
+   bpfsg.cur_sg = 0;
+   bpfsg.sg = sg;
+
+   /* For the first sg element, we store the pkt access pointers
+* into start and end so eBPF program can have pkt access using
+* data and data_end. The pkt access for subsequent element of
+* sg list is possible when eBPF program invokes bpf_sg_next
+* which takes care of setting start and end to the correct sg
+* element.
+*/
+   bpfsg.start = sg_virt(sg);
+   bpfsg.end = bpfsg.start + sg->length;
+   BPF_PROG_RUN(filter->prog, );
+   }
+out:
+   rcu_read_unlock();
+
+   return err;
+}
+EXPORT_SYMBOL(sg_filter_run);
+
  BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
  {
return skb_get_poff(skb);
@@ -3753,6 +3800,29 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const 
void *src_buff,
.arg1_type  = ARG_PTR_TO_CTX,
  };
  
+BPF_CALL_1(bpf_sg_next, struct bpf_scatterlist *, bpfsg)

+{
+   struct scatterlist *sg = bpfsg->sg;
+   int cur_sg = bpfsg->cur_sg;
+
+   cur_sg++;
+   if (cur_sg >= bpfsg->num_sg)
+   return -ENODATA;
+
+   bpfsg->cur_sg = cur_sg;
+   bpfsg->start = sg_virt([cur_sg]);
+   bpfsg->end = bpfsg->start + sg[cur_sg].length;
+
+   return 0;
+}
+
+static const struct bpf_func_proto bpf_sg_next_proto = {
+   .func   = bpf_sg_next,
+   .gpl_only   = false,
+   .ret_type   = RET_INTEGER,
+   .arg1_type  = ARG_PTR_TO_CTX,
+};


Should be added to bpf_helper_changes_pkt_data() in order to enforce a reload
of all pkt pointers. Otherwise this is buggy 

Re: [RFC v2 PATCH 1/4] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER

2018-06-29 Thread Tushar Dave




On 06/29/2018 01:48 AM, Daniel Borkmann wrote:

On 06/29/2018 09:25 AM, Daniel Borkmann wrote:

On 06/19/2018 08:00 PM, Tushar Dave wrote:

Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
existing socket filter infrastructure for bpf program attach and load.
SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
contrast to SOCKET_FILTER which deals with struct skb. This is useful
for kernel entities that don't have skb to represent packet data but
want to run eBPF socket filter on packet data that is in form of struct
scatterlist e.g. IB/RDMA

Signed-off-by: Tushar Dave 
Acked-by: Sowmini Varadhan 

[...]

  static void __bpf_prog_release(struct bpf_prog *prog)
  {
-   if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+   if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER ||
+   prog->type == BPF_PROG_TYPE_SOCKET_SG_FILTER) {
bpf_prog_put(prog);
} else {
bpf_release_orig_filter(prog);
@@ -1551,10 +1552,16 @@ int sk_reuseport_attach_filter(struct sock_fprog 
*fprog, struct sock *sk)
  
  static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)

  {
+   struct bpf_prog *prog;
+
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return ERR_PTR(-EPERM);
  
-	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);

+   prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+   if (IS_ERR(prog))
+   prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_SG_FILTER);
+
+   return prog;
  }


Hmm, I don't think this works: this now means as unpriviledged I can attach a 
new
BPF_PROG_TYPE_SOCKET_SG_FILTER to a non-rds socket e.g. normal tcp/udp through 
the
SO_ATTACH_BPF sockopt, where input context is skb instead of sg list and thus 
crash
my box?


hmm.. I see the problem.



... probably best to just make a setsockopt specific to rds here, so the two 
are fully
separated.


Yes, it makes sense to make setsockopt specific to RDS that gives us
complete separation (and not to worry about above problem).
btw, to to make setsockopt specific to RDS though we have to export
sk_attach_bpf().



Also worth exploring whether you can reuse as much as possible from the struct 
sk_msg_buff
context and in general the BPF_PROG_TYPE_SK_MSG type that is using this which 
we already
have in sockmap today. At least feels like some of the concepts are a bit 
similar. For
pulling in more payload you have bpf_msg_pull_data() there which I think might 
be more
user-friendly at least in that you have the full payload from start to the 
'current' end
available and don't need to navigate through individual sg entries back/forth 
which could
perhaps end up being bit painful for users, though I can see that it's a middle 
ground
between some skb_load_bytes()-alike helper that would copy the pieces out of 
the sg entries
vs needing to linearize. What are the requirements here, would it make sense to 
offer both
as an option or is this impractical based on what you've measured?


Yes, sockmap also deal with struct scatterlist so from that prospective
I certainly try to reuse code wherever I can e.g. most likely get rid of
struct bpf_scatterlist and use struct sk_msg_buff.

Form use-case prospective, we want to look at complete payload (that
includes RDS header as well) and based on that take actions like pass
,drop or forward. So, I agree that it makes sense to not iterate over sg
element back/forth [1]. I guess bpf_msg_pull_data() would do the work.
If there is need we may add sg_load_bytes()-like helper.

Thanks for taking time reviewing. I will work on the suggested changes.

-Tushar

[1]
btw, bpf_sg_next() was added to just showcase an example and if there is
no real need of it I will get rid of it.


[PATCH net-next 3/9] nfp: implement netpoll ndo (thus enabling netconsole)

2018-06-29 Thread Jakub Kicinski
NFP NAPI handling will only complete the TXed packets when called
with budget of 0, implement ndo_poll_controller by scheduling NAPI
on all TX queues.

Signed-off-by: Jakub Kicinski 
---
 .../ethernet/netronome/nfp/nfp_net_common.c| 18 ++
 1 file changed, 18 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index d4c27f849f9b..edc6ef682f6d 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3115,6 +3115,21 @@ nfp_net_vlan_rx_kill_vid(struct net_device *netdev, 
__be16 proto, u16 vid)
return nfp_net_reconfig_mbox(nn, NFP_NET_CFG_MBOX_CMD_CTAG_FILTER_KILL);
 }
 
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void nfp_net_netpoll(struct net_device *netdev)
+{
+   struct nfp_net *nn = netdev_priv(netdev);
+   int i;
+
+   /* nfp_net's NAPIs are statically allocated so even if there is a race
+* with reconfig path this will simply try to schedule some disabled
+* NAPI instances.
+*/
+   for (i = 0; i < nn->dp.num_stack_tx_rings; i++)
+   napi_schedule_irqoff(>r_vecs[i].napi);
+}
+#endif
+
 static void nfp_net_stat64(struct net_device *netdev,
   struct rtnl_link_stats64 *stats)
 {
@@ -3482,6 +3497,9 @@ const struct net_device_ops nfp_net_netdev_ops = {
.ndo_get_stats64= nfp_net_stat64,
.ndo_vlan_rx_add_vid= nfp_net_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid   = nfp_net_vlan_rx_kill_vid,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+   .ndo_poll_controller= nfp_net_netpoll,
+#endif
.ndo_set_vf_mac = nfp_app_set_vf_mac,
.ndo_set_vf_vlan= nfp_app_set_vf_vlan,
.ndo_set_vf_spoofchk= nfp_app_set_vf_spoofchk,
-- 
2.17.1



[PATCH net-next 4/9] nfp: make use of napi_consume_skb()

2018-06-29 Thread Jakub Kicinski
Use napi_consume_skb() in nfp_net_tx_complete() to get bulk free.
Pass 0 as budget for ctrl queue completion since it runs out of
a tasklet.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index edc6ef682f6d..7df5ca37bfb8 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -945,11 +945,12 @@ static int nfp_net_tx(struct sk_buff *skb, struct 
net_device *netdev)
 
 /**
  * nfp_net_tx_complete() - Handled completed TX packets
- * @tx_ring:   TX ring structure
+ * @tx_ring:   TX ring structure
+ * @budget:NAPI budget (only used as bool to determine if in NAPI context)
  *
  * Return: Number of completed TX descriptors
  */
-static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring)
+static void nfp_net_tx_complete(struct nfp_net_tx_ring *tx_ring, int budget)
 {
struct nfp_net_r_vector *r_vec = tx_ring->r_vec;
struct nfp_net_dp *dp = _vec->nfp_net->dp;
@@ -999,7 +1000,7 @@ static void nfp_net_tx_complete(struct nfp_net_tx_ring 
*tx_ring)
 
/* check for last gather fragment */
if (fidx == nr_frags - 1)
-   dev_consume_skb_any(skb);
+   napi_consume_skb(skb, budget);
 
tx_ring->txbufs[idx].dma_addr = 0;
tx_ring->txbufs[idx].skb = NULL;
@@ -1828,7 +1829,7 @@ static int nfp_net_poll(struct napi_struct *napi, int 
budget)
unsigned int pkts_polled = 0;
 
if (r_vec->tx_ring)
-   nfp_net_tx_complete(r_vec->tx_ring);
+   nfp_net_tx_complete(r_vec->tx_ring, budget);
if (r_vec->rx_ring)
pkts_polled = nfp_net_rx(r_vec->rx_ring, budget);
 
@@ -2062,7 +2063,7 @@ static void nfp_ctrl_poll(unsigned long arg)
struct nfp_net_r_vector *r_vec = (void *)arg;
 
spin_lock_bh(_vec->lock);
-   nfp_net_tx_complete(r_vec->tx_ring);
+   nfp_net_tx_complete(r_vec->tx_ring, 0);
__nfp_ctrl_tx_queued(r_vec);
spin_unlock_bh(_vec->lock);
 
-- 
2.17.1



[PATCH net-next 2/9] nfp: fail probe if serial or interface id is missing

2018-06-29 Thread Jakub Kicinski
On some platforms with broken ACPI tables we may not have access
to the Serial Number PCIe capability.  This capability is crucial
for us for switchdev operation as we use serial number as switch ID,
and for communication with management FW where interface ID is used.

If we can't determine the Serial Number we have to fail device probe.

Signed-off-by: Jakub Kicinski 
---
 .../netronome/nfp/nfpcore/nfp6000_pcie.c  | 16 +-
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |  4 ++--
 .../netronome/nfp/nfpcore/nfp_cppcore.c   | 22 ++-
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c 
b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
index 749655c329b2..c8d0b1016a64 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp6000_pcie.c
@@ -1248,7 +1248,7 @@ static void nfp6000_free(struct nfp_cpp *cpp)
kfree(nfp);
 }
 
-static void nfp6000_read_serial(struct device *dev, u8 *serial)
+static int nfp6000_read_serial(struct device *dev, u8 *serial)
 {
struct pci_dev *pdev = to_pci_dev(dev);
int pos;
@@ -1256,25 +1256,29 @@ static void nfp6000_read_serial(struct device *dev, u8 
*serial)
 
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
if (!pos) {
-   memset(serial, 0, NFP_SERIAL_LEN);
-   return;
+   dev_err(dev, "can't find PCIe Serial Number Capability\n");
+   return -EINVAL;
}
 
pci_read_config_dword(pdev, pos + 4, );
put_unaligned_be16(reg >> 16, serial + 4);
pci_read_config_dword(pdev, pos + 8, );
put_unaligned_be32(reg, serial);
+
+   return 0;
 }
 
-static u16 nfp6000_get_interface(struct device *dev)
+static int nfp6000_get_interface(struct device *dev)
 {
struct pci_dev *pdev = to_pci_dev(dev);
int pos;
u32 reg;
 
pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_DSN);
-   if (!pos)
-   return NFP_CPP_INTERFACE(NFP_CPP_INTERFACE_TYPE_PCI, 0, 0xff);
+   if (!pos) {
+   dev_err(dev, "can't find PCIe Serial Number Capability\n");
+   return -EINVAL;
+   }
 
pci_read_config_dword(pdev, pos + 4, );
 
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h 
b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
index b0da3d436850..c338d539fa96 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cpp.h
@@ -364,8 +364,8 @@ struct nfp_cpp_operations {
int (*init)(struct nfp_cpp *cpp);
void (*free)(struct nfp_cpp *cpp);
 
-   void (*read_serial)(struct device *dev, u8 *serial);
-   u16 (*get_interface)(struct device *dev);
+   int (*read_serial)(struct device *dev, u8 *serial);
+   int (*get_interface)(struct device *dev);
 
int (*area_init)(struct nfp_cpp_area *area,
 u32 dest, unsigned long long address,
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c 
b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
index ef30597aa319..73de57a09800 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_cppcore.c
@@ -1163,10 +1163,10 @@ nfp_cpp_from_operations(const struct nfp_cpp_operations 
*ops,
 {
const u32 arm = NFP_CPP_ID(NFP_CPP_TARGET_ARM, NFP_CPP_ACTION_RW, 0);
struct nfp_cpp *cpp;
+   int ifc, err;
u32 mask[2];
u32 xpbaddr;
size_t tgt;
-   int err;
 
cpp = kzalloc(sizeof(*cpp), GFP_KERNEL);
if (!cpp) {
@@ -1176,9 +1176,19 @@ nfp_cpp_from_operations(const struct nfp_cpp_operations 
*ops,
 
cpp->op = ops;
cpp->priv = priv;
-   cpp->interface = ops->get_interface(parent);
-   if (ops->read_serial)
-   ops->read_serial(parent, cpp->serial);
+
+   ifc = ops->get_interface(parent);
+   if (ifc < 0) {
+   err = ifc;
+   goto err_free_cpp;
+   }
+   cpp->interface = ifc;
+   if (ops->read_serial) {
+   err = ops->read_serial(parent, cpp->serial);
+   if (err)
+   goto err_free_cpp;
+   }
+
rwlock_init(>resource_lock);
init_waitqueue_head(>waitq);
lockdep_set_class(>resource_lock, _cpp_resource_lock_key);
@@ -1191,7 +1201,7 @@ nfp_cpp_from_operations(const struct nfp_cpp_operations 
*ops,
err = device_register(>dev);
if (err < 0) {
put_device(>dev);
-   goto err_dev;
+   goto err_free_cpp;
}
 
dev_set_drvdata(>dev, cpp);
@@ -1238,7 +1248,7 @@ nfp_cpp_from_operations(const struct nfp_cpp_operations 
*ops,
 
 err_out:
device_unregister(>dev);
-err_dev:
+err_free_cpp:
kfree(cpp);
 

[PATCH net-next 1/9] nfp: expose ring stats of inactive rings via ethtool

2018-06-29 Thread Jakub Kicinski
After user changes the ring count statistics for deactivated
rings disappear from ethtool -S output.  This causes loss of
information to the user and means that ethtool stats may not
add up to interface stats.  Always expose counters from all
the rings.  Note that we allocate at most num_possible_cpus()
rings so number of rings should be reasonable.

The alternative of only listing stats for rings which were
ever in use could be confusing.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  | 50 +++
 1 file changed, 19 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 26d1cc4e2906..2aeb4622f1ea 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -452,7 +452,7 @@ static unsigned int nfp_vnic_get_sw_stats_count(struct 
net_device *netdev)
 {
struct nfp_net *nn = netdev_priv(netdev);
 
-   return NN_RVEC_GATHER_STATS + nn->dp.num_r_vecs * NN_RVEC_PER_Q_STATS;
+   return NN_RVEC_GATHER_STATS + nn->max_r_vecs * NN_RVEC_PER_Q_STATS;
 }
 
 static u8 *nfp_vnic_get_sw_stats_strings(struct net_device *netdev, u8 *data)
@@ -460,7 +460,7 @@ static u8 *nfp_vnic_get_sw_stats_strings(struct net_device 
*netdev, u8 *data)
struct nfp_net *nn = netdev_priv(netdev);
int i;
 
-   for (i = 0; i < nn->dp.num_r_vecs; i++) {
+   for (i = 0; i < nn->max_r_vecs; i++) {
data = nfp_pr_et(data, "rvec_%u_rx_pkts", i);
data = nfp_pr_et(data, "rvec_%u_tx_pkts", i);
data = nfp_pr_et(data, "rvec_%u_tx_busy", i);
@@ -486,7 +486,7 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device 
*netdev, u64 *data)
u64 tmp[NN_RVEC_GATHER_STATS];
unsigned int i, j;
 
-   for (i = 0; i < nn->dp.num_r_vecs; i++) {
+   for (i = 0; i < nn->max_r_vecs; i++) {
unsigned int start;
 
do {
@@ -521,15 +521,13 @@ static u64 *nfp_vnic_get_sw_stats(struct net_device 
*netdev, u64 *data)
return data;
 }
 
-static unsigned int
-nfp_vnic_get_hw_stats_count(unsigned int rx_rings, unsigned int tx_rings)
+static unsigned int nfp_vnic_get_hw_stats_count(unsigned int num_vecs)
 {
-   return NN_ET_GLOBAL_STATS_LEN + (rx_rings + tx_rings) * 2;
+   return NN_ET_GLOBAL_STATS_LEN + num_vecs * 4;
 }
 
 static u8 *
-nfp_vnic_get_hw_stats_strings(u8 *data, unsigned int rx_rings,
- unsigned int tx_rings, bool repr)
+nfp_vnic_get_hw_stats_strings(u8 *data, unsigned int num_vecs, bool repr)
 {
int swap_off, i;
 
@@ -549,36 +547,29 @@ nfp_vnic_get_hw_stats_strings(u8 *data, unsigned int 
rx_rings,
for (i = NN_ET_SWITCH_STATS_LEN * 2; i < NN_ET_GLOBAL_STATS_LEN; i++)
data = nfp_pr_et(data, nfp_net_et_stats[i].name);
 
-   for (i = 0; i < tx_rings; i++) {
-   data = nfp_pr_et(data, "txq_%u_pkts", i);
-   data = nfp_pr_et(data, "txq_%u_bytes", i);
-   }
-
-   for (i = 0; i < rx_rings; i++) {
+   for (i = 0; i < num_vecs; i++) {
data = nfp_pr_et(data, "rxq_%u_pkts", i);
data = nfp_pr_et(data, "rxq_%u_bytes", i);
+   data = nfp_pr_et(data, "txq_%u_pkts", i);
+   data = nfp_pr_et(data, "txq_%u_bytes", i);
}
 
return data;
 }
 
 static u64 *
-nfp_vnic_get_hw_stats(u64 *data, u8 __iomem *mem,
- unsigned int rx_rings, unsigned int tx_rings)
+nfp_vnic_get_hw_stats(u64 *data, u8 __iomem *mem, unsigned int num_vecs)
 {
unsigned int i;
 
for (i = 0; i < NN_ET_GLOBAL_STATS_LEN; i++)
*data++ = readq(mem + nfp_net_et_stats[i].off);
 
-   for (i = 0; i < tx_rings; i++) {
-   *data++ = readq(mem + NFP_NET_CFG_TXR_STATS(i));
-   *data++ = readq(mem + NFP_NET_CFG_TXR_STATS(i) + 8);
-   }
-
-   for (i = 0; i < rx_rings; i++) {
+   for (i = 0; i < num_vecs; i++) {
*data++ = readq(mem + NFP_NET_CFG_RXR_STATS(i));
*data++ = readq(mem + NFP_NET_CFG_RXR_STATS(i) + 8);
+   *data++ = readq(mem + NFP_NET_CFG_TXR_STATS(i));
+   *data++ = readq(mem + NFP_NET_CFG_TXR_STATS(i) + 8);
}
 
return data;
@@ -633,8 +624,7 @@ static void nfp_net_get_strings(struct net_device *netdev,
switch (stringset) {
case ETH_SS_STATS:
data = nfp_vnic_get_sw_stats_strings(netdev, data);
-   data = nfp_vnic_get_hw_stats_strings(data, nn->dp.num_rx_rings,
-nn->dp.num_tx_rings,
+   data = nfp_vnic_get_hw_stats_strings(data, nn->max_r_vecs,
 false);
data = nfp_mac_get_stats_strings(netdev, data);
data = 

[PATCH net-next 0/9] nfp: flower updates and netconsole

2018-06-29 Thread Jakub Kicinski
Hi!

This set contains assorted updates to driver base and flower.
First patch is a follow up to a fix to calculating counters which
went into net.  For ethtool counters we should also make sure
they are visible even after ring reconfiguration.  Next patch
is a safety measure in case we are running on a machine with a
broken BIOS we should fail the probe when risk of incorrect
operation is too high.  The next two patches add netpoll support
and make use of napi_consume_skb().  Last we populate bus info
on all representors.

Pieter adds support for offload of the checksum action in flower.

John follows up to another fix he's done in net, we set TTL
values on tunnels to stack's default, now Johns does a full
route lookup to get a more precise information, he populates
ToS field as well.  Last but not least he follows up on Jiri's
request to enable LAG offload in case the team driver is used
and then hash function is unknown.


Jakub Kicinski (5):
  nfp: expose ring stats of inactive rings via ethtool
  nfp: fail probe if serial or interface id is missing
  nfp: implement netpoll ndo (thus enabling netconsole)
  nfp: make use of napi_consume_skb()
  nfp: populate bus-info on representors

John Hurley (3):
  nfp: flower: extract ipv4 udp tunnel ttl from route
  nfp: flower: offload tos and tunnel flags for ipv4 udp tunnels
  nfp: flower: enabled offloading of Team LAG

Pieter Jansen van Vuuren (1):
  nfp: flower: ignore checksum actions when performing pedit actions

 .../ethernet/netronome/nfp/flower/action.c| 108 --
 .../net/ethernet/netronome/nfp/flower/cmsg.h  |   4 +-
 .../ethernet/netronome/nfp/flower/lag_conf.c  |   5 +-
 .../ethernet/netronome/nfp/nfp_net_common.c   |  29 -
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  |  58 --
 .../netronome/nfp/nfpcore/nfp6000_pcie.c  |  16 ++-
 .../ethernet/netronome/nfp/nfpcore/nfp_cpp.h  |   4 +-
 .../netronome/nfp/nfpcore/nfp_cppcore.c   |  22 +++-
 8 files changed, 178 insertions(+), 68 deletions(-)

-- 
2.17.1



[PATCH net-next 5/9] nfp: populate bus-info on representors

2018-06-29 Thread Jakub Kicinski
We used to leave bus-info in ethtool driver info empty for
representors in case multi-PCIe-to-single-host cards make
the association between PCIe device and NFP many to one.
It seems these attempts are futile, we need to link the
representors to one PCIe device in sysfs to get consistent
naming, plus devlink uses one PCIe as a handle, anyway.
The multi-PCIe-to-single-system support won't be clean,
if it ever comes.

Turns out some user space (RHEL tests) likes to read bus-info
so just populate it.

While at it remove unnecessary app NULL-check, representors
are spawned by an app, so it must exist.

Signed-off-by: Jakub Kicinski 
Reviewed-by: Dirk van der Merwe 
---
 drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c 
b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
index 2aeb4622f1ea..6a79c8e4a7a4 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ethtool.c
@@ -233,12 +233,10 @@ nfp_net_get_drvinfo(struct net_device *netdev, struct 
ethtool_drvinfo *drvinfo)
 static void
 nfp_app_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo)
 {
-   struct nfp_app *app;
-
-   app = nfp_app_from_netdev(netdev);
-   if (!app)
-   return;
+   struct nfp_app *app = nfp_app_from_netdev(netdev);
 
+   strlcpy(drvinfo->bus_info, pci_name(app->pdev),
+   sizeof(drvinfo->bus_info));
nfp_get_drvinfo(app, app->pdev, "*", drvinfo);
 }
 
-- 
2.17.1



[PATCH net-next 6/9] nfp: flower: ignore checksum actions when performing pedit actions

2018-06-29 Thread Jakub Kicinski
From: Pieter Jansen van Vuuren 

Hardware will automatically update csum in headers when a set action has
been performed. This means we could in the driver ignore the explicit
checksum action when performing a set action.

Signed-off-by: Pieter Jansen van Vuuren 
Reviewed-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
---
 .../ethernet/netronome/nfp/flower/action.c| 80 +--
 1 file changed, 72 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c 
b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 4a6d2db75071..61ba8d4f99f1 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -398,8 +399,27 @@ nfp_fl_set_tport(const struct tc_action *action, int idx, 
u32 off,
return 0;
 }
 
+static u32 nfp_fl_csum_l4_to_flag(u8 ip_proto)
+{
+   switch (ip_proto) {
+   case 0:
+   /* Filter doesn't force proto match,
+* both TCP and UDP will be updated if encountered
+*/
+   return TCA_CSUM_UPDATE_FLAG_TCP | TCA_CSUM_UPDATE_FLAG_UDP;
+   case IPPROTO_TCP:
+   return TCA_CSUM_UPDATE_FLAG_TCP;
+   case IPPROTO_UDP:
+   return TCA_CSUM_UPDATE_FLAG_UDP;
+   default:
+   /* All other protocols will be ignored by FW */
+   return 0;
+   }
+}
+
 static int
-nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len)
+nfp_fl_pedit(const struct tc_action *action, struct tc_cls_flower_offload 
*flow,
+char *nfp_action, int *a_len, u32 *csum_updated)
 {
struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src;
struct nfp_fl_set_ip4_addrs set_ip_addr;
@@ -409,6 +429,7 @@ nfp_fl_pedit(const struct tc_action *action, char 
*nfp_action, int *a_len)
int idx, nkeys, err;
size_t act_size;
u32 offset, cmd;
+   u8 ip_proto = 0;
 
memset(_ip6_dst, 0, sizeof(set_ip6_dst));
memset(_ip6_src, 0, sizeof(set_ip6_src));
@@ -451,6 +472,15 @@ nfp_fl_pedit(const struct tc_action *action, char 
*nfp_action, int *a_len)
return err;
}
 
+   if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_BASIC)) {
+   struct flow_dissector_key_basic *basic;
+
+   basic = skb_flow_dissector_target(flow->dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ flow->key);
+   ip_proto = basic->ip_proto;
+   }
+
if (set_eth.head.len_lw) {
act_size = sizeof(set_eth);
memcpy(nfp_action, _eth, act_size);
@@ -459,6 +489,10 @@ nfp_fl_pedit(const struct tc_action *action, char 
*nfp_action, int *a_len)
act_size = sizeof(set_ip_addr);
memcpy(nfp_action, _ip_addr, act_size);
*a_len += act_size;
+
+   /* Hardware will automatically fix IPv4 and TCP/UDP checksum. */
+   *csum_updated |= TCA_CSUM_UPDATE_FLAG_IPV4HDR |
+   nfp_fl_csum_l4_to_flag(ip_proto);
} else if (set_ip6_dst.head.len_lw && set_ip6_src.head.len_lw) {
/* TC compiles set src and dst IPv6 address as a single action,
 * the hardware requires this to be 2 separate actions.
@@ -471,18 +505,30 @@ nfp_fl_pedit(const struct tc_action *action, char 
*nfp_action, int *a_len)
memcpy(_action[sizeof(set_ip6_src)], _ip6_dst,
   act_size);
*a_len += act_size;
+
+   /* Hardware will automatically fix TCP/UDP checksum. */
+   *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
} else if (set_ip6_dst.head.len_lw) {
act_size = sizeof(set_ip6_dst);
memcpy(nfp_action, _ip6_dst, act_size);
*a_len += act_size;
+
+   /* Hardware will automatically fix TCP/UDP checksum. */
+   *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
} else if (set_ip6_src.head.len_lw) {
act_size = sizeof(set_ip6_src);
memcpy(nfp_action, _ip6_src, act_size);
*a_len += act_size;
+
+   /* Hardware will automatically fix TCP/UDP checksum. */
+   *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
} else if (set_tport.head.len_lw) {
act_size = sizeof(set_tport);
memcpy(nfp_action, _tport, act_size);
*a_len += act_size;
+
+   /* Hardware will automatically fix TCP/UDP checksum. */
+   *csum_updated |= nfp_fl_csum_l4_to_flag(ip_proto);
}
 
return 0;
@@ -493,12 +539,18 @@ nfp_flower_output_action(struct nfp_app *app, const 
struct tc_action *a,
 

[PATCH net-next 9/9] nfp: flower: enabled offloading of Team LAG

2018-06-29 Thread Jakub Kicinski
From: John Hurley 

Currently the NFP fw only supports L3/L4 hashing so rejects the offload of
filters that output to LAG ports implementing other hash algorithms. Team,
however, uses a BPF function for the hash that is not defined. To support
Team offload, accept hashes that are defined as 'unknown' (only Team
defines such hash types). In this case, use the NFP default of L3/L4
hashing for egress port selection.

Signed-off-by: John Hurley 
Reviewed-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
---
 drivers/net/ethernet/netronome/nfp/flower/lag_conf.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c 
b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
index 0c4c957717ea..bf10598f66ae 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/lag_conf.c
@@ -564,8 +564,9 @@ nfp_fl_lag_changeupper_event(struct nfp_fl_lag *lag,
if (lag_upper_info &&
lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
(lag_upper_info->tx_type != NETDEV_LAG_TX_TYPE_HASH ||
-   (lag_upper_info->hash_type != NETDEV_LAG_HASH_L34 &&
-   lag_upper_info->hash_type != NETDEV_LAG_HASH_E34))) {
+(lag_upper_info->hash_type != NETDEV_LAG_HASH_L34 &&
+ lag_upper_info->hash_type != NETDEV_LAG_HASH_E34 &&
+ lag_upper_info->hash_type != NETDEV_LAG_HASH_UNKNOWN))) {
can_offload = false;
nfp_flower_cmsg_warn(priv->app,
 "Unable to offload tx_type %u hash %u\n",
-- 
2.17.1



[PATCH net-next 8/9] nfp: flower: offload tos and tunnel flags for ipv4 udp tunnels

2018-06-29 Thread Jakub Kicinski
From: John Hurley 

Extract the tos and the tunnel flags from the tunnel key and offload these
action fields. Only the checksum and tunnel key flags are implemented in
fw so reject offloads of other flags. The tunnel key flag is always
considered set in the fw so enforce that it is set in the rule. Note that
the compulsory setting of the tunnel key flag and optional setting of
checksum is inline with how tc currently generates ipv4 udp tunnel
actions.

Signed-off-by: John Hurley 
Reviewed-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 9 +
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   | 4 ++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c 
b/drivers/net/ethernet/netronome/nfp/flower/action.c
index d421b7fbce96..e56b815a8dc6 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -45,6 +45,8 @@
 #include "main.h"
 #include "../nfp_net_repr.h"
 
+#define NFP_FL_SUPPORTED_IPV4_UDP_TUN_FLAGS(TUNNEL_CSUM | TUNNEL_KEY)
+
 static void nfp_fl_pop_vlan(struct nfp_fl_pop_vlan *pop_vlan)
 {
size_t act_size = sizeof(struct nfp_fl_pop_vlan);
@@ -274,6 +276,13 @@ nfp_fl_set_ipv4_udp_tun(struct nfp_fl_set_ipv4_udp_tun 
*set_tun,
set_tun->ttl = net->ipv4.sysctl_ip_default_ttl;
}
 
+   set_tun->tos = ip_tun->key.tos;
+
+   if (!(ip_tun->key.tun_flags & TUNNEL_KEY) ||
+   ip_tun->key.tun_flags & ~NFP_FL_SUPPORTED_IPV4_UDP_TUN_FLAGS)
+   return -EOPNOTSUPP;
+   set_tun->tun_flags = ip_tun->key.tun_flags;
+
/* Complete pre_tunnel action. */
pre_tun->ipv4_dst = ip_tun->key.u.ipv4.dst;
 
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h 
b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index 4a7f3510a296..15f1eacd76b6 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -203,9 +203,9 @@ struct nfp_fl_set_ipv4_udp_tun {
__be16 reserved;
__be64 tun_id __packed;
__be32 tun_type_index;
-   __be16 reserved2;
+   __be16 tun_flags;
u8 ttl;
-   u8 reserved3;
+   u8 tos;
__be32 extra[2];
 };
 
-- 
2.17.1



[PATCH net-next 7/9] nfp: flower: extract ipv4 udp tunnel ttl from route

2018-06-29 Thread Jakub Kicinski
From: John Hurley 

Previously the ttl for ipv4 udp tunnels was set to the namespace default.
Modify this to attempt to extract the ttl from a full route lookup on the
tunnel destination. If this is not possible then resort to the default.

Signed-off-by: John Hurley 
Reviewed-by: Jakub Kicinski 
Reviewed-by: Simon Horman 
---
 .../ethernet/netronome/nfp/flower/action.c| 19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c 
b/drivers/net/ethernet/netronome/nfp/flower/action.c
index 61ba8d4f99f1..d421b7fbce96 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -236,9 +236,12 @@ nfp_fl_set_ipv4_udp_tun(struct nfp_fl_set_ipv4_udp_tun 
*set_tun,
size_t act_size = sizeof(struct nfp_fl_set_ipv4_udp_tun);
struct ip_tunnel_info *ip_tun = tcf_tunnel_info(action);
u32 tmp_set_ip_tun_type_index = 0;
+   struct flowi4 flow = {};
/* Currently support one pre-tunnel so index is always 0. */
int pretun_idx = 0;
+   struct rtable *rt;
struct net *net;
+   int err;
 
if (ip_tun->options_len)
return -EOPNOTSUPP;
@@ -255,7 +258,21 @@ nfp_fl_set_ipv4_udp_tun(struct nfp_fl_set_ipv4_udp_tun 
*set_tun,
 
set_tun->tun_type_index = cpu_to_be32(tmp_set_ip_tun_type_index);
set_tun->tun_id = ip_tun->key.tun_id;
-   set_tun->ttl = net->ipv4.sysctl_ip_default_ttl;
+
+   /* Do a route lookup to determine ttl - if fails then use default.
+* Note that CONFIG_INET is a requirement of CONFIG_NET_SWITCHDEV so
+* must be defined here.
+*/
+   flow.daddr = ip_tun->key.u.ipv4.dst;
+   flow.flowi4_proto = IPPROTO_UDP;
+   rt = ip_route_output_key(net, );
+   err = PTR_ERR_OR_ZERO(rt);
+   if (!err) {
+   set_tun->ttl = ip4_dst_hoplimit(>dst);
+   ip_rt_put(rt);
+   } else {
+   set_tun->ttl = net->ipv4.sysctl_ip_default_ttl;
+   }
 
/* Complete pre_tunnel action. */
pre_tun->ipv4_dst = ip_tun->key.u.ipv4.dst;
-- 
2.17.1



Re: [PATCH bpf 3/3] bpf: undo prog rejection on read-only lock failure

2018-06-29 Thread Daniel Borkmann
On 06/29/2018 08:42 PM, Kees Cook wrote:
> On Thu, Jun 28, 2018 at 2:34 PM, Daniel Borkmann  wrote:
>> Kees suggested that if set_memory_*() can fail, we should annotate it with
>> __must_check, and all callers need to deal with it gracefully given those
>> set_memory_*() markings aren't "advisory", but they're expected to actually
>> do what they say. This might be an option worth to move forward in future
>> but would at the same time require that set_memory_*() calls from supporting
>> archs are guaranteed to be "atomic" in that they provide rollback if part
>> of the range fails, once that happened, the transition from RW -> RO could
>> be made more robust that way, while subsequent RO -> RW transition /must/
>> continue guaranteeing to always succeed the undo part.
> 
> Does this mean we can have BPF filters that aren't read-only then?
> What's the situation where set_memory_ro() fails? (Can it be induced
> by the user?)

My understanding is that the cpa_process_alias() would attempt to also change
attributes of physmap ranges, and it found that a large page had to be split
for this but failed in doing so thus attributes couldn't be updated there due
to page alloc error. Attempting to change the primary mapping which would be
directly the addr passed to set_memory_ro() was however set to read-only
despite error. While for reproduction I had a toggle on the alloc_pages() in
split_large_page() to have it fail, I only could trigger it occasionally; I
used the selftest suite in a loop to stress test and it hit about or twice
over hours.

Thanks,
Daniel


Re: [PATCH net-next] openvswitch: kernel datapath clone action

2018-06-29 Thread Pravin Shelar
On Thu, Jun 28, 2018 at 8:20 AM, Yifeng Sun  wrote:
> Add 'clone' action to kernel datapath by using existing functions.
> When actions within clone don't modify the current flow, the flow
> key is not cloned before executing clone actions.
>
> This is a follow up patch for this incomplete work:
> https://patchwork.ozlabs.org/patch/722096/
>
> Signed-off-by: Yifeng Sun 
> Signed-off-by: Andy Zhou 
> ---
>  include/uapi/linux/openvswitch.h |  8 +
>  net/openvswitch/actions.c| 33 ++
>  net/openvswitch/flow_netlink.c   | 73 
> 
>  3 files changed, 114 insertions(+)
>
> diff --git a/include/uapi/linux/openvswitch.h 
> b/include/uapi/linux/openvswitch.h
> index 863aaba..5de8583 100644
> --- a/include/uapi/linux/openvswitch.h
> +++ b/include/uapi/linux/openvswitch.h
> @@ -625,6 +625,11 @@ struct sample_arg {
>   * 'OVS_SAMPLE_ATTR_PROBABILITY'.
>   */
>  };
> +
> +#define OVS_CLONE_ATTR_EXEC  0   /* Specify an u32 value. When nonzero,
> + * actions in clone will not change flow
> + * keys. False otherwise.
> + */
>  #endif

This symbol is used only in datapath, so we can move it to kernel
headers from uapi.

> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 30a5df2..e31 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -1057,6 +1057,28 @@ static int sample(struct datapath *dp, struct sk_buff 
> *skb,
>  clone_flow_key);
>  }
>
> +/* When 'last' is true, clone() should always consume the 'skb'.
> + * Otherwise, clone() should keep 'skb' intact regardless what
> + * actions are executed within clone().
> + */
> +static int clone(struct datapath *dp, struct sk_buff *skb,
> +struct sw_flow_key *key, const struct nlattr *attr,
> +bool last)
> +{
> +   struct nlattr *actions;
> +   struct nlattr *clone_arg;
> +   int rem = nla_len(attr);
> +   bool clone_flow_key;
> +
> +   /* The first action is always 'OVS_CLONE_ATTR_ARG'. */
> +   clone_arg = nla_data(attr);
> +   clone_flow_key = !nla_get_u32(clone_arg);
> +   actions = nla_next(clone_arg, );
> +

Since OVS_CLONE_ATTR_EXEC means do not clone the key, it can be named
accordingly.


Re: [PATCH v3 net-next 7/9] net: ipv4: listified version of ip_rcv

2018-06-29 Thread kbuild test robot
Hi Edward,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Edward-Cree/Handle-multiple-received-packets-at-each-stage/20180630-042204
config: x86_64-randconfig-x003-201825 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All errors (new ones prefixed by >>):

   net/ipv4/ip_input.c: In function 'ip_sublist_rcv':
>> net/ipv4/ip_input.c:524:14: error: passing argument 6 of 'NF_HOOK_LIST' from 
>> incompatible pointer type [-Werror=incompatible-pointer-types]
   head, dev, NULL, ip_rcv_finish);
 ^~~
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net/ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: expected 'struct list_head *' but 
argument is of type 'struct net_device *'
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^~~~
   net/ipv4/ip_input.c:524:25: error: passing argument 8 of 'NF_HOOK_LIST' from 
incompatible pointer type [-Werror=incompatible-pointer-types]
   head, dev, NULL, ip_rcv_finish);
^
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net/ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: expected 'struct net_device *' but 
argument is of type 'int (*)(struct net *, struct sock *, struct sk_buff *)'
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^~~~
>> net/ipv4/ip_input.c:523:2: error: too few arguments to function 
>> 'NF_HOOK_LIST'
 NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 ^~~~
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net/ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: declared here
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^~~~
   cc1: some warnings being treated as errors

vim +/NF_HOOK_LIST +524 net/ipv4/ip_input.c

   517  
   518  static void ip_sublist_rcv(struct list_head *head, struct net_device 
*dev,
   519 struct net *net)
   520  {
   521  struct sk_buff *skb, *next;
   522  
 > 523  NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 > 524   head, dev, NULL, ip_rcv_finish);
   525  list_for_each_entry_safe(skb, next, head, list)
   526  ip_rcv_finish(net, NULL, skb);
   527  }
   528  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [net-next 01/12] net/mlx5e: Add UDP GSO support

2018-06-29 Thread Willem de Bruijn
On Fri, Jun 29, 2018 at 2:24 AM Saeed Mahameed  wrote:
>
> From: Boris Pismenny 
>
> This patch enables UDP GSO support. We enable this by using two WQEs
> the first is a UDP LSO WQE for all segments with equal length, and the
> second is for the last segment in case it has different length.
> Due to HW limitation, before sending, we must adjust the packet length fields.
>
> We measure performance between two Intel(R) Xeon(R) CPU E5-2643 v2 @3.50GHz
> machines connected back-to-back with Connectx4-Lx (40Gbps) NICs.
> We compare single stream UDP, UDP GSO and UDP GSO with offload.
> Performance:
> | MSS (bytes)   | Throughput (Gbps) | CPU utilization (%)
> UDP GSO offload | 1472  | 35.6  | 8%
> UDP GSO | 1472  | 25.5  | 17%
> UDP | 1472  | 10.2  | 17%
> UDP GSO offload | 1024  | 35.6  | 8%
> UDP GSO | 1024  | 19.2  | 17%
> UDP | 1024  | 5.7   | 17%
> UDP GSO offload | 512   | 33.8  | 16%
> UDP GSO | 512   | 10.4  | 17%
> UDP | 512   | 3.5   | 17%

Very nice results :)

> +static void mlx5e_udp_gso_prepare_last_skb(struct sk_buff *skb,
> +  struct sk_buff *nskb,
> +  int remaining)
> +{
> +   int bytes_needed = remaining, remaining_headlen, 
> remaining_page_offset;
> +   int headlen = skb_transport_offset(skb) + sizeof(struct udphdr);
> +   int payload_len = remaining + sizeof(struct udphdr);
> +   int k = 0, i, j;
> +
> +   skb_copy_bits(skb, 0, nskb->data, headlen);
> +   nskb->dev = skb->dev;
> +   skb_reset_mac_header(nskb);
> +   skb_set_network_header(nskb, skb_network_offset(skb));
> +   skb_set_transport_header(nskb, skb_transport_offset(skb));
> +   skb_set_tail_pointer(nskb, headlen);
> +
> +   /* How many frags do we need? */
> +   for (i = skb_shinfo(skb)->nr_frags - 1; i >= 0; i--) {
> +   bytes_needed -= skb_frag_size(_shinfo(skb)->frags[i]);
> +   k++;
> +   if (bytes_needed <= 0)
> +   break;
> +   }
> +
> +   /* Fill the first frag and split it if necessary */
> +   j = skb_shinfo(skb)->nr_frags - k;
> +   remaining_page_offset = -bytes_needed;
> +   skb_fill_page_desc(nskb, 0,
> +  skb_shinfo(skb)->frags[j].page.p,
> +  skb_shinfo(skb)->frags[j].page_offset + 
> remaining_page_offset,
> +  skb_shinfo(skb)->frags[j].size - 
> remaining_page_offset);
> +
> +   skb_frag_ref(skb, j);
> +
> +   /* Fill the rest of the frags */
> +   for (i = 1; i < k; i++) {
> +   j = skb_shinfo(skb)->nr_frags - k + i;
> +
> +   skb_fill_page_desc(nskb, i,
> +  skb_shinfo(skb)->frags[j].page.p,
> +  skb_shinfo(skb)->frags[j].page_offset,
> +  skb_shinfo(skb)->frags[j].size);
> +   skb_frag_ref(skb, j);
> +   }
> +   skb_shinfo(nskb)->nr_frags = k;
> +
> +   remaining_headlen = remaining - skb->data_len;
> +
> +   /* headlen contains remaining data? */
> +   if (remaining_headlen > 0)
> +   skb_copy_bits(skb, skb->len - remaining, nskb->data + headlen,
> + remaining_headlen);
> +   nskb->len = remaining + headlen;
> +   nskb->data_len =  payload_len - sizeof(struct udphdr) +
> +   max_t(int, 0, remaining_headlen);
> +   nskb->protocol = skb->protocol;
> +   if (nskb->protocol == htons(ETH_P_IP)) {
> +   ip_hdr(nskb)->id = htons(ntohs(ip_hdr(nskb)->id) +
> +skb_shinfo(skb)->gso_segs);
> +   ip_hdr(nskb)->tot_len =
> +   htons(payload_len + sizeof(struct iphdr));
> +   } else {
> +   ipv6_hdr(nskb)->payload_len = htons(payload_len);
> +   }
> +   udp_hdr(nskb)->len = htons(payload_len);
> +   skb_shinfo(nskb)->gso_size = 0;
> +   nskb->ip_summed = skb->ip_summed;
> +   nskb->csum_start = skb->csum_start;
> +   nskb->csum_offset = skb->csum_offset;
> +   nskb->queue_mapping = skb->queue_mapping;
> +}
> +
> +/* might send skbs and update wqe and pi */
> +struct sk_buff *mlx5e_udp_gso_handle_tx_skb(struct net_device *netdev,
> +   struct mlx5e_txqsq *sq,
> +   struct sk_buff *skb,
> +   struct mlx5e_tx_wqe **wqe,
> +   u16 *pi)
> +{
> +   int payload_len = skb_shinfo(skb)->gso_size + sizeof(struct udphdr);
> +   int headlen = 

Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw

2018-06-29 Thread Cong Wang
On Fri, Jun 29, 2018 at 10:06 AM Samudrala, Sridhar
 wrote:
>
> So instead of introducing 'chaintemplate' object in the kernel, can't we add 
> 'chain'
> object in the kernel that takes the 'template' as an attribute?

This is exactly what I mean above. Making the chain a standalone object
in kernel would benefit:

1. Align with 'tc chain' in iproute2, add/del an object is natural

2. Template is an attribute of this object when creating it:
# tc chain add template 
# tc chain add ... # non-template chain

3. Easier for sharing by qdiscs:
# tc chain add X block Y ...
# tc filter add ... chain X block Y ...
# tc qdisc add dev eth0 block Y ...

The current 'ingress_block 22 ingress' syntax is ugly.


Anyone know if strongswan works with vrf?

2018-06-29 Thread Ben Greear

Hello,

We're trying to create lots of strongswan VPN tunnels on network devices
bound to different VRFs.  We are using Fedora-24 on the client side, with a 
4.16.15+ kernel
and updated 'ip' package, etc.

So far, no luck getting it to work.

Any idea if this is supported or not?

Thanks,
Ben
--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



Re: [PATCH v3 net-next 7/9] net: ipv4: listified version of ip_rcv

2018-06-29 Thread kbuild test robot
Hi Edward,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Edward-Cree/Handle-multiple-received-packets-at-each-stage/20180630-042204
config: i386-randconfig-a0-201825 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   net//ipv4/ip_input.c: In function 'ip_sublist_rcv':
>> net//ipv4/ip_input.c:524:14: warning: passing argument 6 of 'NF_HOOK_LIST' 
>> from incompatible pointer type
   head, dev, NULL, ip_rcv_finish);
 ^
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net//ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: expected 'struct list_head *' but 
argument is of type 'struct net_device *'
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^
   net//ipv4/ip_input.c:524:25: warning: passing argument 8 of 'NF_HOOK_LIST' 
from incompatible pointer type
   head, dev, NULL, ip_rcv_finish);
^
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net//ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: expected 'struct net_device *' but 
argument is of type 'int (*)(struct net *, struct sock *, struct sk_buff *)'
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^
   net//ipv4/ip_input.c:523:2: error: too few arguments to function 
'NF_HOOK_LIST'
 NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 ^
   In file included from include/uapi/linux/netfilter_ipv4.h:9:0,
from include/linux/netfilter_ipv4.h:7,
from net//ipv4/ip_input.c:145:
   include/linux/netfilter.h:387:1: note: declared here
NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock 
*sk,
^

vim +/NF_HOOK_LIST +524 net//ipv4/ip_input.c

   517  
   518  static void ip_sublist_rcv(struct list_head *head, struct net_device 
*dev,
   519 struct net *net)
   520  {
   521  struct sk_buff *skb, *next;
   522  
   523  NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 > 524   head, dev, NULL, ip_rcv_finish);
   525  list_for_each_entry_safe(skb, next, head, list)
   526  ip_rcv_finish(net, NULL, skb);
   527  }
   528  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH ipsec-next] xfrm: Allow Set Mark to be Updated Using UPDSA

2018-06-29 Thread Nathan Harold
Allow UPDSA to change "set mark" to permit
policy separation of packet routing decisions from
SA keying in systems that use mark-based routing.

The set mark, used as a routing and firewall mark
for outbound packets, is made update-able which
allows routing decisions to be handled independently
of keying/SA creation. To maintain consistency with
other optional attributes, the set mark is only
updated if sent with a non-zero value.

The per-SA lock and the xfrm_state_lock are taken in
that order to avoid a deadlock with
xfrm_timer_handler(), which also takes the locks in
that order.

Signed-off-by: Nathan Harold 
Change-Id: Ia05c6733a94c1901cd1e54eb7c7e237704678d71
---
 net/xfrm/xfrm_state.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e04a510ec992..c9ffcdfa89f6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1562,6 +1562,15 @@ int xfrm_state_update(struct xfrm_state *x)
if (x1->curlft.use_time)
xfrm_state_check_expire(x1);
 
+   if (x->props.smark.m || x->props.smark.v) {
+   spin_lock_bh(>xfrm.xfrm_state_lock);
+
+   x1->props.smark = x->props.smark;
+
+   __xfrm_state_bump_genids(x1);
+   spin_unlock_bh(>xfrm.xfrm_state_lock);
+   }
+
err = 0;
x->km.state = XFRM_STATE_DEAD;
__xfrm_state_put(x);
-- 
2.18.0.399.gad0ab374a1-goog



Re: [Patch net] net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN

2018-06-29 Thread Eric Dumazet



On 06/29/2018 01:42 PM, Cong Wang wrote:
> As noticed by Eric, we need to switch to the helper
> dev_change_tx_queue_len() for SIOCSIFTXQLEN call path too,
> otheriwse still miss dev_qdisc_change_tx_queue_len().
> 
> Fixes: 6a643ddb5624 ("net: introduce helper dev_change_tx_queue_len()")
> Reported-by: Eric Dumazet 
> Signed-off-by: Cong Wang 
> ---
>

Reviewed-by: Eric Dumazet 

Thanks !



[PATCH net] net: fib_rules: bring back rule_exists to match rule during add

2018-06-29 Thread Roopa Prabhu
From: Roopa Prabhu 

After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule
delrule msgs into fib_nl2rule"), rule_exists got replaced by rule_find
for existing rule lookup in both the add and del paths. While this
is good for the delete path, it solves a few problems but opens up
a few invalid key matches in the add path.

$ip -4 rule add table main tos 10 fwmark 1
$ip -4 rule add table main tos 10
RTNETLINK answers: File exists

The problem here is rule_find does not check if the key masks in
the new and old rule are the same and hence ends up matching a more
secific rule. Rule key masks cannot be easily compared today without
an elaborate if-else block. Its best to introduce key masks for easier
and accurate rule comparison in the future. Until then, due to fear of
regressions this patch re-introduces older loose rule_exists during add.
Also fixes both rule_exists and rule_find to cover missing attributes.

Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs 
into fib_nl2rule")
Signed-off-by: Roopa Prabhu 
---
This does bring back old problems with rule_exists. But at this hour, old 
problems
are better than new problems!. The pref compare in rule_exists can be fixed, 
but its best
done in net-next. I will look at adding more coverage in fib rule selftests. 
Thanks.

 net/core/fib_rules.c | 72 +++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index bc8425d..f64aa13 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -444,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
if (rule->ip_proto && r->ip_proto != rule->ip_proto)
continue;
 
+   if (rule->proto && r->proto != rule->proto)
+   continue;
+
if (fib_rule_port_range_set(>sport_range) &&
!fib_rule_port_range_compare(>sport_range,
 >sport_range))
@@ -653,6 +656,73 @@ static int fib_nl2rule(struct sk_buff *skb, struct 
nlmsghdr *nlh,
return err;
 }
 
+static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
+  struct nlattr **tb, struct fib_rule *rule)
+{
+   struct fib_rule *r;
+
+   list_for_each_entry(r, >rules_list, list) {
+   if (r->action != rule->action)
+   continue;
+
+   if (r->table != rule->table)
+   continue;
+
+   if (r->pref != rule->pref)
+   continue;
+
+   if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
+   continue;
+
+   if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
+   continue;
+
+   if (r->mark != rule->mark)
+   continue;
+
+   if (r->suppress_ifgroup != rule->suppress_ifgroup)
+   continue;
+
+   if (r->suppress_prefixlen != rule->suppress_prefixlen)
+   continue;
+
+   if (r->mark_mask != rule->mark_mask)
+   continue;
+
+   if (r->tun_id != rule->tun_id)
+   continue;
+
+   if (r->fr_net != rule->fr_net)
+   continue;
+
+   if (r->l3mdev != rule->l3mdev)
+   continue;
+
+   if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
+   !uid_eq(r->uid_range.end, rule->uid_range.end))
+   continue;
+
+   if (r->ip_proto != rule->ip_proto)
+   continue;
+
+   if (r->proto != rule->proto)
+   continue;
+
+   if (!fib_rule_port_range_compare(>sport_range,
+>sport_range))
+   continue;
+
+   if (!fib_rule_port_range_compare(>dport_range,
+>dport_range))
+   continue;
+
+   if (!ops->compare(r, frh, tb))
+   continue;
+   return 1;
+   }
+   return 0;
+}
+
 int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
   struct netlink_ext_ack *extack)
 {
@@ -687,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr 
*nlh,
goto errout;
 
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
-   rule_find(ops, frh, tb, rule, user_priority)) {
+   rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
goto errout_free;
}
-- 
2.1.4



[Patch net] net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN

2018-06-29 Thread Cong Wang
As noticed by Eric, we need to switch to the helper
dev_change_tx_queue_len() for SIOCSIFTXQLEN call path too,
otheriwse still miss dev_qdisc_change_tx_queue_len().

Fixes: 6a643ddb5624 ("net: introduce helper dev_change_tx_queue_len()")
Reported-by: Eric Dumazet 
Signed-off-by: Cong Wang 
---
 net/core/dev_ioctl.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index a04e1e88bf3a..50537ff961a7 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -285,16 +285,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, 
unsigned int cmd)
if (ifr->ifr_qlen < 0)
return -EINVAL;
if (dev->tx_queue_len ^ ifr->ifr_qlen) {
-   unsigned int orig_len = dev->tx_queue_len;
-
-   dev->tx_queue_len = ifr->ifr_qlen;
-   err = call_netdevice_notifiers(
-   NETDEV_CHANGE_TX_QUEUE_LEN, dev);
-   err = notifier_to_errno(err);
-   if (err) {
-   dev->tx_queue_len = orig_len;
+   err = dev_change_tx_queue_len(dev, ifr->ifr_qlen);
+   if (err)
return err;
-   }
}
return 0;
 
-- 
2.14.4



Re: [PATCH v1 net-next 14/14] net/sched: Make etf report drops on error_queue

2018-06-29 Thread Jesus Sanchez-Palencia




On 06/29/2018 11:49 AM, Willem de Bruijn wrote:
 diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
 +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
 +{
 +   struct sock_exterr_skb *serr;
 +   ktime_t txtime = skb->tstamp;
 +
 +   if (!skb->sk || !(skb->sk->sk_txtime_flags & 
 SK_TXTIME_RECV_ERR_MASK))
 +   return;
 +
 +   skb = skb_clone_sk(skb);
 +   if (!skb)
 +   return;
 +
 +   sock_hold(skb->sk);
>>>
>>> Why take an extra reference? The skb holds a ref on the sk.
>>
>>
>> Yes, the cloned skb holds a ref on the socket, but the documentation of
>> skb_clone_sk() makes this explicit suggestion:
>>
>> (...)
>>  * When passing buffers allocated with this function to sock_queue_err_skb
>>  * it is necessary to wrap the call with sock_hold/sock_put in order to
>>  * prevent the socket from being released prior to being enqueued on
>>  * the sk_error_queue.
>>  */
>>
>> which I believe is here just so we are protected against a possible race 
>> after
>> skb_orphan() is called from sock_queue_err_skb(). Please let me know if I'm
>> misreading anything.
> 
> Yes, indeed. Code only has to worry about that if there are no
> concurrent references
> on the socket.
> 
> I may be mistaken, but I believe that this complicated logic exists
> only for cases where
> the timestamp may be queued after the original skb has been released.
> Specifically,
> when a tx timestamp is returned from a hardware device after transmission of 
> the
> original skb. Then the cloned timestamp skb needs its own reference on
> the sk while
> it is waiting for the timestamp data (i.e., until the device
> completion arrives) and then
> we need a temporary extra ref to work around the skb_orphan in
> sock_queue_err_skb.
> 
> Compare skb_complete_tx_timestamp with skb_tstamp_tx. The second is used in
> the regular datapath to clone an skb and queue it on the error queue
> immediately,
> while holding the original skb. This does not call skb_clone_sk and
> does not need the
> extra sock_hold. This should be good enough for this code path, too.
> As kb holds a
> ref on skb->sk, the socket cannot go away in the middle of report_sock_error.


Oh, that makes sense. Great, I will give this a try and add it to the v2.

Thanks,
Jesus



Re: [PATCH net] tcp: prevent bogus FRTO undos with non-SACK flows

2018-06-29 Thread Ilpo Järvinen
On Fri, 29 Jun 2018, Neal Cardwell wrote:

> On Fri, Jun 29, 2018 at 6:07 AM Ilpo Järvinen  
> wrote:
> >
> > If SACK is not enabled and the first cumulative ACK after the RTO
> > retransmission covers more than the retransmitted skb, a spurious
> > FRTO undo will trigger (assuming FRTO is enabled for that RTO).
> > The reason is that any non-retransmitted segment acknowledged will
> > set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is
> > no indication that it would have been delivered for real (the
> > scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK
> > case so the check for that bit won't help like it does with SACK).
> > Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo
> > in tcp_process_loss.
> >
> > We need to use more strict condition for non-SACK case and check
> > that none of the cumulatively ACKed segments were retransmitted
> > to prove that progress is due to original transmissions. Only then
> > keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in
> > non-SACK case.
> >
> > (FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS
> > to better indicate its purpose but to keep this change minimal, it
> > will be done in another patch).
> >
> > Besides burstiness and congestion control violations, this problem
> > can result in RTO loop: When the loss recovery is prematurely
> > undoed, only new data will be transmitted (if available) and
> > the next retransmission can occur only after a new RTO which in case
> > of multiple losses (that are not for consecutive packets) requires
> > one RTO per loss to recover.
> >
> > Signed-off-by: Ilpo Järvinen 
> > ---
> >  net/ipv4/tcp_input.c | 9 +
> >  1 file changed, 9 insertions(+)
> >
> > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> > index 045d930..8e5522c 100644
> > --- a/net/ipv4/tcp_input.c
> > +++ b/net/ipv4/tcp_input.c
> > @@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 
> > prior_fack,
> >
> > if (tcp_is_reno(tp)) {
> > tcp_remove_reno_sacks(sk, pkts_acked);
> > +
> > +   /* If any of the cumulatively ACKed segments was
> > +* retransmitted, non-SACK case cannot confirm that
> > +* progress was due to original transmission due to
> > +* lack of TCPCB_SACKED_ACKED bits even if some of
> > +* the packets may have been never retransmitted.
> > +*/
> > +   if (flag & FLAG_RETRANS_DATA_ACKED)
> > +   flag &= ~FLAG_ORIG_SACK_ACKED;
> > } else {
> > int delta;
> 
> Thanks, Ilpo. I ran this through our TCP packetdrill tests and only
> got one failure, which detected the change you made, so that on the
> first ACK in a non-SACK F-RTO case we stay in CA_Loss.

Great, thanks for testing.

> However, depending on the exact scenario we are concerned about here,
> this may not be a complete solution. Consider the packetdrill test I
> cooked below, which is for a scenario where we imagine a non-SACK
> connection, with data packet #1 and all dupacks lost. With your patch
> we don't have an F-RTO undo on the first cumulative ACK, which is a
> definite improvement. However, we end up with an F-RTO undo on the
> *second* cumulative ACK, because the second ACK didn't cover any
> retransmitted data. That means that we end up in the second round trip
> back in the initial slow-start mode with a very high cwnd and infinite
> ssthresh, even though data was actually lost in the first round trip.
>
> I'm not sure how much we care about all these cases. Perhaps we should
> just disable F-RTO if the connection has no SACK enabled? These
> non-SACK connections are just such a rare case at this point that I
> would propose it's not worth spending too much time on this.
> 
> The following packetdrill script passes when Ilpo's patch is applied.
> This documents the behavior, which I think is questionable:
> 
> 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
>+0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
>+0 bind(3, ..., ...) = 0
>+0 listen(3, 1) = 0
> 
>+0 < S 0:0(0) win 32792 
>+0 > S. 0:0(0) ack 1 
>  +.02 < . 1:1(0) ack 1 win 257
>+0 accept(3, ..., ...) = 4
>+0 write(4, ..., 27000) = 27000
>+0 > . 1:10001(1) ack 1
> 
> // Suppose 1:1001 is lost and all dupacks are lost.
> 
> // RTO and retransmit head. This fills a real hole.
>  +.22 > . 1:1001(1000) ack 1
> 
> +.005 < . 1:1(0) ack 2001 win 257

Why did the receiver send a cumulative ACK only for 2001?

>+0 > . 10001:13001(3000) ack 1
>+0 %{ assert tcpi_ca_state == TCP_CA_Loss, tcpi_ca_state }%
>+0 %{ assert tcpi_snd_cwnd == 3, tcpi_snd_cwnd }%
>+0 %{ assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh }%
> 
> +.002 < . 1:1(0) ack 10001 win 257

...And then magically all packets 

Re: [PATCH net] net: fib_rules: add protocol check in rule_find

2018-06-29 Thread Roopa Prabhu
On Thu, Jun 28, 2018 at 9:59 PM, Roopa Prabhu  wrote:
> On Wed, Jun 27, 2018 at 6:27 PM, Roopa Prabhu  
> wrote:
>> From: Roopa Prabhu 
>>
>> After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule
>> delrule msgs into fib_nl2rule"), rule_find is strict about checking
>> for an existing rule. rule_find must check against all
>> user given attributes, else it may match against a subset
>> of attributes and return an existing rule.
>>
>> In the below case, without support for protocol match, rule_find
>> will match only against 'table main' and return an existing rule.
>>
>> $ip -4 rule add table main protocol boot
>> RTNETLINK answers: File exists
>>
>> This patch adds protocol support to rule_find, forcing it to
>> check protocol match if given by the user.
>>
>> Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule 
>> msgs into fib_nl2rule")
>> Signed-off-by: Roopa Prabhu 
>> ---
>> I spent some time looking at all match keys today and protocol
>> was the only missing one (protocol is not in a released kernel yet).
>> The only way this could be avoided is to move back to the old loose
>> rule_find. I am worried about this new strict checking surprising users,
>> but going back to the previous loose checking does not seem right either.
>> If there is a reason to believe that users did rely on the previous
>> behaviour, I will be happy to revert. Here is another example of old and
>> new behaviour.
>>
>> old rule_find behaviour:
>> $ip -4 rule add table main protocol boot
>> $ip -4 rule add table main protocol boot
>> $ip -4 rule add table main protocol boot
>> $ip rule show
>> 0:  from all lookup local
>> 32763:  from all lookup main  proto boot
>> 32764:  from all lookup main  proto boot
>> 32765:  from all lookup main  proto boot
>> 32766:  from all lookup main
>> 32767:  from all lookup default
>>
>> new rule_find behaviour (after this patch):
>> $ip -4 rule add table main protocol boot
>> $ip -4 rule add table main protocol boot
>> RTNETLINK answers: File exists
>>
>
> I found the case where the new rule_find breaks for add.
> $ip -4 rule add table main tos 10 fwmark 1
> $ip -4 rule add table main tos 10
> RTNETLINK answers: File exists
>
> The key masks in the new and old rule need to be compared .
> And it cannot be easily compared today without an elaborate if-else block.
> Its best to introduce key masks for easier and accurate rule comparison.
> But this is best done in net-next. I will submit an incremental patch
> tomorrow to
> restore previous rule_exists for the add case and the rest should be good.
>
> The current patch in context is needed regardless.
>
> Thanks (and sorry about the iterations).

as I write the commit msg for the new incremental patch, it seems
better to merge this one with the new one.

Please ignore this patch, I will send an updated patch in a bit. thanks.


[PATCH v3 net-next 9/9] net: don't bother calling list RX functions on empty lists

2018-06-29 Thread Edward Cree
Generally the check should be very cheap, as the sk_buff_head is in cache.

Signed-off-by: Edward Cree 
---
 net/core/dev.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 4c5ebfab9bc8..d6084b0cd9ce 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4773,7 +4773,8 @@ static void __netif_receive_skb_list(struct list_head 
*head)
 
/* Handle the previous sublist */
list_cut_before(, head, >list);
-   __netif_receive_skb_list_core(, pfmemalloc);
+   if (!list_empty())
+   __netif_receive_skb_list_core(, 
pfmemalloc);
pfmemalloc = !pfmemalloc;
/* See comments in __netif_receive_skb */
if (pfmemalloc)
@@ -4783,7 +4784,8 @@ static void __netif_receive_skb_list(struct list_head 
*head)
}
}
/* Handle the remaining sublist */
-   __netif_receive_skb_list_core(head, pfmemalloc);
+   if (!list_empty(head))
+   __netif_receive_skb_list_core(head, pfmemalloc);
/* Restore pflags */
if (pfmemalloc)
memalloc_noreclaim_restore(noreclaim_flag);
@@ -4944,6 +4946,8 @@ void netif_receive_skb_list(struct list_head *head)
 {
struct sk_buff *skb;
 
+   if (list_empty(head))
+   return;
list_for_each_entry(skb, head, list)
trace_netif_receive_skb_list_entry(skb);
netif_receive_skb_list_internal(head);


[PATCH v3 net-next 6/9] net: core: propagate SKB lists through packet_type lookup

2018-06-29 Thread Edward Cree
__netif_receive_skb_core() does a depressingly large amount of per-packet
 work that can't easily be listified, because the another_round looping
 makes it nontrivial to slice up into smaller functions.
Fortunately, most of that work disappears in the fast path:
 * Hardware devices generally don't have an rx_handler
 * Unless you're tcpdumping or something, there is usually only one ptype
 * VLAN processing comes before the protocol ptype lookup, so doesn't force
   a pt_prev deliver
 so normally, __netif_receive_skb_core() will run straight through and pass
 back the one ptype found in ptype_base[hash of skb->protocol].

Signed-off-by: Edward Cree 
---
 net/core/dev.c | 72 +++---
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index d2454678bc82..edd67b1f1e12 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4494,7 +4494,8 @@ static inline int nf_ingress(struct sk_buff *skb, struct 
packet_type **pt_prev,
return 0;
 }
 
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
+static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+   struct packet_type **ppt_prev)
 {
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
@@ -4624,8 +4625,7 @@ static int __netif_receive_skb_core(struct sk_buff *skb, 
bool pfmemalloc)
if (pt_prev) {
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
goto drop;
-   else
-   ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+   *ppt_prev = pt_prev;
} else {
 drop:
if (!deliver_exact)
@@ -4643,6 +4643,18 @@ static int __netif_receive_skb_core(struct sk_buff *skb, 
bool pfmemalloc)
return ret;
 }
 
+static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
+{
+   struct net_device *orig_dev = skb->dev;
+   struct packet_type *pt_prev = NULL;
+   int ret;
+
+   ret = __netif_receive_skb_core(skb, pfmemalloc, _prev);
+   if (pt_prev)
+   ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+   return ret;
+}
+
 /**
  * netif_receive_skb_core - special purpose version of netif_receive_skb
  * @skb: buffer to process
@@ -4663,19 +4675,63 @@ int netif_receive_skb_core(struct sk_buff *skb)
int ret;
 
rcu_read_lock();
-   ret = __netif_receive_skb_core(skb, false);
+   ret = __netif_receive_skb_one_core(skb, false);
rcu_read_unlock();
 
return ret;
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
-static void __netif_receive_skb_list_core(struct list_head *head, bool 
pfmemalloc)
+static inline void __netif_receive_skb_list_ptype(struct list_head *head,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
 {
struct sk_buff *skb, *next;
 
+   if (!pt_prev)
+   return;
+   if (list_empty(head))
+   return;
+
list_for_each_entry_safe(skb, next, head, list)
-   __netif_receive_skb_core(skb, pfmemalloc);
+   pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+}
+
+static void __netif_receive_skb_list_core(struct list_head *head, bool 
pfmemalloc)
+{
+   /* Fast-path assumptions:
+* - There is no RX handler.
+* - Only one packet_type matches.
+* If either of these fails, we will end up doing some per-packet
+* processing in-line, then handling the 'last ptype' for the whole
+* sublist.  This can't cause out-of-order delivery to any single ptype,
+* because the 'last ptype' must be constant across the sublist, and all
+* other ptypes are handled per-packet.
+*/
+   /* Current (common) ptype of sublist */
+   struct packet_type *pt_curr = NULL;
+   /* Current (common) orig_dev of sublist */
+   struct net_device *od_curr = NULL;
+   struct list_head sublist;
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list) {
+   struct net_device *orig_dev = skb->dev;
+   struct packet_type *pt_prev = NULL;
+
+   __netif_receive_skb_core(skb, pfmemalloc, _prev);
+   if (pt_curr != pt_prev || od_curr != orig_dev) {
+   /* dispatch old sublist */
+   list_cut_before(, head, >list);
+   __netif_receive_skb_list_ptype(, pt_curr, 
od_curr);
+   /* start new sublist */
+   pt_curr = pt_prev;
+   od_curr = orig_dev;
+   }
+   }
+
+   /* dispatch final sublist */
+   __netif_receive_skb_list_ptype(head, pt_curr, od_curr);
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
@@ -4695,10 

[PATCH v3 net-next 7/9] net: ipv4: listified version of ip_rcv

2018-06-29 Thread Edward Cree
Also involved adding a way to run a netfilter hook over a list of packets.
 Rather than attempting to make netfilter know about lists (which would be
 a major project in itself) we just let it call the regular okfn (in this
 case ip_rcv_finish()) for any packets it steals, and have it give us back
 a list of packets it's synchronously accepted (which normally NF_HOOK
 would automatically call okfn() on, but we want to be able to potentially
 pass the list to a listified version of okfn().)
The netfilter hooks themselves are indirect calls that still happen per-
 packet (see nf_hook_entry_hookfn()), but again, changing that can be left
 for future work.

There is potential for out-of-order receives if the netfilter hook ends up
 synchronously stealing packets, as they will be processed before any
 accepts earlier in the list.  However, it was already possible for an
 asynchronous accept to cause out-of-order receives, so presumably this is
 considered OK.

Signed-off-by: Edward Cree 
---
 include/linux/netdevice.h |  3 +++
 include/linux/netfilter.h | 25 +
 include/net/ip.h  |  2 ++
 net/core/dev.c|  8 +++---
 net/ipv4/af_inet.c|  1 +
 net/ipv4/ip_input.c   | 68 ++-
 6 files changed, 97 insertions(+), 10 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e104b2e4a735..fe81a2bfcd08 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2291,6 +2291,9 @@ struct packet_type {
 struct net_device *,
 struct packet_type *,
 struct net_device *);
+   void(*list_func) (struct list_head *,
+ struct packet_type *,
+ struct net_device *);
bool(*id_match)(struct packet_type *ptype,
struct sock *sk);
void*af_packet_priv;
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index dd2052f0efb7..2a41f53be1ce 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -288,6 +288,20 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, 
struct sock *sk, struct
return ret;
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+struct list_head *head, struct net_device *in, struct net_device 
*out,
+int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list) {
+   int ret = nf_hook(pf, hook, net, sk, skb, in, out, okfn);
+   if (ret != 1)
+   list_del(>list);
+   }
+}
+
 /* Call setsockopt() */
 int nf_setsockopt(struct sock *sk, u_int8_t pf, int optval, char __user *opt,
  unsigned int len);
@@ -369,6 +383,17 @@ NF_HOOK(uint8_t pf, unsigned int hook, struct net *net, 
struct sock *sk,
return okfn(net, sk, skb);
 }
 
+static inline void
+NF_HOOK_LIST(uint8_t pf, unsigned int hook, struct net *net, struct sock *sk,
+struct list_head *head, struct list_head *sublist,
+struct net_device *in, struct net_device *out,
+int (*okfn)(struct net *, struct sock *, struct sk_buff *))
+{
+   INIT_LIST_HEAD(sublist);
+   /* Move everything to the sublist */
+   list_splice_init(head, sublist);
+}
+
 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
  struct sock *sk, struct sk_buff *skb,
  struct net_device *indev, struct net_device *outdev,
diff --git a/include/net/ip.h b/include/net/ip.h
index 0d2281b4b27a..1de72f9cb23c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -138,6 +138,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct 
sock *sk,
  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
   struct net_device *orig_dev);
+void ip_list_rcv(struct list_head *head, struct packet_type *pt,
+struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index edd67b1f1e12..4c5ebfab9bc8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4692,9 +4692,11 @@ static inline void __netif_receive_skb_list_ptype(struct 
list_head *head,
return;
if (list_empty(head))
return;
-
-   list_for_each_entry_safe(skb, next, head, list)
-   pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+   if (pt_prev->list_func != NULL)
+   

[PATCH v3 net-next 8/9] net: ipv4: listify ip_rcv_finish

2018-06-29 Thread Edward Cree
ip_rcv_finish_core(), if it does not drop, sets skb->dst by either early
 demux or route lookup.  The last step, calling dst_input(skb), is left to
 the caller; in the listified case, we split to form sublists with a common
 dst, but then ip_sublist_rcv_finish() just calls dst_input(skb) in a loop.
The next step in listification would thus be to add a list_input() method
 to struct dst_entry.

Early demux is an indirect call based on iph->protocol; this is another
 opportunity for listification which is not taken here (it would require
 slicing up ip_rcv_finish_core() to allow splitting on protocol changes).

Signed-off-by: Edward Cree 
---
 net/ipv4/ip_input.c | 54 +++--
 1 file changed, 48 insertions(+), 6 deletions(-)

diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 914240830bdf..24b9b0210aeb 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -307,7 +307,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
return true;
 }
 
-static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
 {
const struct iphdr *iph = ip_hdr(skb);
int (*edemux)(struct sk_buff *skb);
@@ -393,7 +394,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, 
struct sk_buff *skb)
goto drop;
}
 
-   return dst_input(skb);
+   return NET_RX_SUCCESS;
 
 drop:
kfree_skb(skb);
@@ -405,6 +406,15 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, 
struct sk_buff *skb)
goto drop;
 }
 
+static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+   int ret = ip_rcv_finish_core(net, sk, skb);
+
+   if (ret != NET_RX_DROP)
+   ret = dst_input(skb);
+   return ret;
+}
+
 /*
  * Main IP Receive routine.
  */
@@ -515,15 +525,47 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt,
   ip_rcv_finish);
 }
 
-static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
-  struct net *net)
+static void ip_sublist_rcv_finish(struct list_head *head)
 {
struct sk_buff *skb, *next;
 
+   list_for_each_entry_safe(skb, next, head, list)
+   dst_input(skb);
+}
+
+static void ip_list_rcv_finish(struct net *net, struct sock *sk,
+  struct list_head *head)
+{
+   struct dst_entry *curr_dst = NULL;
+   struct sk_buff *skb, *next;
+   struct list_head sublist;
+
+   list_for_each_entry_safe(skb, next, head, list) {
+   struct dst_entry *dst;
+
+   if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+   continue;
+
+   dst = skb_dst(skb);
+   if (curr_dst != dst) {
+   /* dispatch old sublist */
+   list_cut_before(, head, >list);
+   if (!list_empty())
+   ip_sublist_rcv_finish();
+   /* start new sublist */
+   curr_dst = dst;
+   }
+   }
+   /* dispatch final sublist */
+   ip_sublist_rcv_finish(head);
+}
+
+static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+  struct net *net)
+{
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 head, dev, NULL, ip_rcv_finish);
-   list_for_each_entry_safe(skb, next, head, list)
-   ip_rcv_finish(net, NULL, skb);
+   ip_list_rcv_finish(net, NULL, head);
 }
 
 /* Receive a list of IP packets */



[PATCH v3 net-next 5/9] net: core: another layer of lists, around PF_MEMALLOC skb handling

2018-06-29 Thread Edward Cree
First example of a layer splitting the list (rather than merely taking
 individual packets off it).
Involves new list.h function, list_cut_before(), like list_cut_position()
 but cuts on the other side of the given entry.

Signed-off-by: Edward Cree 
---
 include/linux/list.h | 30 ++
 net/core/dev.c   | 44 
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/include/linux/list.h b/include/linux/list.h
index 4b129df4d46b..de04cc5ed536 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -285,6 +285,36 @@ static inline void list_cut_position(struct list_head 
*list,
__list_cut_position(list, head, entry);
 }
 
+/**
+ * list_cut_before - cut a list into two, before given entry
+ * @list: a new list to add all removed entries
+ * @head: a list with entries
+ * @entry: an entry within head, could be the head itself
+ *
+ * This helper moves the initial part of @head, up to but
+ * excluding @entry, from @head to @list.  You should pass
+ * in @entry an element you know is on @head.  @list should
+ * be an empty list or a list you do not care about losing
+ * its data.
+ * If @entry == @head, all entries on @head are moved to
+ * @list.
+ */
+static inline void list_cut_before(struct list_head *list,
+  struct list_head *head,
+  struct list_head *entry)
+{
+   if (head->next == entry) {
+   INIT_LIST_HEAD(list);
+   return;
+   }
+   list->next = head->next;
+   list->next->prev = list;
+   list->prev = entry->prev;
+   list->prev->next = list;
+   head->next = entry;
+   entry->prev = head;
+}
+
 static inline void __list_splice(const struct list_head *list,
 struct list_head *prev,
 struct list_head *next)
diff --git a/net/core/dev.c b/net/core/dev.c
index d7f2a880aeed..d2454678bc82 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4670,6 +4670,14 @@ int netif_receive_skb_core(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
+static void __netif_receive_skb_list_core(struct list_head *head, bool 
pfmemalloc)
+{
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list)
+   __netif_receive_skb_core(skb, pfmemalloc);
+}
+
 static int __netif_receive_skb(struct sk_buff *skb)
 {
int ret;
@@ -4695,6 +4703,34 @@ static int __netif_receive_skb(struct sk_buff *skb)
return ret;
 }
 
+static void __netif_receive_skb_list(struct list_head *head)
+{
+   unsigned long noreclaim_flag = 0;
+   struct sk_buff *skb, *next;
+   bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+
+   list_for_each_entry_safe(skb, next, head, list) {
+   if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) 
{
+   struct list_head sublist;
+
+   /* Handle the previous sublist */
+   list_cut_before(, head, >list);
+   __netif_receive_skb_list_core(, pfmemalloc);
+   pfmemalloc = !pfmemalloc;
+   /* See comments in __netif_receive_skb */
+   if (pfmemalloc)
+   noreclaim_flag = memalloc_noreclaim_save();
+   else
+   memalloc_noreclaim_restore(noreclaim_flag);
+   }
+   }
+   /* Handle the remaining sublist */
+   __netif_receive_skb_list_core(head, pfmemalloc);
+   /* Restore pflags */
+   if (pfmemalloc)
+   memalloc_noreclaim_restore(noreclaim_flag);
+}
+
 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 {
struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
@@ -4729,14 +4765,6 @@ static int generic_xdp_install(struct net_device *dev, 
struct netdev_bpf *xdp)
return ret;
 }
 
-static void __netif_receive_skb_list(struct list_head *head)
-{
-   struct sk_buff *skb, *next;
-
-   list_for_each_entry_safe(skb, next, head, list)
-   __netif_receive_skb(skb);
-}
-
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
int ret;



[PATCH v3 net-next 4/9] net: core: Another step of skb receive list processing

2018-06-29 Thread Edward Cree
netif_receive_skb_list_internal() now processes a list and hands it
 on to the next function.

Signed-off-by: Edward Cree 
---
 net/core/dev.c | 61 +-
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 99167ff83919..d7f2a880aeed 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4729,6 +4729,14 @@ static int generic_xdp_install(struct net_device *dev, 
struct netdev_bpf *xdp)
return ret;
 }
 
+static void __netif_receive_skb_list(struct list_head *head)
+{
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list)
+   __netif_receive_skb(skb);
+}
+
 static int netif_receive_skb_internal(struct sk_buff *skb)
 {
int ret;
@@ -4769,6 +4777,50 @@ static int netif_receive_skb_internal(struct sk_buff 
*skb)
return ret;
 }
 
+static void netif_receive_skb_list_internal(struct list_head *head)
+{
+   struct bpf_prog *xdp_prog = NULL;
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list) {
+   net_timestamp_check(netdev_tstamp_prequeue, skb);
+   if (skb_defer_rx_timestamp(skb))
+   /* Handled, remove from list */
+   list_del(>list);
+   }
+
+   if (static_branch_unlikely(_xdp_needed_key)) {
+   preempt_disable();
+   rcu_read_lock();
+   list_for_each_entry_safe(skb, next, head, list) {
+   xdp_prog = rcu_dereference(skb->dev->xdp_prog);
+   if (do_xdp_generic(xdp_prog, skb) != XDP_PASS)
+   /* Dropped, remove from list */
+   list_del(>list);
+   }
+   rcu_read_unlock();
+   preempt_enable();
+   }
+
+   rcu_read_lock();
+#ifdef CONFIG_RPS
+   if (static_key_false(_needed)) {
+   list_for_each_entry_safe(skb, next, head, list) {
+   struct rps_dev_flow voidflow, *rflow = 
+   int cpu = get_rps_cpu(skb->dev, skb, );
+
+   if (cpu >= 0) {
+   enqueue_to_backlog(skb, cpu, 
>last_qtail);
+   /* Handled, remove from list */
+   list_del(>list);
+   }
+   }
+   }
+#endif
+   __netif_receive_skb_list(head);
+   rcu_read_unlock();
+}
+
 /**
  * netif_receive_skb - process receive buffer from network
  * @skb: buffer to process
@@ -4796,20 +4848,19 @@ EXPORT_SYMBOL(netif_receive_skb);
  * netif_receive_skb_list - process many receive buffers from network
  * @head: list of skbs to process.
  *
- * For now, just calls netif_receive_skb() in a loop, ignoring the
- * return value.
+ * Since return value of netif_receive_skb() is normally ignored, and
+ * wouldn't be meaningful for a list, this function returns void.
  *
  * This function may only be called from softirq context and interrupts
  * should be enabled.
  */
 void netif_receive_skb_list(struct list_head *head)
 {
-   struct sk_buff *skb, *next;
+   struct sk_buff *skb;
 
list_for_each_entry(skb, head, list)
trace_netif_receive_skb_list_entry(skb);
-   list_for_each_entry_safe(skb, next, head, list)
-   netif_receive_skb_internal(skb);
+   netif_receive_skb_list_internal(head);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 



[PATCH v3 net-next 2/9] sfc: batch up RX delivery

2018-06-29 Thread Edward Cree
Improves packet rate of 1-byte UDP receives by up to 10%.

Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/efx.c| 12 
 drivers/net/ethernet/sfc/net_driver.h |  3 +++
 drivers/net/ethernet/sfc/rx.c |  7 ++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 570ec72266f3..b24c2e21db8e 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -264,11 +264,17 @@ static int efx_check_disabled(struct efx_nic *efx)
 static int efx_process_channel(struct efx_channel *channel, int budget)
 {
struct efx_tx_queue *tx_queue;
+   struct list_head rx_list;
int spent;
 
if (unlikely(!channel->enabled))
return 0;
 
+   /* Prepare the batch receive list */
+   EFX_WARN_ON_PARANOID(channel->rx_list != NULL);
+   INIT_LIST_HEAD(_list);
+   channel->rx_list = _list;
+
efx_for_each_channel_tx_queue(tx_queue, channel) {
tx_queue->pkts_compl = 0;
tx_queue->bytes_compl = 0;
@@ -291,6 +297,10 @@ static int efx_process_channel(struct efx_channel 
*channel, int budget)
}
}
 
+   /* Receive any packets we queued up */
+   netif_receive_skb_list(channel->rx_list);
+   channel->rx_list = NULL;
+
return spent;
 }
 
@@ -555,6 +565,8 @@ static int efx_probe_channel(struct efx_channel *channel)
goto fail;
}
 
+   channel->rx_list = NULL;
+
return 0;
 
 fail:
diff --git a/drivers/net/ethernet/sfc/net_driver.h 
b/drivers/net/ethernet/sfc/net_driver.h
index 65568925c3ef..961b92979640 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -448,6 +448,7 @@ enum efx_sync_events_state {
  * __efx_rx_packet(), or zero if there is none
  * @rx_pkt_index: Ring index of first buffer for next packet to be delivered
  * by __efx_rx_packet(), if @rx_pkt_n_frags != 0
+ * @rx_list: list of SKBs from current RX, awaiting processing
  * @rx_queue: RX queue for this channel
  * @tx_queue: TX queues for this channel
  * @sync_events_state: Current state of sync events on this channel
@@ -500,6 +501,8 @@ struct efx_channel {
unsigned int rx_pkt_n_frags;
unsigned int rx_pkt_index;
 
+   struct list_head *rx_list;
+
struct efx_rx_queue rx_queue;
struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
 
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index d2e254f2f72b..396ff01298cd 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -634,7 +634,12 @@ static void efx_rx_deliver(struct efx_channel *channel, u8 
*eh,
return;
 
/* Pass the packet up */
-   netif_receive_skb(skb);
+   if (channel->rx_list != NULL)
+   /* Add to list, will pass up later */
+   list_add_tail(>list, channel->rx_list);
+   else
+   /* No list, so pass it up now */
+   netif_receive_skb(skb);
 }
 
 /* Handle a received packet.  Second half: Touches packet payload. */



[PATCH v3 net-next 3/9] net: core: unwrap skb list receive slightly further

2018-06-29 Thread Edward Cree
Signed-off-by: Edward Cree 
---
 include/trace/events/net.h | 7 +++
 net/core/dev.c | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/net.h b/include/trace/events/net.h
index 9c886739246a..00aa72ce0e7c 100644
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -223,6 +223,13 @@ DEFINE_EVENT(net_dev_rx_verbose_template, 
netif_receive_skb_entry,
TP_ARGS(skb)
 );
 
+DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,
+
+   TP_PROTO(const struct sk_buff *skb),
+
+   TP_ARGS(skb)
+);
+
 DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,
 
TP_PROTO(const struct sk_buff *skb),
diff --git a/net/core/dev.c b/net/core/dev.c
index 110c8dfebc01..99167ff83919 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4806,8 +4806,10 @@ void netif_receive_skb_list(struct list_head *head)
 {
struct sk_buff *skb, *next;
 
+   list_for_each_entry(skb, head, list)
+   trace_netif_receive_skb_list_entry(skb);
list_for_each_entry_safe(skb, next, head, list)
-   netif_receive_skb(skb);
+   netif_receive_skb_internal(skb);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 



[PATCH v3 net-next 1/9] net: core: trivial netif_receive_skb_list() entry point

2018-06-29 Thread Edward Cree
Just calls netif_receive_skb() in a loop.

Signed-off-by: Edward Cree 
---
 include/linux/netdevice.h |  1 +
 net/core/dev.c| 19 +++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c6b377a15869..e104b2e4a735 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3365,6 +3365,7 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
+void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
diff --git a/net/core/dev.c b/net/core/dev.c
index dffed642e686..110c8dfebc01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4792,6 +4792,25 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
+/**
+ * netif_receive_skb_list - process many receive buffers from network
+ * @head: list of skbs to process.
+ *
+ * For now, just calls netif_receive_skb() in a loop, ignoring the
+ * return value.
+ *
+ * This function may only be called from softirq context and interrupts
+ * should be enabled.
+ */
+void netif_receive_skb_list(struct list_head *head)
+{
+   struct sk_buff *skb, *next;
+
+   list_for_each_entry_safe(skb, next, head, list)
+   netif_receive_skb(skb);
+}
+EXPORT_SYMBOL(netif_receive_skb_list);
+
 DEFINE_PER_CPU(struct work_struct, flush_works);
 
 /* Network device is going away, flush any packets still pending */



[PATCH v3 net-next 0/9] Handle multiple received packets at each stage

2018-06-29 Thread Edward Cree
This patch series adds the capability for the network stack to receive a
 list of packets and process them as a unit, rather than handling each
 packet singly in sequence.  This is done by factoring out the existing
 datapath code at each layer and wrapping it in list handling code.

The motivation for this change is twofold:
* Instruction cache locality.  Currently, running the entire network
  stack receive path on a packet involves more code than will fit in the
  lowest-level icache, meaning that when the next packet is handled, the
  code has to be reloaded from more distant caches.  By handling packets
  in "row-major order", we ensure that the code at each layer is hot for
  most of the list.  (There is a corresponding downside in _data_ cache
  locality, since we are now touching every packet at every layer, but in
  practice there is easily enough room in dcache to hold one cacheline of
  each of the 64 packets in a NAPI poll.)
* Reduction of indirect calls.  Owing to Spectre mitigations, indirect
  function calls are now more expensive than ever; they are also heavily
  used in the network stack's architecture (see [1]).  By replacing 64
  indirect calls to the next-layer per-packet function with a single
  indirect call to the next-layer list function, we can save CPU cycles.

Drivers pass an SKB list to the stack at the end of the NAPI poll; this
 gives a natural batch size (the NAPI poll weight) and avoids waiting at
 the software level for further packets to make a larger batch (which
 would add latency).  It also means that the batch size is automatically
 tuned by the existing interrupt moderation mechanism.
The stack then runs each layer of processing over all the packets in the
 list before proceeding to the next layer.  Where the 'next layer' (or
 the context in which it must run) differs among the packets, the stack
 splits the list; this 'late demux' means that packets which differ only
 in later headers (e.g. same L2/L3 but different L4) can traverse the
 early part of the stack together.
Also, where the next layer is not (yet) list-aware, the stack can revert
 to calling the rest of the stack in a loop; this allows gradual/creeping
 listification, with no 'flag day' patch needed to listify everything.

Patches 1-2 simply place received packets on a list during the event
 processing loop on the sfc EF10 architecture, then call the normal stack
 for each packet singly at the end of the NAPI poll.  (Analogues of patch
 #2 for other NIC drivers should be fairly straightforward.)
Patches 3-9 extend the list processing as far as the IP receive handler.

Patches 1-2 alone give about a 10% improvement in packet rate in the
 baseline test; adding patches 3-9 raises this to around 25%.

Performance measurements were made with NetPerf UDP_STREAM, using 1-byte
 packets and a single core to handle interrupts on the RX side; this was
 in order to measure as simply as possible the packet rate handled by a
 single core.  Figures are in Mbit/s; divide by 8 to obtain Mpps.  The
 setup was tuned for maximum reproducibility, rather than raw performance.
 Full details and more results (both with and without retpolines) from a
 previous version of the patch series are presented in [2].

The baseline test uses four streams, and multiple RXQs all bound to a
 single CPU (the netperf binary is bound to a neighbouring CPU).  These
 tests were run with retpolines.
net-next: 6.91 Mb/s (datum)
 after 9: 8.46 Mb/s (+22.5%)
Note however that these results are not robust; changes in the parameters
 of the test sometimes shrink the gain to single-digit percentages.  For
 instance, when using only a single RXQ, only a 4% gain was seen.

One test variation was the use of software filtering/firewall rules.
 Adding a single iptables rule (UDP port drop on a port range not matching
 the test traffic), thus making the netfilter hook have work to do,
 reduced baseline performance but showed a similar gain from the patches:
net-next: 5.02 Mb/s (datum)
 after 9: 6.78 Mb/s (+35.1%)

Similarly, testing with a set of TC flower filters (kindly supplied by
 Cong Wang) gave the following:
net-next: 6.83 Mb/s (datum)
 after 9: 8.86 Mb/s (+29.7%)

These data suggest that the batching approach remains effective in the
 presence of software switching rules, and perhaps even improves the
 performance of those rules by allowing them and their codepaths to stay
 in cache between packets.

Changes from v2:
* Used standard list handling (and skb->list) instead of the skb-queue
  functions (that use skb->next, skb->prev).
  - As part of this, changed from a "dequeue, process, enqueue" model to
using list_for_each_safe, list_del, and (new) list_cut_before.
* Altered __netif_receive_skb_core() changes in patch 6 as per Willem de
  Bruijn's suggestions (separate **ppt_prev from *pt_prev; renaming).
* Removed patches to Generic XDP, since they were producing no benefit.
  I may revisit them later.
* Removed RFC tags.

Changes from v1:
* 

Re: [PATCH net-next v2] tcp: force cwnd at least 2 in tcp_cwnd_reduction

2018-06-29 Thread Lawrence Brakmo
On 6/28/18, 1:48 PM, "netdev-ow...@vger.kernel.org on behalf of Neal Cardwell" 
 wrote:

On Thu, Jun 28, 2018 at 4:20 PM Lawrence Brakmo  wrote:
>
> I just looked at 4.18 traces and the behavior is as follows:
>
>Host A sends the last packets of the request
>
>Host B receives them, and the last packet is marked with congestion 
(CE)
>
>Host B sends ACKs for packets not marked with congestion
>
>Host B sends data packet with reply and ACK for packet marked with 
congestion (TCP flag ECE)
>
>Host A receives ACKs with no ECE flag
>
>Host A receives data packet with ACK for the last packet of request 
and has TCP ECE bit set
>
>Host A sends 1st data packet of the next request with TCP flag CWR
>
>Host B receives the packet (as seen in tcpdump at B), no CE flag
>
>Host B sends a dup ACK that also has the TCP ECE flag
>
>Host A RTO timer fires!
>
>Host A to send the next packet
>
>Host A receives an ACK for everything it has sent (i.e. Host B did 
receive 1st packet of request)
>
>Host A send more packets…

Thanks, Larry! This is very interesting. I don't know the cause, but
this reminds me of an issue  Steve Ibanez raised on the netdev list
last December, where he was seeing cases with DCTCP where a CWR packet
would be received and buffered by Host B but not ACKed by Host B. This
was the thread "Re: Linux ECN Handling", starting around December 5. I
have cc-ed Steve.

I wonder if this may somehow be related to the DCTCP logic to rewind
tp->rcv_nxt and call tcp_send_ack(), and then restore tp->rcv_nxt, if
DCTCP notices that the incoming CE bits have been changed while the
receiver thinks it is holding on to a delayed ACK (in
dctcp_ce_state_0_to_1() and dctcp_ce_state_1_to_0()). I wonder if the
"synthetic" call to tcp_send_ack() somehow has side effects in the
delayed ACK state machine that can cause the connection to forget that
it still needs to fire a delayed ACK, even though it just sent an ACK
just now.

neal

You were right Neal, one of the bugs is related to this and is caused by a lack 
of state update to DCTCP. DCTCP is first informed that the ACK was delayed but 
it is not updated when the ACK is sent with a data packet.

I am working on a patch to fix this which hopefully should be out today. Thanks 
everyone for the great feedback! 



Re: [PATCH v1 net-next 14/14] net/sched: Make etf report drops on error_queue

2018-06-29 Thread Willem de Bruijn
> >> diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
> >> index 5514a8aa3bd5..166f4b72875b 100644
> >> --- a/net/sched/sch_etf.c
> >> +++ b/net/sched/sch_etf.c
> >> @@ -11,6 +11,7 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >>  #include 
> >>  #include 
> >>  #include 
> >> @@ -124,6 +125,35 @@ static void reset_watchdog(struct Qdisc *sch)
> >> qdisc_watchdog_schedule_ns(>watchdog, ktime_to_ns(next));
> >>  }
> >>
> >> +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
> >> +{
> >> +   struct sock_exterr_skb *serr;
> >> +   ktime_t txtime = skb->tstamp;
> >> +
> >> +   if (!skb->sk || !(skb->sk->sk_txtime_flags & 
> >> SK_TXTIME_RECV_ERR_MASK))
> >> +   return;
> >> +
> >> +   skb = skb_clone_sk(skb);
> >> +   if (!skb)
> >> +   return;
> >> +
> >> +   sock_hold(skb->sk);
> >
> > Why take an extra reference? The skb holds a ref on the sk.
>
>
> Yes, the cloned skb holds a ref on the socket, but the documentation of
> skb_clone_sk() makes this explicit suggestion:
>
> (...)
>  * When passing buffers allocated with this function to sock_queue_err_skb
>  * it is necessary to wrap the call with sock_hold/sock_put in order to
>  * prevent the socket from being released prior to being enqueued on
>  * the sk_error_queue.
>  */
>
> which I believe is here just so we are protected against a possible race after
> skb_orphan() is called from sock_queue_err_skb(). Please let me know if I'm
> misreading anything.

Yes, indeed. Code only has to worry about that if there are no
concurrent references
on the socket.

I may be mistaken, but I believe that this complicated logic exists
only for cases where
the timestamp may be queued after the original skb has been released.
Specifically,
when a tx timestamp is returned from a hardware device after transmission of the
original skb. Then the cloned timestamp skb needs its own reference on
the sk while
it is waiting for the timestamp data (i.e., until the device
completion arrives) and then
we need a temporary extra ref to work around the skb_orphan in
sock_queue_err_skb.

Compare skb_complete_tx_timestamp with skb_tstamp_tx. The second is used in
the regular datapath to clone an skb and queue it on the error queue
immediately,
while holding the original skb. This does not call skb_clone_sk and
does not need the
extra sock_hold. This should be good enough for this code path, too.
As kb holds a
ref on skb->sk, the socket cannot go away in the middle of report_sock_error.


Re: [PATCH bpf 3/3] bpf: undo prog rejection on read-only lock failure

2018-06-29 Thread Kees Cook
On Thu, Jun 28, 2018 at 2:34 PM, Daniel Borkmann  wrote:
> Kees suggested that if set_memory_*() can fail, we should annotate it with
> __must_check, and all callers need to deal with it gracefully given those
> set_memory_*() markings aren't "advisory", but they're expected to actually
> do what they say. This might be an option worth to move forward in future
> but would at the same time require that set_memory_*() calls from supporting
> archs are guaranteed to be "atomic" in that they provide rollback if part
> of the range fails, once that happened, the transition from RW -> RO could
> be made more robust that way, while subsequent RO -> RW transition /must/
> continue guaranteeing to always succeed the undo part.

Does this mean we can have BPF filters that aren't read-only then?
What's the situation where set_memory_ro() fails? (Can it be induced
by the user?)

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH bpf 0/3] Three BPF fixes

2018-06-29 Thread Alexei Starovoitov
On Thu, Jun 28, 2018 at 11:34:56PM +0200, Daniel Borkmann wrote:
> This set contains three fixes that are mostly JIT and set_memory_*()
> related. The third in the series in particular fixes the syzkaller
> bugs that were still pending; aside from local reproduction & testing,
> also 'syz test' wasn't able to trigger them anymore. I've tested this
> series on x86_64, arm64 and s390x, and kbuild bot wasn't yelling either
> for the rest. For details, please see patches as usual, thanks!

Applied, Thanks



Re: [PATCH v1 net-next 14/14] net/sched: Make etf report drops on error_queue

2018-06-29 Thread Jesus Sanchez-Palencia
Hi Willem,


On 06/28/2018 07:27 AM, Willem de Bruijn wrote:

(...)

> 
>>  struct sock_txtime {
>> clockid_t   clockid;/* reference clockid */
>> -   u16 flags;  /* bit 0: txtime in deadline_mode */
>> +   u16 flags;  /* bit 0: txtime in deadline_mode
>> +* bit 1: report drops on sk err 
>> queue
>> +*/
>>  };
> 
> If this is shared with userspace, should be defined in an uapi header.
> Same on the flag bits below. Self documenting code is preferable over
> comments.


Fixed for v2.


> 
>>  /*
>> diff --git a/include/net/sock.h b/include/net/sock.h
>> index 73f4404e49e4..e681a45cfe7e 100644
>> --- a/include/net/sock.h
>> +++ b/include/net/sock.h
>> @@ -473,6 +473,7 @@ struct sock {
>> u16 sk_clockid;
>> u16 sk_txtime_flags;
>>  #define SK_TXTIME_DEADLINE_MASKBIT(0)
>> +#define SK_TXTIME_RECV_ERR_MASKBIT(1)
> 
> Integer bitfields are (arguably) more readable. There is no requirement
> that the user interface be the same as the in-kernel implementation. Indeed
> if you can save bits in struct sock, that is preferable (but not so for the 
> ABI,
> which cannot easily be extended).


Sure, changed for v2.

(...)


>> diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c
>> index 5514a8aa3bd5..166f4b72875b 100644
>> --- a/net/sched/sch_etf.c
>> +++ b/net/sched/sch_etf.c
>> @@ -11,6 +11,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -124,6 +125,35 @@ static void reset_watchdog(struct Qdisc *sch)
>> qdisc_watchdog_schedule_ns(>watchdog, ktime_to_ns(next));
>>  }
>>
>> +static void report_sock_error(struct sk_buff *skb, u32 err, u8 code)
>> +{
>> +   struct sock_exterr_skb *serr;
>> +   ktime_t txtime = skb->tstamp;
>> +
>> +   if (!skb->sk || !(skb->sk->sk_txtime_flags & 
>> SK_TXTIME_RECV_ERR_MASK))
>> +   return;
>> +
>> +   skb = skb_clone_sk(skb);
>> +   if (!skb)
>> +   return;
>> +
>> +   sock_hold(skb->sk);
> 
> Why take an extra reference? The skb holds a ref on the sk.


Yes, the cloned skb holds a ref on the socket, but the documentation of
skb_clone_sk() makes this explicit suggestion:

(...)
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */

which I believe is here just so we are protected against a possible race after
skb_orphan() is called from sock_queue_err_skb(). Please let me know if I'm
misreading anything.

And for v2 I will move the sock_hold() call to immediately before the
sock_queue_err_skb() to avoid any future confusion.



> 
>> +
>> +   serr = SKB_EXT_ERR(skb);
>> +   serr->ee.ee_errno = err;
>> +   serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
> 
> I suggest adding a new SO_EE_ORIGIN_TXTIME as opposed to overloading
> the existing
> local origin. Then the EE_CODE can start at 1, as ee_code can be
> demultiplexed by origin.


OK, it looks better indeed. Fixed for v2.


> 
>> +   serr->ee.ee_type = 0;
>> +   serr->ee.ee_code = code;
>> +   serr->ee.ee_pad = 0;
>> +   serr->ee.ee_data = (txtime >> 32); /* high part of tstamp */
>> +   serr->ee.ee_info = txtime; /* low part of tstamp */
>> +
>> +   if (sock_queue_err_skb(skb->sk, skb))
>> +   kfree_skb(skb);
>> +
>> +   sock_put(skb->sk);
>> +}


Thanks,
Jesus


[PATCH net 0/5] s390/qeth: fixes 2018-06-29

2018-06-29 Thread Julian Wiedmann
Hi Dave,

please apply a few qeth fixes for -net and your 4.17 stable queue.

Patches 1-3 fix several issues wrt to MAC address management that were
introduced during the 4.17 cycle.
Patch 4 tackles a long-standing issue with busy multi-connection workloads
on devices in af_iucv mode.
Patch 5 makes sure to re-enable all active HW offloads, after a card was
previously set offline and thus lost its HW context.

Thanks,
Julian


Julian Wiedmann (4):
  Revert "s390/qeth: use Read device to query hypervisor for MAC"
  s390/qeth: fix race when setting MAC address
  s390/qeth: don't clobber buffer on async TX completion
  s390/qeth: consistently re-enable device features

Vasily Gorbik (1):
  s390/qeth: avoid using is_multicast_ether_addr_64bits on (u8 *)[6]

 drivers/s390/net/qeth_core.h  | 13 ++-
 drivers/s390/net/qeth_core_main.c | 47 +++
 drivers/s390/net/qeth_l2_main.c   | 24 
 drivers/s390/net/qeth_l3_main.c   |  3 ++-
 4 files changed, 57 insertions(+), 30 deletions(-)

-- 
2.16.4



[PATCH net 1/5] Revert "s390/qeth: use Read device to query hypervisor for MAC"

2018-06-29 Thread Julian Wiedmann
This reverts commit b7493e91c11a757cf0f8ab26989642ee4bb2c642.

On its own, querying RDEV for a MAC address works fine. But when upgrading
from a qeth that previously queried DDEV on a z/VM NIC (ie. any kernel with
commit ec61bd2fd2a2), the RDEV query now returns a _different_ MAC address
than the DDEV query.

If the NIC is configured with MACPROTECT, z/VM apparently requires us to
use the MAC that was initially returned (on DDEV) and registered. So after
upgrading to a kernel that uses RDEV, the SETVMAC registration cmd for the
new MAC address fails and we end up with a non-operabel interface.

To avoid regressions on upgrade, switch back to using DDEV for the MAC
address query. The downgrade path (first RDEV, later DDEV) is fine, in this
case both queries return the same MAC address.

Fixes: b7493e91c11a ("s390/qeth: use Read device to query hypervisor for MAC")
Reported-by: Michal Kubecek 
Tested-by: Karsten Graul 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 8e1474f1ffac..9d9182ed8ac4 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -4834,7 +4834,7 @@ int qeth_vm_request_mac(struct qeth_card *card)
goto out;
}
 
-   ccw_device_get_id(CARD_RDEV(card), );
+   ccw_device_get_id(CARD_DDEV(card), );
request->resp_buf_len = sizeof(*response);
request->resp_version = DIAG26C_VERSION2;
request->op_code = DIAG26C_GET_MAC;
-- 
2.16.4



[PATCH net 4/5] s390/qeth: don't clobber buffer on async TX completion

2018-06-29 Thread Julian Wiedmann
If qeth_qdio_output_handler() detects that a transmit requires async
completion, it replaces the pending buffer's metadata object
(qeth_qdio_out_buffer) so that this queue buffer can be re-used while
the data is pending completion.

Later when the CQ indicates async completion of such a metadata object,
qeth_qdio_cq_handler() tries to free any data associated with this
object (since HW has now completed the transfer). By calling
qeth_clear_output_buffer(), it erronously operates on the queue buffer
that _previously_ belonged to this transfer ... but which has been
potentially re-used several times by now.
This results in double-free's of the buffer's data, and failing
transmits as the buffer descriptor is scrubbed in mid-air.

The correct way of handling this situation is to
1. scrub the queue buffer when it is prepared for re-use, and
2. later obtain the data addresses from the async-completion notifier
   (ie. the AOB), instead of the queue buffer.

All this only affects qeth devices used for af_iucv HiperTransport.

Fixes: 0da9581ddb0f ("qeth: exploit asynchronous delivery of storage blocks")
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  | 11 +++
 drivers/s390/net/qeth_core_main.c | 22 --
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 2a5fec55bf60..940fd7b558d3 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -829,6 +829,17 @@ struct qeth_trap_id {
 /*some helper functions*/
 #define QETH_CARD_IFNAME(card) (((card)->dev)? (card)->dev->name : "")
 
+static inline void qeth_scrub_qdio_buffer(struct qdio_buffer *buf,
+ unsigned int elements)
+{
+   unsigned int i;
+
+   for (i = 0; i < elements; i++)
+   memset(>element[i], 0, sizeof(struct qdio_buffer_element));
+   buf->element[14].sflags = 0;
+   buf->element[15].sflags = 0;
+}
+
 /**
  * qeth_get_elements_for_range() - find number of SBALEs to cover range.
  * @start: Start of the address range.
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 9d9182ed8ac4..d20a69a3bc40 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -73,9 +73,6 @@ static void qeth_notify_skbs(struct qeth_qdio_out_q *queue,
struct qeth_qdio_out_buffer *buf,
enum iucv_tx_notify notification);
 static void qeth_release_skbs(struct qeth_qdio_out_buffer *buf);
-static void qeth_clear_output_buffer(struct qeth_qdio_out_q *queue,
-   struct qeth_qdio_out_buffer *buf,
-   enum qeth_qdio_buffer_states newbufstate);
 static int qeth_init_qdio_out_buf(struct qeth_qdio_out_q *, int);
 
 struct workqueue_struct *qeth_wq;
@@ -489,6 +486,7 @@ static void qeth_qdio_handle_aob(struct qeth_card *card,
struct qaob *aob;
struct qeth_qdio_out_buffer *buffer;
enum iucv_tx_notify notification;
+   unsigned int i;
 
aob = (struct qaob *) phys_to_virt(phys_aob_addr);
QETH_CARD_TEXT(card, 5, "haob");
@@ -513,10 +511,18 @@ static void qeth_qdio_handle_aob(struct qeth_card *card,
qeth_notify_skbs(buffer->q, buffer, notification);
 
buffer->aob = NULL;
-   qeth_clear_output_buffer(buffer->q, buffer,
-QETH_QDIO_BUF_HANDLED_DELAYED);
+   /* Free dangling allocations. The attached skbs are handled by
+* qeth_cleanup_handled_pending().
+*/
+   for (i = 0;
+i < aob->sb_count && i < QETH_MAX_BUFFER_ELEMENTS(card);
+i++) {
+   if (aob->sba[i] && buffer->is_header[i])
+   kmem_cache_free(qeth_core_header_cache,
+   (void *) aob->sba[i]);
+   }
+   atomic_set(>state, QETH_QDIO_BUF_HANDLED_DELAYED);
 
-   /* from here on: do not touch buffer anymore */
qdio_release_aob(aob);
 }
 
@@ -3759,6 +3765,10 @@ static void qeth_qdio_output_handler(struct ccw_device 
*ccwdev,
QETH_CARD_TEXT(queue->card, 5, "aob");
QETH_CARD_TEXT_(queue->card, 5, "%lx",
virt_to_phys(buffer->aob));
+
+   /* prepare the queue slot for re-use: */
+   qeth_scrub_qdio_buffer(buffer->buffer,
+  QETH_MAX_BUFFER_ELEMENTS(card));
if (qeth_init_qdio_out_buf(queue, bidx)) {
QETH_CARD_TEXT(card, 2, "outofbuf");
qeth_schedule_recovery(card);
-- 
2.16.4



[PATCH net 5/5] s390/qeth: consistently re-enable device features

2018-06-29 Thread Julian Wiedmann
commit e830baa9c3f0 ("qeth: restore device features after recovery") and
commit ce3443564145 ("s390/qeth: rely on kernel for feature recovery")
made sure that the HW functions for device features get re-programmed
after recovery.

But we missed that the same handling is also required when a card is
first set offline (destroying all HW context), and then online again.
Fix this by moving the re-enable action out of the recovery-only path.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_core.h  |  2 +-
 drivers/s390/net/qeth_core_main.c | 23 +++
 drivers/s390/net/qeth_l2_main.c   |  5 ++---
 drivers/s390/net/qeth_l3_main.c   |  3 ++-
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/s390/net/qeth_core.h b/drivers/s390/net/qeth_core.h
index 940fd7b558d3..a246a618f9a4 100644
--- a/drivers/s390/net/qeth_core.h
+++ b/drivers/s390/net/qeth_core.h
@@ -1040,7 +1040,7 @@ struct qeth_cmd_buffer *qeth_get_setassparms_cmd(struct 
qeth_card *,
 __u16, __u16,
 enum qeth_prot_versions);
 int qeth_set_features(struct net_device *, netdev_features_t);
-void qeth_recover_features(struct net_device *dev);
+void qeth_enable_hw_features(struct net_device *dev);
 netdev_features_t qeth_fix_features(struct net_device *, netdev_features_t);
 netdev_features_t qeth_features_check(struct sk_buff *skb,
  struct net_device *dev,
diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index d20a69a3bc40..d01ac29fd986 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -6469,28 +6469,27 @@ static int qeth_set_ipa_rx_csum(struct qeth_card *card, 
bool on)
 #define QETH_HW_FEATURES (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_TSO | \
  NETIF_F_IPV6_CSUM)
 /**
- * qeth_recover_features() - Restore device features after recovery
- * @dev:   the recovering net_device
- *
- * Caller must hold rtnl lock.
+ * qeth_enable_hw_features() - (Re-)Enable HW functions for device features
+ * @dev:   a net_device
  */
-void qeth_recover_features(struct net_device *dev)
+void qeth_enable_hw_features(struct net_device *dev)
 {
-   netdev_features_t features = dev->features;
struct qeth_card *card = dev->ml_priv;
+   netdev_features_t features;
 
+   rtnl_lock();
+   features = dev->features;
/* force-off any feature that needs an IPA sequence.
 * netdev_update_features() will restart them.
 */
dev->features &= ~QETH_HW_FEATURES;
netdev_update_features(dev);
-
-   if (features == dev->features)
-   return;
-   dev_warn(>gdev->dev,
-"Device recovery failed to restore all offload features\n");
+   if (features != dev->features)
+   dev_warn(>gdev->dev,
+"Device recovery failed to restore all offload 
features\n");
+   rtnl_unlock();
 }
-EXPORT_SYMBOL_GPL(qeth_recover_features);
+EXPORT_SYMBOL_GPL(qeth_enable_hw_features);
 
 int qeth_set_features(struct net_device *dev, netdev_features_t features)
 {
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 5464515b71f1..2487f0aeb165 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -1119,6 +1119,8 @@ static int __qeth_l2_set_online(struct ccwgroup_device 
*gdev, int recovery_mode)
netif_carrier_off(card->dev);
 
qeth_set_allowed_threads(card, 0x, 0);
+
+   qeth_enable_hw_features(card->dev);
if (recover_flag == CARD_STATE_RECOVER) {
if (recovery_mode &&
card->info.type != QETH_CARD_TYPE_OSN) {
@@ -1130,9 +1132,6 @@ static int __qeth_l2_set_online(struct ccwgroup_device 
*gdev, int recovery_mode)
}
/* this also sets saved unicast addresses */
qeth_l2_set_rx_mode(card->dev);
-   rtnl_lock();
-   qeth_recover_features(card->dev);
-   rtnl_unlock();
}
/* let user_space know that device is online */
kobject_uevent(>dev.kobj, KOBJ_CHANGE);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index e7fa479adf47..5905dc63e256 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -2662,6 +2662,8 @@ static int __qeth_l3_set_online(struct ccwgroup_device 
*gdev, int recovery_mode)
netif_carrier_on(card->dev);
else
netif_carrier_off(card->dev);
+
+   qeth_enable_hw_features(card->dev);
if (recover_flag == CARD_STATE_RECOVER) {
rtnl_lock();
if (recovery_mode)
@@ -2669,7 +2671,6 @@ static int __qeth_l3_set_online(struct ccwgroup_device 
*gdev, int recovery_mode)
else

[PATCH net 2/5] s390/qeth: fix race when setting MAC address

2018-06-29 Thread Julian Wiedmann
When qeth_l2_set_mac_address() finds the card in a non-reachable state,
it merely copies the new MAC address into dev->dev_addr so that
__qeth_l2_set_online() can later register it with the HW.

But __qeth_l2_set_online() may very well be running concurrently, so we
can't trust the card state without appropriate locking:
If the online sequence is past the point where it registers
dev->dev_addr (but not yet in SOFTSETUP state), any address change needs
to be properly programmed into the HW. Otherwise the netdevice ends up
with a different MAC address than what's set in the HW, and inbound
traffic is not forwarded as expected.

This is most likely to occur for OSD in LPAR, where
commit 21b1702af12e ("s390/qeth: improve fallback to random MAC address")
now triggers eg. systemd to immediately change the MAC when the netdevice
is registered with a NET_ADDR_RANDOM address.

Fixes: bcacfcbc82b4 ("s390/qeth: fix MAC address update sequence")
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l2_main.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index a7cb37da6a21..7daf125dae76 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -501,27 +501,34 @@ static int qeth_l2_set_mac_address(struct net_device 
*dev, void *p)
return -ERESTARTSYS;
}
 
+   /* avoid racing against concurrent state change: */
+   if (!mutex_trylock(>conf_mutex))
+   return -EAGAIN;
+
if (!qeth_card_hw_is_reachable(card)) {
ether_addr_copy(dev->dev_addr, addr->sa_data);
-   return 0;
+   goto out_unlock;
}
 
/* don't register the same address twice */
if (ether_addr_equal_64bits(dev->dev_addr, addr->sa_data) &&
(card->info.mac_bits & QETH_LAYER2_MAC_REGISTERED))
-   return 0;
+   goto out_unlock;
 
/* add the new address, switch over, drop the old */
rc = qeth_l2_send_setmac(card, addr->sa_data);
if (rc)
-   return rc;
+   goto out_unlock;
ether_addr_copy(old_addr, dev->dev_addr);
ether_addr_copy(dev->dev_addr, addr->sa_data);
 
if (card->info.mac_bits & QETH_LAYER2_MAC_REGISTERED)
qeth_l2_remove_mac(card, old_addr);
card->info.mac_bits |= QETH_LAYER2_MAC_REGISTERED;
-   return 0;
+
+out_unlock:
+   mutex_unlock(>conf_mutex);
+   return rc;
 }
 
 static void qeth_promisc_to_bridge(struct qeth_card *card)
-- 
2.16.4



[PATCH net 3/5] s390/qeth: avoid using is_multicast_ether_addr_64bits on (u8 *)[6]

2018-06-29 Thread Julian Wiedmann
From: Vasily Gorbik 

*ether_addr*_64bits functions have been introduced to optimize
performance critical paths, which access 6-byte ethernet address as u64
value to get "nice" assembly. A harmless hack works nicely on ethernet
addresses shoved into a structure or a larger buffer, until busted by
Kasan on smth like plain (u8 *)[6].

qeth_l2_set_mac_address calls qeth_l2_remove_mac passing
u8 old_addr[ETH_ALEN] as an argument.

Adding/removing macs for an ethernet adapter is not that performance
critical. Moreover is_multicast_ether_addr_64bits itself on s390 is not
faster than is_multicast_ether_addr:

is_multicast_ether_addr(%r2) -> %r2
llc %r2,0(%r2)
risbg   %r2,%r2,63,191,0

is_multicast_ether_addr_64bits(%r2) -> %r2
llgc%r2,0(%r2)
risbg   %r2,%r2,63,191,0

So, let's just use is_multicast_ether_addr instead of
is_multicast_ether_addr_64bits.

Fixes: bcacfcbc82b4 ("s390/qeth: fix MAC address update sequence")
Reviewed-by: Julian Wiedmann 
Signed-off-by: Vasily Gorbik 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l2_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 7daf125dae76..5464515b71f1 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -140,7 +140,7 @@ static int qeth_l2_send_setmac(struct qeth_card *card, __u8 
*mac)
 
 static int qeth_l2_write_mac(struct qeth_card *card, u8 *mac)
 {
-   enum qeth_ipa_cmds cmd = is_multicast_ether_addr_64bits(mac) ?
+   enum qeth_ipa_cmds cmd = is_multicast_ether_addr(mac) ?
IPA_CMD_SETGMAC : IPA_CMD_SETVMAC;
int rc;
 
@@ -157,7 +157,7 @@ static int qeth_l2_write_mac(struct qeth_card *card, u8 
*mac)
 
 static int qeth_l2_remove_mac(struct qeth_card *card, u8 *mac)
 {
-   enum qeth_ipa_cmds cmd = is_multicast_ether_addr_64bits(mac) ?
+   enum qeth_ipa_cmds cmd = is_multicast_ether_addr(mac) ?
IPA_CMD_DELGMAC : IPA_CMD_DELVMAC;
int rc;
 
-- 
2.16.4



How should NFS, SUNRPC interact with Network and User Namespaces?

2018-06-29 Thread Sargun Dhillon
Today, sunrpc lives in net/sunrpc. As far as I can tell, the primary
production consumer of it is NFS. The RPC clients have the concept of
being tied back to a network namespace. On the other hand, NFS has its
own superblock with its own user namespace.

When sunrpc convert kuids to UIDs to send over the wire, should it use
the user namespace of the network namespace that the RPC client is
associated with? This is required for auth_unix (UID based).
Alternatively, should the sunrpc RPC client use the user namespace
associated with the NFS superblock?


Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw

2018-06-29 Thread Samudrala, Sridhar

On 6/29/2018 6:05 AM, Jiri Pirko wrote:

Fri, Jun 29, 2018 at 02:54:36PM CEST, dsah...@gmail.com wrote:

On 6/29/18 6:48 AM, Jiri Pirko wrote:

Fri, Jun 29, 2018 at 02:12:21PM CEST, j...@mojatatu.com wrote:

On 29/06/18 04:39 AM, Jiri Pirko wrote:

Fri, Jun 29, 2018 at 12:25:53AM CEST, xiyou.wangc...@gmail.com wrote:

On Thu, Jun 28, 2018 at 6:10 AM Jiri Pirko  wrote:

Add a template of type flower allowing to insert rules matching on last
2 bytes of destination mac address:
# tc chaintemplate add dev dummy0 ingress proto ip flower dst_mac 
00:00:00:00:00:00/00:00:00:00:FF:FF

The template is now showed in the list:
# tc chaintemplate show dev dummy0 ingress
chaintemplate flower chain 0
dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
eth_type ipv4

Add another template, this time for chain number 22:
# tc chaintemplate add dev dummy0 ingress proto ip chain 22 flower dst_ip 
0.0.0.0/16
# tc chaintemplate show dev dummy0 ingress
chaintemplate flower chain 0
dst_mac 00:00:00:00:00:00/00:00:00:00:ff:ff
eth_type ipv4
chaintemplate flower chain 22
eth_type ipv4
dst_ip 0.0.0.0/16

So, if I want to check the template of a chain, I have to use
'tc chaintemplate... chain X'.

If I want to check the filters in a chain, I have to use
'tc filter show  chain X'.

If you introduce 'tc chain', it would just need one command:
`tc chain show ... X` which could list its template first and
followed by filters in this chain, something like:

# tc chain show dev eth0 chain X
template: # could be none

filter1
...
filter2
...

Isn't it more elegant?

Well, that is just another iproute2 command. It would use the same
kernel uapi. Filters+templates. Sure, why not. Can be easily introduced.
Let's do it in a follow-up iproute2 patch.


Half a dozen or 6 - take your pick, really.
I would call the template an attribute as opposed to a stand alone
object i.e A chain of filters may have a template. If you have to
introduce a new object then Sridhar's suggested syntax seems appealing.

I think what I have makes sense. Maps nicely to what it really is inside
kernel. What Cong proposes looks like nice extension of userspace app to
do more things in one go. Will address that in followup iproute2 patch.

The resolution of the syntax affect the uapi changes proposed. You are
wanting to add new RTM commands which suggests new objects. If a
template is an attribute of an existing object then the netlink API
should indicate that.

There is no existing "chain" object. So no, no uapi changes.


So instead of introducing 'chaintemplate' object in the kernel, can't we add 
'chain'
object in the kernel that takes the 'template' as an attribute?




Re: [PATCH net-next v2 3/4] net: check tunnel option type in tunnel flags

2018-06-29 Thread Jakub Kicinski
On Fri, 29 Jun 2018 09:04:15 +0200, Daniel Borkmann wrote:
> On 06/28/2018 06:54 PM, Jakub Kicinski wrote:
> > On Thu, 28 Jun 2018 09:42:06 +0200, Jiri Benc wrote:  
> >> On Wed, 27 Jun 2018 11:49:49 +0200, Daniel Borkmann wrote:  
> >>> Looks good to me, and yes in BPF case a mask like TUNNEL_OPTIONS_PRESENT 
> >>> is
> >>> right approach since this is opaque info and solely defined by the BPF 
> >>> prog
> >>> that is using the generic helper.
> >>
> >> Wouldn't it make sense to introduce some safeguards here (in a backward
> >> compatible way, of course)? It's easy to mistakenly set data for a
> >> different tunnel type in a BPF program and then be surprised by the
> >> result. It might help users if such usage was detected by the kernel,
> >> one way or another.  
> > 
> > Well, that's how it works today ;)  
> 
> Well, it was designed like that on purpose, to be i) agnostic of the 
> underlying
> device, ii) to not clutter BPF API with tens of different APIs effectively 
> doing
> the same thing, and at the same time to avoid adding protocol specifics. E.g. 
> at
> least core bits of bpf_skb_{set,get}_tunnel_key() will work whether I use 
> vxlan
> or geneve underneath (we are actually using it this way) and I could use 
> things
> like tun_id to encode custom meta data from BPF for either of them depending 
> on flavor
> picked by orchestration system. For the tunnel options in 
> bpf_skb_{set,get}_tunnel_opt()
> it's similar although here there needs to be awareness of the underlying dev 
> depending
> on whether you encode data into e.g. gbp or tlvs, etc. However, downside 
> right now I
> can see with a patch like below is that:
> 
> i) People might still just keep using 'TUNNEL_OPTIONS_PRESENT path' since 
> available
> and backwards compatible with current/older kernels, ii) we cut bits away from
> size over time for each new tunnel proto added in future that would support 
> tunnel
> options, iii) that extension is one-sided (at least below) and same would be 
> needed
> in getter part, and iv) there needs to be a way for the case when folks add 
> new
> tunnel options where we don't need to worry that we forget updating 
> BPF_F_TUN_*
> each time otherwise this will easily slip through and again people will just 
> rely
> on using TUNNEL_OPTIONS_PRESENT catchall. Given latter and in particular 
> point i)
> I wouldn't think it's worth the pain, the APIs were added to BPF in v4.6 so 
> this
> would buy them 2 more years wrt kernel compatibility with same functionality 
> level.
> And point v), I just noticed the patch is actually buggy: size is 
> ARG_CONST_SIZE and
> verifier will attempt to check the value whether the buffer passed in 
> argument 2 is
> valid or not, so using flags here in upper bits would let verification fail, 
> you'd
> really have to make a new helper just for this.

Ah, indeed.  I'd rather avoid a new helper, if we reuse an old one
people can always write a program like:

err = helper(all_flags);
if (err == -EINVAL)
err = helper(fallback_flags);

With a new helper the program will not even load on old kernels :(

Could we add the flags new to bpf_skb_set_tunnel_key(), maybe?  It is a
bit ugly because only options care about the flags and in theory
bpf_skb_set_tunnel_key() doesn't have to be called before
bpf_skb_set_tunnel_opt() ...

Alternatively we could do this:

diff --git a/net/core/filter.c b/net/core/filter.c
index dade922678f6..9edcc2947be5 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3582,7 +3582,9 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
 
-   ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+   ip_tunnel_info_opts_set(info, from, size, TUNNEL_VXLAN_OPT |
+ TUNNEL_GENEVE_OPT |
+ TUNNEL_ERSPAN_OPT);
 
return 0;
 }

to force ourselves to solve the problem when a next protocol is added ;)

Or should we just leave things as they are?  As you explain the new
helper would really be of marginal use given the backward compatibility
requirement and availability of the old one...


[PATCH net] alx: take rtnl before calling __alx_open from resume

2018-06-29 Thread Sabrina Dubroca
The __alx_open function can be called from ndo_open, which is called
under RTNL, or from alx_resume, which isn't. Since commit d768319cd427,
we're calling the netif_set_real_num_{tx,rx}_queues functions, which
need to be called under RTNL.

This is similar to commit 0c2cc02e571a ("igb: Move the calls to set the
Tx and Rx queues into igb_open").

Fixes: d768319cd427 ("alx: enable multiple tx queues")
Signed-off-by: Sabrina Dubroca 
---
 drivers/net/ethernet/atheros/alx/main.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/alx/main.c 
b/drivers/net/ethernet/atheros/alx/main.c
index 567ee54504bc..5e5022fa1d04 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1897,13 +1897,19 @@ static int alx_resume(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct alx_priv *alx = pci_get_drvdata(pdev);
struct alx_hw *hw = >hw;
+   int err;
 
alx_reset_phy(hw);
 
if (!netif_running(alx->dev))
return 0;
netif_device_attach(alx->dev);
-   return __alx_open(alx, true);
+
+   rtnl_lock();
+   err = __alx_open(alx, true);
+   rtnl_unlock();
+
+   return err;
 }
 
 static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume);
-- 
2.18.0



Re: Link modes representation in phylib

2018-06-29 Thread Maxime Chevallier
On Fri, 29 Jun 2018 17:34:41 +0200
Andrew Lunn  wrote:

>> Wow indeed that will help a lot. Just so that we're in sync, do you
>> plan to add those helpers, or should I take this branch as a base for
>> the conversion and go on ?  
>
>I'm still working on it. I can probably push again in the next few
>minutes. But they won't be compile tested, i.e. broken...
>
>It might make more sense if you work on the core, and leave me to
>cleanup the MAC drivers. You can use my branch, if you need to change
>the helpers, just expect it to be unstable, get rebased, and might not
>even compile...

Ok no problem, I'll keep working on the core while keeping an eye on
your branch then.

Thanks a lot,

Maxime


Re: Link modes representation in phylib

2018-06-29 Thread Andrew Lunn
> Wow indeed that will help a lot. Just so that we're in sync, do you
> plan to add those helpers, or should I take this branch as a base for
> the conversion and go on ?

I'm still working on it. I can probably push again in the next few
minutes. But they won't be compile tested, i.e. broken...

It might make more sense if you work on the core, and leave me to
cleanup the MAC drivers. You can use my branch, if you need to change
the helpers, just expect it to be unstable, get rebased, and might not
even compile...

 Andrew


[PATCH net] sfc: correctly initialise filter rwsem for farch

2018-06-29 Thread Bert Kenward
Fixes: fc7a6c287ff3 ("sfc: use a semaphore to lock farch filters too")
Suggested-by: Joseph Korty 
Signed-off-by: Bert Kenward 
---
 drivers/net/ethernet/sfc/farch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/sfc/farch.c b/drivers/net/ethernet/sfc/farch.c
index 8edf20967c82..e045a5d6b938 100644
--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -2794,6 +2794,7 @@ int efx_farch_filter_table_probe(struct efx_nic *efx)
if (!state)
return -ENOMEM;
efx->filter_state = state;
+   init_rwsem(>lock);
 
table = >table[EFX_FARCH_FILTER_TABLE_RX_IP];
table->id = EFX_FARCH_FILTER_TABLE_RX_IP;
-- 
2.13.6



Re: Link modes representation in phylib

2018-06-29 Thread Maxime Chevallier
Hello Andrew,

On Fri, 29 Jun 2018 15:43:43 +0200
Andrew Lunn  wrote:

>> Thanks for the suggestion. I see how this can be done with
>> phydrv->supported and phydev->lp_advertising, however I'm not sure how
>> we should deal with the fact that ethernet drivers directly access
>> fields such as "advertising" or "supported".  
>
>Hi Maxime
>
>I started cleaning up some of the MAC drivers. Take a look at
>
>https://github.com/lunn/linux.git v4.18-rc1-net-next-phy-cleanup
>
>That moved a lot of the MAC code into helpers, making it easier to
>update. It might make sense to add a couple of more helpers, for what
>remains.

Wow indeed that will help a lot. Just so that we're in sync, do you
plan to add those helpers, or should I take this branch as a base for
the conversion and go on ?

Thanks for this,

Maxime

-- 
Maxime Chevallier, Bootlin (formerly Free Electrons)
Embedded Linux and kernel engineering
https://bootlin.com


Re: [pull request][net-next 00/12] Mellanox, mlx5e updates 2018-06-28

2018-06-29 Thread David Miller
From: Saeed Mahameed 
Date: Thu, 28 Jun 2018 14:50:51 -0700

> The following pull request includes updates for mlx5e netdevice driver.
> For more information please see tag log below.
> 
> Please pull and let me know if there's any problem.

Pulled, thanks Saeed.


Re: [PATCH net-next v2 0/4] net: Geneve options support for TC act_tunnel_key

2018-06-29 Thread David Miller
From: Jakub Kicinski 
Date: Tue, 26 Jun 2018 21:39:33 -0700

> Simon & Pieter say:
> 
> This set adds Geneve Options support to the TC tunnel key action.
> It provides the plumbing required to configure Geneve variable length
> options.  The options can be configured in the form CLASS:TYPE:DATA,
> where CLASS is represented as a 16bit hexadecimal value, TYPE as an 8bit
> hexadecimal value and DATA as a variable length hexadecimal value.
> Additionally multiple options may be listed using a comma delimiter.
> 
> v2:
>  - fix sparse warnings in patches 3 and 4 (first one reported by
>build bot).

Series applied, thanks.


[PATCH net-next v2 1/1] tc-testing: initial version of tunnel_key unit tests

2018-06-29 Thread Keara Leibovitz
Create unittests for the tc tunnel_key action.

v2:
For the tests expecting failures, added non-zero exit codes in the
teardowns. This prevents those tests from failing if the act_tunnel_key
module is unloaded.

Signed-off-by: Keara Leibovitz 
---
 .../tc-testing/tc-tests/actions/tunnel_key.json| 749 +
 1 file changed, 749 insertions(+)
 create mode 100644 
tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json

diff --git 
a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json 
b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
new file mode 100644
index ..d878ce1c7d08
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
@@ -0,0 +1,749 @@
+[
+{
+"id": "2b11",
+"name": "Add tunnel_key set action with mandatory parameters",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 dst_ip 20.20.20.2 id 1",
+"expExitCode": "0",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 
10.10.10.1.*dst_ip 20.20.20.2.*key_id 1",
+"matchCount": "1",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "dc6b",
+"name": "Add tunnel_key set action with missing mandatory src_ip 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set dst_ip 
20.20.20.2 id 100",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*dst_ip 
20.20.20.2.*key_id 100",
+"matchCount": "0",
+"teardown": [
+   [
+   "$TC actions flush action tunnel_key",
+   0,
+   1,
+   255
+   ]
+]
+},
+{
+"id": "7f25",
+"name": "Add tunnel_key set action with missing mandatory dst_ip 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 id 100",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
10.10.10.1.*key_id 100",
+"matchCount": "0",
+"teardown": [
+   [
+   "$TC actions flush action tunnel_key",
+   0,
+   1,
+   255
+   ]
+]
+},
+{
+"id": "ba4e",
+"name": "Add tunnel_key set action with missing mandatory id 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 dst_ip 20.20.20.2",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
10.10.10.1.*dst_ip 20.20.20.2",
+"matchCount": "0",
+"teardown": [
+   [
+   "$TC actions flush action tunnel_key",
+   0,
+   1,
+   255
+   ]
+]
+},
+{
+"id": "a5e0",
+"name": "Add tunnel_key set action with invalid src_ip parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
300.168.100.1 dst_ip 192.168.200.1 id 7 index 1",
+"expExitCode": "1",
+"verifyCmd": "$TC actions get action tunnel_key index 1",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
300.168.100.1.*dst_ip 192.168.200.1.*key_id 7.*index 1 ref",
+"matchCount": "0",
+"teardown": [
+   [
+   

Re: [PATCH net-next V4 1/2] cxgb4: Add support for FW_ETH_TX_PKT_VM_WR

2018-06-29 Thread David Miller
From: Ganesh Goudar 
Date: Tue, 26 Jun 2018 17:10:25 +0530

> From: Arjun Vynipadath 
> 
> The present TX workrequest(FW_ETH_TX_PKT_WR) cant be used for
> host->vf communication, since it doesn't loopback the outgoing
> packets to virtual interfaces on the same port. This can be done
> using FW_ETH_TX_PKT_VM_WR.
> This fix depends on ethtool_flags to determine what WR to use for
> TX path. Support for setting this flags by user is added in next
> commit.
> 
> Based on the original work by : Casey Leedom 
> 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Arjun Vynipadath 
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH net-next V4 2/2] cxgb4: Support ethtool private flags

2018-06-29 Thread David Miller
From: Ganesh Goudar 
Date: Tue, 26 Jun 2018 17:10:50 +0530

> From: Arjun Vynipadath 
> 
> This is used to change TX workrequests, which helps in
> host->vf communication.
> 
> Signed-off-by: Arjun Vynipadath 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH net-next 1/1] tc-testing: initial version of tunnel_key unit tests

2018-06-29 Thread Keara Leibovitz
Please disregard - wrong patch.

Keara


Re: Diagnosing network module for missing link establishment (cxgb3, Chelsio T320)

2018-06-29 Thread U.Mutlu

Andrew Lunn wrote on 06/29/2018 03:33 PM:

Kernel: 3.16.0-4-amd64 (Debian v8 stock kernel)


This is very old. Please at least try the current version of Debian,
"Stretch".

Andrew



I had the same idea and attached the card to another machine with a recent 
kernel:
# uname -a
Linux c5 4.14.1-my1a #1 SMP Wed Nov 22 11:20:35 CET 2017 x86_64 GNU/Linux

But the version of the cxgb3.ko is the same, and the same effect happens: no 
link establishment.


So, what should I do get it working?
I could also re-compile the cxgb3 driver with enabled diagnostic settings, if 
it has any such (haven't studied yet).


Is there an easier method to find the reason, and hopefully get it establish a 
link?


Btw, to encirle the problem, I'm simply attaching the fibre cable
from port 1 to port 2 on the same card. Is doing so ok?

Thx


# dmesg | grep "Chelsio\|cxgb\|eth[0-9]"
[0.670473] cxgb3: Chelsio T3 Network Driver - version 1.1.5-ko
[0.691397] alx :04:00.0 eth0: Qualcomm Atheros AR816x/AR817x Ethernet 
[d0:50:99:0e:89:04]

[1.044840] cxgb3 :01:00.0: Port 0 using 4 queue sets.
[1.044883] cxgb3 :01:00.0: Port 1 using 4 queue sets.
[1.044924] cxgb3 :01:00.0 eth1: Chelsio T320 10GBASE-R RNIC (rev 3) 
PCI Express x8 MSI-X
[1.044964] cxgb3: eth1: 128MB CM, 256MB PMTX, 256MB PMRX, S/N: PT37080022 

[1.044999] cxgb3 :01:00.0 eth2: Chelsio T320 10GBASE-R RNIC (rev 3) 
PCI Express x8 MSI-X

[1.728291] cxgb3 :01:00.0 eth6: renamed from eth1
[1.748264] systemd-udevd[368]: renamed network interface eth1 to eth6
[1.748351] cxgb3 :01:00.0 eth7: renamed from eth2
[1.777472] alx :04:00.0 eth3: renamed from eth0
[1.777522] systemd-udevd[378]: renamed network interface eth2 to eth7
[1.789052] systemd-udevd[372]: renamed network interface eth0 to eth3
[   31.248196] alx :04:00.0 eth3: NIC Up: 1 Gbps Full



[PATCH net-next 1/1] tc-testing: initial version of tunnel_key unit tests

2018-06-29 Thread Keara Leibovitz
Create unittests for the tc tunnel_key action.


Signed-off-by: Keara Leibovitz 
---
 .../tc-testing/tc-tests/actions/tunnel_key.json| 676 +
 1 file changed, 676 insertions(+)
 create mode 100644 
tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json

diff --git 
a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json 
b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
new file mode 100644
index ..bfe522ac8177
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
@@ -0,0 +1,676 @@
+[
+{
+"id": "2b11",
+"name": "Add tunnel_key set action with mandatory parameters",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 dst_ip 20.20.20.2 id 1",
+"expExitCode": "0",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 
10.10.10.1.*dst_ip 20.20.20.2.*key_id 1",
+"matchCount": "1",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "dc6b",
+"name": "Add tunnel_key set action with missing mandatory src_ip 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set dst_ip 
20.20.20.2 id 100",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*dst_ip 
20.20.20.2.*key_id 100",
+"matchCount": "0",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "7f25",
+"name": "Add tunnel_key set action with missing mandatory dst_ip 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 id 100",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
10.10.10.1.*key_id 100",
+"matchCount": "0",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "ba4e",
+"name": "Add tunnel_key set action with missing mandatory id 
parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
10.10.10.1 dst_ip 20.20.20.2",
+"expExitCode": "255",
+"verifyCmd": "$TC actions list action tunnel_key",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
10.10.10.1.*dst_ip 20.20.20.2",
+"matchCount": "0",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "a5e0",
+"name": "Add tunnel_key set action with invalid src_ip parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": "$TC actions add action tunnel_key set src_ip 
300.168.100.1 dst_ip 192.168.200.1 id 7 index 1",
+"expExitCode": "1",
+"verifyCmd": "$TC actions get action tunnel_key index 1",
+"matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 
300.168.100.1.*dst_ip 192.168.200.1.*key_id 7.*index 1 ref",
+"matchCount": "0",
+"teardown": [
+"$TC actions flush action tunnel_key"
+]
+},
+{
+"id": "eaa8",
+"name": "Add tunnel_key set action with invalid dst_ip parameter",
+"category": [
+"actions",
+"tunnel_key"
+],
+"setup": [
+[
+"$TC actions flush action tunnel_key",
+0,
+1,
+255
+]
+],
+"cmdUnderTest": 

Re: [bpf PATCH v4 3/4] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-29 Thread John Fastabend
On 06/29/2018 06:45 AM, Daniel Borkmann wrote:
> On 06/25/2018 05:34 PM, John Fastabend wrote:
> [...]
>> @@ -2302,9 +2347,12 @@ static int sock_hash_ctx_update_elem(struct 
>> bpf_sock_ops_kern *skops,
>>  goto bucket_err;
>>  }
>>  
>> -e->hash_link = l_new;
>> -e->htab = container_of(map, struct bpf_htab, map);
>> +rcu_assign_pointer(e->hash_link, l_new);
>> +rcu_assign_pointer(e->htab,
>> +   container_of(map, struct bpf_htab, map));
>> +write_lock_bh(>sk_callback_lock);
>>  list_add_tail(>list, >maps);
>> +write_unlock_bh(>sk_callback_lock);
>>  
>>  /* add new element to the head of the list, so that
>>   * concurrent search will find it before old elem
>> @@ -2314,8 +2362,10 @@ static int sock_hash_ctx_update_elem(struct 
>> bpf_sock_ops_kern *skops,
>>  psock = smap_psock_sk(l_old->sk);
>>  
>>  hlist_del_rcu(_old->hash_node);
>> +write_lock_bh(_old->sk->sk_callback_lock);
>>  smap_list_hash_remove(psock, l_old);
>>  smap_release_sock(psock, l_old->sk);
>> +write_unlock_bh(_old->sk->sk_callback_lock);
>>  free_htab_elem(htab, l_old);
>>  }
>>  raw_spin_unlock_bh(>lock);
> 
> Rather a question for clarification that came up while staring at this part
> in sock_hash_ctx_update_elem():
> 
> In the 'err' label, wouldn't we also need the sk_callback_lock given we access
> a potential psock, and modify its state (refcnt) would this be needed as well,
> or are we good here since we're under RCU context anyway? Hmm, if latter is
> indeed the case, then I'm wondering why we would need the above /sock/'s
> sk_callback_lock for modifying list handling inside the psock and couldn't use
> some locking that is only in scope of the actual struct smap_psock?
> 

So... originally I used sk_callback_lock to cover both the maps list and
modifying sk callbacks (its real intention!) but with the addition of
sockhash that has gotten increasingly clumsy. At this point it seems like
best/cleanest option would be to move sk_callback_lock to _only_ cover
callback updates and create a new lock to protect maps. Then we avoid
this pain.

Question is do we want to do this in bpf or bpf-next? I'm thinking we
respin this series, yet again, with the above and hopefully at that
point the locking will be cleaner.

Your observation about err label is correct, see below.

> My other question is, if someone calls the update helper with BPF_NOEXIST 
> which
> we seem to support with bailing out via -EEXIST, and if there's currently a
> psock attached already to the sock, then we drop this one off from the socket?
> Is that correct / expected?

I have a patch I'll submit shortly to fix this. This was missed fallout from
allowing socks in multiple maps. Checking to see if it has a psock is not
sufficient to know if we can dec the refcnt. It used to be in earlier versions
before the multiple map support. Also because the psock can be referenced still
from another map the sk_callback_lock (or new map specific lock) is needed
to protect maps.

> 
> Thanks,
> Daniel
> 



Re: [PATCH net] tcp: prevent bogus FRTO undos with non-SACK flows

2018-06-29 Thread Neal Cardwell
On Fri, Jun 29, 2018 at 6:07 AM Ilpo Järvinen  wrote:
>
> If SACK is not enabled and the first cumulative ACK after the RTO
> retransmission covers more than the retransmitted skb, a spurious
> FRTO undo will trigger (assuming FRTO is enabled for that RTO).
> The reason is that any non-retransmitted segment acknowledged will
> set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is
> no indication that it would have been delivered for real (the
> scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK
> case so the check for that bit won't help like it does with SACK).
> Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo
> in tcp_process_loss.
>
> We need to use more strict condition for non-SACK case and check
> that none of the cumulatively ACKed segments were retransmitted
> to prove that progress is due to original transmissions. Only then
> keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in
> non-SACK case.
>
> (FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS
> to better indicate its purpose but to keep this change minimal, it
> will be done in another patch).
>
> Besides burstiness and congestion control violations, this problem
> can result in RTO loop: When the loss recovery is prematurely
> undoed, only new data will be transmitted (if available) and
> the next retransmission can occur only after a new RTO which in case
> of multiple losses (that are not for consecutive packets) requires
> one RTO per loss to recover.
>
> Signed-off-by: Ilpo Järvinen 
> ---
>  net/ipv4/tcp_input.c | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 045d930..8e5522c 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 
> prior_fack,
>
> if (tcp_is_reno(tp)) {
> tcp_remove_reno_sacks(sk, pkts_acked);
> +
> +   /* If any of the cumulatively ACKed segments was
> +* retransmitted, non-SACK case cannot confirm that
> +* progress was due to original transmission due to
> +* lack of TCPCB_SACKED_ACKED bits even if some of
> +* the packets may have been never retransmitted.
> +*/
> +   if (flag & FLAG_RETRANS_DATA_ACKED)
> +   flag &= ~FLAG_ORIG_SACK_ACKED;
> } else {
> int delta;

Thanks, Ilpo. I ran this through our TCP packetdrill tests and only
got one failure, which detected the change you made, so that on the
first ACK in a non-SACK F-RTO case we stay in CA_Loss.

However, depending on the exact scenario we are concerned about here,
this may not be a complete solution. Consider the packetdrill test I
cooked below, which is for a scenario where we imagine a non-SACK
connection, with data packet #1 and all dupacks lost. With your patch
we don't have an F-RTO undo on the first cumulative ACK, which is a
definite improvement. However, we end up with an F-RTO undo on the
*second* cumulative ACK, because the second ACK didn't cover any
retransmitted data. That means that we end up in the second round trip
back in the initial slow-start mode with a very high cwnd and infinite
ssthresh, even though data was actually lost in the first round trip.

I'm not sure how much we care about all these cases. Perhaps we should
just disable F-RTO if the connection has no SACK enabled? These
non-SACK connections are just such a rare case at this point that I
would propose it's not worth spending too much time on this.

The following packetdrill script passes when Ilpo's patch is applied.
This documents the behavior, which I think is questionable:

0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
   +0 bind(3, ..., ...) = 0
   +0 listen(3, 1) = 0

   +0 < S 0:0(0) win 32792 
   +0 > S. 0:0(0) ack 1 
 +.02 < . 1:1(0) ack 1 win 257
   +0 accept(3, ..., ...) = 4
   +0 write(4, ..., 27000) = 27000
   +0 > . 1:10001(1) ack 1

// Suppose 1:1001 is lost and all dupacks are lost.

// RTO and retransmit head. This fills a real hole.
 +.22 > . 1:1001(1000) ack 1

+.005 < . 1:1(0) ack 2001 win 257
   +0 > . 10001:13001(3000) ack 1
   +0 %{ assert tcpi_ca_state == TCP_CA_Loss, tcpi_ca_state }%
   +0 %{ assert tcpi_snd_cwnd == 3, tcpi_snd_cwnd }%
   +0 %{ assert tcpi_snd_ssthresh == 7, tcpi_snd_ssthresh }%

+.002 < . 1:1(0) ack 10001 win 257
   +0 %{ assert tcpi_ca_state == TCP_CA_Open, tcpi_ca_state }%
   +0 %{ assert tcpi_snd_cwnd == 18, tcpi_snd_cwnd }%
   +0 %{ assert tcpi_snd_ssthresh == 2147483647, tcpi_snd_ssthresh }%
   +0 > . 13001:15001(2000) ack 1

---

neal


[PATCH net-next] strparser: Call skb_unclone conditionally

2018-06-29 Thread Vakul Garg
Calling skb_unclone() is expensive as it triggers a memcpy operation.
Instead of calling skb_unclone() unconditionally, call it only when skb
has a shared frag_list. This improves tls rx throughout significantly.

Signed-off-by: Vakul Garg 
Suggested-by: Boris Pismenny 
---
 net/strparser/strparser.c | 26 +++---
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 373836615c57..4f40a90ca016 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -155,11 +155,13 @@ static int __strp_recv(read_descriptor_t *desc, struct 
sk_buff *orig_skb,
/* We are going to append to the frags_list of head.
 * Need to unshare the frag_list.
 */
-   err = skb_unclone(head, GFP_ATOMIC);
-   if (err) {
-   STRP_STATS_INCR(strp->stats.mem_fail);
-   desc->error = err;
-   return 0;
+   if (skb_has_frag_list(head)) {
+   err = skb_unclone(head, GFP_ATOMIC);
+   if (err) {
+   STRP_STATS_INCR(strp->stats.mem_fail);
+   desc->error = err;
+   return 0;
+   }
}
 
if (unlikely(skb_shinfo(head)->frag_list)) {
@@ -216,14 +218,16 @@ static int __strp_recv(read_descriptor_t *desc, struct 
sk_buff *orig_skb,
memset(stm, 0, sizeof(*stm));
stm->strp.offset = orig_offset + eaten;
} else {
-   /* Unclone since we may be appending to an skb that we
+   /* Unclone if we are appending to an skb that we
 * already share a frag_list with.
 */
-   err = skb_unclone(skb, GFP_ATOMIC);
-   if (err) {
-   STRP_STATS_INCR(strp->stats.mem_fail);
-   desc->error = err;
-   break;
+   if (skb_has_frag_list(skb)) {
+   err = skb_unclone(skb, GFP_ATOMIC);
+   if (err) {
+   STRP_STATS_INCR(strp->stats.mem_fail);
+   desc->error = err;
+   break;
+   }
}
 
stm = _strp_msg(head);
-- 
2.13.6



Re: [PATCH 1/2] net: handle NULL ->poll gracefully

2018-06-29 Thread Linus Torvalds
On Fri, Jun 29, 2018 at 6:46 AM Linus Torvalds
 wrote:
>
> Hmm. I'll have to think about it.

I took yours over the revert. It's just smaller and nicer, and the
conditional should predict fine for any case where it might matter.

Linus


Re: [PATCH 1/2] net: handle NULL ->poll gracefully

2018-06-29 Thread Linus Torvalds
On Fri, Jun 29, 2018 at 6:37 AM Christoph Hellwig  wrote:
>
> The big aio poll revert broke various network protocols that don't
> implement ->poll as a patch in the aio poll serie removed sock_no_poll
> and made the common code handle this case.

Hmm. I just did the revert of commit 984652dd8b1f ("net: remove
sock_no_poll") in my tree.

But I haven't pushed it out, and your patch is certainly smaller/nicer
than the revert.

That said, the revert does have the advantage of avoiding a
conditional in the network poll() function (at the cost of a more
expensive indirect call for when sock_no_poll actually triggers, but
only "odd" network protocols have that).

Hmm. I'll have to think about it.

   Linus


Re: [bpf PATCH v4 3/4] bpf: sockhash fix omitted bucket lock in sock_close

2018-06-29 Thread Daniel Borkmann
On 06/25/2018 05:34 PM, John Fastabend wrote:
[...]
> @@ -2302,9 +2347,12 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   goto bucket_err;
>   }
>  
> - e->hash_link = l_new;
> - e->htab = container_of(map, struct bpf_htab, map);
> + rcu_assign_pointer(e->hash_link, l_new);
> + rcu_assign_pointer(e->htab,
> +container_of(map, struct bpf_htab, map));
> + write_lock_bh(>sk_callback_lock);
>   list_add_tail(>list, >maps);
> + write_unlock_bh(>sk_callback_lock);
>  
>   /* add new element to the head of the list, so that
>* concurrent search will find it before old elem
> @@ -2314,8 +2362,10 @@ static int sock_hash_ctx_update_elem(struct 
> bpf_sock_ops_kern *skops,
>   psock = smap_psock_sk(l_old->sk);
>  
>   hlist_del_rcu(_old->hash_node);
> + write_lock_bh(_old->sk->sk_callback_lock);
>   smap_list_hash_remove(psock, l_old);
>   smap_release_sock(psock, l_old->sk);
> + write_unlock_bh(_old->sk->sk_callback_lock);
>   free_htab_elem(htab, l_old);
>   }
>   raw_spin_unlock_bh(>lock);

Rather a question for clarification that came up while staring at this part
in sock_hash_ctx_update_elem():

In the 'err' label, wouldn't we also need the sk_callback_lock given we access
a potential psock, and modify its state (refcnt) would this be needed as well,
or are we good here since we're under RCU context anyway? Hmm, if latter is
indeed the case, then I'm wondering why we would need the above /sock/'s
sk_callback_lock for modifying list handling inside the psock and couldn't use
some locking that is only in scope of the actual struct smap_psock?

My other question is, if someone calls the update helper with BPF_NOEXIST which
we seem to support with bailing out via -EEXIST, and if there's currently a
psock attached already to the sock, then we drop this one off from the socket?
Is that correct / expected?

Thanks,
Daniel


Re: Link modes representation in phylib

2018-06-29 Thread Andrew Lunn
> Thanks for the suggestion. I see how this can be done with
> phydrv->supported and phydev->lp_advertising, however I'm not sure how
> we should deal with the fact that ethernet drivers directly access
> fields such as "advertising" or "supported".

Hi Maxime

I started cleaning up some of the MAC drivers. Take a look at

https://github.com/lunn/linux.git v4.18-rc1-net-next-phy-cleanup

That moved a lot of the MAC code into helpers, making it easier to
update. It might make sense to add a couple of more helpers, for what
remains.

Andrew


Re: [PATCH 6/6] fs: replace f_ops->get_poll_head with a static ->f_poll_head pointer

2018-06-29 Thread Linus Torvalds
On Fri, Jun 29, 2018 at 6:29 AM Christoph Hellwig  wrote:
> No need for poll_table_entry, we just need a wait_queue_head.
> poll_table_entry is an select.c internal (except for two nasty driver) -
> neither epoll nor most in-kernel callers use it.

Well, you need the poll_table for the "poll_wait()", and you would
need to be able to have multiple wait-queues even though you only have
one file you're waiting for.

But yes, it doesn't really need to be the full complex poll_table_entry model.

  Linus


[PATCH 2/2] aio: mark __aio_sigset::sigmask const

2018-06-29 Thread Christoph Hellwig
From: Avi Kivity 

io_pgetevents() will not change the signal mask. Mark it const
to make it clear and to reduce the need for casts in user code.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Avi Kivity 
Signed-off-by: Al Viro 
[hch: reapply the patch that got incorrectly reverted]
Signed-off-by: Christoph Hellwig 
---
 include/uapi/linux/aio_abi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index d4e768d55d14..3c5038b587ba 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -111,7 +111,7 @@ struct iocb {
 #undef IFLITTLE
 
 struct __aio_sigset {
-   sigset_t __user *sigmask;
+   const sigset_t __user   *sigmask;
size_t  sigsetsize;
 };
 
-- 
2.18.0



[PATCH 1/2] net: handle NULL ->poll gracefully

2018-06-29 Thread Christoph Hellwig
The big aio poll revert broke various network protocols that don't
implement ->poll as a patch in the aio poll serie removed sock_no_poll
and made the common code handle this case.

Fixes: a11e1d43 ("Revert changes to convert to ->poll_mask() and aio 
IOCB_CMD_POLL")
Signed-off-by: Christoph Hellwig 
---
 net/socket.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/socket.c b/net/socket.c
index a564c6ed19d5..85633622c94d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1133,6 +1133,8 @@ static __poll_t sock_poll(struct file *file, poll_table 
*wait)
__poll_t events = poll_requested_events(wait);
 
sock_poll_busy_loop(sock, events);
+   if (!sock->ops->poll)
+   return 0;
return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
 }
 
-- 
2.18.0



Fixes for the AIO poll revert

2018-06-29 Thread Christoph Hellwig
Two fixup for incorrectly reverted bits.




Re: [patch net-next v2 0/9] net: sched: introduce chain templates support with offloading to mlxsw

2018-06-29 Thread David Miller
From: David Ahern 
Date: Fri, 29 Jun 2018 06:54:36 -0600

> The resolution of the syntax affect the uapi changes proposed. You are
> wanting to add new RTM commands which suggests new objects. If a
> template is an attribute of an existing object then the netlink API
> should indicate that.

+1


Re: Diagnosing network module for missing link establishment (cxgb3, Chelsio T320)

2018-06-29 Thread Andrew Lunn
> Kernel: 3.16.0-4-amd64 (Debian v8 stock kernel)

This is very old. Please at least try the current version of Debian,
"Stretch".

Andrew


Re: [PATCH 6/6] fs: replace f_ops->get_poll_head with a static ->f_poll_head pointer

2018-06-29 Thread Christoph Hellwig
On Thu, Jun 28, 2018 at 10:30:27PM +0100, Al Viro wrote:
> > Because I think that what it can do is simply to do the ->poll() calls
> > outside the iocb locks, and then just attach the poll table to the
> > kioctx afterwards.
> 
> I'd do a bit more - embed the first poll_table_entry into poll iocb itself,
> so that the instances that use only one queue wouldn't need any allocations
> at all.

No need for poll_table_entry, we just need a wait_queue_head.
poll_table_entry is an select.c internal (except for two nasty driver) -
neither epoll nor most in-kernel callers use it.


Re: [PATCH 6/6] fs: replace f_ops->get_poll_head with a static ->f_poll_head pointer

2018-06-29 Thread Christoph Hellwig
On Thu, Jun 28, 2018 at 02:11:17PM -0700, Linus Torvalds wrote:
> Christoph, do you have a test program for IOCB_CMD_POLL and what it's
> actually supposed to do?

https://pagure.io/libaio/c/9c6935e81854d1585bbfa48c35b185849d746864?branch=aio-poll

is the actual test in libaio.  In addition to that the seastar library
actually has a real life user.  But given that is c++ with all modern
bells and whistles you'll probably have an as hard time as me actually
understanding that.

> Because I think that what it can do is simply to do the ->poll() calls
> outside the iocb locks, and then just attach the poll table to the
> kioctx afterwards.

We could do that on the submit side fairly easily.  The problem
is really the completion side, where I'd much avoid introducing a
spurious context switch.  Right now even with a NULL qproc we can't
guarantee any of that.  So we'll need to schedule out to a workqueue,
and then from that schedule the potential multiple NULL qproc calls,
which might actually block elsewhere even if __pollwait is never called.

> This whole "poll must not block" is a complete red herring. It doesn't
> come from any other requirements than BAD AIO GARBAGE CODE.

I comes from the fact to avoid a totally pointless context switch.
aio code itself works just fine called from a workqueue, we have
exatly that case when file system do non-trivial operations in their
end_io handler.


Re: Link modes representation in phylib

2018-06-29 Thread Maxime Chevallier
Hello Andrew,

On Tue, 19 Jun 2018 17:21:55 +0200
Andrew Lunn  wrote:

>> What I propose is that we add 3 link_mode fields in phy_device, and keep
>> the legacy fields for now. It would be up to the driver to fill the new
>> "supported" field in config_init, kind of like what's done in the
>> marvell10g driver.  
>
>Hi Maxime
>
>You can do this conversion in the core. If features == 0, and some
>bits are set in the features link_mode, do the conversion at probe
>time. The same can be done for lp_advertising, when the call into the
>drivers read_status() has completed.

Thanks for the suggestion. I see how this can be done with
phydrv->supported and phydev->lp_advertising, however I'm not sure how
we should deal with the fact that ethernet drivers directly access
fields such as "advertising" or "supported".

Should we update the new fields in phy_device when functions such as
"phy_start" or "phy_start_aneg" are called, just in case the ethernet
driver modified the phydev->supported / phydev->advertising fields
in the meantime ?

My concern is that phylink will rely on the new link_mode
representation to be up-to-date, and ethernet drivers that aren't
converted to phylink, but link to a new PHY will rely on the legacy
representation.

That might be just fine if we take good care making sure both the
legacy and the new representation are well sync'd, but since I don't
have a good big-picture view of all this, I prefer having your opinion
first :)

>> Would that be acceptable ?  
>
>It sounds reasonable. Lets see what the code looks like.

Ok I'll be glad to send a series for that, I just want to make sure I
go in the right direction

Thanks,

Maxime


Re: [PATCHv3 net-next] sctp: add support for SCTP_REUSE_PORT sockopt

2018-06-29 Thread David Miller
From: Xin Long 
Date: Thu, 28 Jun 2018 15:31:00 +0800

> This feature is actually already supported by sk->sk_reuse which can be
> set by socket level opt SO_REUSEADDR. But it's not working exactly as
> RFC6458 demands in section 8.1.27, like:
> 
>   - This option only supports one-to-one style SCTP sockets
>   - This socket option must not be used after calling bind()
> or sctp_bindx().
> 
> Besides, SCTP_REUSE_PORT sockopt should be provided for user's programs.
> Otherwise, the programs with SCTP_REUSE_PORT from other systems will not
> work in linux.
> 
> To separate it from the socket level version, this patch adds 'reuse' in
> sctp_sock and it works pretty much as sk->sk_reuse, but with some extra
> setup limitations that are needed when it is being enabled.
> 
> "It should be noted that the behavior of the socket-level socket option
> to reuse ports and/or addresses for SCTP sockets is unspecified", so it
> leaves SO_REUSEADDR as is for the compatibility.
> 
> Note that the name SCTP_REUSE_PORT is somewhat confusing, as its
> functionality is nearly identical to SO_REUSEADDR, but with some
> extra restrictions. Here it uses 'reuse' in sctp_sock instead of
> 'reuseport'. As for sk->sk_reuseport support for SCTP, it will be
> added in another patch.
> 
> Thanks to Neil to make this clear.
> 
> v1->v2:
>   - add sctp_sk->reuse to separate it from the socket level version.
> v2->v3:
>   - improve changelog according to Marcelo's suggestion.
> 
> Acked-by: Neil Horman 
> Signed-off-by: Xin Long 

Applied, thanks for working on getting the semantics right.


[PATCH bpf] bpf: hash_map: decrement counter on error

2018-06-29 Thread Mauricio Vasquez B
Decrement the number of elements in the map in case the allocation
of a new node fails.

Signed-off-by: Mauricio Vasquez B 
---
 kernel/bpf/hashtab.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ca2198..513d9df 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -747,13 +747,15 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab 
*htab, void *key,
 * old element will be freed immediately.
 * Otherwise return an error
 */
-   atomic_dec(>count);
-   return ERR_PTR(-E2BIG);
+   l_new = ERR_PTR(-E2BIG);
+   goto dec_count;
}
l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
 htab->map.numa_node);
-   if (!l_new)
-   return ERR_PTR(-ENOMEM);
+   if (!l_new) {
+   l_new = ERR_PTR(-ENOMEM);
+   goto dec_count;
+   }
}
 
memcpy(l_new->key, key, key_size);
@@ -766,7 +768,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab 
*htab, void *key,
  GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);
-   return ERR_PTR(-ENOMEM);
+   l_new = ERR_PTR(-ENOMEM);
+   goto dec_count;
}
}
 
@@ -780,6 +783,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab 
*htab, void *key,
 
l_new->hash = hash;
return l_new;
+dec_count:
+   atomic_dec(>count);
+   return l_new;
 }
 
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
-- 
2.7.4



  1   2   >