Re: tip -ENOBOOT - bisected to locking/refcounts, x86/asm: Implement fast refcount overflow protection

2017-08-31 Thread Mike Galbraith
On Thu, 2017-08-31 at 11:45 -0700, Kees Cook wrote:
> On Thu, Aug 31, 2017 at 10:19 AM, Mike Galbraith  wrote:
> > On Thu, 2017-08-31 at 10:00 -0700, Kees Cook wrote:
> >>
> >> Oh! So it's gcc-version sensitive? That's alarming. Is this mapping 
> >> correct:
> >>
> >> 4.8.5: WARN, eventual kernel hang
> >> 6.3.1, 7.0.1: WARN, but continues working
> >
> > Yeah, that's correct.  I find that troubling, simply because this gcc
> > version has been through one hell of a lot of kernels with me.  Yeah, I
> > know, that doesn't exempt it from having bugs, but color me suspicious.
> 
> I still can't hit this with a 4.8.5 build. :(
> 
> With _RATELIMIT removed, this should, in theory, report whatever goes
> negative first...

I applied the other patch you posted, and built with gcc-6.3.1 to
remove the gcc-4.8.5 aspect.  Look below the resulting splat.

[1.293962] NET: Registered protocol family 10
[1.294635] refcount_t silent saturation at in6_dev_get+0x25/0x104 in 
swapper/0[1], uid/euid: 0/0
[1.295616] [ cut here ]
[1.296120] WARNING: CPU: 0 PID: 1 at kernel/panic.c:612 
refcount_error_report+0x94/0x9e
[1.296950] Modules linked in:
[1.297276] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
4.13.0.g152d54a-tip-default #53
[1.299179] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.0.0-prebuilt.qemu-project.org 04/01/2014
[1.300743] task: 88013ab84040 task.stack: c962c000
[1.301825] RIP: 0010:refcount_error_report+0x94/0x9e
[1.302804] RSP: 0018:c962fc10 EFLAGS: 00010282
[1.303791] RAX: 0055 RBX: 81a34274 RCX: 81c605e8
[1.304991] RDX: 0001 RSI: 0096 RDI: 0246
[1.306189] RBP: c962fd58 R08:  R09: 0175
[1.307392] R10:  R11: 0001 R12: 88013ab84040
[1.308583] R13:  R14: 0004 R15: 81a256c8
[1.309768] FS:  () GS:88013fc0() 
knlGS:
[1.311052] CS:  0010 DS:  ES:  CR0: 80050033
[1.312100] CR2: 7f4631fe8df0 CR3: 000137d09003 CR4: 001606f0
[1.313301] Call Trace:
[1.314012]  ex_handler_refcount+0x63/0x70
[1.314893]  fixup_exception+0x32/0x40
[1.315737]  do_trap+0x8c/0x170
[1.316519]  do_error_trap+0x70/0xd0
[1.317340]  ? in6_dev_get+0x23/0x104
[1.318172]  ? netlink_broadcast_filtered+0x2bd/0x430
[1.319156]  ? kmem_cache_alloc_trace+0xce/0x5d0
[1.320098]  ? set_debug_rodata+0x11/0x11
[1.320964]  invalid_op+0x1e/0x30
[1.322520] RIP: 0010:in6_dev_get+0x25/0x104
[1.323631] RSP: 0018:c962fe00 EFLAGS: 00010202
[1.324614] RAX: 880137de2400 RBX: 880137df4600 RCX: 880137de24f0
[1.325793] RDX: 88013a5e4000 RSI: fe00 RDI: 88013a5e4000
[1.326964] RBP: 00d1 R08:  R09: 880137de7600
[1.328150] R10:  R11: 8801398a4df8 R12: 
[1.329374] R13: 82137872 R14: 014200ca R15: 
[1.330547]  ? set_debug_rodata+0x11/0x11
[1.331392]  ip6_route_init_special_entries+0x2a/0x89
[1.332369]  addrconf_init+0x9e/0x203
[1.333173]  inet6_init+0x1af/0x365
[1.333956]  ? af_unix_init+0x4e/0x4e
[1.334753]  do_one_initcall+0x4e/0x190
[1.33]  ? set_debug_rodata+0x11/0x11
[1.336369]  kernel_init_freeable+0x189/0x20e
[1.337230]  ? rest_init+0xd0/0xd0
[1.337999]  kernel_init+0xa/0xf7
[1.338744]  ret_from_fork+0x25/0x30
[1.339500] Code: 48 8b 95 80 00 00 00 41 55 49 8d 8c 24 f0 0a 00 00 45 8b 
84 24 10 09 00 00 41 89 c1 48 89 de 48 c7 c7 60 7a a3 81 e8 07 de 05 00 <0f> ff 
58 5b 5d 41 5c 41 5d c3 0f 1f 44 00 00 55 48 89 e5 41 56 
[1.342243] ---[ end trace b5d40c0fccce776c ]---

Back yours out, and...

# tracer: nop
#
#  _-=> irqs-off
# / _=> need-resched
#| / _---=> hardirq/softirq
#|| / _--=> preempt-depth
#||| / delay
#   TASK-PID   CPU#  TIMESTAMP  FUNCTION
#  | |   |      | |
   swapper/0-1 [000] ...1 1.974114: in6_dev_getx: 
refs.counter:-1073741824
   swapper/0-1 [000] ...1 1.974116: in6_dev_getx: 
refs.counter:-1073741824

---
 arch/x86/include/asm/refcount.h |9 +
 include/net/addrconf.h  |   12 
 net/ipv6/route.c|4 ++--
 3 files changed, 23 insertions(+), 2 deletions(-)

--- a/arch/x86/include/asm/refcount.h
+++ b/arch/x86/include/asm/refcount.h
@@ -55,6 +55,15 @@ static __always_inline void refcount_inc
: : "cc", "cx");
 }
 
+static __always_inline void refcount_inc_x(refcount_t *r)
+{
+   trace_printk("refs.counter:%d\n", r->refs.counter);
+   asm 

Re: netdev watchdog enp1s0 (tg3): transmit queue 0 timed out

2017-08-31 Thread Frans van Berckel
Dear Michael and Siva,

On Thu, 2017-08-31 at 23:36 -0700, Michael Chan wrote:
> On Thu, Aug 31, 2017 at 11:10 PM, Frans van Berckel
>  wrote:
> > 
> >  a long list of likely the same type of error codes.
> > 
> 
> Please post the entire register dump.

[  237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full
duplex
[  237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX and
on for RX
[  237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link becomes
ready
[  243.683910] tg3 :01:00.0 enp1s0: DMA Status error.  Resetting
chip.
[  243.759610] hrtimer: interrupt took 9464192 ns
[  245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4,
0x00100406, 0x0221, 0x0010
[  245.476295] tg3 :01:00.0 enp1s0: 0x0010: 0xefcf0004,
0x, 0x, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0020: 0x,
0x, 0x, 0x01eb1028
[  245.476295] tg3 :01:00.0 enp1s0: 0x0030: 0x,
0x0048, 0x, 0x0105
[  245.476295] tg3 :01:00.0 enp1s0: 0x0040: 0x,
0x, 0xc0025001, 0x64002000
[  245.476295] tg3 :01:00.0 enp1s0: 0x0050: 0x00fc5803,
0x80818283, 0x0087d005, 0xfee0300c
[  245.476295] tg3 :01:00.0 enp1s0: 0x0060: 0x,
0x4122, 0x42010298, 0x7618
[  245.476295] tg3 :01:00.0 enp1s0: 0x0070: 0x10f2,
0x00a0, 0x, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0080: 0x165914e4,
0x3c1d0002, 0x04130034, 0x3c085082
[  245.476295] tg3 :01:00.0 enp1s0: 0x0090: 0x01009509,
0x, 0x, 0x00ce
[  245.476295] tg3 :01:00.0 enp1s0: 0x00a0: 0x,
0x0006, 0x, 0x000c
[  245.476295] tg3 :01:00.0 enp1s0: 0x00b0: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x00c0: 0x,
0x, 0x000e, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x00d0: 0x00010010,
0x0fa0, 0x00122104, 0x00036c11
[  245.476295] tg3 :01:00.0 enp1s0: 0x00e0: 0x1011,
0x, 0x, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0100: 0x13c10001,
0x00014000, 0x0011c000, 0x000e3010
[  245.476295] tg3 :01:00.0 enp1s0: 0x0110: 0x,
0x11c1, 0x00a0, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0130: 0x,
0x, 0x, 0x16010002
[  245.476295] tg3 :01:00.0 enp1s0: 0x0150: 0x80ff,
0x, 0x, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0160: 0x16c10003,
0xfe3b2547, 0x001ec9ff, 0x00010004
[  245.476295] tg3 :01:00.0 enp1s0: 0x0170: 0x,
0x0007810e, 0x0001, 0x2c0c3c0e
[  245.476295] tg3 :01:00.0 enp1s0: 0x0180: 0x3f062304,
0x, 0x, 0x
[  245.476295] tg3 :01:00.0 enp1s0: 0x0200: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0210: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0220: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0230: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0240: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0250: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0260: 0x,
0x0001, 0x, 0x00ce
[  245.476295] tg3 :01:00.0 enp1s0: 0x0270: 0x,
0x0001, 0x, 0x0001
[  245.476295] tg3 :01:00.0 enp1s0: 0x0280: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x0290: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02a0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02b0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02c0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02d0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02e0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x02f0: 0x,
0x0006, 0x, 0x0006
[  245.476295] tg3 :01:00.0 enp1s0: 0x0300: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 enp1s0: 0x0310: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 enp1s0: 0x0320: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 enp1s0: 0x0330: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 enp1s0: 0x0340: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 enp1s0: 0x0350: 0x,
0x000f, 0x, 0x000f
[  245.476295] tg3 :01:00.0 en

Re: netdev watchdog enp1s0 (tg3): transmit queue 0 timed out

2017-08-31 Thread Michael Chan
On Thu, Aug 31, 2017 at 11:10 PM, Frans van Berckel  wrote:
> Dear NetDev Team,
>
> I am new to this machine. On a marketplace website I
> bought a Dell PowerEdge sc1435. Booting a today's Fedora (or even a
> Debian) amd64 Live CD from usb, and goes all fine.
>
> [0.00] Linux version 4.11.8-300.fc26.x86_64 (mockbuild@bkernel0
> 2.phx2.fedoraproject.org) (gcc version 7.1.1 20170622 (Red Hat 7.1.1-
> 3) (GCC) ) #1 SMP Thu Jun 29 20:09:48 UTC 2017
>
> Until .. I plunged in the ethernet cable for the first time. I have got
> console output, what frightens me a bit. It's about the driver for
> bcm95721. This kernel does a DMA Status error. And next it's calling a
> watchdog for enp1s0 (tg3): transmit queue 0 timed out.
>
> [  237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full
> du
> plex
> [  237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX
> and
> on for RX
> [  237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link
> becomes
> ready
> [  243.683910] tg3 :01:00.0 enp1s0: DMA Status
> error.  Resetting
> chip.
> [  243.759610] hrtimer: interrupt took 9464192 ns
> [  245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4,
> 0x00100406, 0x0221, 0x0010
>
>  a long list of likely the same type of error codes.
>

Please post the entire register dump.


[PATCH net-next 3/3] bpf: Only set node->ref = 1 if it has not been set

2017-08-31 Thread Martin KaFai Lau
This patch writes 'node->ref = 1' only if node->ref is 0.
The number of lookups/s for a ~1M entries LRU map increased by
~30% (260097 to 343313).

Other writes on 'node->ref = 0' is not changed.  In those cases, the
same cache line has to be changed anyway.

First column: Size of the LRU hash
Second column: Number of lookups/s

Before:
> echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 1000 | awk 
> '{print $3}')"
1048577: 260097

After:
> echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 1000 | awk 
> '{print $3}')"
1048577: 343313

Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/bpf_lru_list.h | 3 ++-
 kernel/bpf/hashtab.c  | 7 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 5c35a98d02bf..7d4f89b7cb84 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node 
*node)
/* ref is an approximation on access frequency.  It does not
 * have to be very accurate.  Hence, no protection is used.
 */
-   node->ref = 1;
+   if (!node->ref)
+   node->ref = 1;
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 682f4543fefa..431126f31ea3 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -519,9 +519,14 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
 {
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
+   const int ref_reg = BPF_REG_1;
 
*insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
u64))__htab_map_lookup_elem);
-   *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+   *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
+   *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
+ offsetof(struct htab_elem, lru_node) +
+ offsetof(struct bpf_lru_node, ref));
+   *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1);
*insn++ = BPF_ST_MEM(BPF_B, ret,
 offsetof(struct htab_elem, lru_node) +
 offsetof(struct bpf_lru_node, ref),
-- 
2.9.5



[PATCH net-next 0/3] bpf: Improve LRU map lookup performance

2017-08-31 Thread Martin KaFai Lau
This patchset improves the lookup performance of the LRU map.
Please see individual patch for details.

Martin KaFai Lau (3):
  bpf: Add lru_hash_lookup performance test
  bpf: Inline LRU map lookup
  bpf: Only set node->ref = 1 if it has not been set

 kernel/bpf/bpf_lru_list.h|  3 +-
 kernel/bpf/hashtab.c | 24 +
 samples/bpf/map_perf_test_kern.c | 44 +++
 samples/bpf/map_perf_test_user.c | 77 ++--
 4 files changed, 138 insertions(+), 10 deletions(-)

-- 
2.9.5



[PATCH net-next 2/3] bpf: Inline LRU map lookup

2017-08-31 Thread Martin KaFai Lau
Inline the lru map lookup to save the cost in making calls to
bpf_map_lookup_elem() and htab_lru_map_lookup_elem().

Different LRU hash size is tested.  The benefit diminishes when
the cache miss starts to dominate in the bigger LRU hash.
Considering the change is simple, it is still worth to optimize.

First column: Size of the LRU hash
Second column: Number of lookups/s

Before:
> for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 
> $((2**i+1)) 1000 | awk '{print $3}')"; done
513: 1132020
1025: 1056826
2049: 1007024
4097: 853298
8193: 742723
16385: 712600
32769: 688142
65537: 677028
131073: 619437
262145: 498770
524289: 316695
1048577: 260038

After:
> for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 
> $((2**i+1)) 1000 | awk '{print $3}')"; done
513: 1221851
1025: 1144695
2049: 1049902
4097: 884460
8193: 773731
16385: 729673
32769: 721989
65537: 715530
131073: 671665
262145: 516987
524289: 321125
1048577: 260048

Signed-off-by: Martin KaFai Lau 
---
 kernel/bpf/hashtab.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d246905f2bb1..682f4543fefa 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -514,6 +514,24 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, 
void *key)
return NULL;
 }
 
+static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
+  struct bpf_insn *insn_buf)
+{
+   struct bpf_insn *insn = insn_buf;
+   const int ret = BPF_REG_0;
+
+   *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, 
u64))__htab_map_lookup_elem);
+   *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
+   *insn++ = BPF_ST_MEM(BPF_B, ret,
+offsetof(struct htab_elem, lru_node) +
+offsetof(struct bpf_lru_node, ref),
+1);
+   *insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
+   offsetof(struct htab_elem, key) +
+   round_up(map->key_size, 8));
+   return insn - insn_buf;
+}
+
 /* It is called from the bpf_lru_list when the LRU needs to delete
  * older elements from the htab.
  */
@@ -1137,6 +1155,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
+   .map_gen_lookup = htab_lru_map_gen_lookup,
 };
 
 /* Called from eBPF program */
-- 
2.9.5



[PATCH net-next 1/3] bpf: Add lru_hash_lookup performance test

2017-08-31 Thread Martin KaFai Lau
Create a new case to test the LRU lookup performance.

At the beginning, the LRU map is fully loaded (i.e. the number of keys
is equal to map->max_entries).   The lookup is done through key 0
to num_map_entries and then repeats from 0 again.

This patch also creates an anonymous struct to properly
name the test params in stress_lru_hmap_alloc() in map_perf_test_kern.c.

Signed-off-by: Martin KaFai Lau 
---
 samples/bpf/map_perf_test_kern.c | 44 +++
 samples/bpf/map_perf_test_user.c | 77 ++--
 2 files changed, 112 insertions(+), 9 deletions(-)

diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c
index ca3b22ed577a..098c857f1eda 100644
--- a/samples/bpf/map_perf_test_kern.c
+++ b/samples/bpf/map_perf_test_kern.c
@@ -88,6 +88,13 @@ struct bpf_map_def SEC("maps") array_map = {
.max_entries = MAX_ENTRIES,
 };
 
+struct bpf_map_def SEC("maps") lru_hash_lookup_map = {
+   .type = BPF_MAP_TYPE_LRU_HASH,
+   .key_size = sizeof(u32),
+   .value_size = sizeof(long),
+   .max_entries = MAX_ENTRIES,
+};
+
 SEC("kprobe/sys_getuid")
 int stress_hmap(struct pt_regs *ctx)
 {
@@ -148,12 +155,23 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
 SEC("kprobe/sys_connect")
 int stress_lru_hmap_alloc(struct pt_regs *ctx)
 {
+   char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn";
+   union {
+   u16 dst6[8];
+   struct {
+   u16 magic0;
+   u16 magic1;
+   u16 tcase;
+   u16 unused16;
+   u32 unused32;
+   u32 key;
+   };
+   } test_params;
struct sockaddr_in6 *in6;
-   u16 test_case, dst6[8];
+   u16 test_case;
int addrlen, ret;
-   char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n";
long val = 1;
-   u32 key = bpf_get_prandom_u32();
+   u32 key = 0;
 
in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx);
addrlen = (int)PT_REGS_PARM3(ctx);
@@ -161,14 +179,18 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx)
if (addrlen != sizeof(*in6))
return 0;
 
-   ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr);
+   ret = bpf_probe_read(test_params.dst6, sizeof(test_params.dst6),
+&in6->sin6_addr);
if (ret)
goto done;
 
-   if (dst6[0] != 0xdead || dst6[1] != 0xbeef)
+   if (test_params.magic0 != 0xdead ||
+   test_params.magic1 != 0xbeef)
return 0;
 
-   test_case = dst6[7];
+   test_case = test_params.tcase;
+   if (test_case != 3)
+   key = bpf_get_prandom_u32();
 
if (test_case == 0) {
ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
@@ -188,6 +210,16 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx)
 
ret = bpf_map_update_elem(nolocal_lru_map, &key, &val,
  BPF_ANY);
+   } else if (test_case == 3) {
+   u32 i;
+
+   key = test_params.key;
+
+#pragma clang loop unroll(full)
+   for (i = 0; i < 32; i++) {
+   bpf_map_lookup_elem(&lru_hash_lookup_map, &key);
+   key++;
+   }
} else {
ret = -EINVAL;
}
diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c
index bccbf8478e43..f388254896f6 100644
--- a/samples/bpf/map_perf_test_user.c
+++ b/samples/bpf/map_perf_test_user.c
@@ -46,6 +46,7 @@ enum test_type {
HASH_LOOKUP,
ARRAY_LOOKUP,
INNER_LRU_HASH_PREALLOC,
+   LRU_HASH_LOOKUP,
NR_TESTS,
 };
 
@@ -60,6 +61,7 @@ const char *test_map_names[NR_TESTS] = {
[HASH_LOOKUP] = "hash_map",
[ARRAY_LOOKUP] = "array_map",
[INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map",
+   [LRU_HASH_LOOKUP] = "lru_hash_lookup_map",
 };
 
 static int test_flags = ~0;
@@ -67,6 +69,8 @@ static uint32_t num_map_entries;
 static uint32_t inner_lru_hash_size;
 static int inner_lru_hash_idx = -1;
 static int array_of_lru_hashs_idx = -1;
+static int lru_hash_lookup_idx = -1;
+static int lru_hash_lookup_test_entries = 32;
 static uint32_t max_cnt = 100;
 
 static int check_test_flags(enum test_type t)
@@ -86,6 +90,32 @@ static void test_hash_prealloc(int cpu)
   cpu, max_cnt * 10ll / (time_get_ns() - start_time));
 }
 
+static int pre_test_lru_hash_lookup(int tasks)
+{
+   int fd = map_fd[lru_hash_lookup_idx];
+   uint32_t key;
+   long val = 1;
+   int ret;
+
+   if (num_map_entries > lru_hash_lookup_test_entries)
+   lru_hash_lookup_test_entries = num_map_entries;
+
+   /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test.
+*
+* It is fine that the user requests for a map with
+* num_map_entries < 3

netdev watchdog enp1s0 (tg3): transmit queue 0 timed out

2017-08-31 Thread Frans van Berckel
Dear NetDev Team,

I am new to this machine. On a marketplace website I
bought a Dell PowerEdge sc1435. Booting a today's Fedora (or even a
Debian) amd64 Live CD from usb, and goes all fine.  

[0.00] Linux version 4.11.8-300.fc26.x86_64 (mockbuild@bkernel0
2.phx2.fedoraproject.org) (gcc version 7.1.1 20170622 (Red Hat 7.1.1-
3) (GCC) ) #1 SMP Thu Jun 29 20:09:48 UTC 2017

Until .. I plunged in the ethernet cable for the first time. I have got
console output, what frightens me a bit. It's about the driver for
bcm95721. This kernel does a DMA Status error. And next it's calling a
watchdog for enp1s0 (tg3): transmit queue 0 timed out.

[  237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full
du
plex
[  237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX
and
on for RX
[  237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link
becomes
ready
[  243.683910] tg3 :01:00.0 enp1s0: DMA Status
error.  Resetting
chip.
[  243.759610] hrtimer: interrupt took 9464192 ns
[  245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4,
0x00100406, 0x0221, 0x0010

 a long list of likely the same type of error codes.

[  245.760055] tg3 :01:00.0 enp1s0: 0x7810: 0x,
0x0060, 0x000d, 0x
[  245.762065] tg3 :01:00.0 enp1s0: 0: Host status block
[0005:0016:(:0007:):(0007:000c)]
[  245.764128] tg3 :01:00.0 enp1s0: 0: NAPI info
[0015:0015:(000f:000c:01ff):0006:(00ce:::)]
[  245.867391] tg3 :01:00.0: tg3_stop_block timed out, ofs=2c00
enable_bit=2
[  245.971098] tg3 :01:00.0: tg3_stop_block timed out, ofs=4800
enable_bit=2
[  245.993343] tg3 :01:00.0 enp1s0: Link is down
[  249.731158] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full
duplex
[  249.731336] tg3 :01:00.0 enp1s0: Flow control is on for TX and
on for RX
[  254.944022] [ cut here ]
[  254.945010] WARNING: CPU: 1 PID: 0 at net/sched/sch_generic.c:316
dev_watchdog+0x221/0x230
[  254.945010] NETDEV WATCHDOG: enp1s0 (tg3): transmit queue 0 timed
out
[  254.945010] Modules linked in: xt_CHECKSUM ipt_MASQUERADE
nf_nat_masquerade_ipv4 tun fuse nf_conntrack_netbios_ns
nf_conntrack_broadcast xt_CT ip6t_rpfilter ip6t_REJECT nf_reject_ipv6
xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge
ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6
ip6table_mangle ip6table_raw ip6table_security iptable_nat
nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack
libcrc32c iptable_mangle iptable_raw iptable_security ebtable_filter
ebtables ip6table_filter ip6_tables powernow_k8 amd64_edac_mod
edac_mce_amd edac_core kvm_amd kvm irqbypass amdkfd amd_iommu_v2 dcdbas
radeon k8temp ipmi_ssif i2c_algo_bit ttm drm_kms_helper drm ipmi_si
ipmi_devintf tpm_tis ipmi_msghandler tpm_tis_core tpm i2c_piix4 shpchp
nls_utf8 isofs squashfs ata_generic
[  254.945010]  pata_acpi uas usb_storage 8021q garp stp llc mrp
serio_raw tg3 sata_sil24 ptp pps_core pata_serverworks sunrpc
scsi_transport_iscsi loop
[  254.945010] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.11.8-
300.fc26.x86_64 #1
[  254.945010] Hardware name: Dell Inc. PowerEdge SC1435/0YR707, BIOS
2.2.5 03/21/2008
[  254.945010] Call Trace:
[  254.945010]  
[  254.945010]  dump_stack+0x63/0x84
[  254.945010]  __warn+0xcb/0xf0
[  254.945010]  warn_slowpath_fmt+0x5a/0x80
[  254.945010]  dev_watchdog+0x221/0x230
[  254.945010]  ? qdisc_rcu_free+0x50/0x50
[  254.945010]  call_timer_fn+0x33/0x130
[  254.945010]  run_timer_softirq+0x3ee/0x440
[  254.945010]  ? ktime_get+0x40/0xb0
[  254.945010]  ? lapic_next_event+0x1d/0x30
[  254.945010]  __do_softirq+0xea/0x2e3
[  254.945010]  irq_exit+0xfb/0x100
[  254.945010]  smp_apic_timer_interrupt+0x3d/0x50
[  254.945010]  apic_timer_interrupt+0x93/0xa0
[  254.945010] RIP: 0010:native_safe_halt+0x6/0x10
[  254.945010] RSP: 0018:babf4038be60 EFLAGS: 0246 ORIG_RAX:
ff10
[  254.945010] RAX: 687475412d40 RBX: 926d39d04880 RCX:

[  254.945010] RDX:  RSI:  RDI:

[  254.945010] RBP: babf4038be60 R08: 926d3d052ae0 R09:

[  254.945010] R10:  R11:  R12:
0001
[  254.945010] R13: 926d39d04880 R14:  R15:

[  254.945010]  
[  254.945010]  default_idle+0x20/0xe0
[  254.945010]  amd_e400_idle+0x3f/0x50
[  254.945010]  arch_cpu_idle+0xf/0x20
[  254.945010]  default_idle_call+0x23/0x30
[  254.945010]  do_idle+0x170/0x200
[  254.945010]  cpu_startup_entry+0x71/0x80
[  254.945010]  start_secondary+0x154/0x190
[  254.945010]  start_cpu+0x14/0x14
[  255.080137] ---[ end trace 25a535e6d8610c90 ]---
[  255.080137] tg3 :01:00.0 enp1s0: transmit timed out, resetting

And resetting is what 'he' does again and again. Until you take out the
UTP cable. This counts for the second ethernet port as well. Is there
something with the machine bios or a setting that i ca

Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Jeffrey Vander Stoep
On Thu, Aug 31, 2017 at 7:05 PM, Alexei Starovoitov
 wrote:
> On Thu, Aug 31, 2017 at 01:56:34PM -0700, Chenbo Feng wrote:
>> From: Chenbo Feng 
>>
>> Introduce a pointer into struct bpf_map to hold the security information
>> about the map. The actual security struct varies based on the security
>> models implemented. Place the LSM hooks before each of the unrestricted
>> eBPF operations, the map_update_elem and map_delete_elem operations are
>> checked by security_map_modify. The map_lookup_elem and map_get_next_key
>> operations are checked by securtiy_map_read.
>>
>> Signed-off-by: Chenbo Feng 
>
> ...
>
>> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_read(map);
>> + if (err)
>> + return -EACCES;
>> +
>>   key = memdup_user(ukey, map->key_size);
>>   if (IS_ERR(key)) {
>>   err = PTR_ERR(key);
>> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_modify(map);
>
> I don't feel these extra hooks are really thought through.
> With such hook you'll disallow map_update for given map. That's it.
> The key/values etc won't be used in such security decision.
> In such case you don't need such hooks in update/lookup at all.
> Only in map_creation and object_get calls where FD can be received.
> In other words I suggest to follow standard unix practices:
> Do permissions checks in open() and allow read/write() if FD is valid.
> Same here. Do permission checks in prog_load/map_create/obj_pin/get
> and that will be enough to jail bpf subsystem.
> bpf cmds that need to be fast (like lookup and update) should not
> have security hooks.
>

I do think we want to distinguish between read/write (or read/modify)
for these objects. Essentially, we want to implement the example
described in patch 0/3 where eBPF objects can be passed to less
privileged processes which can read, but not modify the map. What
would be the best way to do this? Add a mode field to the bpf_map
struct?


RE: netdev carrier changes is one even after ethernet link up.

2017-08-31 Thread Bhadram Varka
Thanks for responding. Now responding inline

> -Original Message-
> From: Florian Fainelli [mailto:f.faine...@gmail.com]
> Sent: Friday, September 01, 2017 5:53 AM
> To: Bhadram Varka ; and...@lunn.ch
> Cc: linux-netdev 
> Subject: Re: netdev carrier changes is one even after ethernet link up.
> 
> On 08/30/2017 10:53 PM, Bhadram Varka wrote:
> > Hi,
> >
> >
> >
> > I have observed that carrier_changes is one even in case of the
> > ethernet link is up.
> >
> >
> >
> > After investigating the code below is my observation –
> >
> >
> >
> > ethernet_driver_probe()
> >
> > +--->phy_connect()
> >
> > | +--->phy_attach_direct()
> >
> > |   +---> netif_carrier_off(): which increments
> > carrier_changes to one.
> >
> > +--->register_netdevice() : will the carrier_changes becomes zero here ?
> >
> > +--->netif_carrier_off(): not increment the carrier_changes since
> > __LINK_STATE_NOCARRIER already set.
> >
> >
> >
> > From ethernet driver open will start the PHY and trigger the
> > phy_state_machine.
> >
> > Phy_state_machine workqueue calling netif_carrier_on() once the link is
> UP.
> >
> > netif_carrier_on() increments the carrier_changes by one.
> 
> If the call trace is correct, then there is at least two problems here:
> 
> - phy_connect() does start the PHY machine which means that as soon as it
> detects a link state of any kind (up or down) it can call
> netif_carrier_off() respectively netif_carrier_on()
> 
> - as soon as you call register_netdevice() notifiers run and other parts of 
> the
> kernel or user-space programs can see an inconsistent link state
> 
> I would suggest doing the following sequence instead:
> 
> netif_carrier_off()
> register_netdevice()
> phy_connect()
> 
> Which should result in a consistent link state and carrier value.
> 
Yes, It will address the issue. 

If we did the phy_conect in ndo_open it will make the carrier changes as two. 
But if we did in probe function then it's not working.

In ethernet driver probe - (below sequence is not working)
phy_connect()
register_netdevice()
netif_carrier_off()

working sequence:
In probe():
register_netdevice()
ndo_open:
   phy_connect()

After reverting - https://lkml.org/lkml/2016/1/9/173 this works if we do 
phy_connect in probe as well.

Thanks,
Bhadram.
> >
> >
> >
> > After link is UP if we check the carrier_changes sysfs node - it will
> > be one only.
> >
> >
> >
> > $ cat /sys/class/net/eth0/carrier_changes
> >
> > 1
> >
> >
> >
> > After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net:
> > phy: turn carrier off on phy attach) then I could see the carrier
> > changes incremented to 2 after Link UP.
> >
> > $ cat /sys/class/net/eth0/carrier_changes
> >
> > 2
> >
> >
> >
> > Thanks,
> >
> > Bhadram.
> >
> > --
> > -- This email message is for the sole use of the intended recipient(s)
> > and may contain confidential information.  Any unauthorized review,
> > use, disclosure or distribution is prohibited.  If you are not the
> > intended recipient, please contact the sender by reply email and
> > destroy all copies of the original message.
> > --
> > --
> 
> 
> --
> Florian


RE: netdev carrier changes is one even after ethernet link up.

2017-08-31 Thread Bhadram Varka
Thanks for responding.

-Original Message-
From: Florian Fainelli [mailto:f.faine...@gmail.com] 
Sent: Friday, September 01, 2017 5:53 AM
To: Bhadram Varka ; and...@lunn.ch
Cc: linux-netdev 
Subject: Re: netdev carrier changes is one even after ethernet link up.

On 08/30/2017 10:53 PM, Bhadram Varka wrote:
> Hi,
> 
>  
> 
> I have observed that carrier_changes is one even in case of the 
> ethernet link is up.
> 
>  
> 
> After investigating the code below is my observation –
> 
>  
> 
> ethernet_driver_probe()
> 
> +--->phy_connect()
> 
> | +--->phy_attach_direct()
> 
> |   +---> netif_carrier_off(): which increments
> carrier_changes to one.
> 
> +--->register_netdevice() : will the carrier_changes becomes zero here ?
> 
> +--->netif_carrier_off(): not increment the carrier_changes since
> __LINK_STATE_NOCARRIER already set.
> 
>  
> 
> From ethernet driver open will start the PHY and trigger the 
> phy_state_machine.
> 
> Phy_state_machine workqueue calling netif_carrier_on() once the link is UP.
> 
> netif_carrier_on() increments the carrier_changes by one.

If the call trace is correct, then there is at least two problems here:

- phy_connect() does start the PHY machine which means that as soon as it 
detects a link state of any kind (up or down) it can call
netif_carrier_off() respectively netif_carrier_on()

- as soon as you call register_netdevice() notifiers run and other parts of the 
kernel or user-space programs can see an inconsistent link state

I would suggest doing the following sequence instead:

netif_carrier_off()
register_netdevice()
phy_connect()

Which should result in a consistent link state and carrier value.

Yes, It will address the issue. 

If we did the phy_conect in ndo_open it will make the carrier changes as one. 
But if we did in probe function then it's not working.

In ethernet driver probe - (below sequence is not working)
phy_connect()
register_netdevice()
netif_carrier_off()

working sequence:
In probe():
register_netdevice()
ndo_open:
   phy_connect()

Thanks,
Bhadram.
> 
>  
> 
> After link is UP if we check the carrier_changes sysfs node - it will 
> be one only.
> 
>  
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 1
> 
>  
> 
> After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net:
> phy: turn carrier off on phy attach) then I could see the carrier 
> changes incremented to 2 after Link UP.
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 2
> 
>  
> 
> Thanks,
> 
> Bhadram.
> 
> --
> -- This email message is for the sole use of the intended recipient(s) 
> and may contain confidential information.  Any unauthorized review, 
> use, disclosure or distribution is prohibited.  If you are not the 
> intended recipient, please contact the sender by reply email and 
> destroy all copies of the original message.
> --
> --


--
Florian


Re: [PATCH 13/31] timer: Remove meaningless .data/.function assignments

2017-08-31 Thread Greg Kroah-Hartman
On Thu, Aug 31, 2017 at 04:29:25PM -0700, Kees Cook wrote:
> Several timer users needlessly reset their .function/.data fields during
> their timer callback, but nothing else changes them. Some users do not
> use their .data field at all. Each instance is removed here.
> 
> Cc: Krzysztof Halasa 
> Cc: Aditya Shankar 
> Cc: Ganesh Krishna 
> Cc: Greg Kroah-Hartman 
> Cc: Jens Axboe 
> Cc: netdev@vger.kernel.org
> Cc: linux-wirel...@vger.kernel.org
> Cc: de...@driverdev.osuosl.org
> Signed-off-by: Kees Cook 
> ---
>  drivers/block/amiflop.c   | 3 +--
>  drivers/net/wan/hdlc_cisco.c  | 2 --
>  drivers/net/wan/hdlc_fr.c | 2 --
>  drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +---
>  4 files changed, 2 insertions(+), 9 deletions(-)

For the staging driver:

Acked-by: Greg Kroah-Hartman 


Re: [PATCH v3 net-next 0/7] bpf: Add option to set mark and priority in cgroup sock programs

2017-08-31 Thread David Miller
From: David Ahern 
Date: Thu, 31 Aug 2017 15:05:43 -0700

> Add option to set mark and priority in addition to bound device for newly
> created sockets. Also, allow the bpf programs to use the get_current_uid_gid
> helper meaning socket marks, priority and device can be set based on the
> uid/gid of the running process.
> 
> Sample programs are updated to demonstrate the new options.
> 
> v3
> - no changes to Patches 1 and 2 which Alexei acked in previous versions
> - dropped change related to recursive programs in a cgroup
> - updated tests per dropped patch
> 
> v2
> - added flag to control recursive behavior as requested by Alexei
> - added comment to sock_filter_func_proto regarding use of
>   get_current_uid_gid helper
> - updated test programs for recursive option

Series applied, please follow up to Daniel's feedback.

Thanks.


[PATCH] net: ethernet: ibm-emac: Add 5482 PHY init for OpenBlocks 600

2017-08-31 Thread Benjamin Herrenschmidt
The vendor patches initialize those registers to get the
PHY working properly.

Sadly I don't have that PHY spec and whatever Broadcom PHY
code we already have don't seem to document these two shadow
registers (unless I miscalculated the address) so I'm keeping
this as "vendor magic for that board". The vendor has long
abandoned that product, but I find it handy to test ppc405
kernels and so would like to keep it alive upstream :-)

Signed-off-by: Benjamin Herrenschmidt 
---

Note: Ideally, the whole driver should switch over to the
generic PHY layer. However this is a much bigger undertaking
which requires access to a bunch of HW to test, and for which
I have neither the time nor the HW available these days.

(Some of the HW could prove hard to find ...)
---
 drivers/net/ethernet/ibm/emac/phy.c | 30 ++
 1 file changed, 30 insertions(+)

diff --git a/drivers/net/ethernet/ibm/emac/phy.c 
b/drivers/net/ethernet/ibm/emac/phy.c
index 35865d05fccd..daa10de542fb 100644
--- a/drivers/net/ethernet/ibm/emac/phy.c
+++ b/drivers/net/ethernet/ibm/emac/phy.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "emac.h"
 #include "phy.h"
@@ -363,6 +364,34 @@ static struct mii_phy_def bcm5248_phy_def = {
.ops= &generic_phy_ops
 };
 
+static int bcm5482_init(struct mii_phy *phy)
+{
+   if (!of_machine_is_compatible("plathome,obs600"))
+   return 0;
+
+   /* Magic inits from vendor original patches */
+   phy_write(phy, 0x1c, 0xa410);
+   phy_write(phy, 0x1c, 0x8804);
+
+   return 0;
+}
+
+static const struct mii_phy_ops bcm5482_phy_ops = {
+   .init   = bcm5482_init,
+   .setup_aneg = genmii_setup_aneg,
+   .setup_forced   = genmii_setup_forced,
+   .poll_link  = genmii_poll_link,
+   .read_link  = genmii_read_link
+};
+
+static struct mii_phy_def bcm5482_phy_def = {
+
+   .phy_id = 0x0143bcb0,
+   .phy_id_mask= 0x0ff0,
+   .name   = "BCM5482 Gigabit Ethernet",
+   .ops= &bcm5482_phy_ops
+};
+
 static int m88e_init(struct mii_phy *phy)
 {
pr_debug("%s: Marvell 88E Ethernet\n", __func__);
@@ -499,6 +528,7 @@ static struct mii_phy_def *mii_phy_table[] = {
&et1011c_phy_def,
&cis8201_phy_def,
&bcm5248_phy_def,
+   &bcm5482_phy_def,
&m88e_phy_def,
&m88e1112_phy_def,
&ar8035_phy_def,
-- 
2.13.5



Re: [RFC net-next 0/8] net: dsa: Multi-queue awareness

2017-08-31 Thread Florian Fainelli


On 08/31/2017 05:05 PM, Andrew Lunn wrote:
> On Wed, Aug 30, 2017 at 05:18:44PM -0700, Florian Fainelli wrote:
>> This patch series is sent as reference, especially because the last patch
>> is trying not to be creating too many layer violations, but clearly there
>> are a little bit being created here anyways.
>>
>> Essentially what I am trying to achieve is that you have a stacked device 
>> which
>> is multi-queue aware, that applications will be using, and for which they can
>> control the queue selection (using mq) the way they want. Each of each 
>> stacked
>> network devices are created for each port of the switch (this is what DSA
>> does). When a skb is submitted from say net_device X, we can derive its port
>> number and look at the queue_mapping value to determine which port of the
>> switch and queue we should be sending this to. The information is embedded 
>> in a
>> tag (4 bytes) and is used by the switch to steer the transmission.
>>
>> These stacked devices will actually transmit using a "master" or conduit
>> network device which has a number of queues as well. In one version of the
>> hardware that I work with, we have up to 4 ports, each with 8 queues, and the
>> master device has a total of 32 hardware queues, so a 1:1 mapping is easy. 
>> With
>> another version of the hardware, same number of ports and queues, but only 16
>> hardware queues, so only a 2:1 mapping is possible.
>>
>> In order for congestion information to work properly, I need to establish a
>> mapping, preferably before transmission starts (but reconfiguration while
>> interfaces are running would be possible too) between these stacked device's
>> queue and the conduit interface's queue.
>>
>> Comments, flames, rotten tomatoes, anything!
> 
> Right, i think i understand.
> 
> This works just for traffic between the host and ports.  The host can
> set the egress queue. And i assume the queues are priorities, either
> absolute or weighted round robin, etc.
> 
> But this has no effect on traffic going from port to port. At some
> point, i expect you will want to offload TC for that.

You are absolutely right, this patch series aims at having the host be
able to steer traffic towards particular switch port egress queues which
are configured with specific priorities. At the moment it really is
mapping one priority value (in the 802.1p sense) to one queue number and
let the switch scheduler figure things out.

With this patch set you can now use the multiq filter of tc and do
exactly what is documented under Documentation/networking/multiqueue.txt
and get the desired matches to be steered towards the queue you defined.

> 
> How will the two interact? Could the TC rules also act on traffic from
> the host to a port? Would it be simpler in the long run to just
> implement TC rules?

I suppose that you could somehow use TC to influence how the traffic
from host to CPU works, but without a "CPU" port representor the
question is how do we get that done? If we used "eth0" we need to
callback into the switch driver for programming..

Regarding the last patch in this series, what I would ideally to replace
it with is something along the lines of:

tc bind dev sw0p0 queue 0 dev eth0 queue 16

I am not sure if this is an action, or a filter, or something else...
-- 
Florian


Re: [RFC net-next 1/8] net: dsa: Allow switch drivers to indicate number of RX/TX queues

2017-08-31 Thread Florian Fainelli


On 08/31/2017 04:44 PM, Andrew Lunn wrote:
> On Wed, Aug 30, 2017 at 05:18:45PM -0700, Florian Fainelli wrote:
>> Let switch drivers indicate how many RX and TX queues they support. Some
>> switches, such as Broadcom Starfighter 2 are resigned with 8 egress
>> queues.
> 
> Marvell switches also have egress queue.
> 
> Does the SF2 have ingress queues? Marvel don't as far as i known.  So
> i wounder if num_rx_queues is useful?

At the moment probably not, since we are not doing anything useful other
than creating the network devices with the indicated number of queues.

> 
> Do switches in general have ingress queues? 

They do, at least the Starfigther 2 has, and from the Broadcom tag you
can get such information (BRCM_EG_TC_SHIFT) and you could presumably
record that queue on the SKB. I don't have an use case for that (yet?).
-- 
Florian


[RFC] tools: selftests: psock_tpacket: skip un-supported tpacket_v3 test

2017-08-31 Thread Orson Zhai
The TPACKET_V3 test of PACKET_TX_RING will fail with kernel version
lower than v4.11. Supported code of tx ring was add with commit id
<7f953ab2ba46: af_packet: TX_RING support for TPACKET_V3> at Jan. 3
of 2017.

So skip this item test instead of reporting failing for old kernels.

Signed-off-by: Orson Zhai 
---
 tools/testing/selftests/net/psock_tpacket.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/net/psock_tpacket.c 
b/tools/testing/selftests/net/psock_tpacket.c
index 7f6cd9fdacf3..f0cfc18c3726 100644
--- a/tools/testing/selftests/net/psock_tpacket.c
+++ b/tools/testing/selftests/net/psock_tpacket.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "psock_lib.h"
 
@@ -676,7 +677,7 @@ static void __v3_fill(struct ring *ring, unsigned int 
blocks, int type)
ring->flen = ring->req3.tp_block_size;
 }
 
-static void setup_ring(int sock, struct ring *ring, int version, int type)
+static int setup_ring(int sock, struct ring *ring, int version, int type)
 {
int ret = 0;
unsigned int blocks = 256;
@@ -703,7 +704,11 @@ static void setup_ring(int sock, struct ring *ring, int 
version, int type)
 
if (ret == -1) {
perror("setsockopt");
-   exit(1);
+   if (errno == EINVAL) {
+   printf("[SKIP] This type seems un-supported in current 
kernel, skipped.\n");
+   return -1;
+   } else
+   exit(1);
}
 
ring->rd_len = ring->rd_num * sizeof(*ring->rd);
@@ -715,6 +720,7 @@ static void setup_ring(int sock, struct ring *ring, int 
version, int type)
 
total_packets = 0;
total_bytes = 0;
+   return 0;
 }
 
 static void mmap_ring(int sock, struct ring *ring)
@@ -830,7 +836,12 @@ static int test_tpacket(int version, int type)
 
sock = pfsocket(version);
memset(&ring, 0, sizeof(ring));
-   setup_ring(sock, &ring, version, type);
+   if(setup_ring(sock, &ring, version, type)) {
+   /* skip test when error of invalid argument */
+   close(sock);
+   return 0;
+   }
+
mmap_ring(sock, &ring);
bind_ring(sock, &ring);
walk_ring(sock, &ring);
-- 
2.12.2



Re: virtio_net: ethtool supported link modes

2017-08-31 Thread Jason Wang



On 2017年09月01日 01:04, Radu Rendec wrote:

Hello,

Looking at the code in virtnet_set_link_ksettings, it seems the speed
and duplex can be set to any valid value. The driver will "remember"
them and report them back in virtnet_get_link_ksettings.

However, the supported link modes (link_modes.supported in struct
ethtool_link_ksettings) is always 0, indicating that no speed/duplex
setting is supported.

Does it make more sense to set (at least a few of) the supported link
modes, such as 10baseT_Half ... 1baseT_Full?

I would expect to see consistency between what is reported in
link_modes.supported and what can actually be set. Could you please
share your opinion on this?


I think the may make sense only if there's a hardware implementation for 
virtio. And we probably need to extend virtio spec for adding new commands.


Thanks



Thank you,
Radu Rendec





Re: [PATCH net-next] doc: document MSG_ZEROCOPY

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 11:10 PM, Alexei Starovoitov
 wrote:
> On Thu, Aug 31, 2017 at 11:04:41PM -0400, Willem de Bruijn wrote:
>> On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov
>>  wrote:
>> > On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote:
>> >> From: Willem de Bruijn 
>> >>
>> >> Documentation for this feature was missing from the patchset.
>> >> Copied a lot from the netdev 2.1 paper, addressing some small
>> >> interface changes since then.
>> >>
>> >> Signed-off-by: Willem de Bruijn 
>> > ...
>> >> +Notification Batching
>> >> +~
>> >> +
>> >> +Multiple outstanding packets can be read at once using the recvmmsg
>> >> +call. This is often not needed. In each message the kernel returns not
>> >> +a single value, but a range. It coalesces consecutive notifications
>> >> +while one is outstanding for reception on the error queue.
>> >> +
>> >> +When a new notification is about to be queued, it checks whether the
>> >> +new value extends the range of the notification at the tail of the
>> >> +queue. If so, it drops the new notification packet and instead increases
>> >> +the range upper value of the outstanding notification.
>> >
>> > Would it make sense to mention that max notification range is 32-bit?
>> > So each 4Gbyte of xmit bytes there will be a notification.
>> > In modern 40Gbps NICs it's not a lot. Means that there will be
>> > at least one notification every second.
>> > Or I misread the code?
>>
>> You're right. The doc does mention that the counter and range
>> are 32-bit. I can state more explicitly that that bounds the working
>> set size to 4GB. Do you expect this to be problematic? Processing
>> a single notification per 4GB of data should not be a significant
>> cost in itself.
>
> I think 4GB is fine. Just there was an idea that in cases when
> notification of transmission can be known by other means

Some kind of unspoofable response from the peer (i.e., not just
a tcp ack), or a kernel mechanism independent from the error
queue? The first does not guarantee that a retransmit is
not in progress.

> the user space
> could have skipped reading errqeuee completely, but looks like it
> still needs to poll.

If a process has no need to see the notification, say because
it is sending out a buffer that is constant for the process lifetime,
then it could conceivably skip the recv, and poll with it. The code
as written will not coalesce more than 4GB of data, but that could
be revised.


Re: [PATCH v2 net-next 1/8] bpf: Add support for recursively running cgroup sock filters

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 07:22:01AM -0700, Tejun Heo wrote:
> Hello, David, Alexei.
> 
> Sorry about late reply.
> 
> On Sun, Aug 27, 2017 at 08:49:23AM -0600, David Ahern wrote:
> > On 8/25/17 8:49 PM, Alexei Starovoitov wrote:
> > > 
> > >> +if (prog && curr_recursive && !new_recursive)
> > >> +/* if a parent has recursive prog attached, only
> > >> + * allow recursive programs in descendent cgroup
> > >> + */
> > >> +return -EINVAL;
> > >> +
> > >>  old_prog = cgrp->bpf.prog[type];
> > > 
> > > ... I'm struggling to completely understand how it interacts
> > > with BPF_F_ALLOW_OVERRIDE.
> > 
> > The 2 flags are completely independent. The existing override logic is
> > unchanged. If a program can not be overridden, then the new recursive
> > flag is irrelevant.
> 
> I'm not sure all four combo of the two flags makes sense.  Can't we
> have something simpler like the following?
> 
> 1. None: No further bpf programs allowed in the subtree.
> 
> 2. Overridable: If a sub-cgroup installs the same bpf program, this
>one yields to that one.
> 
> 3. Recursive: If a sub-cgroup installs the same bpf program, that
>cgroup program gets run in addition to this one.
> 
> Note that we can have combinations of overridables and recursives -
> both allow further programs in the sub-hierarchy and the only
> distinction is whether that specific program behaves when that
> happens.

If I understand the proposal correctly in case of:
A (with recurs) -> B (with override) -> C (with recurse) -> D (with override)
when something happens in D, you propose to run D,C,A  ?

With the order of execution from children to parent?
That's a bit a different then what I was proposing with 'multi-prog' flag,
but the more I think about it the more I like it.
In your case detach is sort of transparent to everything around.
And you would also allow to say 'None' to one of the substrees too, right?
So something like:
A (with recurs) -> B (with override) -> C (with recurse) -> D (None) -> E
would mean that E cannot attach anything and events in E will
call D->C->A, right?
I will work on a patch for the above and see how it looks.



Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi

2017-08-31 Thread Jason Wang



On 2017年08月31日 22:30, Willem de Bruijn wrote:

Incomplete results at this stage, but I do see this correlation between
flows. It occurs even while not running out of zerocopy descriptors,
which I cannot yet explain.

Running two threads in a guest, each with a udp socket, each
sending up to 100 datagrams, or until EAGAIN, every msec.

Sender A sends 1B datagrams.
Sender B sends VHOST_GOODCOPY_LEN, which is enough
to trigger zcopy_used in vhost net.

A local receive process on the host receives both flows. To avoid
a deep copy when looping the packet onto the receive path,
changed skb_orphan_frags_rx to always return false (gross hack).

The flow with the larger packets is redirected through netem on ifb0:

   modprobe ifb
   ip link set dev ifb0 up
   tc qdisc add dev ifb0 root netem limit $LIMIT rate 1MBit

   tc qdisc add dev tap0 ingress
   tc filter add dev tap0 parent : protocol ip \
   u32 match ip dport 8000 0x \
   action mirred egress redirect dev ifb0

For 10 second run, packet count with various ifb0 queue lengths $LIMIT:

no filter
   rx.A: ~840,000
   rx.B: ~840,000

limit 1
   rx.A: ~500,000
   rx.B: ~3100
   ifb0: 3273 sent, 371141 dropped

limit 100
   rx.A: ~9000
   rx.B: ~4200
   ifb0: 4630 sent, 1491 dropped

limit 1000
   rx.A: ~6800
   rx.B: ~4200
   ifb0: 4651 sent, 0 dropped

Sender B is always correctly rate limited to 1 MBps or less. With a
short queue, it ends up dropping a lot and sending even less.

When a queue builds up for sender B, sender A throughput is strongly
correlated with queue length. With queue length 1, it can send almost
at unthrottled speed. But even at limit 100 its throughput is on the
same order as sender B.

What is surprising to me is that this happens even though the number
of ubuf_info in use at limit 100 is around 100 at all times. In other words,
it does not exhaust the pool.

When forcing zcopy_used to be false for all packets, this effect of
sender A throughput being correlated with sender B does not happen.

no filter
   rx.A: ~850,000
   rx.B: ~850,000

limit 100
   rx.A: ~850,000
   rx.B: ~4200
   ifb0: 4518 sent, 876182 dropped

Also relevant is that with zerocopy, the sender processes back off
and report the same count as the receiver. Without zerocopy,
both senders send at full speed, even if only 4200 packets from flow
B arrive at the receiver.

This is with the default virtio_net driver, so without napi-tx.

It appears that the zerocopy notifications are pausing the guest.
Will look at that now.

It was indeed as simple as that. With 256 descriptors, queuing even
a hundred or so packets causes the guest to stall the device as soon
as the qdisc is installed.

Adding this check

+   in_use = nvq->upend_idx - nvq->done_idx;
+   if (nvq->upend_idx < nvq->done_idx)
+   in_use += UIO_MAXIOV;
+
+   if (in_use > (vq->num >> 2))
+   zcopy_used = false;

Has the desired behavior of reverting zerocopy requests to copying.

Without this change, the result is, as previously reported, throughput
dropping to hundreds of packets per second on both flows.

With the change, pps as observed for a few seconds at handle_tx is

zerocopy=165 copy=168435
zerocopy=0 copy=168500
zerocopy=65 copy=168535

Both flows continue to send at more or less normal rate, with only
sender B observing massive drops at the netem.

With the queue removed the rate reverts to

zerocopy=58878 copy=110239
zerocopy=58833 copy=110207

This is not a 50/50 split, which impliesTw that some packets from the large
packet flow are still converted to copying. Without the change the rate
without queue was 80k zerocopy vs 80k copy, so this choice of
(vq->num >> 2) appears too conservative.

However, testing with (vq->num >> 1) was not as effective at mitigating
stalls. I did not save that data, unfortunately. Can run more tests on fine
tuning this variable, if the idea sounds good.


Looks like there're still two cases were left:

1) sndbuf is not INT_MAX
2) tx napi is used for virtio-net

1) could be a corner case, and for 2) what your suggest here may not 
solve the issue since it still do in order completion.


Thanks



Re: [PATCH net-next] doc: document MSG_ZEROCOPY

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 11:04:41PM -0400, Willem de Bruijn wrote:
> On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov
>  wrote:
> > On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote:
> >> From: Willem de Bruijn 
> >>
> >> Documentation for this feature was missing from the patchset.
> >> Copied a lot from the netdev 2.1 paper, addressing some small
> >> interface changes since then.
> >>
> >> Signed-off-by: Willem de Bruijn 
> > ...
> >> +Notification Batching
> >> +~
> >> +
> >> +Multiple outstanding packets can be read at once using the recvmmsg
> >> +call. This is often not needed. In each message the kernel returns not
> >> +a single value, but a range. It coalesces consecutive notifications
> >> +while one is outstanding for reception on the error queue.
> >> +
> >> +When a new notification is about to be queued, it checks whether the
> >> +new value extends the range of the notification at the tail of the
> >> +queue. If so, it drops the new notification packet and instead increases
> >> +the range upper value of the outstanding notification.
> >
> > Would it make sense to mention that max notification range is 32-bit?
> > So each 4Gbyte of xmit bytes there will be a notification.
> > In modern 40Gbps NICs it's not a lot. Means that there will be
> > at least one notification every second.
> > Or I misread the code?
> 
> You're right. The doc does mention that the counter and range
> are 32-bit. I can state more explicitly that that bounds the working
> set size to 4GB. Do you expect this to be problematic? Processing
> a single notification per 4GB of data should not be a significant
> cost in itself.

I think 4GB is fine. Just there was an idea that in cases when
notification of transmission can be known by other means the user space
could have skipped reading errqeuee completely, but looks like it
still needs to poll. That's fine.

> > Thanks for the doc!
> 
> Thanks for reviewing :)
> 
> >
> > Acked-by: Alexei Starovoitov 
> >


Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi

2017-08-31 Thread Jason Wang



On 2017年08月30日 11:11, Willem de Bruijn wrote:

On Tue, Aug 29, 2017 at 9:45 PM, Jason Wang  wrote:


On 2017年08月30日 03:35, Willem de Bruijn wrote:

On Fri, Aug 25, 2017 at 9:03 PM, Willem de Bruijn
 wrote:

On Fri, Aug 25, 2017 at 7:32 PM, Michael S. Tsirkin 
wrote:

On Fri, Aug 25, 2017 at 06:44:36PM -0400, Willem de Bruijn wrote:


[...]


Incomplete results at this stage, but I do see this correlation between
flows. It occurs even while not running out of zerocopy descriptors,
which I cannot yet explain.

Running two threads in a guest, each with a udp socket, each
sending up to 100 datagrams, or until EAGAIN, every msec.

Sender A sends 1B datagrams.
Sender B sends VHOST_GOODCOPY_LEN, which is enough
to trigger zcopy_used in vhost net.

A local receive process on the host receives both flows. To avoid
a deep copy when looping the packet onto the receive path,
changed skb_orphan_frags_rx to always return false (gross hack).

The flow with the larger packets is redirected through netem on ifb0:

modprobe ifb
ip link set dev ifb0 up
tc qdisc add dev ifb0 root netem limit $LIMIT rate 1MBit

tc qdisc add dev tap0 ingress
tc filter add dev tap0 parent : protocol ip \
u32 match ip dport 8000 0x \
action mirred egress redirect dev ifb0

For 10 second run, packet count with various ifb0 queue lengths $LIMIT:

no filter
rx.A: ~840,000
rx.B: ~840,000


Just to make sure I understand the case here. What did rx.B mean here? I
thought all traffic sent by Sender B has been redirected to ifb0?

It has been, but the packet still arrives at the destination socket.
IFB is a special virtual device that applies traffic shaping and
then reinjects it back at the point it was intercept by mirred.

rx.B is indeed arrival rate at the receiver, similar to rx.A.



I see, then ifb looks pretty fit to the test.




Re: [PATCH net-next] doc: document MSG_ZEROCOPY

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov
 wrote:
> On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote:
>> From: Willem de Bruijn 
>>
>> Documentation for this feature was missing from the patchset.
>> Copied a lot from the netdev 2.1 paper, addressing some small
>> interface changes since then.
>>
>> Signed-off-by: Willem de Bruijn 
> ...
>> +Notification Batching
>> +~
>> +
>> +Multiple outstanding packets can be read at once using the recvmmsg
>> +call. This is often not needed. In each message the kernel returns not
>> +a single value, but a range. It coalesces consecutive notifications
>> +while one is outstanding for reception on the error queue.
>> +
>> +When a new notification is about to be queued, it checks whether the
>> +new value extends the range of the notification at the tail of the
>> +queue. If so, it drops the new notification packet and instead increases
>> +the range upper value of the outstanding notification.
>
> Would it make sense to mention that max notification range is 32-bit?
> So each 4Gbyte of xmit bytes there will be a notification.
> In modern 40Gbps NICs it's not a lot. Means that there will be
> at least one notification every second.
> Or I misread the code?

You're right. The doc does mention that the counter and range
are 32-bit. I can state more explicitly that that bounds the working
set size to 4GB. Do you expect this to be problematic? Processing
a single notification per 4GB of data should not be a significant
cost in itself.

> Thanks for the doc!

Thanks for reviewing :)

>
> Acked-by: Alexei Starovoitov 
>


Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses

2017-08-31 Thread Eric Dumazet
On Thu, 2017-08-31 at 09:59 -0700, Ivan Delalande wrote:
> Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to
> processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is
> not possible to retrieve these from the kernel once they have been
> configured on sockets.
> 
> Signed-off-by: Ivan Delalande 
> ---
>  include/uapi/linux/inet_diag.h |   1 +
>  include/uapi/linux/tcp.h   |   9 
>  net/ipv4/tcp_diag.c| 109 
> ++---
>  3 files changed, 113 insertions(+), 6 deletions(-)

Acked-by: Eric Dumazet 



Re: [PATCH net-next v5 1/2] inet_diag: allow protocols to provide additional data

2017-08-31 Thread Eric Dumazet
On Thu, 2017-08-31 at 09:59 -0700, Ivan Delalande wrote:
> Extend inet_diag_handler to allow individual protocols to report
> additional data on INET_DIAG_INFO through idiag_get_aux. The size
> can be dynamic and is computed by idiag_get_aux_size.
> 
> Signed-off-by: Ivan Delalande 
> ---
>  include/linux/inet_diag.h |  7 +++
>  net/ipv4/inet_diag.c  | 22 ++
>  2 files changed, 25 insertions(+), 4 deletions(-)

Acked-by: Eric Dumazet 




Re: [PATCH] bnx2x: drop packets where gso_size is too big for hardware

2017-08-31 Thread Daniel Axtens
Eric Dumazet  writes:

> If you had this test in bnx2x_features_check(), packet could be
> segmented by core networking stack before reaching bnx2x_start_xmit() by
> clearing NETIF_F_GSO_MASK
>
> -> No drop would be involved.

Thanks for the pointer - networking code is all a bit new to me.

I'm just struggling at the moment to figure out what the right way to
calculate the length. My original patch uses gso_size + hlen, but:

 - On reflection, while this solves the immediate bug, I'm not 100% sure
   this is the right thing to be calculating

 - If it is, then we have the problem that hlen is calculated in a bunch
   of weird and wonderful ways which make it a nightmare to extract.

Yuval (or anyone else who groks the driver properly) - what's the right
test to be doing here to make sure we don't write to much data to the
card?

Regards,
Daniel




RE: [RFC PATCH] net: frag limit checks need to use percpu_counter_compare

2017-08-31 Thread liujian (CE)



Best Regards,
liujian


> -Original Message-
> From: Michal Kubecek [mailto:mkube...@suse.cz]
> Sent: Friday, September 01, 2017 12:24 AM
> To: Jesper Dangaard Brouer
> Cc: liujian (CE); netdev@vger.kernel.org; Florian Westphal
> Subject: Re: [RFC PATCH] net: frag limit checks need to use
> percpu_counter_compare
> 
> On Thu, Aug 31, 2017 at 12:20:19PM +0200, Jesper Dangaard Brouer wrote:
> > To: Liujian can you please test this patch?
> >  I want to understand if using __percpu_counter_compare() solves  the
> > problem correctness wise (even-though this will be slower  than using
> > a simple atomic_t on your big system).

I have test the patch, it can work. 
1. make sure frag_mem_limit reach to thresh
  ===>FRAG: inuse 0 memory 0 frag_mem_limit 5386864
2. change NIC rx irq's affinity to a fixed CPU
3. iperf -u -c 9.83.1.41 -l 1 -i 1 -t 1000 -P 10 -b 20M
  And check /proc/net/snmp, there are no ReasmFails.

And I think it is a better way that adding some counter sync points as you said.

> > Fix bug in fragmentation codes use of the percpu_counter API, that
> > cause issues on systems with many CPUs.
> >
> > The frag_mem_limit() just reads the global counter (fbc->count),
> > without considering other CPUs can have upto batch size (130K) that
> > haven't been subtracted yet.  Due to the 3MBytes lower thresh limit,
> > this become dangerous at >=24 CPUs (3*1024*1024/13=24).
> >
> > The __percpu_counter_compare() does the right thing, and takes into
> > account the number of (online) CPUs and batch size, to account for
> > this and call __percpu_counter_sum() when needed.
> >
> > On systems with many CPUs this will unfortunately always result in the
> > heavier fully locked __percpu_counter_sum() which touch the
> > per_cpu_ptr of all (online) CPUs.
> >
> > On systems with a smaller number of CPUs this solution is also not
> > optimal, because __percpu_counter_compare()/__percpu_counter_sum()
> > doesn't help synchronize the global counter.
> >  Florian Westphal have an idea of adding some counter sync points,
> > which should help address this issue.
> > ---
> >  include/net/inet_frag.h  |   16 ++--
> >  net/ipv4/inet_fragment.c |6 +++---
> >  2 files changed, 17 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index
> > 6fdcd2427776..b586e320783d 100644
> > --- a/include/net/inet_frag.h
> > +++ b/include/net/inet_frag.h
> > @@ -147,9 +147,21 @@ static inline bool inet_frag_evicting(struct
> inet_frag_queue *q)
> >   */
> >  static unsigned int frag_percpu_counter_batch = 13;
> >
> > -static inline int frag_mem_limit(struct netns_frags *nf)
> > +static inline bool frag_mem_over_limit(struct netns_frags *nf, int
> > +thresh)
> >  {
> > -   return percpu_counter_read(&nf->mem);
> > +   /* When reading counter here, __percpu_counter_compare() call
> > +* will invoke __percpu_counter_sum() when needed.  Which
> > +* depend on num_online_cpus()*batch size, as each CPU can
> > +* potentential can hold a batch count.
> > +*
> > +* With many CPUs this heavier sum operation will
> > +* unfortunately always occur.
> > +*/
> > +   if (__percpu_counter_compare(&nf->mem, thresh,
> > +frag_percpu_counter_batch) > 0)
> > +   return true;
> > +   else
> > +   return false;
> >  }
> >
> >  static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
> > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index
> > 96e95e83cc61..ee2cf56900e6 100644
> > --- a/net/ipv4/inet_fragment.c
> > +++ b/net/ipv4/inet_fragment.c
> > @@ -120,7 +120,7 @@ static void inet_frag_secret_rebuild(struct
> > inet_frags *f)  static bool inet_fragq_should_evict(const struct
> > inet_frag_queue *q)  {
> > return q->net->low_thresh == 0 ||
> > -  frag_mem_limit(q->net) >= q->net->low_thresh;
> > +   frag_mem_over_limit(q->net, q->net->low_thresh);
> >  }
> >
> >  static unsigned int
> > @@ -355,7 +355,7 @@ static struct inet_frag_queue
> > *inet_frag_alloc(struct netns_frags *nf,  {
> > struct inet_frag_queue *q;
> >
> > -   if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> > +   if (!nf->high_thresh || frag_mem_over_limit(nf, nf->high_thresh)) {
> > inet_frag_schedule_worker(f);
> > return NULL;
> > }
> 
> If we go this way (which would IMHO require some benchmarks to make sure it
> doesn't harm performance too much) we can drop the explicit checks for zero
> thresholds which were added to work around the unreliability of fast checks of
> percpu counters (or at least the second one was by commit
> 30759219f562 ("net: disable fragment reassembly if high_thresh is zero").
> 
> Michal Kubecek
> 
> > @@ -396,7 +396,7 @@ struct inet_frag_queue *inet_frag_find(struct
> netns_frags *nf,
> > struct inet_frag_queue *q;
> > int depth = 0;
> >
> > -   if (frag_mem_limit(nf) > nf->low_thresh)
> > +  

Re: [PATCH net-next] doc: document MSG_ZEROCOPY

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> Documentation for this feature was missing from the patchset.
> Copied a lot from the netdev 2.1 paper, addressing some small
> interface changes since then.
> 
> Signed-off-by: Willem de Bruijn 
...
> +Notification Batching
> +~
> +
> +Multiple outstanding packets can be read at once using the recvmmsg
> +call. This is often not needed. In each message the kernel returns not
> +a single value, but a range. It coalesces consecutive notifications
> +while one is outstanding for reception on the error queue.
> +
> +When a new notification is about to be queued, it checks whether the
> +new value extends the range of the notification at the tail of the
> +queue. If so, it drops the new notification packet and instead increases
> +the range upper value of the outstanding notification.

Would it make sense to mention that max notification range is 32-bit?
So each 4Gbyte of xmit bytes there will be a notification.
In modern 40Gbps NICs it's not a lot. Means that there will be
at least one notification every second.
Or I misread the code?

Thanks for the doc!

Acked-by: Alexei Starovoitov 



Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 01:56:34PM -0700, Chenbo Feng wrote:
> From: Chenbo Feng 
> 
> Introduce a pointer into struct bpf_map to hold the security information
> about the map. The actual security struct varies based on the security
> models implemented. Place the LSM hooks before each of the unrestricted
> eBPF operations, the map_update_elem and map_delete_elem operations are
> checked by security_map_modify. The map_lookup_elem and map_get_next_key
> operations are checked by securtiy_map_read.
> 
> Signed-off-by: Chenbo Feng 

...

> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_read(map);
> + if (err)
> + return -EACCES;
> +
>   key = memdup_user(ukey, map->key_size);
>   if (IS_ERR(key)) {
>   err = PTR_ERR(key);
> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_modify(map);

I don't feel these extra hooks are really thought through.
With such hook you'll disallow map_update for given map. That's it.
The key/values etc won't be used in such security decision.
In such case you don't need such hooks in update/lookup at all.
Only in map_creation and object_get calls where FD can be received.
In other words I suggest to follow standard unix practices:
Do permissions checks in open() and allow read/write() if FD is valid.
Same here. Do permission checks in prog_load/map_create/obj_pin/get
and that will be enough to jail bpf subsystem.
bpf cmds that need to be fast (like lookup and update) should not
have security hooks.



Re: [PATCH v3 net-next 6/7] samples/bpf: Update cgrp2 socket tests

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 03:05:49PM -0700, David Ahern wrote:
> Update cgrp2 bpf sock tests to check that device, mark and priority
> can all be set on a socket via bpf programs attached to a cgroup.
> 
> Signed-off-by: David Ahern 

Acked-by: Alexei Starovoitov 



Re: [PATCH v3 net-next 7/7] samples/bpf: Update cgroup socket examples to use uid gid helper

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 03:05:50PM -0700, David Ahern wrote:
> Signed-off-by: David Ahern 

Acked-by: Alexei Starovoitov 



Re: [PATCH v3 net-next 5/7] samples/bpf: Add option to dump socket settings

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 03:05:48PM -0700, David Ahern wrote:
> Add option to dump socket settings. Will be used in the next patch
> to verify bpf programs are correctly setting mark, priority and
> device based on the cgroup attachment for the program run.
> 
> Signed-off-by: David Ahern 

Acked-by: Alexei Starovoitov 



Re: [PATCH v3 net-next 4/7] samples/bpf: Add detach option to test_cgrp2_sock

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 03:05:47PM -0700, David Ahern wrote:
> Add option to detach programs from a cgroup.
> 
> Signed-off-by: David Ahern 

Acked-by: Alexei Starovoitov 


Re: [PATCH v3 net-next 3/7] samples/bpf: Update sock test to allow setting mark and priority

2017-08-31 Thread Alexei Starovoitov
On Thu, Aug 31, 2017 at 03:05:46PM -0700, David Ahern wrote:
> Update sock test to set mark and priority on socket create.
> 
> Signed-off-by: David Ahern 

Acked-by: Alexei Starovoitov 



[RFC iproute2 1/2] update headers with CBS API [RFC]

2017-08-31 Thread Vinicius Costa Gomes
Signed-off-by: Vinicius Costa Gomes 
---
 include/linux/pkt_sched.h | 29 +
 1 file changed, 29 insertions(+)

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 099bf552..ba6c9a54 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -871,4 +871,33 @@ struct tc_pie_xstats {
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
 };
+
+/* CBS */
+/* FIXME: this is only for usage with ndo_setup_tc(), this should be
+ * in another header someplace else. Is pkt_cls.h the right place?
+ */
+struct tc_cbs_qopt_offload {
+   __u8enable;
+   __s32   queue;
+   __s32   hicredit;
+   __s32   locredit;
+   __s32   idleslope;
+   __s32   sendslope;
+};
+
+struct tc_cbs_qopt {
+   __s32   hicredit;
+   __s32   locredit;
+   __s32   idleslope;
+   __s32   sendslope;
+};
+
+enum {
+   TCA_CBS_UNSPEC,
+   TCA_CBS_PARMS,
+   __TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
-- 
2.14.1



[RFC iproute2 2/2] tc: Add support for the CBS qdisc

2017-08-31 Thread Vinicius Costa Gomes
The Credit Based Shaper (CBS) queueing discipline allows bandwidth
reservation with sub-milisecond precision. It is defined by the
802.1Q-2014 specification (section 8.6.8.2 and Annex L).

The syntax is:

tc qdisc add dev DEV parent NODE cbs locredit  hicredit
 sendslope  idleslope 

Signed-off-by: Vinicius Costa Gomes 
---
 tc/Makefile |   1 +
 tc/q_cbs.c  | 134 
 2 files changed, 135 insertions(+)
 create mode 100644 tc/q_cbs.c

diff --git a/tc/Makefile b/tc/Makefile
index a9b4b8e6..f0091217 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -73,6 +73,7 @@ TCMODULES += q_hhf.o
 TCMODULES += q_clsact.o
 TCMODULES += e_bpf.o
 TCMODULES += f_matchall.o
+TCMODULES += q_cbs.o
 
 TCSO :=
 ifeq ($(TC_CONFIG_ATM),y)
diff --git a/tc/q_cbs.c b/tc/q_cbs.c
new file mode 100644
index ..0120e838
--- /dev/null
+++ b/tc/q_cbs.c
@@ -0,0 +1,134 @@
+/*
+ * q_tbf.c TBF.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Alexey Kuznetsov, 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "utils.h"
+#include "tc_util.h"
+
+static void explain(void)
+{
+   fprintf(stderr, "Usage: ... tbf hicredit BYTES locredit BYTES sendslope 
BPS idleslope BPS\n");
+}
+
+static void explain1(const char *arg, const char *val)
+{
+   fprintf(stderr, "cbs: illegal value for \"%s\": \"%s\"\n", arg, val);
+}
+
+static int cbs_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct 
nlmsghdr *n)
+{
+   int ok = 0;
+   struct tc_cbs_qopt opt = {};
+   struct rtattr *tail;
+
+   while (argc > 0) {
+   if (matches(*argv, "hicredit") == 0) {
+   NEXT_ARG();
+   if (opt.hicredit) {
+   fprintf(stderr, "cbs: duplicate \"hicredit\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(&opt.hicredit, *argv, 0)) {
+   explain1("hicredit", *argv);
+   return -1;
+   }
+   ok++;
+   } else if (matches(*argv, "locredit") == 0) {
+   NEXT_ARG();
+   if (opt.locredit) {
+   fprintf(stderr, "cbs: duplicate \"locredit\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(&opt.locredit, *argv, 0)) {
+   explain1("locredit", *argv);
+   return -1;
+   }
+   ok++;
+   } else if (matches(*argv, "sendslope") == 0) {
+   NEXT_ARG();
+   if (opt.sendslope) {
+   fprintf(stderr, "cbs: duplicate \"sendslope\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(&opt.sendslope, *argv, 0)) {
+   explain1("sendslope", *argv);
+   return -1;
+   }
+   ok++;
+   } else if (matches(*argv, "idleslope") == 0) {
+   NEXT_ARG();
+   if (opt.idleslope) {
+   fprintf(stderr, "cbs: duplicate \"idleslope\" 
specification\n");
+   return -1;
+   }
+   if (get_s32(&opt.idleslope, *argv, 0)) {
+   explain1("idleslope", *argv);
+   return -1;
+   }
+   ok++;
+   } else if (strcmp(*argv, "help") == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, "cbs: unknown parameter \"%s\"\n", 
*argv);
+   explain();
+   return -1;
+   }
+   argc--; argv++;
+   }
+
+   tail = NLMSG_TAIL(n);
+   addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+   addattr_l(n, 2024, TCA_CBS_PARMS, &opt, sizeof(opt));
+   tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+   return 0;
+}
+
+static int cbs_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt)
+{
+   struct rtattr *tb[TCA_TBF_MAX+1];
+   struct tc_cbs_qopt *qopt;
+
+   if (opt == NULL)
+   return 0;
+
+   parse_rtattr_nested(tb, TCA_CBS_MAX, opt);
+
+   if (tb[TCA_CBS_PARMS] == NULL)
+   return -1;
+
+  

[RFC net-next 5/5] samples/tsn: Add script for calculating CBS config

2017-08-31 Thread Vinicius Costa Gomes
From: Andre Guedes 

Add a script that takes as input the parameters of the Credit-based
shaper used on FQTSS - link rate, max frame size of best effort
traffic, idleslope and maximum frame size of the time-sensitive
traffic class - for SR classes A and B, and calculates how the CBS
qdisc must be configured for each traffic class.

For example, if you want to have Class A with a bandwidth of 300 Mbps
and Class B of 200 Mbps, and the max frame size of both classes'
traffic is 1500 bytes:

$ ./calculate_cbs_params.py -A 30 -a 1500 -B 20 -b 1500

would give you the correct cbs qdisc config command lines to be used.

This script is just a helper to ease testing of the TSN samples -
talker and listener - and shouldn't be taken as highly accurate.

Signed-off-by: Andre Guedes 
Signed-off-by: Jesus Sanchez-Palencia 
---
 samples/tsn/calculate_cbs_params.py | 112 
 1 file changed, 112 insertions(+)
 create mode 100755 samples/tsn/calculate_cbs_params.py

diff --git a/samples/tsn/calculate_cbs_params.py 
b/samples/tsn/calculate_cbs_params.py
new file mode 100755
index ..9c46210b699f
--- /dev/null
+++ b/samples/tsn/calculate_cbs_params.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2017, Intel Corporation
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in the
+#   documentation and/or other materials provided with the distribution.
+# * Neither the name of Intel Corporation nor the names of its contributors
+#   may be used to endorse or promote products derived from this software
+#   without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import math
+import sys
+
+def print_cbs_params_for_class_a(args):
+idleslope = args.idleslope_a
+sendslope = idleslope - args.link_speed
+
+# According to 802.1Q-2014 spec, Annex L, hiCredit and
+# loCredit for SR class A are calculated following the
+# equations L-10 and L-12, respectively.
+hicredit = math.ceil(idleslope * args.frame_non_sr / args.link_speed)
+locredit = math.ceil(sendslope * args.frame_a / args.link_speed)
+
+print("Class A --> # tc qdisc replace dev IFACE parent ID cbs " \
+  "locredit %d hicredit %d sendslope %d idleslope %d" % \
+  (locredit, hicredit, sendslope, idleslope))
+
+def print_cbs_params_for_class_b(args):
+idleslope = args.idleslope_b
+sendslope = idleslope - args.link_speed
+
+# Annex L doesn't present a straightforward equation to
+# calculate hiCredit for Class B so we have to derive it
+# based on generic equations presented in that Annex.
+#
+# L-3 is the primary equation to calculate hiCredit. Section
+# L.2 states that the 'maxInterferenceSize' for SR class B
+# is the maximum burst size for SR class A plus the
+# maxInterferenceSize from SR class A (which is equal to the
+# maximum frame from non-SR traffic).
+#
+# The maximum burst size for SR class A equation is shown in
+# L-16. Merging L-16 into L-3 we get the resulting equation
+# which calculates hiCredit B (refer to section L.3 in case
+# you're not familiar with the legend):
+#
+# hiCredit B = Rb * ( Mo Ma   )
+# -- + --
+#  Ro - Ra   Ro
+#
+hicredit = math.ceil(idleslope * \
+   ((args.frame_non_sr / (args.link_speed - args.idleslope_a)) + \
+   (args.frame_a / args.link_speed)))
+
+# loCredit B is calculated following equation L-2.
+locredit = math.ceil(sendslope * args.frame_b / args.link_speed)
+
+print("Class B --> # tc qdisc replace dev IFACE parent ID cbs " \
+  "locredit %d hicredit %d sendslope %d idleslope %d" % \
+  (locredit, hicredit, sendslope, idleslope))
+

[RFC net-next 4/5] sample: Add TSN Talker and Listener examples

2017-08-31 Thread Vinicius Costa Gomes
From: Jesus Sanchez-Palencia 

Add two examples so one can easily test a 'TSN distributed system'
running with standard kernel interfaces. Both 'talker' and 'listener'
sides are provided, and use a AF_PACKET for Tx / Rx of frames.

Running the examples is rather simple.
For the talker, just the interface and SO_PRIORITY are expected as
parameters:

$ ./talker -i enp3s0 -p 3

For the listener, only the interface is needed:

$ ./listener -i enp3s0

The multicast MAC address being used is currently hardcoded on both
examples for simplicity.

Note that the listener side uses a BPF filter so only frames sent to
the correct "stream" destination address are received by the socket.
If you modify the address used by the talker, you must also adapt
the BPF filter otherwise no frames will be received by the socket.

The listener example will print the rate of packets reception after
every 1 second. This makes it easier to verify if the bandwidth
configured for a traffic class is being respected or not.

Signed-off-by: Jesus Sanchez-Palencia 
Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Andre Guedes 
Signed-off-by: Iván Briano 
---
 samples/tsn/listener.c | 254 +
 samples/tsn/talker.c   | 136 ++
 2 files changed, 390 insertions(+)
 create mode 100644 samples/tsn/listener.c
 create mode 100644 samples/tsn/talker.c

diff --git a/samples/tsn/listener.c b/samples/tsn/listener.c
new file mode 100644
index ..2d17bdfbea99
--- /dev/null
+++ b/samples/tsn/listener.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of Intel Corporation nor the names of its 
contributors
+ *   may be used to endorse or promote products derived from this software
+ *   without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define MAX_FRAME_SIZE 1500
+
+/* XXX: If this address is changed, the BPF filter must be adjusted. */
+static uint8_t multicast_macaddr[] = { 0xBB, 0xAA, 0xBB, 0xAA, 0xBB, 0xAA };
+static char ifname[IFNAMSIZ];
+static uint64_t data_count;
+static int arg_count;
+
+/*
+ * BPF Filter so we only receive frames from the destination MAC address of our
+ * SRP stream. This is hardcoded in multicast_macaddr[].
+ */
+static struct sock_filter dst_addr_filter[] = {
+   { 0x20,  0,  0, 00 }, /* Load DST address: first 32bits only */
+   { 0x15,  0,  3, 0xbbaabbaa }, /* Compare with first 32bits from MAC */
+   { 0x28,  0,  0, 0x0004 }, /* Load DST address: remaining 16bits */
+   { 0x15,  0,  1, 0xbbaa }, /* Compare with last 16bits from MAC */
+   { 0x06,  0,  0, 0x },
+   { 0x06,  0,  0, 00 }, /* Ret 0. Jump here if any mismatches. */
+};
+
+/* BPF program */
+static struct sock_fprog bpf = {
+   .len = 6, /* Number of instructions on BPF filter */
+   .filter = dst_addr_filter,
+};
+
+static struct argp_option options[] = {
+   {"ifname", 'i', "IFNAME", 0, "Network Interface" },
+   { 0 }
+};
+
+static error_t parser(int key, char *arg, struct argp_state *s)
+{
+   switch (key) {
+   case 'i':
+   strncpy(ifname, arg, sizeof(ifname) - 1);
+   arg_count++;
+   break;
+   case ARGP_KEY_END:
+   if (arg_count < 1)
+   argp_failure(s, 1, 0, "Options missing. Check --help");
+   break;
+   }
+
+   return 0;
+}
+
+static struct argp argp = { options, par

[RFC net-next 0/5] TSN: Add qdisc-based config interfaces for traffic shapers

2017-08-31 Thread Vinicius Costa Gomes
Hi,

This patchset is an RFC on a proposal of how the Traffic Control subsystem can
be used to offload the configuration of traffic shapers into network devices
that provide support for them in HW. Our goal here is to start upstreaming
support for features related to the Time-Sensitive Networking (TSN) set of
standards into the kernel.

As part of this work, we've assessed previous public discussions related to TSN
enabling: patches from Henrik Austad (Cisco), the presentation from Eric Mann
at Linux Plumbers 2012, patches from Gangfeng Huang (National Instruments) and
the current state of the OpenAVNU project (https://github.com/AVnu/OpenAvnu/).

Please note that the patches provided as part of this RFC are implementing what
is needed only for 802.1Qav (FQTSS) only, but we'd like to take advantage of
this discussion and share our WIP ideas for the 802.1Qbv and 802.1Qbu interfaces
as well. The current patches are only providing support for HW offload of the
configs.


Overview


Time-sensitive Networking (TSN) is a set of standards that aim to address
resources availability for providing bandwidth reservation and bounded latency
on Ethernet based LANs. The proposal described here aims to cover mainly what is
needed to enable the following standards: 802.1Qat, 802.1Qav, 802.1Qbv and
802.1Qbu.

The initial target of this work is the Intel i210 NIC, but other controllers'
datasheet were also taken into account, like the Renesas RZ/A1H RZ/A1M group and
the Synopsis DesignWare Ethernet QoS controller.


Proposal


Feature-wise, what is covered here are configuration interfaces for HW
implementations of the Credit-Based shaper (CBS, 802.1Qav), Time-Aware shaper
(802.1Qbv) and Frame Preemption (802.1Qbu). CBS is a per-queue shaper, while
Qbv and Qbu must be configured per port, with the configuration covering all
queues. Given that these features are related to traffic shaping, and that the
traffic control subsystem already provides a queueing discipline that offloads
config into the device driver (i.e. mqprio), designing new qdiscs for the
specific purpose of offloading the config for each shaper seemed like a good
fit.

For steering traffic into the correct queues, we use the socket option
SO_PRIORITY and then a mechanism to map priority to traffic classes / Tx queues.
The qdisc mqprio is currently used in our tests.

As for the shapers config interface:

 * CBS (802.1Qav)

   This patchset is proposing a new qdisc called 'cbs'. Its 'tc' cmd line is:
   $ tc qdisc add dev IFACE parent ID cbs locredit N hicredit M sendslope S \
 idleslope I

   Note that the parameters for this qdisc are the ones defined by the
   802.1Q-2014 spec, so no hardware specific functionality is exposed here.


 * Time-aware shaper (802.1Qbv):

   The idea we are currently exploring is to add a "time-aware", priority based
   qdisc, that also exposes the Tx queues available and provides a mechanism for
   mapping priority <-> traffic class <-> Tx queues in a similar fashion as
   mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be:

   $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4\
   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \
   queues 0 1 2 3  \
   sched-file gates.sched [base-time ]   \
   [cycle-time ] [extension-time ]

is multi-line, with each line being of the following format:
 

   Qbv only defines one : "S" for 'SetGates'

   For example:

   S 0x01 300
   S 0x03 500

   This means that there are two intervals, the first will have the gate
   for traffic class 0 open for 300 nanoseconds, the second will have
   both traffic classes open for 500 nanoseconds.

   Additionally, an option to set just one entry of the gate control list will
   also be provided by 'taprio':

   $ tc qdisc (...) \
sched-row  \
[base-time ] [cycle-time ] \
[extension-time ]


 * Frame Preemption (802.1Qbu):

   To control even further the latency, it may prove useful to signal which
   traffic classes are marked as preemptable. For that, 'taprio' provides the
   preemption command so you set each traffic class as preemptable or not:

   $ tc qdisc (...) \
preemption 0 1 1 1


 * Time-aware shaper + Preemption:

   As an example of how Qbv and Qbu can be used together, we may specify
   both the schedule and the preempt-mask, and this way we may also
   specify the Set-Gates-and-Hold and Set-Gates-and-Release commands as
   specified in the Qbu spec:

   $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4 \
   map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3\
   queues 0 1 2 3 \
   preemption 0 1 1 1 \
   sched-file preempt_gates.sched

 is multi-line, with each line being of the following format:
  

F

[RFC net-next 1/5] net/sched: Introduce the user API for the CBS shaper

2017-08-31 Thread Vinicius Costa Gomes
Export the API necessary for configuring the CBS shaper (implemented
in the next patch) via the tc tool.

Signed-off-by: Vinicius Costa Gomes 
---
 include/uapi/linux/pkt_sched.h | 29 +
 1 file changed, 29 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 099bf5528fed..aa4a3e5421be 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -871,4 +871,33 @@ struct tc_pie_xstats {
__u32 maxq; /* maximum queue size */
__u32 ecn_mark; /* packets marked with ecn*/
 };
+
+/* CBS */
+/* FIXME: this is only for usage with ndo_setup_tc(), this should be
+ * in another header someplace else. Is pkt_cls.h the right place?
+ */
+struct tc_cbs_qopt_offload {
+   u8  enable;
+   s32 queue;
+   s32 hicredit;
+   s32 locredit;
+   s32 idleslope;
+   s32 sendslope;
+};
+
+struct tc_cbs_qopt {
+   __s32   hicredit;
+   __s32   locredit;
+   __s32   idleslope;
+   __s32   sendslope;
+};
+
+enum {
+   TCA_CBS_UNSPEC,
+   TCA_CBS_PARMS,
+   __TCA_CBS_MAX,
+};
+
+#define TCA_CBS_MAX (__TCA_CBS_MAX - 1)
+
 #endif
-- 
2.14.1



[RFC net-next 2/5] net/sched: Introduce Credit Based Shaper (CBS) qdisc

2017-08-31 Thread Vinicius Costa Gomes
This queueing discipline implements the shaper algorithm defined by
the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L.

It's primary usage is to apply some bandwidth reservation to user
defined traffic classes, which are mapped to different queues via the
mqprio qdisc.

Initially, it only supports offloading the traffic shaping work to
supporting controllers.

Later, when a software implementation is added, the current dependency
on being installed "under" mqprio can be lifted.

Signed-off-by: Vinicius Costa Gomes 
Signed-off-by: Jesus Sanchez-Palencia 
---
 include/linux/netdevice.h |   1 +
 net/sched/Kconfig |  11 ++
 net/sched/Makefile|   1 +
 net/sched/sch_cbs.c   | 286 ++
 4 files changed, 299 insertions(+)
 create mode 100644 net/sched/sch_cbs.c

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 35de8312e0b5..dd9a2ecd0c03 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -775,6 +775,7 @@ enum tc_setup_type {
TC_SETUP_CLSFLOWER,
TC_SETUP_CLSMATCHALL,
TC_SETUP_CLSBPF,
+   TC_SETUP_CBS,
 };
 
 /* These structures hold the attributes of xdp state that are being passed
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e70ed26485a2..c03d86a7775e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -172,6 +172,17 @@ config NET_SCH_TBF
  To compile this code as a module, choose M here: the
  module will be called sch_tbf.
 
+config NET_SCH_CBS
+   tristate "Credit Based Shaper (CBS)"
+   ---help---
+ Say Y here if you want to use the Credit Based Shaper (CBS) packet
+ scheduling algorithm.
+
+ See the top of  for more details.
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_cbs.
+
 config NET_SCH_GRED
tristate "Generic Random Early Detection (GRED)"
---help---
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 7b915d226de7..80c8f92d162d 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)+= sch_fq_codel.o
 obj-$(CONFIG_NET_SCH_FQ)   += sch_fq.o
 obj-$(CONFIG_NET_SCH_HHF)  += sch_hhf.o
 obj-$(CONFIG_NET_SCH_PIE)  += sch_pie.o
+obj-$(CONFIG_NET_SCH_CBS)  += sch_cbs.o
 
 obj-$(CONFIG_NET_CLS_U32)  += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
new file mode 100644
index ..1c86a9e14150
--- /dev/null
+++ b/net/sched/sch_cbs.c
@@ -0,0 +1,286 @@
+/*
+ * net/sched/sch_cbs.c Credit Based Shaper
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors:Vininicius Costa Gomes 
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct cbs_sched_data {
+   struct Qdisc *qdisc; /* Inner qdisc, default - pfifo queue */
+   s32 queue;
+   s32 locredit;
+   s32 hicredit;
+   s32 sendslope;
+   s32 idleslope;
+};
+
+static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+  struct sk_buff **to_free)
+{
+   struct cbs_sched_data *q = qdisc_priv(sch);
+   int ret;
+
+   ret = qdisc_enqueue(skb, q->qdisc, to_free);
+   if (ret != NET_XMIT_SUCCESS) {
+   if (net_xmit_drop_count(ret))
+   qdisc_qstats_drop(sch);
+   return ret;
+   }
+
+   qdisc_qstats_backlog_inc(sch, skb);
+   sch->q.qlen++;
+   return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
+{
+   struct cbs_sched_data *q = qdisc_priv(sch);
+   struct sk_buff *skb;
+
+   skb = q->qdisc->ops->peek(q->qdisc);
+   if (skb) {
+   skb = qdisc_dequeue_peeked(q->qdisc);
+   if (unlikely(!skb))
+   return NULL;
+
+   qdisc_qstats_backlog_dec(sch, skb);
+   sch->q.qlen--;
+   qdisc_bstats_update(sch, skb);
+
+   return skb;
+   }
+   return NULL;
+}
+
+static void cbs_reset(struct Qdisc *sch)
+{
+   struct cbs_sched_data *q = qdisc_priv(sch);
+
+   qdisc_reset(q->qdisc);
+}
+
+static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
+   [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) },
+};
+
+static int cbs_change(struct Qdisc *sch, struct nlattr *opt)
+{
+   struct cbs_sched_data *q = qdisc_priv(sch);
+   struct tc_cbs_qopt_offload cbs = { };
+   struct nlattr *tb[TCA_CBS_MAX + 1];
+   const struct net_device_ops *ops;
+   struct tc_cbs_qopt *qopt;
+   struct net_device *dev;
+   int err;
+
+  

[RFC net-next 3/5] igb: Add support for CBS offload

2017-08-31 Thread Vinicius Costa Gomes
From: Andre Guedes 

This patch adds support for Credit-Based Shaper (CBS) qdisc offload
from Traffic Control system. This support enable us to leverage the
Forwarding and Queuing for Time-Sensitive Streams (FQTSS) features
from Intel i210 Ethernet Controller. FQTSS is the former 802.1Qav
standard which was merged into 802.1Q in 2014. It enables traffic
prioritization and bandwidth reservation via the Credit-Based Shaper
which is implemented in hardware by i210 controller.

The patch introduces the igb_setup_tc() function which implements the
support for CBS qdisc hardware offload in the IGB driver. CBS offload
is the only traffic control offload supported by the driver at the
moment.

FQTSS transmission mode from i210 controller is automatically enabled
by the IGB driver when the CBS is enabled for the first hardware
queue. Likewise, FQTSS mode is automatically disabled when CBS is
disabled for the last hardware queue. Changing FQTSS mode requires NIC
reset.

FQTSS feature is supported by i210 controller only.

Signed-off-by: Andre Guedes 
Signed-off-by: Jesus Sanchez-Palencia 
---
 drivers/net/ethernet/intel/igb/e1000_defines.h |  23 ++
 drivers/net/ethernet/intel/igb/e1000_regs.h|   8 +
 drivers/net/ethernet/intel/igb/igb.h   |   6 +
 drivers/net/ethernet/intel/igb/igb_main.c  | 349 +
 4 files changed, 386 insertions(+)

diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h 
b/drivers/net/ethernet/intel/igb/e1000_defines.h
index 1de82f247312..83cabff1e0ab 100644
--- a/drivers/net/ethernet/intel/igb/e1000_defines.h
+++ b/drivers/net/ethernet/intel/igb/e1000_defines.h
@@ -353,7 +353,18 @@
 #define E1000_RXPBS_CFG_TS_EN   0x8000
 
 #define I210_RXPBSIZE_DEFAULT  0x00A2 /* RXPBSIZE default */
+#define I210_RXPBSIZE_MASK 0x003F
+#define I210_RXPBSIZE_PB_32KB  0x0020
 #define I210_TXPBSIZE_DEFAULT  0x0414 /* TXPBSIZE default */
+#define I210_TXPBSIZE_MASK 0xC0FF
+#define I210_TXPBSIZE_PB0_8KB  (8 << 0)
+#define I210_TXPBSIZE_PB1_8KB  (8 << 6)
+#define I210_TXPBSIZE_PB2_4KB  (4 << 12)
+#define I210_TXPBSIZE_PB3_4KB  (4 << 18)
+
+#define I210_DTXMXPKTSZ_DEFAULT0x0098
+
+#define I210_SR_QUEUES_NUM 2
 
 /* SerDes Control */
 #define E1000_SCTL_DISABLE_SERDES_LOOPBACK 0x0400
@@ -1051,4 +1062,16 @@
 #define E1000_VLAPQF_P_VALID(_n)   (0x1 << (3 + (_n) * 4))
 #define E1000_VLAPQF_QUEUE_MASK0x03
 
+/* TX Qav Control fields */
+#define E1000_TQAVCTRL_XMIT_MODE   BIT(0)
+#define E1000_TQAVCTRL_DATAFETCHARBBIT(4)
+#define E1000_TQAVCTRL_DATATRANARB BIT(8)
+
+/* TX Qav Credit Control fields */
+#define E1000_TQAVCC_IDLESLOPE_MASK0x
+#define E1000_TQAVCC_QUEUEMODE BIT(31)
+
+/* Transmit Descriptor Control fields */
+#define E1000_TXDCTL_PRIORITY  BIT(27)
+
 #endif
diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h 
b/drivers/net/ethernet/intel/igb/e1000_regs.h
index 58adbf234e07..8eee081d395f 100644
--- a/drivers/net/ethernet/intel/igb/e1000_regs.h
+++ b/drivers/net/ethernet/intel/igb/e1000_regs.h
@@ -421,6 +421,14 @@ do { \
 
 #define E1000_I210_FLA 0x1201C
 
+#define E1000_I210_DTXMXPKTSZ  0x355C
+
+#define E1000_I210_TXDCTL(_n)  (0x0E028 + ((_n) * 0x40))
+
+#define E1000_I210_TQAVCTRL0x3570
+#define E1000_I210_TQAVCC(_n)  (0x3004 + ((_n) * 0x40))
+#define E1000_I210_TQAVHC(_n)  (0x300C + ((_n) * 0x40))
+
 #define E1000_INVM_DATA_REG(_n)(0x12120 + 4*(_n))
 #define E1000_INVM_SIZE64 /* Number of INVM Data Registers */
 
diff --git a/drivers/net/ethernet/intel/igb/igb.h 
b/drivers/net/ethernet/intel/igb/igb.h
index 06ffb2bc713e..92845692087a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -281,6 +281,11 @@ struct igb_ring {
u16 count;  /* number of desc. in the ring */
u8 queue_index; /* logical index of the ring*/
u8 reg_idx; /* physical index of the ring */
+   bool cbs_enable;/* indicates if CBS is enabled */
+   s32 idleslope;  /* idleSlope in kbps */
+   s32 sendslope;  /* sendSlope in kbps */
+   s32 hicredit;   /* hiCredit in bytes */
+   s32 locredit;   /* loCredit in bytes */
 
/* everything past this point are written often */
u16 next_to_clean;
@@ -621,6 +626,7 @@ struct igb_adapter {
 #define IGB_FLAG_EEE   BIT(14)
 #define IGB_FLAG_VLAN_PROMISC  BIT(15)
 #define IGB_FLAG_RX_LEGACY BIT(16)
+#define IGB_FLAG_FQTSS BIT(17)
 
 /* Media Auto Sense */
 #define IGB_MAS_ENABLE_0   0X0001
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index fd4a46b03cc8..47cabca0c99a 100644
--- a/drivers/n

Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data

2017-08-31 Thread Dmitry Torokhov
On Thu, Aug 31, 2017 at 4:59 PM, Kees Cook  wrote:
> On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov
>  wrote:
>> On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook  wrote:
>>> In several places, .data is checked for initialization to gate early
>>> calls to del_timer_sync(). Checking for .function is equally valid, so
>>> switch to this in all callers.
>>
>> Not seeing the rest of patches it is unclear from the patch
>> description why this is needed/wanted.
>
> The CC list would have been really giant, but here is the first patch
> and the earlier series list:
>
> https://lkml.org/lkml/2017/8/31/904
> https://lkml.org/lkml/2017/8/30/760
>
> tl;dr: We're going to switch all struct timer_list callbacks to get
> the timer pointer as the argument instead of from the .data field.
> This patch is one step in removing open-coded users of the .data
> field.
>

And that is exactly what should have been in the patch description.

FWIW for input bits:

Acked-by: Dmitry Torokhov 

Thanks.

-- 
Dmitry


Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Chenbo Feng
On Thu, Aug 31, 2017 at 3:38 PM, Daniel Borkmann  wrote:
> On 08/31/2017 10:56 PM, Chenbo Feng wrote:
>>
>> From: Chenbo Feng 
>>
>> Introduce a pointer into struct bpf_map to hold the security information
>> about the map. The actual security struct varies based on the security
>> models implemented. Place the LSM hooks before each of the unrestricted
>> eBPF operations, the map_update_elem and map_delete_elem operations are
>> checked by security_map_modify. The map_lookup_elem and map_get_next_key
>> operations are checked by securtiy_map_read.
>>
>> Signed-off-by: Chenbo Feng 
>
>
> Against which tree is this by the way, net-next? There are
> changes here which require a rebase of your set.
>

This patch series is rebased on security subsystem since patch 1/3 is
a patch for
security system I assume. But I am not sure where this specific patch
should go in.
If I should submit this one to net-next, I will rebase it against
net-next and submit again.
>> ---
>>   include/linux/bpf.h  |  3 +++
>>   kernel/bpf/syscall.c | 28 
>>   2 files changed, 31 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index b69e7a5869ff..ca3e6ff7091d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -53,6 +53,9 @@ struct bpf_map {
>> struct work_struct work;
>> atomic_t usercnt;
>> struct bpf_map *inner_map_meta;
>> +#ifdef CONFIG_SECURITY
>> +   void *security;
>> +#endif
>>   };
>>
>>   /* function argument constraints */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 045646da97cc..b15580bcf3b1 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>> if (err)
>> return -EINVAL;
>>
>> +   err = security_map_create();
>
>
> Seems a bit limited to me, don't you want to be able to
> also differentiate between different map types? Same goes
> for security_prog_load() wrt prog types, no?
>
I don't want to introduce extra complexity into it if not necessary.
so I only included the
thing needed for the selinux implementation for now.  But I agree that the map
and program type information could be useful when people developing more complex
security checks. I will add a union bpf_attr *attr pointer into the
security hook functions
for future needs.
>> +   if (err)
>> +   return -EACCES;
>> +
>> /* find map type and init map: hashtable vs rbtree vs bloom vs ...
>> */
>> map = find_and_alloc_map(attr);
>> if (IS_ERR(map))
>> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>> if (err)
>> goto free_map_nouncharge;
>>
>> +   err = security_post_create(map);
>> +   if (err < 0)
>> +   goto free_map;
>> +
>
>
> So the hook you implement in patch 3/3 does:
>
> +static int selinux_bpf_post_create(struct bpf_map *map)
> +{
> +   struct bpf_security_struct *bpfsec;
> +
> +   bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
> +   if (!bpfsec)
> +   return -ENOMEM;
> +
> +   bpfsec->sid = current_sid();
> +   map->security = bpfsec;
> +
> +   return 0;
> +}
>
> Where do you kfree() bpfsec when the map gets released
> normally or in error path?
>
Will add it in next version. Thanks for point it out.
>> err = bpf_map_alloc_id(map);
>> if (err)
>> goto free_map;
>> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>>
>> +   err = security_map_read(map);
>> +   if (err)
>> +   return -EACCES;
>
>
> How about actually dropping ref?
>
May bad, thanks again.
>> +
>> key = memdup_user(ukey, map->key_size);
>> if (IS_ERR(key)) {
>> err = PTR_ERR(key);
>> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>>
>> +   err = security_map_modify(map);
>> +   if (err)
>> +   return -EACCES;
>
>
> Ditto ...
>
>> key = memdup_user(ukey, map->key_size);
>> if (IS_ERR(key)) {
>> err = PTR_ERR(key);
>> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>>
>> +   err = security_map_modify(map);
>> +   if (err)
>> +   return -EACCES;
>
>
> Ditto ...
>
>> key = memdup_user(ukey, map->key_size);
>> if (IS_ERR(key)) {
>> err = PTR_ERR(key);
>> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>>
>> +   err = security_map_read(map);
>> +   if (err)
>> +   return -EACCES;
>
>
> And once again here ... :(
>
>
>> if (ukey) {
>> key = memdup_

Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function

2017-08-31 Thread Tyrel Datwyler
On 08/31/2017 04:29 PM, Kees Cook wrote:
> This standardizes the callback and data prototypes in several places that
> perform casting, in an effort to remove more open-coded .data and
> .function uses in favor of setup_timer().
> 
> Cc: Samuel Ortiz 
> Cc: Tyrel Datwyler 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: "James E.J. Bottomley" 
> Cc: "Martin K. Petersen" 
> Cc: netdev@vger.kernel.org
> Cc: linux-s...@vger.kernel.org
> Cc: linuxppc-...@lists.ozlabs.org
> Signed-off-by: Kees Cook 
> ---
>  drivers/net/irda/bfin_sir.c  |  5 +++--
>  drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++
>  drivers/scsi/ibmvscsi/ibmvscsi.c |  8 
>  3 files changed, 13 insertions(+), 14 deletions(-)

Resending from correct email address.

For ibmvfc & ibmvscsi portions:

Acked-by: Tyrel Datwyler 



Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function

2017-08-31 Thread Tyrel Datwyler
On 08/31/2017 04:29 PM, Kees Cook wrote:
> This standardizes the callback and data prototypes in several places that
> perform casting, in an effort to remove more open-coded .data and
> .function uses in favor of setup_timer().
> 
> Cc: Samuel Ortiz 
> Cc: Tyrel Datwyler 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: "James E.J. Bottomley" 
> Cc: "Martin K. Petersen" 
> Cc: netdev@vger.kernel.org
> Cc: linux-s...@vger.kernel.org
> Cc: linuxppc-...@lists.ozlabs.org
> Signed-off-by: Kees Cook 
> ---
>  drivers/net/irda/bfin_sir.c  |  5 +++--
>  drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++
>  drivers/scsi/ibmvscsi/ibmvscsi.c |  8 
>  3 files changed, 13 insertions(+), 14 deletions(-)
For ibmvfc & ibmvscsi portions:

Acked-by: Tyrel Datwyler 


Re: netdev carrier changes is one even after ethernet link up.

2017-08-31 Thread Florian Fainelli
On 08/30/2017 10:53 PM, Bhadram Varka wrote:
> Hi,
> 
>  
> 
> I have observed that carrier_changes is one even in case of the ethernet
> link is up.
> 
>  
> 
> After investigating the code below is my observation –
> 
>  
> 
> ethernet_driver_probe()
> 
> +--->phy_connect()
> 
> | +--->phy_attach_direct()
> 
> |   +---> netif_carrier_off(): which increments
> carrier_changes to one.
> 
> +--->register_netdevice() : will the carrier_changes becomes zero here ?
> 
> +--->netif_carrier_off(): not increment the carrier_changes since
> __LINK_STATE_NOCARRIER already set.
> 
>  
> 
> From ethernet driver open will start the PHY and trigger the
> phy_state_machine.
> 
> Phy_state_machine workqueue calling netif_carrier_on() once the link is UP.
> 
> netif_carrier_on() increments the carrier_changes by one.

If the call trace is correct, then there is at least two problems here:

- phy_connect() does start the PHY machine which means that as soon as
it detects a link state of any kind (up or down) it can call
netif_carrier_off() respectively netif_carrier_on()

- as soon as you call register_netdevice() notifiers run and other parts
of the kernel or user-space programs can see an inconsistent link state

I would suggest doing the following sequence instead:

netif_carrier_off()
register_netdevice()
phy_connect()

Which should result in a consistent link state and carrier value.

> 
>  
> 
> After link is UP if we check the carrier_changes sysfs node - it will be
> one only.
> 
>  
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 1
> 
>  
> 
> After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net:
> phy: turn carrier off on phy attach) then I could see the carrier
> changes incremented to 2 after Link UP.
> 
> $ cat /sys/class/net/eth0/carrier_changes
> 
> 2
> 
>  
> 
> Thanks,
> 
> Bhadram.
> 
> 
> This email message is for the sole use of the intended recipient(s) and
> may contain confidential information.  Any unauthorized review, use,
> disclosure or distribution is prohibited.  If you are not the intended
> recipient, please contact the sender by reply email and destroy all
> copies of the original message.
> 


-- 
Florian


Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses

2017-08-31 Thread Ivan Delalande
On Fri, Sep 01, 2017 at 01:26:33AM +0200, Sabrina Dubroca wrote:
> 2017-08-31, 09:59:39 -0700, Ivan Delalande wrote:
> > diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
> > index a748c74aa8b7..abbf0edcf6c2 100644
> > --- a/net/ipv4/tcp_diag.c
> > +++ b/net/ipv4/tcp_diag.c
> [...]
> > +static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
> > +   struct sk_buff *skb)
> > +{
> > +#ifdef CONFIG_TCP_MD5SIG
> > +   if (net_admin) {
> 
> In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't
> see anything preventing you from reaching this with a !fullsock?

Currently handler->idiag_get_aux is only called from inet_sk_diag_fill
which has a `BUG_ON(!sk_fullsock(sk));`, but I could add another
explicit check in that function if you think it's more consistent.

Actually, I wasn't sure when adding this idiag_get_aux in v2 if it
should be called from inet_twsk_diag_fill, inet_req_diag_fill and
inet_csk_diag_fill, or just the last one. I chose that simpler approach
for now to avoid duplicating these state checks in the idiag_get_aux
defined by protocols and because we didn't need for INET_DIAG_MD5SIG,
but it shouldn't be too hard to change. Do you think this could be
useful for other protocols or attributes?

Thank you,
-- 
Ivan Delalande
Arista Networks


Re: [PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion

2017-08-31 Thread Eric Dumazet
On Thu, 2017-08-31 at 16:48 -0700, Eric Dumazet wrote:
> Yet another atomic_t -> refcount_t conversion, split in two patches.
> 
> First patch prepares the automatic conversion done in the second patch.
> 
> Eric Dumazet (2):
>   net: prepare (struct ubuf_info)->refcnt conversion
>   net: convert (struct ubuf_info)->refcnt to refcount_t
> 
>  drivers/vhost/net.c|  2 +-
>  include/linux/skbuff.h |  5 +++--
>  net/core/skbuff.c  | 14 --
>  net/ipv4/tcp.c |  2 --
>  4 files changed, 8 insertions(+), 15 deletions(-)
> 

David please ignore this series, I will send a V3 :)




Re: [RFC net-next 0/8] net: dsa: Multi-queue awareness

2017-08-31 Thread Andrew Lunn
On Wed, Aug 30, 2017 at 05:18:44PM -0700, Florian Fainelli wrote:
> This patch series is sent as reference, especially because the last patch
> is trying not to be creating too many layer violations, but clearly there
> are a little bit being created here anyways.
> 
> Essentially what I am trying to achieve is that you have a stacked device 
> which
> is multi-queue aware, that applications will be using, and for which they can
> control the queue selection (using mq) the way they want. Each of each stacked
> network devices are created for each port of the switch (this is what DSA
> does). When a skb is submitted from say net_device X, we can derive its port
> number and look at the queue_mapping value to determine which port of the
> switch and queue we should be sending this to. The information is embedded in 
> a
> tag (4 bytes) and is used by the switch to steer the transmission.
> 
> These stacked devices will actually transmit using a "master" or conduit
> network device which has a number of queues as well. In one version of the
> hardware that I work with, we have up to 4 ports, each with 8 queues, and the
> master device has a total of 32 hardware queues, so a 1:1 mapping is easy. 
> With
> another version of the hardware, same number of ports and queues, but only 16
> hardware queues, so only a 2:1 mapping is possible.
> 
> In order for congestion information to work properly, I need to establish a
> mapping, preferably before transmission starts (but reconfiguration while
> interfaces are running would be possible too) between these stacked device's
> queue and the conduit interface's queue.
> 
> Comments, flames, rotten tomatoes, anything!

Right, i think i understand.

This works just for traffic between the host and ports.  The host can
set the egress queue. And i assume the queues are priorities, either
absolute or weighted round robin, etc.

But this has no effect on traffic going from port to port. At some
point, i expect you will want to offload TC for that.

How will the two interact? Could the TC rules also act on traffic from
the host to a port? Would it be simpler in the long run to just
implement TC rules?

  Andrew


Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data

2017-08-31 Thread Kees Cook
On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov
 wrote:
> On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook  wrote:
>> In several places, .data is checked for initialization to gate early
>> calls to del_timer_sync(). Checking for .function is equally valid, so
>> switch to this in all callers.
>
> Not seeing the rest of patches it is unclear from the patch
> description why this is needed/wanted.

The CC list would have been really giant, but here is the first patch
and the earlier series list:

https://lkml.org/lkml/2017/8/31/904
https://lkml.org/lkml/2017/8/30/760

tl;dr: We're going to switch all struct timer_list callbacks to get
the timer pointer as the argument instead of from the .data field.
This patch is one step in removing open-coded users of the .data
field.

-Kees

-- 
Kees Cook
Pixel Security


Re: [PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet  wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> v2: added the change in drivers/vhost/net.c as spotted
> by Willem.
>
> Signed-off-by: Eric Dumazet 

Acked-by: Willem de Bruijn 


Re: [PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet  wrote:
> In order to convert this atomic_t refcnt to refcount_t,
> we need to init the refcount to one to not trigger
> a 0 -> 1 transition.
>
> This also removes one atomic operation in fast path.
>
> v2: removed dead code in sock_zerocopy_put_abort()
> as suggested by Willem.
>
> Signed-off-by: Eric Dumazet 

Acked-by: Willem de Bruijn 


[PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion

2017-08-31 Thread Eric Dumazet
In order to convert this atomic_t refcnt to refcount_t,
we need to init the refcount to one to not trigger
a 0 -> 1 transition.

This also removes one atomic operation in fast path.

v2: removed dead code in sock_zerocopy_put_abort()
as suggested by Willem.

Signed-off-by: Eric Dumazet 
---
 net/core/skbuff.c | 10 ++
 net/ipv4/tcp.c|  2 --
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 
917da73d3ab3b82163cf0a9ee944da09cb5a391f..1a754d7896ceac1eb85e3b13c1422b4d6a88fedf
 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, 
size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
-   atomic_set(&uarg->refcnt, 0);
+   atomic_set(&uarg->refcnt, 1);
sock_hold(sk);
 
return uarg;
@@ -1005,6 +1005,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, 
size_t size,
uarg->len++;
uarg->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
+   sock_zerocopy_get(uarg);
return uarg;
}
}
@@ -1102,13 +1103,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
 
-   /* sock_zerocopy_put expects a ref. Most sockets take one per
-* skb, which is zero on abort. tcp_sendmsg holds one extra, to
-* avoid an skb send inside the main loop triggering uarg free.
-*/
-   if (sk->sk_type != SOCK_STREAM)
-   atomic_inc(&uarg->refcnt);
-
sock_zerocopy_put(uarg);
}
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 
21ca2df274c5130a13d31a391a1408d779af34af..fff4d7bc3b8b46174e7bd0a04d7c212307e55cb5
 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1190,8 +1190,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr 
*msg, size_t size)
goto out_err;
}
 
-   /* skb may be freed in main loop, keep extra ref on uarg */
-   sock_zerocopy_get(uarg);
if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG))
uarg->zerocopy = 0;
}
-- 
2.14.1.581.gf28d330327-goog



[PATCH 25/31] net/atm/mpc: Use separate static data field with with static timer

2017-08-31 Thread Kees Cook
In preparation for changing the timer callback argument to the timer
pointer, move to a separate static data variable.

Cc: "David S. Miller" 
Cc: Andrew Morton 
Cc: Alexey Dobriyan 
Cc: "Reshetova, Elena" 
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook 
---
 net/atm/mpc.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 5baa31b8500c..fc24a46500ae 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -95,7 +95,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb,
 static int mpoa_event_listener(struct notifier_block *mpoa_notifier,
   unsigned long event, void *dev);
 static void mpc_timer_refresh(void);
-static void mpc_cache_check(unsigned long checking_time);
+static void mpc_cache_check(unsigned long unused);
 
 static struct llc_snap_hdr llc_snap_mpoa_ctrl = {
0xaa, 0xaa, 0x03,
@@ -121,7 +121,8 @@ static struct notifier_block mpoa_notifier = {
 
 struct mpoa_client *mpcs = NULL; /* FIXME */
 static struct atm_mpoa_qos *qos_head = NULL;
-static DEFINE_TIMER(mpc_timer, NULL);
+static DEFINE_TIMER(mpc_timer, mpc_cache_check);
+static unsigned long checking_time;
 
 
 static struct mpoa_client *find_mpc_by_itfnum(int itf)
@@ -1411,12 +1412,11 @@ static void clean_up(struct k_message *msg, struct 
mpoa_client *mpc, int action)
 static void mpc_timer_refresh(void)
 {
mpc_timer.expires = jiffies + (MPC_P2 * HZ);
-   mpc_timer.data = mpc_timer.expires;
-   mpc_timer.function = mpc_cache_check;
+   checking_time = mpc_timer.expires;
add_timer(&mpc_timer);
 }
 
-static void mpc_cache_check(unsigned long checking_time)
+static void mpc_cache_check(unsigned long unused)
 {
struct mpoa_client *mpc = mpcs;
static unsigned long previous_resolving_check_time;
-- 
2.7.4



[PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t

2017-08-31 Thread Eric Dumazet
refcount_t type and corresponding API should be
used instead of atomic_t when the variable is used as
a reference counter. This allows to avoid accidental
refcounter overflows that might lead to use-after-free
situations.

v2: added the change in drivers/vhost/net.c as spotted
by Willem.

Signed-off-by: Eric Dumazet 
---
 drivers/vhost/net.c| 2 +-
 include/linux/skbuff.h | 5 +++--
 net/core/skbuff.c  | 6 +++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 
ba08b78ed630c429ccf415af69bf27f2433af4f0..8d2bcae53a2ec9ea1c876625e581bcd429abe365
 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -533,7 +533,7 @@ static void handle_tx(struct vhost_net *net)
ubuf->callback = vhost_zerocopy_callback;
ubuf->ctx = nvq->ubufs;
ubuf->desc = nvq->upend_idx;
-   atomic_set(&ubuf->refcnt, 1);
+   refcount_set(&ubuf->refcnt, 1);
msg.msg_control = ubuf;
msg.msg_controllen = sizeof(ubuf);
ubufs = nvq->ubufs;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 
7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15
 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -456,7 +457,7 @@ struct ubuf_info {
u32 bytelen;
};
};
-   atomic_t refcnt;
+   refcount_t refcnt;
 
struct mmpin {
struct user_struct *user;
@@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, 
size_t size,
 
 static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 {
-   atomic_inc(&uarg->refcnt);
+   refcount_inc(&uarg->refcnt);
 }
 
 void sock_zerocopy_put(struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 
1a754d7896ceac1eb85e3b13c1422b4d6a88fedf..06b9ddca3188442d1d1d2052fc060cf5b1e2a6f4
 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, 
size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
-   atomic_set(&uarg->refcnt, 1);
+   refcount_set(&uarg->refcnt, 1);
sock_hold(sk);
 
return uarg;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
 
 void sock_zerocopy_put(struct ubuf_info *uarg)
 {
-   if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
+   if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
if (uarg->callback)
uarg->callback(uarg, uarg->zerocopy);
else
@@ -1483,7 +1483,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int 
ntail,
if (skb_orphan_frags(skb, gfp_mask))
goto nofrags;
if (skb_zcopy(skb))
-   atomic_inc(&skb_uarg(skb)->refcnt);
+   refcount_inc(&skb_uarg(skb)->refcnt);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
 
-- 
2.14.1.581.gf28d330327-goog



[PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion

2017-08-31 Thread Eric Dumazet
Yet another atomic_t -> refcount_t conversion, split in two patches.

First patch prepares the automatic conversion done in the second patch.

Eric Dumazet (2):
  net: prepare (struct ubuf_info)->refcnt conversion
  net: convert (struct ubuf_info)->refcnt to refcount_t

 drivers/vhost/net.c|  2 +-
 include/linux/skbuff.h |  5 +++--
 net/core/skbuff.c  | 14 --
 net/ipv4/tcp.c |  2 --
 4 files changed, 8 insertions(+), 15 deletions(-)

-- 
2.14.1.581.gf28d330327-goog



Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data

2017-08-31 Thread Dmitry Torokhov
On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook  wrote:
> In several places, .data is checked for initialization to gate early
> calls to del_timer_sync(). Checking for .function is equally valid, so
> switch to this in all callers.

Not seeing the rest of patches it is unclear from the patch
description why this is needed/wanted.

Thanks.

-- 
Dmitry


Re: [PATCH net] ipv4: Don't override return code from ip_route_input_noref()

2017-08-31 Thread Sabrina Dubroca
2017-08-31, 18:11:41 +0200, Stefano Brivio wrote:
> After ip_route_input() calls ip_route_input_noref(), another
> check on skb_dst() is done, but if this fails, we shouldn't
> override the return code from ip_route_input_noref(), as it
> could have been more specific (i.e. -EHOSTUNREACH).
> 
> This also saves one call to skb_dst_force_safe() and one to
> skb_dst() in case the ip_route_input_noref() check fails.
> 
> Reported-by: Sabrina Dubroca 
> Fixes: ad65a2f05695 ("ipv4: call dst_hold_safe() properly")

That should be instead:

Fixes: 9df16efadd2a ("ipv4: call dst_hold_safe() properly")

Acked-by: Sabrina Dubroca 

-- 
Sabrina


Re: [RFC net-next 1/8] net: dsa: Allow switch drivers to indicate number of RX/TX queues

2017-08-31 Thread Andrew Lunn
On Wed, Aug 30, 2017 at 05:18:45PM -0700, Florian Fainelli wrote:
> Let switch drivers indicate how many RX and TX queues they support. Some
> switches, such as Broadcom Starfighter 2 are resigned with 8 egress
> queues.

Marvell switches also have egress queue.

Does the SF2 have ingress queues? Marvel don't as far as i known.  So
i wounder if num_rx_queues is useful?

Do switches in general have ingress queues? 

   Andrew


[PATCH 20/31] net/core: Collapse redundant sk_timer callback data assignments

2017-08-31 Thread Kees Cook
The core sk_timer initializer can provide the common .data assignment
instead of it being set separately in users.

Cc: "David S. Miller" 
Cc: Ralf Baechle 
Cc: Andrew Hendry 
Cc: Eric Dumazet 
Cc: Paolo Abeni 
Cc: David Howells 
Cc: Colin Ian King 
Cc: Ingo Molnar 
Cc: linzhang 
Cc: netdev@vger.kernel.org
Cc: linux-h...@vger.kernel.org
Cc: linux-...@vger.kernel.org
Signed-off-by: Kees Cook 
---
 net/core/sock.c   | 2 +-
 net/netrom/nr_timer.c | 1 -
 net/rose/rose_timer.c | 1 -
 net/x25/af_x25.c  | 1 -
 net/x25/x25_timer.c   | 1 -
 5 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ac2a404c73eb..5281a998ab32 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2623,7 +2623,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk_init_common(sk);
sk->sk_send_head=   NULL;
 
-   init_timer(&sk->sk_timer);
+   setup_timer(&sk->sk_timer, NULL, (unsigned long)sk);
 
sk->sk_allocation   =   GFP_KERNEL;
sk->sk_rcvbuf   =   sysctl_rmem_default;
diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c
index 94d05806a9a2..f84ce71f1f5f 100644
--- a/net/netrom/nr_timer.c
+++ b/net/netrom/nr_timer.c
@@ -45,7 +45,6 @@ void nr_init_timers(struct sock *sk)
setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk);
 
/* initialized by sock_init_data */
-   sk->sk_timer.data = (unsigned long)sk;
sk->sk_timer.function = &nr_heartbeat_expiry;
 }
 
diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c
index bc5469d6d9cb..6baa415b199a 100644
--- a/net/rose/rose_timer.c
+++ b/net/rose/rose_timer.c
@@ -36,7 +36,6 @@ void rose_start_heartbeat(struct sock *sk)
 {
del_timer(&sk->sk_timer);
 
-   sk->sk_timer.data = (unsigned long)sk;
sk->sk_timer.function = &rose_heartbeat_expiry;
sk->sk_timer.expires  = jiffies + 5 * HZ;
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5a1a98df3499..a5ac385b9120 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -414,7 +414,6 @@ static void __x25_destroy_socket(struct sock *sk)
/* Defer: outstanding buffers */
sk->sk_timer.expires  = jiffies + 10 * HZ;
sk->sk_timer.function = x25_destroy_timer;
-   sk->sk_timer.data = (unsigned long)sk;
add_timer(&sk->sk_timer);
} else {
/* drop last reference so sock_put will free */
diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c
index 5c5db1a36399..de5cec41d100 100644
--- a/net/x25/x25_timer.c
+++ b/net/x25/x25_timer.c
@@ -36,7 +36,6 @@ void x25_init_timers(struct sock *sk)
setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk);
 
/* initialized by sock_init_data */
-   sk->sk_timer.data = (unsigned long)sk;
sk->sk_timer.function = &x25_heartbeat_expiry;
 }
 
-- 
2.7.4



[PATCH 19/31] timer: Remove open-coded casts for .data and .function

2017-08-31 Thread Kees Cook
This standardizes the callback and data prototypes in several places that
perform casting, in an effort to remove more open-coded .data and
.function uses in favor of setup_timer().

Cc: Samuel Ortiz 
Cc: Tyrel Datwyler 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "James E.J. Bottomley" 
Cc: "Martin K. Petersen" 
Cc: netdev@vger.kernel.org
Cc: linux-s...@vger.kernel.org
Cc: linuxppc-...@lists.ozlabs.org
Signed-off-by: Kees Cook 
---
 drivers/net/irda/bfin_sir.c  |  5 +++--
 drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++
 drivers/scsi/ibmvscsi/ibmvscsi.c |  8 
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c
index 3151b580dbd6..c9413bd580a7 100644
--- a/drivers/net/irda/bfin_sir.c
+++ b/drivers/net/irda/bfin_sir.c
@@ -317,8 +317,9 @@ static void bfin_sir_dma_rx_chars(struct net_device *dev)
async_unwrap_char(dev, &self->stats, &self->rx_buff, 
port->rx_dma_buf.buf[i]);
 }
 
-void bfin_sir_rx_dma_timeout(struct net_device *dev)
+void bfin_sir_rx_dma_timeout(unsigned long data)
 {
+   struct net_device *dev = (struct net_device *)data;
struct bfin_sir_self *self = netdev_priv(dev);
struct bfin_sir_port *port = self->sir_port;
int x_pos, pos;
@@ -406,7 +407,7 @@ static int bfin_sir_startup(struct bfin_sir_port *port, 
struct net_device *dev)
enable_dma(port->rx_dma_channel);
 
port->rx_dma_timer.data = (unsigned long)(dev);
-   port->rx_dma_timer.function = (void *)bfin_sir_rx_dma_timeout;
+   port->rx_dma_timer.function = bfin_sir_rx_dma_timeout;
 
 #else
 
diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index cc4e05be8d4a..1be20688dd1f 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1393,8 +1393,9 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
  *
  * Called when an internally generated command times out
  **/
-static void ibmvfc_timeout(struct ibmvfc_event *evt)
+static void ibmvfc_timeout(unsigned long data)
 {
+   struct ibmvfc_event *evt = (struct ibmvfc_event *)data;
struct ibmvfc_host *vhost = evt->vhost;
dev_err(vhost->dev, "Command timed out (%p). Resetting connection\n", 
evt);
ibmvfc_reset_host(vhost);
@@ -1424,12 +1425,10 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
BUG();
 
list_add_tail(&evt->queue, &vhost->sent);
-   init_timer(&evt->timer);
+   setup_timer(&evt->timer, ibmvfc_timeout, (unsigned long)evt);
 
if (timeout) {
-   evt->timer.data = (unsigned long) evt;
evt->timer.expires = jiffies + (timeout * HZ);
-   evt->timer.function = (void (*)(unsigned long))ibmvfc_timeout;
add_timer(&evt->timer);
}
 
@@ -3696,8 +3695,9 @@ static void ibmvfc_tgt_adisc_cancel_done(struct 
ibmvfc_event *evt)
  * out, reset the CRQ. When the ADISC comes back as cancelled,
  * log back into the target.
  **/
-static void ibmvfc_adisc_timeout(struct ibmvfc_target *tgt)
+static void ibmvfc_adisc_timeout(unsigned long data)
 {
+   struct ibmvfc_target *tgt = (struct ibmvfc_target *)data;
struct ibmvfc_host *vhost = tgt->vhost;
struct ibmvfc_event *evt;
struct ibmvfc_tmf *tmf;
@@ -3782,9 +3782,7 @@ static void ibmvfc_tgt_adisc(struct ibmvfc_target *tgt)
if (timer_pending(&tgt->timer))
mod_timer(&tgt->timer, jiffies + (IBMVFC_ADISC_TIMEOUT * HZ));
else {
-   tgt->timer.data = (unsigned long) tgt;
tgt->timer.expires = jiffies + (IBMVFC_ADISC_TIMEOUT * HZ);
-   tgt->timer.function = (void (*)(unsigned 
long))ibmvfc_adisc_timeout;
add_timer(&tgt->timer);
}
 
@@ -3916,7 +3914,7 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, 
u64 scsi_id)
tgt->vhost = vhost;
tgt->need_login = 1;
tgt->cancel_key = vhost->task_set++;
-   init_timer(&tgt->timer);
+   setup_timer(&tgt->timer, ibmvfc_adisc_timeout, (unsigned long)tgt);
kref_init(&tgt->kref);
ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout);
spin_lock_irqsave(vhost->host->host_lock, flags);
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index da22b3665cb0..44ae85903a00 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -837,8 +837,9 @@ static void ibmvscsi_reset_host(struct ibmvscsi_host_data 
*hostdata)
  *
  * Called when an internally generated command times out
 */
-static void ibmvscsi_timeout(struct srp_event_struct *evt_struct)
+static void ibmvscsi_timeout(unsigned long data)
 {
+   struct srp_event_struct *evt_struct = (struct srp_event_struct *)data;
struct ibmvscsi_host_data *hostdata = evt_struct->hostdata;
 
dev_err(hostdata->dev, "Command timed out (%x). Resetting connection\n",
@

[PATCH 30/31] appletalk: Remove unneeded synchronization

2017-08-31 Thread Kees Cook
The use of del_timer_sync() will make sure a timer is not rescheduled.
As such, there is no need to add external signals to kill timers. In
preparation for switching the timer callback argument to the timer
pointer, this drops the .data argument since it doesn't serve a meaningful
purpose here.

Cc: David Howells 
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook 
---
 drivers/net/appletalk/ltpc.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c
index e4aa374caa4d..cc3dc9337eae 100644
--- a/drivers/net/appletalk/ltpc.c
+++ b/drivers/net/appletalk/ltpc.c
@@ -880,14 +880,10 @@ static void ltpc_poll(unsigned long l)
}
ltpc_poll_counter--;
}
-  
-   if (!dev)
-   return;  /* we've been downed */
 
/* poll 20 times per second */
idle(dev);
ltpc_timer.expires = jiffies + HZ/20;
-   
add_timer(

[PATCH 31/31] timer: Switch to testing for .function instead of .data

2017-08-31 Thread Kees Cook
In several places, .data is checked for initialization to gate early
calls to del_timer_sync(). Checking for .function is equally valid, so
switch to this in all callers.

Cc: "Rafael J. Wysocki" 
Cc: Pavel Machek 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: Mike Marciniszyn 
Cc: Dennis Dalessandro 
Cc: Doug Ledford 
Cc: Sean Hefty 
Cc: Hal Rosenstock 
Cc: Dmitry Torokhov 
Cc: Jeff Kirsher 
Cc: linux...@vger.kernel.org
Cc: linux-r...@vger.kernel.org
Cc: linux-in...@vger.kernel.org
Cc: intel-wired-...@lists.osuosl.org
Cc: netdev@vger.kernel.org
Signed-off-by: Kees Cook 
---
 drivers/base/power/wakeup.c |  3 +--
 drivers/infiniband/hw/hfi1/chip.c   |  6 ++
 drivers/infiniband/hw/hfi1/init.c   |  2 +-
 drivers/infiniband/hw/qib/qib_iba7220.c |  2 +-
 drivers/infiniband/hw/qib/qib_iba7322.c |  2 +-
 drivers/infiniband/hw/qib/qib_init.c| 14 +-
 drivers/infiniband/hw/qib/qib_mad.c |  2 +-
 drivers/input/input.c   |  5 ++---
 drivers/net/ethernet/intel/i40e/i40e_main.c |  2 +-
 9 files changed, 15 insertions(+), 23 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 144e6d8fafc8..79a3c1b204af 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -479,8 +479,7 @@ static bool wakeup_source_not_registered(struct 
wakeup_source *ws)
 * Use timer struct to check if the given source is initialized
 * by wakeup_source_add.
 */
-   return ws->timer.function != pm_wakeup_timer_fn ||
-  ws->timer.data != (unsigned long)ws;
+   return ws->timer.function != pm_wakeup_timer_fn;
 }
 
 /*
diff --git a/drivers/infiniband/hw/hfi1/chip.c 
b/drivers/infiniband/hw/hfi1/chip.c
index 94b54850ec75..53a6596cd7d6 100644
--- a/drivers/infiniband/hw/hfi1/chip.c
+++ b/drivers/infiniband/hw/hfi1/chip.c
@@ -5513,9 +5513,8 @@ static int init_rcverr(struct hfi1_devdata *dd)
 
 static void free_rcverr(struct hfi1_devdata *dd)
 {
-   if (dd->rcverr_timer.data)
+   if (dd->rcverr_timer.function)
del_timer_sync(&dd->rcverr_timer);
-   dd->rcverr_timer.data = 0;
 }
 
 static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg)
@@ -11992,9 +11991,8 @@ static void free_cntrs(struct hfi1_devdata *dd)
struct hfi1_pportdata *ppd;
int i;
 
-   if (dd->synth_stats_timer.data)
+   if (dd->synth_stats_timer.function)
del_timer_sync(&dd->synth_stats_timer);
-   dd->synth_stats_timer.data = 0;
ppd = (struct hfi1_pportdata *)(dd + 1);
for (i = 0; i < dd->num_pports; i++, ppd++) {
kfree(ppd->cntrs);
diff --git a/drivers/infiniband/hw/hfi1/init.c 
b/drivers/infiniband/hw/hfi1/init.c
index 4a11d4da4c92..bc2af709c111 100644
--- a/drivers/infiniband/hw/hfi1/init.c
+++ b/drivers/infiniband/hw/hfi1/init.c
@@ -839,7 +839,7 @@ static void stop_timers(struct hfi1_devdata *dd)
 
for (pidx = 0; pidx < dd->num_pports; ++pidx) {
ppd = dd->pport + pidx;
-   if (ppd->led_override_timer.data) {
+   if (ppd->led_override_timer.function) {
del_timer_sync(&ppd->led_override_timer);
atomic_set(&ppd->led_override_timer_active, 0);
}
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c 
b/drivers/infiniband/hw/qib/qib_iba7220.c
index b1d512c7ff4b..22fd65fe7193 100644
--- a/drivers/infiniband/hw/qib/qib_iba7220.c
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -1662,7 +1662,7 @@ static void qib_7220_quiet_serdes(struct qib_pportdata 
*ppd)
   dd->control | QLOGIC_IB_C_FREEZEMODE);
 
ppd->cpspec->chase_end = 0;
-   if (ppd->cpspec->chase_timer.data) /* if initted */
+   if (ppd->cpspec->chase_timer.function) /* if initted */
del_timer_sync(&ppd->cpspec->chase_timer);
 
if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c 
b/drivers/infiniband/hw/qib/qib_iba7322.c
index bb2439fff8fa..471aaf6bcbf2 100644
--- a/drivers/infiniband/hw/qib/qib_iba7322.c
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -2531,7 +2531,7 @@ static void qib_7322_mini_quiet_serdes(struct 
qib_pportdata *ppd)
cancel_delayed_work_sync(&ppd->cpspec->ipg_work);
 
ppd->cpspec->chase_end = 0;
-   if (ppd->cpspec->chase_timer.data) /* if initted */
+   if (ppd->cpspec->chase_timer.function) /* if initted */
del_timer_sync(&ppd->cpspec->chase_timer);
 
/*
diff --git a/drivers/infiniband/hw/qib/qib_init.c 
b/drivers/infiniband/hw/qib/qib_init.c
index 6c16ba1107ba..66fb0318660b 100644
--- a/drivers/infiniband/hw/qib/qib_init.c
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -815,23 +815,19 @@ static void qib_stop_timers(struct qib_devdata *dd)
struct qib_pportdata *ppd;
int pidx;
 
-   if (dd->stats_timer.dat

[PATCH 13/31] timer: Remove meaningless .data/.function assignments

2017-08-31 Thread Kees Cook
Several timer users needlessly reset their .function/.data fields during
their timer callback, but nothing else changes them. Some users do not
use their .data field at all. Each instance is removed here.

Cc: Krzysztof Halasa 
Cc: Aditya Shankar 
Cc: Ganesh Krishna 
Cc: Greg Kroah-Hartman 
Cc: Jens Axboe 
Cc: netdev@vger.kernel.org
Cc: linux-wirel...@vger.kernel.org
Cc: de...@driverdev.osuosl.org
Signed-off-by: Kees Cook 
---
 drivers/block/amiflop.c   | 3 +--
 drivers/net/wan/hdlc_cisco.c  | 2 --
 drivers/net/wan/hdlc_fr.c | 2 --
 drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +---
 4 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index c4b1cba27178..6680d75bc857 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -323,7 +323,7 @@ static void fd_deselect (int drive)
 
 }
 
-static void motor_on_callback(unsigned long nr)
+static void motor_on_callback(unsigned long ignored)
 {
if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) {
complete_all(&motor_on_completion);
@@ -344,7 +344,6 @@ static int fd_motor_on(int nr)
fd_select(nr);
 
reinit_completion(&motor_on_completion);
-   motor_on_timer.data = nr;
mod_timer(&motor_on_timer, jiffies + HZ/2);
 
on_attempts = 10;
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index c696d42f4502..6c98d85f2773 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -276,8 +276,6 @@ static void cisco_timer(unsigned long arg)
spin_unlock(&st->lock);
 
st->timer.expires = jiffies + st->settings.interval * HZ;
-   st->timer.function = cisco_timer;
-   st->timer.data = arg;
add_timer(&st->timer);
 }
 
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index de42faca076a..7da2424c28a4 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -644,8 +644,6 @@ static void fr_timer(unsigned long arg)
state(hdlc)->settings.t391 * HZ;
}
 
-   state(hdlc)->timer.function = fr_timer;
-   state(hdlc)->timer.data = arg;
add_timer(&state(hdlc)->timer);
 }
 
diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c 
b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
index 68fd5b3b8b2d..2fca2b017093 100644
--- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
+++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c
@@ -275,7 +275,7 @@ static void update_scan_time(void)
last_scanned_shadow[i].time_scan = jiffies;
 }
 
-static void remove_network_from_shadow(unsigned long arg)
+static void remove_network_from_shadow(unsigned long unused)
 {
unsigned long now = jiffies;
int i, j;
@@ -296,7 +296,6 @@ static void remove_network_from_shadow(unsigned long arg)
}
 
if (last_scanned_cnt != 0) {
-   hAgingTimer.data = arg;
mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME));
}
 }
@@ -313,7 +312,6 @@ static int is_network_in_shadow(struct network_info 
*pstrNetworkInfo,
int i;
 
if (last_scanned_cnt == 0) {
-   hAgingTimer.data = (unsigned long)user_void;
mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME));
state = -1;
} else {
-- 
2.7.4



[RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.

2017-08-31 Thread Sridhar Samudrala
This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used
to enable symmetric tx and rx queues on a socket.

This option is specifically useful for epoll based multi threaded workloads
where each thread handles packets received on a single RX queue . In this model,
we have noticed that it helps to send the packets on the same TX queue
corresponding to the queue-pair associated with the RX queue specifically when
busy poll is enabled with epoll().

Two new fields are added to struct sock_common to cache the last rx ifindex and
the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the 
cached
rx queue when this option is enabled and the TX is happening on the same device.

Signed-off-by: Sridhar Samudrala 
---
 include/net/request_sock.h|  1 +
 include/net/sock.h| 17 +
 include/uapi/asm-generic/socket.h |  2 ++
 net/core/dev.c|  8 +++-
 net/core/sock.c   | 10 ++
 net/ipv4/tcp_input.c  |  1 +
 net/ipv4/tcp_ipv4.c   |  1 +
 net/ipv4/tcp_minisocks.c  |  1 +
 8 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 23e2205..c3bc12e 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock 
*req)
req_to_sk(req)->sk_prot = sk_listener->sk_prot;
sk_node_init(&req_to_sk(req)->sk_node);
sk_tx_queue_clear(req_to_sk(req));
+   req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues;
req->saved_syn = NULL;
refcount_set(&req->rsk_refcnt, 0);
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 03a3625..3421809 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, 
...)
  * @skc_node: main hash linkage for various protocol lookup tables
  * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  * @skc_tx_queue_mapping: tx queue number for this connection
+ * @skc_rx_queue_mapping: rx queue number for this connection
+ * @skc_rx_ifindex: rx ifindex for this connection
  * @skc_flags: place holder for sk_flags
  * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
  * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  * @skc_incoming_cpu: record/match cpu processing incoming packets
  * @skc_refcnt: reference count
+ * @skc_symmetric_queues: symmetric tx/rx queues
  *
  * This is the minimal network layer representation of sockets, the header
  * for struct sock and struct inet_timewait_sock.
@@ -177,6 +180,7 @@ struct sock_common {
unsigned char   skc_reuseport:1;
unsigned char   skc_ipv6only:1;
unsigned char   skc_net_refcnt:1;
+   unsigned char   skc_symmetric_queues:1;
int skc_bound_dev_if;
union {
struct hlist_node   skc_bind_node;
@@ -214,6 +218,8 @@ struct sock_common {
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
+   int skc_rx_queue_mapping;
+   int skc_rx_ifindex;
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
@@ -324,6 +330,8 @@ struct sock {
 #define sk_nulls_node  __sk_common.skc_nulls_node
 #define sk_refcnt  __sk_common.skc_refcnt
 #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping
+#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping
+#define sk_rx_ifindex  __sk_common.skc_rx_ifindex
 
 #define sk_dontcopy_begin  __sk_common.skc_dontcopy_begin
 #define sk_dontcopy_end__sk_common.skc_dontcopy_end
@@ -340,6 +348,7 @@ struct sock {
 #define sk_reuseport   __sk_common.skc_reuseport
 #define sk_ipv6only__sk_common.skc_ipv6only
 #define sk_net_refcnt  __sk_common.skc_net_refcnt
+#define sk_symmetric_queues__sk_common.skc_symmetric_queues
 #define sk_bound_dev_if__sk_common.skc_bound_dev_if
 #define sk_bind_node   __sk_common.skc_bind_node
 #define sk_prot__sk_common.skc_prot
@@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk)
return sk ? sk->sk_tx_queue_mapping : -1;
 }
 
+static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb)
+{
+   if (sk->sk_symmetric_queues) {
+   sk->sk_rx_ifindex = skb->skb_iif;
+   sk->sk_rx_queue_mapping = skb_get_rx_queue(skb);
+   }
+}
+
 static inline void sk_set_socket(struct sock *sk, struct socket *sock)
 {
sk_tx_queue_clear(sk);
diff --git a/include/uapi/asm-generic/socket.h 
b/include/uapi/asm-generic/so

Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses

2017-08-31 Thread Sabrina Dubroca
2017-08-31, 09:59:39 -0700, Ivan Delalande wrote:
> diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
> index a748c74aa8b7..abbf0edcf6c2 100644
> --- a/net/ipv4/tcp_diag.c
> +++ b/net/ipv4/tcp_diag.c
[...]
> +static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
> + struct sk_buff *skb)
> +{
> +#ifdef CONFIG_TCP_MD5SIG
> + if (net_admin) {

In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't
see anything preventing you from reaching this with a !fullsock?


> + struct tcp_md5sig_info *md5sig;
> + int err = 0;
> +
> + rcu_read_lock();
> + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
> + if (md5sig)
> + err = tcp_diag_put_md5sig(skb, md5sig);
> + rcu_read_unlock();
> + if (err < 0)
> + return err;
> + }
> +#endif
> +
> + return 0;
> +}
> +
> +static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
> +{
> + size_t size = 0;
> +
> +#ifdef CONFIG_TCP_MD5SIG
> + if (net_admin && sk_fullsock(sk)) {
> + const struct tcp_md5sig_info *md5sig;
> + const struct tcp_md5sig_key *key;
> + size_t md5sig_count = 0;
> +
> + rcu_read_lock();
> + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
> + if (md5sig) {
> + hlist_for_each_entry_rcu(key, &md5sig->head, node)
> + md5sig_count++;
> + }
> + rcu_read_unlock();
> + size += nla_total_size(md5sig_count *
> +sizeof(struct tcp_diag_md5sig));
> + }
> +#endif
> +
> + return size;
> +}
> +
>  static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
> const struct inet_diag_req_v2 *r, struct nlattr *bc)
>  {

-- 
Sabrina


[PATCH net-next 0/2] netvsc: transparent VF related cleanups

2017-08-31 Thread Stephen Hemminger
The first gets rid of unnecessary ref counting, and second
allows removing hv_netvsc driver even if VF present.

Stephen Hemminger (2):
  netvsc: cleanup datapath switch
  netvsc: allow driver to be removed even if VF is present

 drivers/net/hyperv/netvsc_drv.c | 55 -
 1 file changed, 16 insertions(+), 39 deletions(-)

-- 
2.11.0



[PATCH net-next 2/2] netvsc: allow driver to be removed even if VF is present

2017-08-31 Thread Stephen Hemminger
If VF is attached then can still allow netvsc driver module to
be removed. Just have to make sure and do the cleanup.

Also, avoid extra rtnl round trip when calling unregister.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 128165e49c9a..97ed4bdc439f 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1834,9 +1834,6 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
 
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
 
-   /* Prevent this module from being unloaded while VF is registered */
-   try_module_get(THIS_MODULE);
-
dev_hold(vf_netdev);
rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
return NOTIFY_OK;
@@ -1880,10 +1877,11 @@ static int netvsc_unregister_vf(struct net_device 
*vf_netdev)
 
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
 
+   netdev_rx_handler_unregister(vf_netdev);
netdev_upper_dev_unlink(vf_netdev, ndev);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
dev_put(vf_netdev);
-   module_put(THIS_MODULE);
+
return NOTIFY_OK;
 }
 
@@ -1987,11 +1985,11 @@ static int netvsc_probe(struct hv_device *dev,
 
 static int netvsc_remove(struct hv_device *dev)
 {
-   struct net_device *net;
struct net_device_context *ndev_ctx;
+   struct net_device *vf_netdev;
+   struct net_device *net;
 
net = hv_get_drvdata(dev);
-
if (net == NULL) {
dev_err(&dev->device, "No net device to remove\n");
return 0;
@@ -2008,12 +2006,15 @@ static int netvsc_remove(struct hv_device *dev)
 * removed. Also blocks mtu and channel changes.
 */
rtnl_lock();
+   vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev);
+   if (vf_netdev)
+   netvsc_unregister_vf(vf_netdev);
+
rndis_filter_device_remove(dev,
   rtnl_dereference(ndev_ctx->nvdev));
+   unregister_netdevice(net);
rtnl_unlock();
 
-   unregister_netdev(net);
-
hv_set_drvdata(dev, NULL);
 
free_percpu(ndev_ctx->vf_stats);
-- 
2.11.0



[PATCH net-next 1/2] netvsc: cleanup datapath switch

2017-08-31 Thread Stephen Hemminger
Use one routine for datapath up/down. Don't need to reopen
the rndis layer.

Signed-off-by: Stephen Hemminger 
---
 drivers/net/hyperv/netvsc_drv.c | 38 +++---
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index fac44c5c8d0d..128165e49c9a 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1842,11 +1842,13 @@ static int netvsc_register_vf(struct net_device 
*vf_netdev)
return NOTIFY_OK;
 }
 
-static int netvsc_vf_up(struct net_device *vf_netdev)
+/* VF up/down change detected, schedule to change data path */
+static int netvsc_vf_changed(struct net_device *vf_netdev)
 {
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
struct net_device *ndev;
+   bool vf_is_up = netif_running(vf_netdev);
 
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
@@ -1857,34 +1859,9 @@ static int netvsc_vf_up(struct net_device *vf_netdev)
if (!netvsc_dev)
return NOTIFY_DONE;
 
-   /* Bump refcount when datapath is acvive - Why? */
-   rndis_filter_open(netvsc_dev);
-
-   /* notify the host to switch the data path. */
-   netvsc_switch_datapath(ndev, true);
-   netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name);
-
-   return NOTIFY_OK;
-}
-
-static int netvsc_vf_down(struct net_device *vf_netdev)
-{
-   struct net_device_context *net_device_ctx;
-   struct netvsc_device *netvsc_dev;
-   struct net_device *ndev;
-
-   ndev = get_netvsc_byref(vf_netdev);
-   if (!ndev)
-   return NOTIFY_DONE;
-
-   net_device_ctx = netdev_priv(ndev);
-   netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
-   if (!netvsc_dev)
-   return NOTIFY_DONE;
-
-   netvsc_switch_datapath(ndev, false);
-   netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name);
-   rndis_filter_close(netvsc_dev);
+   netvsc_switch_datapath(ndev, vf_is_up);
+   netdev_info(ndev, "Data path switched %s VF: %s\n",
+   vf_is_up ? "to" : "from", vf_netdev->name);
 
return NOTIFY_OK;
 }
@@ -2094,9 +2071,8 @@ static int netvsc_netdev_event(struct notifier_block 
*this,
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
-   return netvsc_vf_up(event_dev);
case NETDEV_DOWN:
-   return netvsc_vf_down(event_dev);
+   return netvsc_vf_changed(event_dev);
default:
return NOTIFY_DONE;
}
-- 
2.11.0



Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t

2017-08-31 Thread Eric Dumazet
On Thu, 2017-08-31 at 18:45 -0400, Willem de Bruijn wrote:
> On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet  wrote:
> > refcount_t type and corresponding API should be
> > used instead of atomic_t when the variable is used as
> > a reference counter. This allows to avoid accidental
> > refcounter overflows that might lead to use-after-free
> > situations.
> >
> > Signed-off-by: Eric Dumazet 
> > ---
> >  include/linux/skbuff.h | 5 +++--
> >  net/core/skbuff.c  | 8 
> >  2 files changed, 7 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 
> > 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15
> >  100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -22,6 +22,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  #include 
> >  #include 
> > @@ -456,7 +457,7 @@ struct ubuf_info {
> > u32 bytelen;
> > };
> > };
> > -   atomic_t refcnt;
> > +   refcount_t refcnt;
> >
> > struct mmpin {
> > struct user_struct *user;
> > @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock 
> > *sk, size_t size,
> >
> >  static inline void sock_zerocopy_get(struct ubuf_info *uarg)
> >  {
> > -   atomic_inc(&uarg->refcnt);
> > +   refcount_inc(&uarg->refcnt);
> >  }
> >
> >  void sock_zerocopy_put(struct ubuf_info *uarg);
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 
> > 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612
> >  100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, 
> > size_t size)
> > uarg->len = 1;
> > uarg->bytelen = size;
> > uarg->zerocopy = 1;
> > -   atomic_set(&uarg->refcnt, 1);
> > +   refcount_set(&uarg->refcnt, 1);
> > sock_hold(sk);
> >
> > return uarg;
> > @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
> >
> >  void sock_zerocopy_put(struct ubuf_info *uarg)
> >  {
> > -   if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
> > +   if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
> > if (uarg->callback)
> > uarg->callback(uarg, uarg->zerocopy);
> > else
> > @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
> >  * avoid an skb send inside the main loop triggering uarg 
> > free.
> >  */
> > if (sk->sk_type != SOCK_STREAM)
> > -   atomic_inc(&uarg->refcnt);
> > +   refcount_inc(&uarg->refcnt);
> 
> This would be a 0 -> 1 transition. It is taken only on the error path from
> a socket that does not take an explicit hold at the start of sendmsg.
> 

Arg.

> The code is vestigial, really, as the final patchset only includes tcp.
> It is safe to leave as is for now, or I can remove it.
> 

OK I probably should remove this chunk in patch 1/2 then.

> In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated
> other users of ubuf_info to initialize refcnt. We probably also want to
> convert the call in handle_tx in drivers/vhost/net.c.

Thanks ! :)




Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet  wrote:
> refcount_t type and corresponding API should be
> used instead of atomic_t when the variable is used as
> a reference counter. This allows to avoid accidental
> refcounter overflows that might lead to use-after-free
> situations.
>
> Signed-off-by: Eric Dumazet 
> ---
>  include/linux/skbuff.h | 5 +++--
>  net/core/skbuff.c  | 8 
>  2 files changed, 7 insertions(+), 6 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 
> 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15
>  100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -22,6 +22,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -456,7 +457,7 @@ struct ubuf_info {
> u32 bytelen;
> };
> };
> -   atomic_t refcnt;
> +   refcount_t refcnt;
>
> struct mmpin {
> struct user_struct *user;
> @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, 
> size_t size,
>
>  static inline void sock_zerocopy_get(struct ubuf_info *uarg)
>  {
> -   atomic_inc(&uarg->refcnt);
> +   refcount_inc(&uarg->refcnt);
>  }
>
>  void sock_zerocopy_put(struct ubuf_info *uarg);
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index 
> 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612
>  100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, 
> size_t size)
> uarg->len = 1;
> uarg->bytelen = size;
> uarg->zerocopy = 1;
> -   atomic_set(&uarg->refcnt, 1);
> +   refcount_set(&uarg->refcnt, 1);
> sock_hold(sk);
>
> return uarg;
> @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
>
>  void sock_zerocopy_put(struct ubuf_info *uarg)
>  {
> -   if (uarg && atomic_dec_and_test(&uarg->refcnt)) {
> +   if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
> if (uarg->callback)
> uarg->callback(uarg, uarg->zerocopy);
> else
> @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
>  * avoid an skb send inside the main loop triggering uarg 
> free.
>  */
> if (sk->sk_type != SOCK_STREAM)
> -   atomic_inc(&uarg->refcnt);
> +   refcount_inc(&uarg->refcnt);

This would be a 0 -> 1 transition. It is taken only on the error path from
a socket that does not take an explicit hold at the start of sendmsg.

The code is vestigial, really, as the final patchset only includes tcp.
It is safe to leave as is for now, or I can remove it.

In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated
other users of ubuf_info to initialize refcnt. We probably also want to
convert the call in handle_tx in drivers/vhost/net.c.


Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update

2017-08-31 Thread Stephen Hemminger
On Thu, 31 Aug 2017 23:50:26 +0200
Jesper Dangaard Brouer  wrote:

> On Thu, 31 Aug 2017 11:43:25 -0700 (PDT)
> David Miller  wrote:
> 
> > From: Roopa Prabhu 
> > Date: Wed, 30 Aug 2017 22:18:13 -0700
> >   
> > > From: Roopa Prabhu 
> > > 
> > > This extends bridge fdb table tracepoints to also cover
> > > learned fdb entries in the br_fdb_update path. Note that
> > > unlike other tracepoints I have moved this to when the fdb
> > > is modified because this is in the datapath and can generate
> > > a lot of noise in the trace output. br_fdb_update is also called
> > > from added_by_user context in the NTF_USE case which is already
> > > traced ..hence the !added_by_user check.
> > > 
> > > Signed-off-by: Roopa Prabhu 
> > 
> > Applied.
> > 
> > Let's use dev->name for now and if the tooling can eventually
> > do transparent ifindex->name then we can consider redoing
> > a bunch of networking tracepoints.  
> 
> I agree! :-)
> 

Agreed, but it is yet another case of tracepoints not having
a stable ABI. There is no ABI guarantee on tracing, but it still
makes for user complaints.


Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Daniel Borkmann

On 08/31/2017 10:56 PM, Chenbo Feng wrote:

From: Chenbo Feng 

Introduce a pointer into struct bpf_map to hold the security information
about the map. The actual security struct varies based on the security
models implemented. Place the LSM hooks before each of the unrestricted
eBPF operations, the map_update_elem and map_delete_elem operations are
checked by security_map_modify. The map_lookup_elem and map_get_next_key
operations are checked by securtiy_map_read.

Signed-off-by: Chenbo Feng 


Against which tree is this by the way, net-next? There are
changes here which require a rebase of your set.


---
  include/linux/bpf.h  |  3 +++
  kernel/bpf/syscall.c | 28 
  2 files changed, 31 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b69e7a5869ff..ca3e6ff7091d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -53,6 +53,9 @@ struct bpf_map {
struct work_struct work;
atomic_t usercnt;
struct bpf_map *inner_map_meta;
+#ifdef CONFIG_SECURITY
+   void *security;
+#endif
  };

  /* function argument constraints */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 045646da97cc..b15580bcf3b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;

+   err = security_map_create();


Seems a bit limited to me, don't you want to be able to
also differentiate between different map types? Same goes
for security_prog_load() wrt prog types, no?


+   if (err)
+   return -EACCES;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;

+   err = security_post_create(map);
+   if (err < 0)
+   goto free_map;
+


So the hook you implement in patch 3/3 does:

+static int selinux_bpf_post_create(struct bpf_map *map)
+{
+   struct bpf_security_struct *bpfsec;
+
+   bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+   if (!bpfsec)
+   return -ENOMEM;
+
+   bpfsec->sid = current_sid();
+   map->security = bpfsec;
+
+   return 0;
+}

Where do you kfree() bpfsec when the map gets released
normally or in error path?


err = bpf_map_alloc_id(map);
if (err)
goto free_map;
@@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);

+   err = security_map_read(map);
+   if (err)
+   return -EACCES;


How about actually dropping ref?


+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);

+   err = security_map_modify(map);
+   if (err)
+   return -EACCES;


Ditto ...


key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);

+   err = security_map_modify(map);
+   if (err)
+   return -EACCES;


Ditto ...


key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);

+   err = security_map_read(map);
+   if (err)
+   return -EACCES;


And once again here ... :(


if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;

+   err = security_prog_load();
+   if (err)
+   return -EACCES;
+
if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
return -EINVAL;






Re: [PATCH net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion

2017-08-31 Thread Willem de Bruijn
On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet  wrote:
> In order to convert this atomic_t refcnt to refcount_t,
> we need to init the refcount to one to not trigger
> a 0 -> 1 transition.
>
> This also removes one atomic operation in fast path.
>
> Signed-off-by: Eric Dumazet 

Acked-by: Willem de Bruijn 

Thanks, Eric.


[PATCH net-next 1/2] flow_dissector: Cleanup control flow

2017-08-31 Thread Tom Herbert
__skb_flow_dissect is riddled with gotos that make discerning the flow,
debugging, and extending the capability difficult. This patch
reorganizes things so that we only perform goto's after the two main
switch statements (no gotos within the cases now). It also eliminates
several goto labels so that there are only two labels that can be target
for goto.

Reported-by: Alexander Popov 
Signed-off-by: Tom Herbert 
---
 include/net/flow_dissector.h |   9 ++
 net/core/flow_dissector.c| 225 ---
 2 files changed, 156 insertions(+), 78 deletions(-)

diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h
index e2663e900b0a..c358c3ff6acc 100644
--- a/include/net/flow_dissector.h
+++ b/include/net/flow_dissector.h
@@ -19,6 +19,15 @@ struct flow_dissector_key_control {
 #define FLOW_DIS_FIRST_FRAGBIT(1)
 #define FLOW_DIS_ENCAPSULATION BIT(2)
 
+enum flow_dissect_ret {
+   FLOW_DISSECT_RET_OUT_GOOD,
+   FLOW_DISSECT_RET_OUT_BAD,
+   FLOW_DISSECT_RET_PROTO_AGAIN,
+   FLOW_DISSECT_RET_IPPROTO_AGAIN,
+   FLOW_DISSECT_RET_IPPROTO_AGAIN_EH,
+   FLOW_DISSECT_RET_CONTINUE,
+};
+
 /**
  * struct flow_dissector_key_basic:
  * @thoff: Transport header offset
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e2eaa1ff948d..5110180a3e96 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -115,12 +115,6 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int 
thoff, u8 ip_proto,
 }
 EXPORT_SYMBOL(__skb_flow_get_ports);
 
-enum flow_dissect_ret {
-   FLOW_DISSECT_RET_OUT_GOOD,
-   FLOW_DISSECT_RET_OUT_BAD,
-   FLOW_DISSECT_RET_OUT_PROTO_AGAIN,
-};
-
 static enum flow_dissect_ret
 __skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
@@ -341,7 +335,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
return FLOW_DISSECT_RET_OUT_GOOD;
 
-   return FLOW_DISSECT_RET_OUT_PROTO_AGAIN;
+   return FLOW_DISSECT_RET_PROTO_AGAIN;
 }
 
 static void
@@ -431,6 +425,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
+   enum flow_dissect_ret fdret;
bool skip_vlan = false;
u8 ip_proto = 0;
bool ret;
@@ -482,14 +477,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
}
 
 proto_again:
+   fdret = FLOW_DISSECT_RET_CONTINUE;
+
switch (proto) {
case htons(ETH_P_IP): {
const struct iphdr *iph;
struct iphdr _iph;
-ip:
+
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, 
hlen, &_iph);
-   if (!iph || iph->ihl < 5)
-   goto out_bad;
+   if (!iph || iph->ihl < 5) {
+   fdret = FLOW_DISSECT_RET_OUT_BAD;
+   break;
+   }
+
nhoff += iph->ihl * 4;
 
ip_proto = iph->protocol;
@@ -509,19 +509,25 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
 
if (iph->frag_off & htons(IP_OFFSET)) {
-   goto out_good;
+   fdret = FLOW_DISSECT_RET_OUT_GOOD;
+   break;
} else {
key_control->flags |= FLOW_DIS_FIRST_FRAG;
-   if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
-   goto out_good;
+   if (!(flags &
+ FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
+   fdret = FLOW_DISSECT_RET_OUT_GOOD;
+   break;
+   }
}
}
 
__skb_flow_dissect_ipv4(skb, flow_dissector,
target_container, data, iph);
 
-   if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
-   goto out_good;
+   if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
+   fdret = FLOW_DISSECT_RET_OUT_GOOD;
+   break;
+   }
 
break;
}
@@ -529,10 +535,11 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
 
-ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, 
hlen, &_iph);
-   if (!iph)
-   goto out_bad;
+   if (!iph) {
+   fdret = FLOW_DISSECT_RET_OUT_BAD;
+   break;
+   }
 
ip_proto = iph->nex

[PATCH net-next 2/2] flow_dissector: Add limits for encapsulation and EH

2017-08-31 Thread Tom Herbert
In flow dissector there are no limits to the number of nested
encapsulations that might be dissected which makes for a nice DOS
attack. This patch limits for dissecting nested encapsulations
as well as for dissecting over extension headers.

Reported-by: Hannes Frederic Sowa 
Signed-off-by: Tom Herbert 
---
 net/core/flow_dissector.c | 48 ---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 5110180a3e96..1bca748de27d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -396,6 +396,35 @@ __skb_flow_dissect_ipv6(const struct sk_buff *skb,
key_ip->ttl = iph->hop_limit;
 }
 
+/* Maximum number of nested encapsulations that can be processed in
+ * __skb_flow_dissect
+ */
+#define MAX_FLOW_DISSECT_ENCAPS5
+
+static bool skb_flow_dissect_encap_allowed(int *num_encaps, unsigned int 
*flags)
+{
+   ++*num_encaps;
+
+   if (*num_encaps >= MAX_FLOW_DISSECT_ENCAPS) {
+   if (*num_encaps == MAX_FLOW_DISSECT_ENCAPS) {
+   /* Allow one more pass but ignore disregard
+* further encapsulations
+*/
+   *flags |= FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+   } else {
+   /* Max encaps reached */
+   return  false;
+   }
+   }
+
+   return true;
+}
+
+/* Maximum number of extension headers can be processed in __skb_flow_dissect
+ * per IPv6 packet
+ */
+#define MAX_FLOW_DISSECT_EH5
+
 /**
  * __skb_flow_dissect - extract the flow_keys struct and return it
  * @skb: sk_buff to extract the flow from, can be NULL if the rest are 
specified
@@ -426,6 +455,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
+   int num_eh, num_encaps = 0;
bool skip_vlan = false;
u8 ip_proto = 0;
bool ret;
@@ -714,7 +744,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
case FLOW_DISSECT_RET_OUT_GOOD:
goto out_good;
case FLOW_DISSECT_RET_PROTO_AGAIN:
-   goto proto_again;
+   if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+   goto proto_again;
+   goto out_good;
case FLOW_DISSECT_RET_CONTINUE:
case FLOW_DISSECT_RET_IPPROTO_AGAIN:
case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
@@ -724,6 +756,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
goto out_bad;
}
 
+   num_eh = 0;
+
 ip_proto_again:
fdret = FLOW_DISSECT_RET_CONTINUE;
 
@@ -844,10 +878,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
/* Process result of IP proto processing */
switch (fdret) {
case FLOW_DISSECT_RET_PROTO_AGAIN:
-   goto proto_again;
+   if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+   goto proto_again;
+   break;
case FLOW_DISSECT_RET_IPPROTO_AGAIN:
+   if (skb_flow_dissect_encap_allowed(&num_encaps, &flags))
+   goto ip_proto_again;
+   break;
case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH:
-   goto ip_proto_again;
+   ++num_eh;
+   if (num_eh <= MAX_FLOW_DISSECT_EH)
+   goto ip_proto_again;
+   break;
case FLOW_DISSECT_RET_OUT_GOOD:
case FLOW_DISSECT_RET_CONTINUE:
break;
-- 
2.11.0



[PATCH net-next 0/2] flow_dissector: Flow dissector fixes

2017-08-31 Thread Tom Herbert
This patch set fixes some basic issues with __skb_flow_dissect function.

Items addressed:
  - Cleanup control flow in the fucntion; in particular eliminate a
bunch of goto's and implement a simplified control flow model
  - Add limits for number of encapsulations of extension headers that
can be dissected

Tested:

Ran normal traffic, GUE, and VXLAN traffic.


Tom Herbert (2):
  flow_dissector: Cleanup control flow
  flow_dissector: Add limits for encapsulation and EH

 include/net/flow_dissector.h |   9 ++
 net/core/flow_dissector.c| 267 ++-
 2 files changed, 198 insertions(+), 78 deletions(-)

-- 
2.11.0



Re: [PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper

2017-08-31 Thread Daniel Borkmann

On 09/01/2017 12:05 AM, David Ahern wrote:

Allow BPF programs run on sock create to use the get_current_uid_gid
helper. IPv4 and IPv6 sockets are created in a process context so
there is always a valid uid/gid

Signed-off-by: David Ahern 
Acked-by: Alexei Starovoitov 


Acked-by: Daniel Borkmann 


Re: [PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set

2017-08-31 Thread Daniel Borkmann

On 09/01/2017 12:05 AM, David Ahern wrote:

Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.

Signed-off-by: David Ahern 
Acked-by: Alexei Starovoitov 
---
  include/uapi/linux/bpf.h |  2 ++
  net/core/filter.c| 26 ++
  2 files changed, 28 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d46cf326b95f..e9c89e20adff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -758,6 +758,8 @@ struct bpf_sock {
__u32 family;
__u32 type;
__u32 protocol;
+   __u32 mark;
+   __u32 priority;
  };

  #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index c6a37fe0285b..f51b9690adf3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int 
size,
switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
break;
+   case offsetof(struct bpf_sock, mark):
+   break;
+   case offsetof(struct bpf_sock, priority):
+   break;


Can also be follow-up, but please do keep this consistent
to all the other *_is_valid_access() helpers, meaning:

switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
case offsetof(struct bpf_sock, mark):
case offsetof(struct bpf_sock, priority):
break;
default:
return false;
}

Rest:

Acked-by: Daniel Borkmann 


Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Chenbo Feng
On Thu, Aug 31, 2017 at 2:17 PM, Mimi Zohar  wrote:
> On Thu, 2017-08-31 at 13:56 -0700, Chenbo Feng wrote:
>> From: Chenbo Feng 
>>
>> Introduce a pointer into struct bpf_map to hold the security information
>> about the map. The actual security struct varies based on the security
>> models implemented. Place the LSM hooks before each of the unrestricted
>> eBPF operations, the map_update_elem and map_delete_elem operations are
>> checked by security_map_modify. The map_lookup_elem and map_get_next_key
>> operations are checked by securtiy_map_read.
>>
>> Signed-off-by: Chenbo Feng 
>> ---
>>  include/linux/bpf.h  |  3 +++
>>  kernel/bpf/syscall.c | 28 
>>  2 files changed, 31 insertions(+)
>>
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index b69e7a5869ff..ca3e6ff7091d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -53,6 +53,9 @@ struct bpf_map {
>>   struct work_struct work;
>>   atomic_t usercnt;
>>   struct bpf_map *inner_map_meta;
>> +#ifdef CONFIG_SECURITY
>> + void *security;
>> +#endif
>>  };
>>
>>  /* function argument constraints */
>> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
>> index 045646da97cc..b15580bcf3b1 100644
>> --- a/kernel/bpf/syscall.c
>> +++ b/kernel/bpf/syscall.c
>> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>>   if (err)
>>   return -EINVAL;
>>
>> + err = security_map_create();
>> + if (err)
>> + return -EACCES;
>
> Any reason not to just return err?
>
> Mimi
>
Nope... return err might be better. I will fix this in next version.

Thanks
Chenbo
>> +
>>   /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
>>   map = find_and_alloc_map(attr);
>>   if (IS_ERR(map))
>> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>>   if (err)
>>   goto free_map_nouncharge;
>>
>> + err = security_post_create(map);
>> + if (err < 0)
>> + goto free_map;
>> +
>>   err = bpf_map_alloc_id(map);
>>   if (err)
>>   goto free_map;
>> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_read(map);
>> + if (err)
>> + return -EACCES;
>> +
>>   key = memdup_user(ukey, map->key_size);
>>   if (IS_ERR(key)) {
>>   err = PTR_ERR(key);
>> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_modify(map);
>> + if (err)
>> + return -EACCES;
>> +
>>   key = memdup_user(ukey, map->key_size);
>>   if (IS_ERR(key)) {
>>   err = PTR_ERR(key);
>> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_modify(map);
>> + if (err)
>> + return -EACCES;
>> +
>>   key = memdup_user(ukey, map->key_size);
>>   if (IS_ERR(key)) {
>>   err = PTR_ERR(key);
>> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>>   if (IS_ERR(map))
>>   return PTR_ERR(map);
>>
>> + err = security_map_read(map);
>> + if (err)
>> + return -EACCES;
>> +
>>   if (ukey) {
>>   key = memdup_user(ukey, map->key_size);
>>   if (IS_ERR(key)) {
>> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>>   if (CHECK_ATTR(BPF_PROG_LOAD))
>>   return -EINVAL;
>>
>> + err = security_prog_load();
>> + if (err)
>> + return -EACCES;
>> +
>>   if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
>>   return -EINVAL;
>>
>


[PATCH v3 net-next 3/7] samples/bpf: Update sock test to allow setting mark and priority

2017-08-31 Thread David Ahern
Update sock test to set mark and priority on socket create.

Signed-off-by: David Ahern 
---
 samples/bpf/test_cgrp2_sock.c  | 134 -
 samples/bpf/test_cgrp2_sock.sh |   2 +-
 2 files changed, 119 insertions(+), 17 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index c3cfb23e23b5..681abbe6c85e 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -19,59 +19,161 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "libbpf.h"
 
 char bpf_log_buf[BPF_LOG_BUF_SIZE];
 
-static int prog_load(int idx)
+static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 {
-   struct bpf_insn prog[] = {
+   /* save pointer to context */
+   struct bpf_insn prog_start[] = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+   };
+   struct bpf_insn prog_end[] = {
+   BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
+   BPF_EXIT_INSN(),
+   };
+
+   /* set sk_bound_dev_if on socket */
+   struct bpf_insn prog_dev[] = {
BPF_MOV64_IMM(BPF_REG_3, idx),
BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, 
bound_dev_if)),
BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct 
bpf_sock, bound_dev_if)),
-   BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */
-   BPF_EXIT_INSN(),
};
-   size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
 
-   return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
+   /* set mark on socket */
+   struct bpf_insn prog_mark[] = {
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+   BPF_MOV64_IMM(BPF_REG_3, mark),
+   BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
+   BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct 
bpf_sock, mark)),
+   };
+
+   /* set priority on socket */
+   struct bpf_insn prog_prio[] = {
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+   BPF_MOV64_IMM(BPF_REG_3, prio),
+   BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)),
+   BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct 
bpf_sock, priority)),
+   };
+
+   struct bpf_insn *prog;
+   size_t insns_cnt;
+   void *p;
+   int ret;
+
+   insns_cnt = sizeof(prog_start) + sizeof(prog_end);
+   if (idx)
+   insns_cnt += sizeof(prog_dev);
+
+   if (mark)
+   insns_cnt += sizeof(prog_mark);
+
+   if (prio)
+   insns_cnt += sizeof(prog_prio);
+
+   p = prog = malloc(insns_cnt);
+   if (!prog) {
+   fprintf(stderr, "Failed to allocate memory for instructions\n");
+   return EXIT_FAILURE;
+   }
+
+   memcpy(p, prog_start, sizeof(prog_start));
+   p += sizeof(prog_start);
+
+   if (idx) {
+   memcpy(p, prog_dev, sizeof(prog_dev));
+   p += sizeof(prog_dev);
+   }
+
+   if (mark) {
+   memcpy(p, prog_mark, sizeof(prog_mark));
+   p += sizeof(prog_mark);
+   }
+
+   if (prio) {
+   memcpy(p, prog_prio, sizeof(prog_prio));
+   p += sizeof(prog_prio);
+   }
+
+   memcpy(p, prog_end, sizeof(prog_end));
+   p += sizeof(prog_end);
+
+   insns_cnt /= sizeof(struct bpf_insn);
+
+   ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt,
"GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+   free(prog);
+
+   return ret;
 }
 
 static int usage(const char *argv0)
 {
-   printf("Usage: %s cg-path device-index\n", argv0);
+   printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
return EXIT_FAILURE;
 }
 
 int main(int argc, char **argv)
 {
+   __u32 idx = 0, mark = 0, prio = 0;
+   const char *cgrp_path = NULL;
int cg_fd, prog_fd, ret;
-   unsigned int idx;
+   int rc;
+
+   while ((rc = getopt(argc, argv, "b:m:p:")) != -1) {
+   switch (rc) {
+   case 'b':
+   idx = if_nametoindex(optarg);
+   if (!idx) {
+   idx = strtoumax(optarg, NULL, 0);
+   if (!idx) {
+   printf("Invalid device name\n");
+   return EXIT_FAILURE;
+   }
+   }
+   break;
+   case 'm':
+   mark = strtoumax(optarg, NULL, 0);
+   break;
+   case 'p':
+   prio = strtoumax(optarg, NULL, 0);
+   break;
+   default:
+   return usage(argv[0]);
+   }
+   }
 
-   if (argc < 2)
+   if (optind == argc)
return usage(arg

[PATCH v3 net-next 4/7] samples/bpf: Add detach option to test_cgrp2_sock

2017-08-31 Thread David Ahern
Add option to detach programs from a cgroup.

Signed-off-by: David Ahern 
---
 samples/bpf/test_cgrp2_sock.c | 50 ++-
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 681abbe6c85e..15396761c5cc 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -114,7 +114,12 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 
 static int usage(const char *argv0)
 {
-   printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
+   printf("Usage:\n");
+   printf("  Attach a program\n");
+   printf("  %s -b bind-to-dev -m mark -p prio cg-path\n", argv0);
+   printf("\n");
+   printf("  Detach a program\n");
+   printf("  %s -d cg-path\n", argv0);
return EXIT_FAILURE;
 }
 
@@ -123,10 +128,14 @@ int main(int argc, char **argv)
__u32 idx = 0, mark = 0, prio = 0;
const char *cgrp_path = NULL;
int cg_fd, prog_fd, ret;
+   int do_attach = 1;
int rc;
 
-   while ((rc = getopt(argc, argv, "b:m:p:")) != -1) {
+   while ((rc = getopt(argc, argv, "db:m:p:")) != -1) {
switch (rc) {
+   case 'd':
+   do_attach = 0;
+   break;
case 'b':
idx = if_nametoindex(optarg);
if (!idx) {
@@ -157,7 +166,7 @@ int main(int argc, char **argv)
return EXIT_FAILURE;
}
 
-   if (!idx && !mark && !prio) {
+   if (do_attach && !idx && !mark && !prio) {
fprintf(stderr,
"One of device, mark or priority must be given\n");
return EXIT_FAILURE;
@@ -169,20 +178,31 @@ int main(int argc, char **argv)
return EXIT_FAILURE;
}
 
-   prog_fd = prog_load(idx, mark, prio);
-   if (prog_fd < 0) {
-   printf("Failed to load prog: '%s'\n", strerror(errno));
-   printf("Output from kernel verifier:\n%s\n---\n",
-  bpf_log_buf);
-   return EXIT_FAILURE;
-   }
+   if (do_attach) {
+   prog_fd = prog_load(idx, mark, prio);
+   if (prog_fd < 0) {
+   printf("Failed to load prog: '%s'\n", strerror(errno));
+   printf("Output from kernel verifier:\n%s\n---\n",
+  bpf_log_buf);
+   return EXIT_FAILURE;
+   }
 
-   ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0);
-   if (ret < 0) {
-   printf("Failed to attach prog to cgroup: '%s'\n",
-  strerror(errno));
-   return EXIT_FAILURE;
+   ret = bpf_prog_attach(prog_fd, cg_fd,
+ BPF_CGROUP_INET_SOCK_CREATE, 0);
+   if (ret < 0) {
+   printf("Failed to attach prog to cgroup: '%s'\n",
+  strerror(errno));
+   return EXIT_FAILURE;
+   }
+   } else {
+   ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE);
+   if (ret < 0) {
+   printf("Failed to detach prog from cgroup: '%s'\n",
+  strerror(errno));
+   return EXIT_FAILURE;
+   }
}
 
+   close(cg_fd);
return EXIT_SUCCESS;
 }
-- 
2.1.4



[PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper

2017-08-31 Thread David Ahern
Allow BPF programs run on sock create to use the get_current_uid_gid
helper. IPv4 and IPv6 sockets are created in a process context so
there is always a valid uid/gid

Signed-off-by: David Ahern 
Acked-by: Alexei Starovoitov 
---
 net/core/filter.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index f51b9690adf3..9dad3e7e2e10 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3150,6 +3150,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 }
 
 static const struct bpf_func_proto *
+sock_filter_func_proto(enum bpf_func_id func_id)
+{
+   switch (func_id) {
+   /* inet and inet6 sockets are created in a process
+* context so there is always a valid uid/gid
+*/
+   case BPF_FUNC_get_current_uid_gid:
+   return &bpf_get_current_uid_gid_proto;
+   default:
+   return bpf_base_func_proto(func_id);
+   }
+}
+
+static const struct bpf_func_proto *
 sk_filter_func_proto(enum bpf_func_id func_id)
 {
switch (func_id) {
@@ -4233,7 +4247,7 @@ const struct bpf_verifier_ops lwt_xmit_prog_ops = {
 };
 
 const struct bpf_verifier_ops cg_sock_prog_ops = {
-   .get_func_proto = bpf_base_func_proto,
+   .get_func_proto = sock_filter_func_proto,
.is_valid_access= sock_filter_is_valid_access,
.convert_ctx_access = sock_filter_convert_ctx_access,
 };
-- 
2.1.4



[PATCH v3 net-next 6/7] samples/bpf: Update cgrp2 socket tests

2017-08-31 Thread David Ahern
Update cgrp2 bpf sock tests to check that device, mark and priority
can all be set on a socket via bpf programs attached to a cgroup.

Signed-off-by: David Ahern 
---
 samples/bpf/test_cgrp2_sock.sh | 162 +++--
 1 file changed, 124 insertions(+), 38 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh
index 1153c33e8964..a81f38eef417 100755
--- a/samples/bpf/test_cgrp2_sock.sh
+++ b/samples/bpf/test_cgrp2_sock.sh
@@ -1,47 +1,133 @@
-#!/bin/bash
-
-function config_device {
-   ip netns add at_ns0
-   ip link add veth0 type veth peer name veth0b
-   ip link set veth0b up
-   ip link set veth0 netns at_ns0
-   ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
-   ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad
-   ip netns exec at_ns0 ip link set dev veth0 up
-   ip link add foo type vrf table 1234
-   ip link set foo up
-   ip addr add 172.16.1.101/24 dev veth0b
-   ip addr add 2401:db00::2/64 dev veth0b nodad
-   ip link set veth0b master foo
+#!/bin/sh
+
+# Test various socket options that can be set by attaching programs to cgroups.
+
+CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock"
+
+
+#
+print_result()
+{
+   local rc=$1
+   local status=" OK "
+
+   [ $rc -ne 0 ] && status="FAIL"
+
+   printf "%-50s[%4s]\n" "$2" "$status"
 }
 
-function attach_bpf {
-   rm -rf /tmp/cgroupv2
-   mkdir -p /tmp/cgroupv2
-   mount -t cgroup2 none /tmp/cgroupv2
-   mkdir -p /tmp/cgroupv2/foo
-   test_cgrp2_sock -b foo /tmp/cgroupv2/foo
-   echo $$ >> /tmp/cgroupv2/foo/cgroup.procs
+check_sock()
+{
+   out=$(test_cgrp2_sock)
+   echo $out | grep -q "$1"
+   if [ $? -ne 0 ]; then
+   print_result 1 "IPv4: $2"
+   echo "expected: $1"
+   echo "have: $out"
+   rc=1
+   else
+   print_result 0 "IPv4: $2"
+   fi
 }
 
-function cleanup {
-   set +ex
-   ip netns delete at_ns0
-   ip link del veth0
-   ip link del foo
-   umount /tmp/cgroupv2
-   rm -rf /tmp/cgroupv2
-   set -ex
+check_sock6()
+{
+   out=$(test_cgrp2_sock -6)
+   echo $out | grep -q "$1"
+   if [ $? -ne 0 ]; then
+   print_result 1 "IPv6: $2"
+   echo "expected: $1"
+   echo "have: $out"
+   rc=1
+   else
+   print_result 0 "IPv6: $2"
+   fi
 }
 
-function do_test {
-   ping -c1 -w1 172.16.1.100
-   ping6 -c1 -w1 2401:db00::1
+
+#
+
+cleanup()
+{
+   echo $$ >> ${CGRP_MNT}/cgroup.procs
+   rmdir ${CGRP_MNT}/sockopts
 }
 
+cleanup_and_exit()
+{
+   local rc=$1
+   local msg="$2"
+
+   [ -n "$msg" ] && echo "ERROR: $msg"
+
+   ip li del cgrp2_sock
+   umount ${CGRP_MNT}
+
+   exit $rc
+}
+
+
+
+# main
+
+rc=0
+
+ip li add cgrp2_sock type dummy 2>/dev/null
+
+set -e
+mkdir -p ${CGRP_MNT}
+mount -t cgroup2 none ${CGRP_MNT}
+set +e
+
+
+# make sure we have a known start point
 cleanup 2>/dev/null
-config_device
-attach_bpf
-do_test
-cleanup
-echo "*** PASS ***"
+
+mkdir -p ${CGRP_MNT}/sockopts
+[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy"
+
+
+# set pid into cgroup
+echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs
+
+# no bpf program attached, so socket should show no settings
+check_sock "dev , mark 0, priority 0" "No programs attached"
+check_sock6 "dev , mark 0, priority 0" "No programs attached"
+
+# verify device is set
+#
+test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+   cleanup_and_exit 1 "Failed to install program to set device"
+fi
+check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set"
+check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set"
+
+# verify mark is set
+#
+test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+   cleanup_and_exit 1 "Failed to install program to set mark"
+fi
+check_sock "dev , mark 666, priority 0" "Mark set"
+check_sock6 "dev , mark 666, priority 0" "Mark set"
+
+# verify priority is set
+#
+test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+   cleanup_and_exit 1 "Failed to install program to set priority"
+fi
+check_sock "dev , mark 0, priority 123" "Priority set"
+check_sock6 "dev , mark 0, priority 123" "Priority set"
+
+# all 3 at once
+#
+test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts
+if [ $? -ne 0 ]; then
+   cleanup_and_exit 1 "Failed to install program to set device, mark and 
priority"
+fi
+check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set"
+
+clea

[PATCH v3 net-next 7/7] samples/bpf: Update cgroup socket examples to use uid gid helper

2017-08-31 Thread David Ahern
Signed-off-by: David Ahern 
---
 samples/bpf/sock_flags_kern.c |  5 +
 samples/bpf/test_cgrp2_sock.c | 12 +++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c
index 533dd11a6baa..05dcdf8a4baa 100644
--- a/samples/bpf/sock_flags_kern.c
+++ b/samples/bpf/sock_flags_kern.c
@@ -9,8 +9,13 @@ SEC("cgroup/sock1")
 int bpf_prog1(struct bpf_sock *sk)
 {
char fmt[] = "socket: family %d type %d protocol %d\n";
+   char fmt2[] = "socket: uid %u gid %u\n";
+   __u64 gid_uid = bpf_get_current_uid_gid();
+   __u32 uid = gid_uid & 0x;
+   __u32 gid = gid_uid >> 32;
 
bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol);
+   bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid);
 
/* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets
 * ie., make ping6 fail
diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 5a688837720c..e79594dd629b 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -46,8 +46,18 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
 
/* set mark on socket */
struct bpf_insn prog_mark[] = {
-   BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+   /* get uid of process */
+   BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+BPF_FUNC_get_current_uid_gid),
+   BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0x),
+
+   /* if uid is 0, use given mark, else use the uid as the mark */
+   BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+   BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
BPF_MOV64_IMM(BPF_REG_3, mark),
+
+   /* set the mark on the new socket */
+   BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)),
BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct 
bpf_sock, mark)),
};
-- 
2.1.4



[PATCH v3 net-next 5/7] samples/bpf: Add option to dump socket settings

2017-08-31 Thread David Ahern
Add option to dump socket settings. Will be used in the next patch
to verify bpf programs are correctly setting mark, priority and
device based on the cgroup attachment for the program run.

Signed-off-by: David Ahern 
---
 samples/bpf/test_cgrp2_sock.c | 75 +--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c
index 15396761c5cc..5a688837720c 100644
--- a/samples/bpf/test_cgrp2_sock.c
+++ b/samples/bpf/test_cgrp2_sock.c
@@ -112,6 +112,70 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio)
return ret;
 }
 
+static int get_bind_to_device(int sd, char *name, size_t len)
+{
+   socklen_t optlen = len;
+   int rc;
+
+   name[0] = '\0';
+   rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
+   if (rc < 0)
+   perror("setsockopt(SO_BINDTODEVICE)");
+
+   return rc;
+}
+
+static unsigned int get_somark(int sd)
+{
+   unsigned int mark = 0;
+   socklen_t optlen = sizeof(mark);
+   int rc;
+
+   rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen);
+   if (rc < 0)
+   perror("getsockopt(SO_MARK)");
+
+   return mark;
+}
+
+static unsigned int get_priority(int sd)
+{
+   unsigned int prio = 0;
+   socklen_t optlen = sizeof(prio);
+   int rc;
+
+   rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen);
+   if (rc < 0)
+   perror("getsockopt(SO_PRIORITY)");
+
+   return prio;
+}
+
+static int show_sockopts(int family)
+{
+   unsigned int mark, prio;
+   char name[16];
+   int sd;
+
+   sd = socket(family, SOCK_DGRAM, 17);
+   if (sd < 0) {
+   perror("socket");
+   return 1;
+   }
+
+   if (get_bind_to_device(sd, name, sizeof(name)) < 0)
+   return 1;
+
+   mark = get_somark(sd);
+   prio = get_priority(sd);
+
+   close(sd);
+
+   printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio);
+
+   return 0;
+}
+
 static int usage(const char *argv0)
 {
printf("Usage:\n");
@@ -120,6 +184,9 @@ static int usage(const char *argv0)
printf("\n");
printf("  Detach a program\n");
printf("  %s -d cg-path\n", argv0);
+   printf("\n");
+   printf("  Show inherited socket settings (mark, priority, and 
device)\n");
+   printf("  %s [-6]\n", argv0);
return EXIT_FAILURE;
 }
 
@@ -128,10 +195,11 @@ int main(int argc, char **argv)
__u32 idx = 0, mark = 0, prio = 0;
const char *cgrp_path = NULL;
int cg_fd, prog_fd, ret;
+   int family = PF_INET;
int do_attach = 1;
int rc;
 
-   while ((rc = getopt(argc, argv, "db:m:p:")) != -1) {
+   while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) {
switch (rc) {
case 'd':
do_attach = 0;
@@ -152,13 +220,16 @@ int main(int argc, char **argv)
case 'p':
prio = strtoumax(optarg, NULL, 0);
break;
+   case '6':
+   family = PF_INET6;
+   break;
default:
return usage(argv[0]);
}
}
 
if (optind == argc)
-   return usage(argv[0]);
+   return show_sockopts(family);
 
cgrp_path = argv[optind];
if (!cgrp_path) {
-- 
2.1.4



[PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set

2017-08-31 Thread David Ahern
Add socket mark and priority to fields that can be set by
ebpf program when a socket is created.

Signed-off-by: David Ahern 
Acked-by: Alexei Starovoitov 
---
 include/uapi/linux/bpf.h |  2 ++
 net/core/filter.c| 26 ++
 2 files changed, 28 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d46cf326b95f..e9c89e20adff 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -758,6 +758,8 @@ struct bpf_sock {
__u32 family;
__u32 type;
__u32 protocol;
+   __u32 mark;
+   __u32 priority;
 };
 
 #define XDP_PACKET_HEADROOM 256
diff --git a/net/core/filter.c b/net/core/filter.c
index c6a37fe0285b..f51b9690adf3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int 
size,
switch (off) {
case offsetof(struct bpf_sock, bound_dev_if):
break;
+   case offsetof(struct bpf_sock, mark):
+   break;
+   case offsetof(struct bpf_sock, priority):
+   break;
default:
return false;
}
@@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum 
bpf_access_type type,
  offsetof(struct sock, sk_bound_dev_if));
break;
 
+   case offsetof(struct bpf_sock, mark):
+   BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+
+   if (type == BPF_WRITE)
+   *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+   offsetof(struct sock, sk_mark));
+   else
+   *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_mark));
+   break;
+
+   case offsetof(struct bpf_sock, priority):
+   BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+
+   if (type == BPF_WRITE)
+   *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
+   offsetof(struct sock, sk_priority));
+   else
+   *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct sock, sk_priority));
+   break;
+
case offsetof(struct bpf_sock, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
 
-- 
2.1.4



[PATCH v3 net-next 0/7] bpf: Add option to set mark and priority in cgroup sock programs

2017-08-31 Thread David Ahern
Add option to set mark and priority in addition to bound device for newly
created sockets. Also, allow the bpf programs to use the get_current_uid_gid
helper meaning socket marks, priority and device can be set based on the
uid/gid of the running process.

Sample programs are updated to demonstrate the new options.

v3
- no changes to Patches 1 and 2 which Alexei acked in previous versions
- dropped change related to recursive programs in a cgroup
- updated tests per dropped patch

v2
- added flag to control recursive behavior as requested by Alexei
- added comment to sock_filter_func_proto regarding use of
  get_current_uid_gid helper
- updated test programs for recursive option

David Ahern (7):
  bpf: Add mark and priority to sock options that can be set
  bpf: Allow cgroup sock filters to use get_current_uid_gid helper
  samples/bpf: Update sock test to allow setting mark and priority
  samples/bpf: Add detach option to test_cgrp2_sock
  samples/bpf: Add option to dump socket settings
  samples/bpf: Update cgrp2 socket tests
  samples/bpf: Update cgroup socket examples to use uid gid helper

 include/uapi/linux/bpf.h   |   2 +
 net/core/filter.c  |  42 ++-
 samples/bpf/sock_flags_kern.c  |   5 +
 samples/bpf/test_cgrp2_sock.c  | 255 -
 samples/bpf/test_cgrp2_sock.sh | 162 --
 5 files changed, 401 insertions(+), 65 deletions(-)

-- 
2.1.4



Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update

2017-08-31 Thread Jesper Dangaard Brouer
On Thu, 31 Aug 2017 11:43:25 -0700 (PDT)
David Miller  wrote:

> From: Roopa Prabhu 
> Date: Wed, 30 Aug 2017 22:18:13 -0700
> 
> > From: Roopa Prabhu 
> > 
> > This extends bridge fdb table tracepoints to also cover
> > learned fdb entries in the br_fdb_update path. Note that
> > unlike other tracepoints I have moved this to when the fdb
> > is modified because this is in the datapath and can generate
> > a lot of noise in the trace output. br_fdb_update is also called
> > from added_by_user context in the NTF_USE case which is already
> > traced ..hence the !added_by_user check.
> > 
> > Signed-off-by: Roopa Prabhu   
> 
> Applied.
> 
> Let's use dev->name for now and if the tooling can eventually
> do transparent ifindex->name then we can consider redoing
> a bunch of networking tracepoints.

I agree! :-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [patch net-next repost 0/8] mlxsw: Add IPv6 host dpipe table

2017-08-31 Thread David Miller
From: Jiri Pirko 
Date: Thu, 31 Aug 2017 17:59:11 +0200

> This patchset adds IPv6 host dpipe table support. This will provide the
> ability to observe the hardware offloaded IPv6 neighbors.

Series applied, thanks.

I noticed while reviewing this we are pretty much split on how
to access neigh->primary_key.

Half the code goes "(type *) n->primary_key" and the other half
(mostly in ipv6) goes "(type *) &n->primary_key"

I know both forms are basically equivalent, but consistency
matters :-)


Re: [PATCH net-next] net/ncsi: Define {add, kill}_vid callbacks for !CONFIG_NET_NCSI

2017-08-31 Thread Benjamin Herrenschmidt
On Thu, 2017-08-31 at 08:24 -0700, Vernon Mauery wrote:
>  +int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
> > +{
> > + return -ENOTTY;
> > +}
> > +int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
> > +{
> > + return -ENOTTY;
> > +}
> 
> These should be static functions because they are defined in the header 
> file or you will get multiple symbol definitions.

static inline even or you'll get warning about them being unused iirc.

Cheers,
Ben.



Re: [PATCH net-next 7/8] net: hns3: add vlan filter config of Ports

2017-08-31 Thread David Miller
From: Lipeng 
Date: Thu, 31 Aug 2017 21:39:08 +0800

> Config the self_define vlan_type as TPID(0x8100) for vlan identification.
> When normal port initialize vlan configure, set default vlan id as 0.
> 
> Signed-off-by: Mingguang Qu 
> Signed-off-by: Lipeng 

No, that's not what this patch is doing.

> @@ -3308,6 +3308,7 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport 
> *vport,
>   mc_desc[1].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
>   hclge_cmd_reuse_desc(&mc_desc[2], false);
>   mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_NEXT);
> +
>   memcpy(mc_desc[0].data, req,
>  sizeof(struct hclge_mac_vlan_tbl_entry));
>   ret = hclge_cmd_send(&hdev->hw, mc_desc, 3);

All it does is add an empty line.


Re: [PATCH] ath9k: remove cast to void pointer

2017-08-31 Thread Joe Perches
On Thu, 2017-08-31 at 18:37 +0530, Himanshu Jha wrote:
> casting to void pointer from any pointer type and vice-versa is done
> implicitly and therefore casting is not needed in such a case.

You said you were going to remember to mention the tool
and script that did this.


Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Mimi Zohar
On Thu, 2017-08-31 at 13:56 -0700, Chenbo Feng wrote:
> From: Chenbo Feng 
> 
> Introduce a pointer into struct bpf_map to hold the security information
> about the map. The actual security struct varies based on the security
> models implemented. Place the LSM hooks before each of the unrestricted
> eBPF operations, the map_update_elem and map_delete_elem operations are
> checked by security_map_modify. The map_lookup_elem and map_get_next_key
> operations are checked by securtiy_map_read.
> 
> Signed-off-by: Chenbo Feng 
> ---
>  include/linux/bpf.h  |  3 +++
>  kernel/bpf/syscall.c | 28 
>  2 files changed, 31 insertions(+)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b69e7a5869ff..ca3e6ff7091d 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -53,6 +53,9 @@ struct bpf_map {
>   struct work_struct work;
>   atomic_t usercnt;
>   struct bpf_map *inner_map_meta;
> +#ifdef CONFIG_SECURITY
> + void *security;
> +#endif
>  };
>  
>  /* function argument constraints */
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 045646da97cc..b15580bcf3b1 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
>   if (err)
>   return -EINVAL;
>  
> + err = security_map_create();
> + if (err)
> + return -EACCES;

Any reason not to just return err?

Mimi

> +
>   /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
>   map = find_and_alloc_map(attr);
>   if (IS_ERR(map))
> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
>   if (err)
>   goto free_map_nouncharge;
>  
> + err = security_post_create(map);
> + if (err < 0)
> + goto free_map;
> +
>   err = bpf_map_alloc_id(map);
>   if (err)
>   goto free_map;
> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_read(map);
> + if (err)
> + return -EACCES;
> +
>   key = memdup_user(ukey, map->key_size);
>   if (IS_ERR(key)) {
>   err = PTR_ERR(key);
> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_modify(map);
> + if (err)
> + return -EACCES;
> +
>   key = memdup_user(ukey, map->key_size);
>   if (IS_ERR(key)) {
>   err = PTR_ERR(key);
> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_modify(map);
> + if (err)
> + return -EACCES;
> +
>   key = memdup_user(ukey, map->key_size);
>   if (IS_ERR(key)) {
>   err = PTR_ERR(key);
> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
>   if (IS_ERR(map))
>   return PTR_ERR(map);
>  
> + err = security_map_read(map);
> + if (err)
> + return -EACCES;
> +
>   if (ukey) {
>   key = memdup_user(ukey, map->key_size);
>   if (IS_ERR(key)) {
> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
>   if (CHECK_ATTR(BPF_PROG_LOAD))
>   return -EINVAL;
>  
> + err = security_prog_load();
> + if (err)
> + return -EACCES;
> +
>   if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
>   return -EINVAL;
>  



[PATCH net-next] doc: document MSG_ZEROCOPY

2017-08-31 Thread Willem de Bruijn
From: Willem de Bruijn 

Documentation for this feature was missing from the patchset.
Copied a lot from the netdev 2.1 paper, addressing some small
interface changes since then.

Signed-off-by: Willem de Bruijn 
---
 Documentation/networking/msg_zerocopy.rst | 254 ++
 1 file changed, 254 insertions(+)
 create mode 100644 Documentation/networking/msg_zerocopy.rst

diff --git a/Documentation/networking/msg_zerocopy.rst 
b/Documentation/networking/msg_zerocopy.rst
new file mode 100644
index ..2e2a3410b947
--- /dev/null
+++ b/Documentation/networking/msg_zerocopy.rst
@@ -0,0 +1,254 @@
+
+
+MSG_ZEROCOPY
+
+
+Intro
+=
+
+The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
+The feature is currently implemented for TCP sockets.
+
+
+Opportunity and Caveats
+---
+
+Copying large buffers between user process and kernel can be
+expensive. Linux supports various interfaces that eschew copying,
+such as sendpage and splice. The MSG_ZEROCOPY flag extends the
+underlying copy avoidance mechanism to common socket send calls.
+
+Copy avoidance is not a free lunch. As implemented, with page pinning,
+it replaces per byte copy cost with page accounting and completion
+notification overhead. As a result, MSG_ZEROCOPY is generally only
+effective at writes over around 10 KB.
+
+Page pinning also changes system call semantics. It temporarily shares
+the buffer between process and network stack. Unlike with copying, the
+process cannot immediately overwrite the buffer after system call
+return without possibly modifying the data in flight. Kernel integrity
+is not affected, but a buggy program can possibly corrupt its own data
+stream.
+
+The kernel returns a notification when it is safe to modify data.
+Converting an existing application to MSG_ZEROCOPY is not always as
+trivial as just passing the flag, then.
+
+
+More Info
+-
+
+Much of this document was derived from a longer paper presented at
+netdev 2.1. For more in-depth information see that paper and talk,
+the excellent reporting over at LWN.net or read the original code.
+
+  paper, slides, video
+https://netdevconf.org/2.1/session.html?debruijn
+
+  LWN article
+https://lwn.net/Articles/726917/
+
+  patchset
+[PATCH net-next v4 0/9] socket sendmsg MSG_ZEROCOPY
+https://lwn.net/Articles/730010/
+https://www.spinics.net/lists/netdev/msg447552.html
+
+
+Interface
+=
+
+Passing the MSG_ZEROCOPY flag is the most obvious step to enable copy
+avoidance, but not the only one.
+
+Socket Setup
+
+
+The kernel is permissive when applications pass undefined flags to the
+send system call. By default it simply ignores these. To avoid enabling
+copy avoidance mode for legacy processes that accidentally pass this
+flag, a process must first signal intent by setting a socket option:
+
+::
+
+   if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
+   error(1, errno, "setsockopt zerocopy");
+
+
+Transmission
+
+
+The change to send (or sendto, sendmsg, sendmmsg) itself is trivial.
+Pass the new flag.
+
+::
+
+   ret = send(fd, buf, sizeof(buf), MSG_ZEROCOPY);
+   if (ret != sizeof(buf))
+   error(1, errno, "send");
+
+
+Mixing copy avoidance and copying
+~
+
+Many workloads have a mixture of large and small buffers. Because copy
+avoidance is more expensive than copying for small packets, the
+feature is implemented as a flag. It is safe to mix calls with the flag
+with those without.
+
+
+Notifications
+-
+
+The kernel has to notify the process when it is safe to reuse a
+previously passed buffer. It queues completion notifications on the
+socket error queue, akin to the transmit timestamping interface.
+
+The notification itself is a simple scalar value. Each socket
+maintains an internal 32-bit counter. Each send call with MSG_ZEROCOPY
+that successfully sends data increments the counter. The counter is
+not incremented on failure or if called with length zero.
+
+
+Notification Reception
+~~
+
+The below snippet demonstrates the API. In the simplest case, each
+send syscall is followed by a poll and recvmsg on the error queue.
+
+Reading from the error queue is always a non-blocking operation. The
+poll call is there to block until an error is outstanding. It will set
+POLLERR in its output flags. That flag does not have to be set in the
+events field: errors are signaled unconditionally.
+
+::
+
+   pfd.fd = fd;
+   pfd.events = 0;
+   if (poll(&pfd, 1, -1) != 1 || pfd.revents & POLLERR == 0)
+   error(1, errno, "poll");
+
+   ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+   if (ret == -1)
+   error(1, errno, "recvmsg");
+
+   read_notification(msg);
+
+The example is for demonstration purpose only. In practice, it is more
+efficient to not wait for notifications, but

Re: [PATCH v4 4/5] net: stmmac: dwmac-sun8i: choose internal PHY via phy-is-integrated

2017-08-31 Thread Andrew Lunn
On Thu, Aug 31, 2017 at 03:18:03PM -0500, Rob Herring wrote:
> On Sat, Aug 26, 2017 at 11:20:51PM +0200, Andrew Lunn wrote:
> > Hi Corentin
> > 
> > I think we have now all agreed this is an mdio-mux, plus it is also an
> > MII mux. We should represent that in device tree. This patchset does
> > this. However, as it is now, the mux structure in DT is ignored. All
> > it does is search for the phy-is-integrated flags and goes on that.
> > 
> > I made the comment that the device tree representation cannot be
> > implemented using an MDIO mux driver, because of driver loading
> > issues.  However, the core of the MDIO mux code is just a library,
> > symbols exported as GPL, free for anything to use.
> > 
> > What i think should happen is the mdio-mux is implemented inside the
> > MAC driver, using the mux-core as a library. The device tree structure
> > of a mix is then reflected within Linux. The mux switch callback is
> > implemented within the MAC driver. So it can reset the MAC when the
> > mux is switched. The 'phy-is-integrated' property is then no longer
> > needed.
> > 
> > I would suggest a binding something like:
> 
> This is looks better to me, but...
>  
> > emac: ethernet@1c0b000 {
> > compatible = "allwinner,sun8i-h3-emac";
> > syscon = <&syscon>;
> > reg = <0x01c0b000 0x104>;
> > interrupts = ;
> > interrupt-names = "macirq";
> > resets = <&ccu RST_BUS_EMAC>;
> > reset-names = "stmmaceth";
> > clocks = <&ccu CLK_BUS_EMAC>;
> > clock-names = "stmmaceth";
> > #address-cells = <1>;
> > #size-cells = <0>;
> > 
> > phy-handle = <&int_mii_phy>;
> > phy-mode = "mii";
> > allwinner,leds-active-low;
> > 
> > mdio: mdio {
> > #address-cells = <1>;
> > #size-cells = <0>;
> > }
> 
> Why do you need this node still?

Hi Rob
 
It might not be needed, depending on how it is implemented. But:

Documentation/devicetree/bindings/net/mdio-mux.txt

It is normal for an mdio bus mux to have a phandle back to the parent
mdio bus.  Also, i think the stmmac driver will only instantiate the
mdio bus if there is a node for it in the device tree.

 Andrew


[PATCH 3/3] selinux: bpf: Implement the selinux checks for eBPF object

2017-08-31 Thread Chenbo Feng
From: Chenbo Feng 

Introduce 5 new selinux checks for eBPF object related operations. The
check is based on the ownership information of eBPF maps and the
capability of creating eBPF object.

Signed-off-by: Chenbo Feng 
---
 security/selinux/hooks.c| 54 +
 security/selinux/include/classmap.h |  2 ++
 security/selinux/include/objsec.h   |  4 +++
 3 files changed, 60 insertions(+)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 33fd061305c4..39ad7d9f335d 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -85,6 +85,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "avc.h"
 #include "objsec.h"
@@ -6245,6 +6246,52 @@ static void selinux_ib_free_security(void *ib_sec)
 }
 #endif
 
+#ifdef CONFIG_BPF_SYSCALL
+static int selinux_bpf_map_create(void)
+{
+   u32 sid = current_sid();
+
+   return avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL);
+}
+
+static int selinux_bpf_map_modify(struct bpf_map *map)
+{
+   struct bpf_security_struct *bpfsec = map->security;
+
+   return avc_has_perm(current_sid(), bpfsec->sid, SECCLASS_BPF,
+   BPF__MAP_MODIFY, NULL);
+}
+
+static int selinux_bpf_map_read(struct bpf_map *map)
+{
+   struct bpf_security_struct *bpfsec = map->security;
+
+   return avc_has_perm(current_sid(), bpfsec->sid, SECCLASS_BPF,
+   BPF__MAP_READ, NULL);
+}
+
+static int selinux_bpf_prog_load(void)
+{
+   u32 sid = current_sid();
+
+   return avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL);
+}
+
+static int selinux_bpf_post_create(struct bpf_map *map)
+{
+   struct bpf_security_struct *bpfsec;
+
+   bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+   if (!bpfsec)
+   return -ENOMEM;
+
+   bpfsec->sid = current_sid();
+   map->security = bpfsec;
+
+   return 0;
+}
+#endif
+
 static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -6465,6 +6512,13 @@ static struct security_hook_list selinux_hooks[] 
__lsm_ro_after_init = {
LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
 #endif
+#ifdef CONFIG_BPF_SYSCALL
+   LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
+   LSM_HOOK_INIT(bpf_map_modify, selinux_bpf_map_modify),
+   LSM_HOOK_INIT(bpf_map_read, selinux_bpf_map_read),
+   LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
+   LSM_HOOK_INIT(bpf_post_create, selinux_bpf_post_create),
+#endif
 };
 
 static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h 
b/security/selinux/include/classmap.h
index b9fe3434b036..83c880fb17b4 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -235,6 +235,8 @@ struct security_class_mapping secclass_map[] = {
  { "access", NULL } },
{ "infiniband_endport",
  { "manage_subnet", NULL } },
+   { "bpf",
+ {"map_create", "map_modify", "map_read", "prog_load" } },
{ NULL }
   };
 
diff --git a/security/selinux/include/objsec.h 
b/security/selinux/include/objsec.h
index 6ebc61e370ff..ba564f662b0d 100644
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -150,6 +150,10 @@ struct pkey_security_struct {
u32 sid;/* SID of pkey */
 };
 
+struct bpf_security_struct {
+   u32 sid;/*SID of bpf obj creater*/
+};
+
 extern unsigned int selinux_checkreqprot;
 
 #endif /* _SELINUX_OBJSEC_H_ */
-- 
2.14.1.581.gf28d330327-goog



[PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map

2017-08-31 Thread Chenbo Feng
From: Chenbo Feng 

Introduce a pointer into struct bpf_map to hold the security information
about the map. The actual security struct varies based on the security
models implemented. Place the LSM hooks before each of the unrestricted
eBPF operations, the map_update_elem and map_delete_elem operations are
checked by security_map_modify. The map_lookup_elem and map_get_next_key
operations are checked by securtiy_map_read.

Signed-off-by: Chenbo Feng 
---
 include/linux/bpf.h  |  3 +++
 kernel/bpf/syscall.c | 28 
 2 files changed, 31 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b69e7a5869ff..ca3e6ff7091d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -53,6 +53,9 @@ struct bpf_map {
struct work_struct work;
atomic_t usercnt;
struct bpf_map *inner_map_meta;
+#ifdef CONFIG_SECURITY
+   void *security;
+#endif
 };
 
 /* function argument constraints */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 045646da97cc..b15580bcf3b1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr)
if (err)
return -EINVAL;
 
+   err = security_map_create();
+   if (err)
+   return -EACCES;
+
/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
map = find_and_alloc_map(attr);
if (IS_ERR(map))
@@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
 
+   err = security_post_create(map);
+   if (err < 0)
+   goto free_map;
+
err = bpf_map_alloc_id(map);
if (err)
goto free_map;
@@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
 
+   err = security_map_read(map);
+   if (err)
+   return -EACCES;
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
 
+   err = security_map_modify(map);
+   if (err)
+   return -EACCES;
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
 
+   err = security_map_modify(map);
+   if (err)
+   return -EACCES;
+
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
err = PTR_ERR(key);
@@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
 
+   err = security_map_read(map);
+   if (err)
+   return -EACCES;
+
if (ukey) {
key = memdup_user(ukey, map->key_size);
if (IS_ERR(key)) {
@@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_LOAD))
return -EINVAL;
 
+   err = security_prog_load();
+   if (err)
+   return -EACCES;
+
if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT)
return -EINVAL;
 
-- 
2.14.1.581.gf28d330327-goog



  1   2   3   >