Re: tip -ENOBOOT - bisected to locking/refcounts, x86/asm: Implement fast refcount overflow protection
On Thu, 2017-08-31 at 11:45 -0700, Kees Cook wrote: > On Thu, Aug 31, 2017 at 10:19 AM, Mike Galbraith wrote: > > On Thu, 2017-08-31 at 10:00 -0700, Kees Cook wrote: > >> > >> Oh! So it's gcc-version sensitive? That's alarming. Is this mapping > >> correct: > >> > >> 4.8.5: WARN, eventual kernel hang > >> 6.3.1, 7.0.1: WARN, but continues working > > > > Yeah, that's correct. I find that troubling, simply because this gcc > > version has been through one hell of a lot of kernels with me. Yeah, I > > know, that doesn't exempt it from having bugs, but color me suspicious. > > I still can't hit this with a 4.8.5 build. :( > > With _RATELIMIT removed, this should, in theory, report whatever goes > negative first... I applied the other patch you posted, and built with gcc-6.3.1 to remove the gcc-4.8.5 aspect. Look below the resulting splat. [1.293962] NET: Registered protocol family 10 [1.294635] refcount_t silent saturation at in6_dev_get+0x25/0x104 in swapper/0[1], uid/euid: 0/0 [1.295616] [ cut here ] [1.296120] WARNING: CPU: 0 PID: 1 at kernel/panic.c:612 refcount_error_report+0x94/0x9e [1.296950] Modules linked in: [1.297276] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0.g152d54a-tip-default #53 [1.299179] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 [1.300743] task: 88013ab84040 task.stack: c962c000 [1.301825] RIP: 0010:refcount_error_report+0x94/0x9e [1.302804] RSP: 0018:c962fc10 EFLAGS: 00010282 [1.303791] RAX: 0055 RBX: 81a34274 RCX: 81c605e8 [1.304991] RDX: 0001 RSI: 0096 RDI: 0246 [1.306189] RBP: c962fd58 R08: R09: 0175 [1.307392] R10: R11: 0001 R12: 88013ab84040 [1.308583] R13: R14: 0004 R15: 81a256c8 [1.309768] FS: () GS:88013fc0() knlGS: [1.311052] CS: 0010 DS: ES: CR0: 80050033 [1.312100] CR2: 7f4631fe8df0 CR3: 000137d09003 CR4: 001606f0 [1.313301] Call Trace: [1.314012] ex_handler_refcount+0x63/0x70 [1.314893] fixup_exception+0x32/0x40 [1.315737] do_trap+0x8c/0x170 [1.316519] do_error_trap+0x70/0xd0 [1.317340] ? in6_dev_get+0x23/0x104 [1.318172] ? netlink_broadcast_filtered+0x2bd/0x430 [1.319156] ? kmem_cache_alloc_trace+0xce/0x5d0 [1.320098] ? set_debug_rodata+0x11/0x11 [1.320964] invalid_op+0x1e/0x30 [1.322520] RIP: 0010:in6_dev_get+0x25/0x104 [1.323631] RSP: 0018:c962fe00 EFLAGS: 00010202 [1.324614] RAX: 880137de2400 RBX: 880137df4600 RCX: 880137de24f0 [1.325793] RDX: 88013a5e4000 RSI: fe00 RDI: 88013a5e4000 [1.326964] RBP: 00d1 R08: R09: 880137de7600 [1.328150] R10: R11: 8801398a4df8 R12: [1.329374] R13: 82137872 R14: 014200ca R15: [1.330547] ? set_debug_rodata+0x11/0x11 [1.331392] ip6_route_init_special_entries+0x2a/0x89 [1.332369] addrconf_init+0x9e/0x203 [1.333173] inet6_init+0x1af/0x365 [1.333956] ? af_unix_init+0x4e/0x4e [1.334753] do_one_initcall+0x4e/0x190 [1.33] ? set_debug_rodata+0x11/0x11 [1.336369] kernel_init_freeable+0x189/0x20e [1.337230] ? rest_init+0xd0/0xd0 [1.337999] kernel_init+0xa/0xf7 [1.338744] ret_from_fork+0x25/0x30 [1.339500] Code: 48 8b 95 80 00 00 00 41 55 49 8d 8c 24 f0 0a 00 00 45 8b 84 24 10 09 00 00 41 89 c1 48 89 de 48 c7 c7 60 7a a3 81 e8 07 de 05 00 <0f> ff 58 5b 5d 41 5c 41 5d c3 0f 1f 44 00 00 55 48 89 e5 41 56 [1.342243] ---[ end trace b5d40c0fccce776c ]--- Back yours out, and... # tracer: nop # # _-=> irqs-off # / _=> need-resched #| / _---=> hardirq/softirq #|| / _--=> preempt-depth #||| / delay # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | swapper/0-1 [000] ...1 1.974114: in6_dev_getx: refs.counter:-1073741824 swapper/0-1 [000] ...1 1.974116: in6_dev_getx: refs.counter:-1073741824 --- arch/x86/include/asm/refcount.h |9 + include/net/addrconf.h | 12 net/ipv6/route.c|4 ++-- 3 files changed, 23 insertions(+), 2 deletions(-) --- a/arch/x86/include/asm/refcount.h +++ b/arch/x86/include/asm/refcount.h @@ -55,6 +55,15 @@ static __always_inline void refcount_inc : : "cc", "cx"); } +static __always_inline void refcount_inc_x(refcount_t *r) +{ + trace_printk("refs.counter:%d\n", r->refs.counter); + asm
Re: netdev watchdog enp1s0 (tg3): transmit queue 0 timed out
Dear Michael and Siva, On Thu, 2017-08-31 at 23:36 -0700, Michael Chan wrote: > On Thu, Aug 31, 2017 at 11:10 PM, Frans van Berckel > wrote: > > > > a long list of likely the same type of error codes. > > > > Please post the entire register dump. [ 237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full duplex [ 237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX and on for RX [ 237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link becomes ready [ 243.683910] tg3 :01:00.0 enp1s0: DMA Status error. Resetting chip. [ 243.759610] hrtimer: interrupt took 9464192 ns [ 245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4, 0x00100406, 0x0221, 0x0010 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0010: 0xefcf0004, 0x, 0x, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0020: 0x, 0x, 0x, 0x01eb1028 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0030: 0x, 0x0048, 0x, 0x0105 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0040: 0x, 0x, 0xc0025001, 0x64002000 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0050: 0x00fc5803, 0x80818283, 0x0087d005, 0xfee0300c [ 245.476295] tg3 :01:00.0 enp1s0: 0x0060: 0x, 0x4122, 0x42010298, 0x7618 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0070: 0x10f2, 0x00a0, 0x, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0080: 0x165914e4, 0x3c1d0002, 0x04130034, 0x3c085082 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0090: 0x01009509, 0x, 0x, 0x00ce [ 245.476295] tg3 :01:00.0 enp1s0: 0x00a0: 0x, 0x0006, 0x, 0x000c [ 245.476295] tg3 :01:00.0 enp1s0: 0x00b0: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x00c0: 0x, 0x, 0x000e, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x00d0: 0x00010010, 0x0fa0, 0x00122104, 0x00036c11 [ 245.476295] tg3 :01:00.0 enp1s0: 0x00e0: 0x1011, 0x, 0x, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0100: 0x13c10001, 0x00014000, 0x0011c000, 0x000e3010 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0110: 0x, 0x11c1, 0x00a0, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0130: 0x, 0x, 0x, 0x16010002 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0150: 0x80ff, 0x, 0x, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0160: 0x16c10003, 0xfe3b2547, 0x001ec9ff, 0x00010004 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0170: 0x, 0x0007810e, 0x0001, 0x2c0c3c0e [ 245.476295] tg3 :01:00.0 enp1s0: 0x0180: 0x3f062304, 0x, 0x, 0x [ 245.476295] tg3 :01:00.0 enp1s0: 0x0200: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0210: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0220: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0230: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0240: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0250: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0260: 0x, 0x0001, 0x, 0x00ce [ 245.476295] tg3 :01:00.0 enp1s0: 0x0270: 0x, 0x0001, 0x, 0x0001 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0280: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0290: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02a0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02b0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02c0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02d0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02e0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x02f0: 0x, 0x0006, 0x, 0x0006 [ 245.476295] tg3 :01:00.0 enp1s0: 0x0300: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 enp1s0: 0x0310: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 enp1s0: 0x0320: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 enp1s0: 0x0330: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 enp1s0: 0x0340: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 enp1s0: 0x0350: 0x, 0x000f, 0x, 0x000f [ 245.476295] tg3 :01:00.0 en
Re: netdev watchdog enp1s0 (tg3): transmit queue 0 timed out
On Thu, Aug 31, 2017 at 11:10 PM, Frans van Berckel wrote: > Dear NetDev Team, > > I am new to this machine. On a marketplace website I > bought a Dell PowerEdge sc1435. Booting a today's Fedora (or even a > Debian) amd64 Live CD from usb, and goes all fine. > > [0.00] Linux version 4.11.8-300.fc26.x86_64 (mockbuild@bkernel0 > 2.phx2.fedoraproject.org) (gcc version 7.1.1 20170622 (Red Hat 7.1.1- > 3) (GCC) ) #1 SMP Thu Jun 29 20:09:48 UTC 2017 > > Until .. I plunged in the ethernet cable for the first time. I have got > console output, what frightens me a bit. It's about the driver for > bcm95721. This kernel does a DMA Status error. And next it's calling a > watchdog for enp1s0 (tg3): transmit queue 0 timed out. > > [ 237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full > du > plex > [ 237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX > and > on for RX > [ 237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link > becomes > ready > [ 243.683910] tg3 :01:00.0 enp1s0: DMA Status > error. Resetting > chip. > [ 243.759610] hrtimer: interrupt took 9464192 ns > [ 245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4, > 0x00100406, 0x0221, 0x0010 > > a long list of likely the same type of error codes. > Please post the entire register dump.
[PATCH net-next 3/3] bpf: Only set node->ref = 1 if it has not been set
This patch writes 'node->ref = 1' only if node->ref is 0. The number of lookups/s for a ~1M entries LRU map increased by ~30% (260097 to 343313). Other writes on 'node->ref = 0' is not changed. In those cases, the same cache line has to be changed anyway. First column: Size of the LRU hash Second column: Number of lookups/s Before: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 1000 | awk > '{print $3}')" 1048577: 260097 After: > echo "$((2**20+1)): $(./map_perf_test 1024 1 $((2**20+1)) 1000 | awk > '{print $3}')" 1048577: 343313 Signed-off-by: Martin KaFai Lau --- kernel/bpf/bpf_lru_list.h | 3 ++- kernel/bpf/hashtab.c | 7 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 5c35a98d02bf..7d4f89b7cb84 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h @@ -69,7 +69,8 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) /* ref is an approximation on access frequency. It does not * have to be very accurate. Hence, no protection is used. */ - node->ref = 1; + if (!node->ref) + node->ref = 1; } int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 682f4543fefa..431126f31ea3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -519,9 +519,14 @@ static u32 htab_lru_map_gen_lookup(struct bpf_map *map, { struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; + const int ref_reg = BPF_REG_1; *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); - *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); + *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, + offsetof(struct htab_elem, lru_node) + + offsetof(struct bpf_lru_node, ref)); + *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); *insn++ = BPF_ST_MEM(BPF_B, ret, offsetof(struct htab_elem, lru_node) + offsetof(struct bpf_lru_node, ref), -- 2.9.5
[PATCH net-next 0/3] bpf: Improve LRU map lookup performance
This patchset improves the lookup performance of the LRU map. Please see individual patch for details. Martin KaFai Lau (3): bpf: Add lru_hash_lookup performance test bpf: Inline LRU map lookup bpf: Only set node->ref = 1 if it has not been set kernel/bpf/bpf_lru_list.h| 3 +- kernel/bpf/hashtab.c | 24 + samples/bpf/map_perf_test_kern.c | 44 +++ samples/bpf/map_perf_test_user.c | 77 ++-- 4 files changed, 138 insertions(+), 10 deletions(-) -- 2.9.5
[PATCH net-next 2/3] bpf: Inline LRU map lookup
Inline the lru map lookup to save the cost in making calls to bpf_map_lookup_elem() and htab_lru_map_lookup_elem(). Different LRU hash size is tested. The benefit diminishes when the cache miss starts to dominate in the bigger LRU hash. Considering the change is simple, it is still worth to optimize. First column: Size of the LRU hash Second column: Number of lookups/s Before: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 > $((2**i+1)) 1000 | awk '{print $3}')"; done 513: 1132020 1025: 1056826 2049: 1007024 4097: 853298 8193: 742723 16385: 712600 32769: 688142 65537: 677028 131073: 619437 262145: 498770 524289: 316695 1048577: 260038 After: > for i in $(seq 9 20); do echo "$((2**i+1)): $(./map_perf_test 1024 1 > $((2**i+1)) 1000 | awk '{print $3}')"; done 513: 1221851 1025: 1144695 2049: 1049902 4097: 884460 8193: 773731 16385: 729673 32769: 721989 65537: 715530 131073: 671665 262145: 516987 524289: 321125 1048577: 260048 Signed-off-by: Martin KaFai Lau --- kernel/bpf/hashtab.c | 19 +++ 1 file changed, 19 insertions(+) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d246905f2bb1..682f4543fefa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -514,6 +514,24 @@ static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +static u32 htab_lru_map_gen_lookup(struct bpf_map *map, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); + *insn++ = BPF_ST_MEM(BPF_B, ret, +offsetof(struct htab_elem, lru_node) + +offsetof(struct bpf_lru_node, ref), +1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + /* It is called from the bpf_lru_list when the LRU needs to delete * older elements from the htab. */ @@ -1137,6 +1155,7 @@ const struct bpf_map_ops htab_lru_map_ops = { .map_lookup_elem = htab_lru_map_lookup_elem, .map_update_elem = htab_lru_map_update_elem, .map_delete_elem = htab_lru_map_delete_elem, + .map_gen_lookup = htab_lru_map_gen_lookup, }; /* Called from eBPF program */ -- 2.9.5
[PATCH net-next 1/3] bpf: Add lru_hash_lookup performance test
Create a new case to test the LRU lookup performance. At the beginning, the LRU map is fully loaded (i.e. the number of keys is equal to map->max_entries). The lookup is done through key 0 to num_map_entries and then repeats from 0 again. This patch also creates an anonymous struct to properly name the test params in stress_lru_hmap_alloc() in map_perf_test_kern.c. Signed-off-by: Martin KaFai Lau --- samples/bpf/map_perf_test_kern.c | 44 +++ samples/bpf/map_perf_test_user.c | 77 ++-- 2 files changed, 112 insertions(+), 9 deletions(-) diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index ca3b22ed577a..098c857f1eda 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -88,6 +88,13 @@ struct bpf_map_def SEC("maps") array_map = { .max_entries = MAX_ENTRIES, }; +struct bpf_map_def SEC("maps") lru_hash_lookup_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + SEC("kprobe/sys_getuid") int stress_hmap(struct pt_regs *ctx) { @@ -148,12 +155,23 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) SEC("kprobe/sys_connect") int stress_lru_hmap_alloc(struct pt_regs *ctx) { + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%dn"; + union { + u16 dst6[8]; + struct { + u16 magic0; + u16 magic1; + u16 tcase; + u16 unused16; + u32 unused32; + u32 key; + }; + } test_params; struct sockaddr_in6 *in6; - u16 test_case, dst6[8]; + u16 test_case; int addrlen, ret; - char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n"; long val = 1; - u32 key = bpf_get_prandom_u32(); + u32 key = 0; in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); addrlen = (int)PT_REGS_PARM3(ctx); @@ -161,14 +179,18 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) if (addrlen != sizeof(*in6)) return 0; - ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + ret = bpf_probe_read(test_params.dst6, sizeof(test_params.dst6), +&in6->sin6_addr); if (ret) goto done; - if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + if (test_params.magic0 != 0xdead || + test_params.magic1 != 0xbeef) return 0; - test_case = dst6[7]; + test_case = test_params.tcase; + if (test_case != 3) + key = bpf_get_prandom_u32(); if (test_case == 0) { ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); @@ -188,6 +210,16 @@ int stress_lru_hmap_alloc(struct pt_regs *ctx) ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, BPF_ANY); + } else if (test_case == 3) { + u32 i; + + key = test_params.key; + +#pragma clang loop unroll(full) + for (i = 0; i < 32; i++) { + bpf_map_lookup_elem(&lru_hash_lookup_map, &key); + key++; + } } else { ret = -EINVAL; } diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index bccbf8478e43..f388254896f6 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -46,6 +46,7 @@ enum test_type { HASH_LOOKUP, ARRAY_LOOKUP, INNER_LRU_HASH_PREALLOC, + LRU_HASH_LOOKUP, NR_TESTS, }; @@ -60,6 +61,7 @@ const char *test_map_names[NR_TESTS] = { [HASH_LOOKUP] = "hash_map", [ARRAY_LOOKUP] = "array_map", [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", + [LRU_HASH_LOOKUP] = "lru_hash_lookup_map", }; static int test_flags = ~0; @@ -67,6 +69,8 @@ static uint32_t num_map_entries; static uint32_t inner_lru_hash_size; static int inner_lru_hash_idx = -1; static int array_of_lru_hashs_idx = -1; +static int lru_hash_lookup_idx = -1; +static int lru_hash_lookup_test_entries = 32; static uint32_t max_cnt = 100; static int check_test_flags(enum test_type t) @@ -86,6 +90,32 @@ static void test_hash_prealloc(int cpu) cpu, max_cnt * 10ll / (time_get_ns() - start_time)); } +static int pre_test_lru_hash_lookup(int tasks) +{ + int fd = map_fd[lru_hash_lookup_idx]; + uint32_t key; + long val = 1; + int ret; + + if (num_map_entries > lru_hash_lookup_test_entries) + lru_hash_lookup_test_entries = num_map_entries; + + /* Populate the lru_hash_map for LRU_HASH_LOOKUP perf test. +* +* It is fine that the user requests for a map with +* num_map_entries < 3
netdev watchdog enp1s0 (tg3): transmit queue 0 timed out
Dear NetDev Team, I am new to this machine. On a marketplace website I bought a Dell PowerEdge sc1435. Booting a today's Fedora (or even a Debian) amd64 Live CD from usb, and goes all fine. [0.00] Linux version 4.11.8-300.fc26.x86_64 (mockbuild@bkernel0 2.phx2.fedoraproject.org) (gcc version 7.1.1 20170622 (Red Hat 7.1.1- 3) (GCC) ) #1 SMP Thu Jun 29 20:09:48 UTC 2017 Until .. I plunged in the ethernet cable for the first time. I have got console output, what frightens me a bit. It's about the driver for bcm95721. This kernel does a DMA Status error. And next it's calling a watchdog for enp1s0 (tg3): transmit queue 0 timed out. [ 237.169194] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full du plex [ 237.169335] tg3 :01:00.0 enp1s0: Flow control is on for TX and on for RX [ 237.169375] IPv6: ADDRCONF(NETDEV_CHANGE): enp1s0: link becomes ready [ 243.683910] tg3 :01:00.0 enp1s0: DMA Status error. Resetting chip. [ 243.759610] hrtimer: interrupt took 9464192 ns [ 245.317566] tg3 :01:00.0 enp1s0: 0x: 0x165914e4, 0x00100406, 0x0221, 0x0010 a long list of likely the same type of error codes. [ 245.760055] tg3 :01:00.0 enp1s0: 0x7810: 0x, 0x0060, 0x000d, 0x [ 245.762065] tg3 :01:00.0 enp1s0: 0: Host status block [0005:0016:(:0007:):(0007:000c)] [ 245.764128] tg3 :01:00.0 enp1s0: 0: NAPI info [0015:0015:(000f:000c:01ff):0006:(00ce:::)] [ 245.867391] tg3 :01:00.0: tg3_stop_block timed out, ofs=2c00 enable_bit=2 [ 245.971098] tg3 :01:00.0: tg3_stop_block timed out, ofs=4800 enable_bit=2 [ 245.993343] tg3 :01:00.0 enp1s0: Link is down [ 249.731158] tg3 :01:00.0 enp1s0: Link is up at 1000 Mbps, full duplex [ 249.731336] tg3 :01:00.0 enp1s0: Flow control is on for TX and on for RX [ 254.944022] [ cut here ] [ 254.945010] WARNING: CPU: 1 PID: 0 at net/sched/sch_generic.c:316 dev_watchdog+0x221/0x230 [ 254.945010] NETDEV WATCHDOG: enp1s0 (tg3): transmit queue 0 timed out [ 254.945010] Modules linked in: xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun fuse nf_conntrack_netbios_ns nf_conntrack_broadcast xt_CT ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack libcrc32c iptable_mangle iptable_raw iptable_security ebtable_filter ebtables ip6table_filter ip6_tables powernow_k8 amd64_edac_mod edac_mce_amd edac_core kvm_amd kvm irqbypass amdkfd amd_iommu_v2 dcdbas radeon k8temp ipmi_ssif i2c_algo_bit ttm drm_kms_helper drm ipmi_si ipmi_devintf tpm_tis ipmi_msghandler tpm_tis_core tpm i2c_piix4 shpchp nls_utf8 isofs squashfs ata_generic [ 254.945010] pata_acpi uas usb_storage 8021q garp stp llc mrp serio_raw tg3 sata_sil24 ptp pps_core pata_serverworks sunrpc scsi_transport_iscsi loop [ 254.945010] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.11.8- 300.fc26.x86_64 #1 [ 254.945010] Hardware name: Dell Inc. PowerEdge SC1435/0YR707, BIOS 2.2.5 03/21/2008 [ 254.945010] Call Trace: [ 254.945010] [ 254.945010] dump_stack+0x63/0x84 [ 254.945010] __warn+0xcb/0xf0 [ 254.945010] warn_slowpath_fmt+0x5a/0x80 [ 254.945010] dev_watchdog+0x221/0x230 [ 254.945010] ? qdisc_rcu_free+0x50/0x50 [ 254.945010] call_timer_fn+0x33/0x130 [ 254.945010] run_timer_softirq+0x3ee/0x440 [ 254.945010] ? ktime_get+0x40/0xb0 [ 254.945010] ? lapic_next_event+0x1d/0x30 [ 254.945010] __do_softirq+0xea/0x2e3 [ 254.945010] irq_exit+0xfb/0x100 [ 254.945010] smp_apic_timer_interrupt+0x3d/0x50 [ 254.945010] apic_timer_interrupt+0x93/0xa0 [ 254.945010] RIP: 0010:native_safe_halt+0x6/0x10 [ 254.945010] RSP: 0018:babf4038be60 EFLAGS: 0246 ORIG_RAX: ff10 [ 254.945010] RAX: 687475412d40 RBX: 926d39d04880 RCX: [ 254.945010] RDX: RSI: RDI: [ 254.945010] RBP: babf4038be60 R08: 926d3d052ae0 R09: [ 254.945010] R10: R11: R12: 0001 [ 254.945010] R13: 926d39d04880 R14: R15: [ 254.945010] [ 254.945010] default_idle+0x20/0xe0 [ 254.945010] amd_e400_idle+0x3f/0x50 [ 254.945010] arch_cpu_idle+0xf/0x20 [ 254.945010] default_idle_call+0x23/0x30 [ 254.945010] do_idle+0x170/0x200 [ 254.945010] cpu_startup_entry+0x71/0x80 [ 254.945010] start_secondary+0x154/0x190 [ 254.945010] start_cpu+0x14/0x14 [ 255.080137] ---[ end trace 25a535e6d8610c90 ]--- [ 255.080137] tg3 :01:00.0 enp1s0: transmit timed out, resetting And resetting is what 'he' does again and again. Until you take out the UTP cable. This counts for the second ethernet port as well. Is there something with the machine bios or a setting that i ca
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On Thu, Aug 31, 2017 at 7:05 PM, Alexei Starovoitov wrote: > On Thu, Aug 31, 2017 at 01:56:34PM -0700, Chenbo Feng wrote: >> From: Chenbo Feng >> >> Introduce a pointer into struct bpf_map to hold the security information >> about the map. The actual security struct varies based on the security >> models implemented. Place the LSM hooks before each of the unrestricted >> eBPF operations, the map_update_elem and map_delete_elem operations are >> checked by security_map_modify. The map_lookup_elem and map_get_next_key >> operations are checked by securtiy_map_read. >> >> Signed-off-by: Chenbo Feng > > ... > >> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_read(map); >> + if (err) >> + return -EACCES; >> + >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_modify(map); > > I don't feel these extra hooks are really thought through. > With such hook you'll disallow map_update for given map. That's it. > The key/values etc won't be used in such security decision. > In such case you don't need such hooks in update/lookup at all. > Only in map_creation and object_get calls where FD can be received. > In other words I suggest to follow standard unix practices: > Do permissions checks in open() and allow read/write() if FD is valid. > Same here. Do permission checks in prog_load/map_create/obj_pin/get > and that will be enough to jail bpf subsystem. > bpf cmds that need to be fast (like lookup and update) should not > have security hooks. > I do think we want to distinguish between read/write (or read/modify) for these objects. Essentially, we want to implement the example described in patch 0/3 where eBPF objects can be passed to less privileged processes which can read, but not modify the map. What would be the best way to do this? Add a mode field to the bpf_map struct?
RE: netdev carrier changes is one even after ethernet link up.
Thanks for responding. Now responding inline > -Original Message- > From: Florian Fainelli [mailto:f.faine...@gmail.com] > Sent: Friday, September 01, 2017 5:53 AM > To: Bhadram Varka ; and...@lunn.ch > Cc: linux-netdev > Subject: Re: netdev carrier changes is one even after ethernet link up. > > On 08/30/2017 10:53 PM, Bhadram Varka wrote: > > Hi, > > > > > > > > I have observed that carrier_changes is one even in case of the > > ethernet link is up. > > > > > > > > After investigating the code below is my observation – > > > > > > > > ethernet_driver_probe() > > > > +--->phy_connect() > > > > | +--->phy_attach_direct() > > > > | +---> netif_carrier_off(): which increments > > carrier_changes to one. > > > > +--->register_netdevice() : will the carrier_changes becomes zero here ? > > > > +--->netif_carrier_off(): not increment the carrier_changes since > > __LINK_STATE_NOCARRIER already set. > > > > > > > > From ethernet driver open will start the PHY and trigger the > > phy_state_machine. > > > > Phy_state_machine workqueue calling netif_carrier_on() once the link is > UP. > > > > netif_carrier_on() increments the carrier_changes by one. > > If the call trace is correct, then there is at least two problems here: > > - phy_connect() does start the PHY machine which means that as soon as it > detects a link state of any kind (up or down) it can call > netif_carrier_off() respectively netif_carrier_on() > > - as soon as you call register_netdevice() notifiers run and other parts of > the > kernel or user-space programs can see an inconsistent link state > > I would suggest doing the following sequence instead: > > netif_carrier_off() > register_netdevice() > phy_connect() > > Which should result in a consistent link state and carrier value. > Yes, It will address the issue. If we did the phy_conect in ndo_open it will make the carrier changes as two. But if we did in probe function then it's not working. In ethernet driver probe - (below sequence is not working) phy_connect() register_netdevice() netif_carrier_off() working sequence: In probe(): register_netdevice() ndo_open: phy_connect() After reverting - https://lkml.org/lkml/2016/1/9/173 this works if we do phy_connect in probe as well. Thanks, Bhadram. > > > > > > > > After link is UP if we check the carrier_changes sysfs node - it will > > be one only. > > > > > > > > $ cat /sys/class/net/eth0/carrier_changes > > > > 1 > > > > > > > > After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net: > > phy: turn carrier off on phy attach) then I could see the carrier > > changes incremented to 2 after Link UP. > > > > $ cat /sys/class/net/eth0/carrier_changes > > > > 2 > > > > > > > > Thanks, > > > > Bhadram. > > > > -- > > -- This email message is for the sole use of the intended recipient(s) > > and may contain confidential information. Any unauthorized review, > > use, disclosure or distribution is prohibited. If you are not the > > intended recipient, please contact the sender by reply email and > > destroy all copies of the original message. > > -- > > -- > > > -- > Florian
RE: netdev carrier changes is one even after ethernet link up.
Thanks for responding. -Original Message- From: Florian Fainelli [mailto:f.faine...@gmail.com] Sent: Friday, September 01, 2017 5:53 AM To: Bhadram Varka ; and...@lunn.ch Cc: linux-netdev Subject: Re: netdev carrier changes is one even after ethernet link up. On 08/30/2017 10:53 PM, Bhadram Varka wrote: > Hi, > > > > I have observed that carrier_changes is one even in case of the > ethernet link is up. > > > > After investigating the code below is my observation – > > > > ethernet_driver_probe() > > +--->phy_connect() > > | +--->phy_attach_direct() > > | +---> netif_carrier_off(): which increments > carrier_changes to one. > > +--->register_netdevice() : will the carrier_changes becomes zero here ? > > +--->netif_carrier_off(): not increment the carrier_changes since > __LINK_STATE_NOCARRIER already set. > > > > From ethernet driver open will start the PHY and trigger the > phy_state_machine. > > Phy_state_machine workqueue calling netif_carrier_on() once the link is UP. > > netif_carrier_on() increments the carrier_changes by one. If the call trace is correct, then there is at least two problems here: - phy_connect() does start the PHY machine which means that as soon as it detects a link state of any kind (up or down) it can call netif_carrier_off() respectively netif_carrier_on() - as soon as you call register_netdevice() notifiers run and other parts of the kernel or user-space programs can see an inconsistent link state I would suggest doing the following sequence instead: netif_carrier_off() register_netdevice() phy_connect() Which should result in a consistent link state and carrier value. Yes, It will address the issue. If we did the phy_conect in ndo_open it will make the carrier changes as one. But if we did in probe function then it's not working. In ethernet driver probe - (below sequence is not working) phy_connect() register_netdevice() netif_carrier_off() working sequence: In probe(): register_netdevice() ndo_open: phy_connect() Thanks, Bhadram. > > > > After link is UP if we check the carrier_changes sysfs node - it will > be one only. > > > > $ cat /sys/class/net/eth0/carrier_changes > > 1 > > > > After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net: > phy: turn carrier off on phy attach) then I could see the carrier > changes incremented to 2 after Link UP. > > $ cat /sys/class/net/eth0/carrier_changes > > 2 > > > > Thanks, > > Bhadram. > > -- > -- This email message is for the sole use of the intended recipient(s) > and may contain confidential information. Any unauthorized review, > use, disclosure or distribution is prohibited. If you are not the > intended recipient, please contact the sender by reply email and > destroy all copies of the original message. > -- > -- -- Florian
Re: [PATCH 13/31] timer: Remove meaningless .data/.function assignments
On Thu, Aug 31, 2017 at 04:29:25PM -0700, Kees Cook wrote: > Several timer users needlessly reset their .function/.data fields during > their timer callback, but nothing else changes them. Some users do not > use their .data field at all. Each instance is removed here. > > Cc: Krzysztof Halasa > Cc: Aditya Shankar > Cc: Ganesh Krishna > Cc: Greg Kroah-Hartman > Cc: Jens Axboe > Cc: netdev@vger.kernel.org > Cc: linux-wirel...@vger.kernel.org > Cc: de...@driverdev.osuosl.org > Signed-off-by: Kees Cook > --- > drivers/block/amiflop.c | 3 +-- > drivers/net/wan/hdlc_cisco.c | 2 -- > drivers/net/wan/hdlc_fr.c | 2 -- > drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +--- > 4 files changed, 2 insertions(+), 9 deletions(-) For the staging driver: Acked-by: Greg Kroah-Hartman
Re: [PATCH v3 net-next 0/7] bpf: Add option to set mark and priority in cgroup sock programs
From: David Ahern Date: Thu, 31 Aug 2017 15:05:43 -0700 > Add option to set mark and priority in addition to bound device for newly > created sockets. Also, allow the bpf programs to use the get_current_uid_gid > helper meaning socket marks, priority and device can be set based on the > uid/gid of the running process. > > Sample programs are updated to demonstrate the new options. > > v3 > - no changes to Patches 1 and 2 which Alexei acked in previous versions > - dropped change related to recursive programs in a cgroup > - updated tests per dropped patch > > v2 > - added flag to control recursive behavior as requested by Alexei > - added comment to sock_filter_func_proto regarding use of > get_current_uid_gid helper > - updated test programs for recursive option Series applied, please follow up to Daniel's feedback. Thanks.
[PATCH] net: ethernet: ibm-emac: Add 5482 PHY init for OpenBlocks 600
The vendor patches initialize those registers to get the PHY working properly. Sadly I don't have that PHY spec and whatever Broadcom PHY code we already have don't seem to document these two shadow registers (unless I miscalculated the address) so I'm keeping this as "vendor magic for that board". The vendor has long abandoned that product, but I find it handy to test ppc405 kernels and so would like to keep it alive upstream :-) Signed-off-by: Benjamin Herrenschmidt --- Note: Ideally, the whole driver should switch over to the generic PHY layer. However this is a much bigger undertaking which requires access to a bunch of HW to test, and for which I have neither the time nor the HW available these days. (Some of the HW could prove hard to find ...) --- drivers/net/ethernet/ibm/emac/phy.c | 30 ++ 1 file changed, 30 insertions(+) diff --git a/drivers/net/ethernet/ibm/emac/phy.c b/drivers/net/ethernet/ibm/emac/phy.c index 35865d05fccd..daa10de542fb 100644 --- a/drivers/net/ethernet/ibm/emac/phy.c +++ b/drivers/net/ethernet/ibm/emac/phy.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "emac.h" #include "phy.h" @@ -363,6 +364,34 @@ static struct mii_phy_def bcm5248_phy_def = { .ops= &generic_phy_ops }; +static int bcm5482_init(struct mii_phy *phy) +{ + if (!of_machine_is_compatible("plathome,obs600")) + return 0; + + /* Magic inits from vendor original patches */ + phy_write(phy, 0x1c, 0xa410); + phy_write(phy, 0x1c, 0x8804); + + return 0; +} + +static const struct mii_phy_ops bcm5482_phy_ops = { + .init = bcm5482_init, + .setup_aneg = genmii_setup_aneg, + .setup_forced = genmii_setup_forced, + .poll_link = genmii_poll_link, + .read_link = genmii_read_link +}; + +static struct mii_phy_def bcm5482_phy_def = { + + .phy_id = 0x0143bcb0, + .phy_id_mask= 0x0ff0, + .name = "BCM5482 Gigabit Ethernet", + .ops= &bcm5482_phy_ops +}; + static int m88e_init(struct mii_phy *phy) { pr_debug("%s: Marvell 88E Ethernet\n", __func__); @@ -499,6 +528,7 @@ static struct mii_phy_def *mii_phy_table[] = { &et1011c_phy_def, &cis8201_phy_def, &bcm5248_phy_def, + &bcm5482_phy_def, &m88e_phy_def, &m88e1112_phy_def, &ar8035_phy_def, -- 2.13.5
Re: [RFC net-next 0/8] net: dsa: Multi-queue awareness
On 08/31/2017 05:05 PM, Andrew Lunn wrote: > On Wed, Aug 30, 2017 at 05:18:44PM -0700, Florian Fainelli wrote: >> This patch series is sent as reference, especially because the last patch >> is trying not to be creating too many layer violations, but clearly there >> are a little bit being created here anyways. >> >> Essentially what I am trying to achieve is that you have a stacked device >> which >> is multi-queue aware, that applications will be using, and for which they can >> control the queue selection (using mq) the way they want. Each of each >> stacked >> network devices are created for each port of the switch (this is what DSA >> does). When a skb is submitted from say net_device X, we can derive its port >> number and look at the queue_mapping value to determine which port of the >> switch and queue we should be sending this to. The information is embedded >> in a >> tag (4 bytes) and is used by the switch to steer the transmission. >> >> These stacked devices will actually transmit using a "master" or conduit >> network device which has a number of queues as well. In one version of the >> hardware that I work with, we have up to 4 ports, each with 8 queues, and the >> master device has a total of 32 hardware queues, so a 1:1 mapping is easy. >> With >> another version of the hardware, same number of ports and queues, but only 16 >> hardware queues, so only a 2:1 mapping is possible. >> >> In order for congestion information to work properly, I need to establish a >> mapping, preferably before transmission starts (but reconfiguration while >> interfaces are running would be possible too) between these stacked device's >> queue and the conduit interface's queue. >> >> Comments, flames, rotten tomatoes, anything! > > Right, i think i understand. > > This works just for traffic between the host and ports. The host can > set the egress queue. And i assume the queues are priorities, either > absolute or weighted round robin, etc. > > But this has no effect on traffic going from port to port. At some > point, i expect you will want to offload TC for that. You are absolutely right, this patch series aims at having the host be able to steer traffic towards particular switch port egress queues which are configured with specific priorities. At the moment it really is mapping one priority value (in the 802.1p sense) to one queue number and let the switch scheduler figure things out. With this patch set you can now use the multiq filter of tc and do exactly what is documented under Documentation/networking/multiqueue.txt and get the desired matches to be steered towards the queue you defined. > > How will the two interact? Could the TC rules also act on traffic from > the host to a port? Would it be simpler in the long run to just > implement TC rules? I suppose that you could somehow use TC to influence how the traffic from host to CPU works, but without a "CPU" port representor the question is how do we get that done? If we used "eth0" we need to callback into the switch driver for programming.. Regarding the last patch in this series, what I would ideally to replace it with is something along the lines of: tc bind dev sw0p0 queue 0 dev eth0 queue 16 I am not sure if this is an action, or a filter, or something else... -- Florian
Re: [RFC net-next 1/8] net: dsa: Allow switch drivers to indicate number of RX/TX queues
On 08/31/2017 04:44 PM, Andrew Lunn wrote: > On Wed, Aug 30, 2017 at 05:18:45PM -0700, Florian Fainelli wrote: >> Let switch drivers indicate how many RX and TX queues they support. Some >> switches, such as Broadcom Starfighter 2 are resigned with 8 egress >> queues. > > Marvell switches also have egress queue. > > Does the SF2 have ingress queues? Marvel don't as far as i known. So > i wounder if num_rx_queues is useful? At the moment probably not, since we are not doing anything useful other than creating the network devices with the indicated number of queues. > > Do switches in general have ingress queues? They do, at least the Starfigther 2 has, and from the Broadcom tag you can get such information (BRCM_EG_TC_SHIFT) and you could presumably record that queue on the SKB. I don't have an use case for that (yet?). -- Florian
[RFC] tools: selftests: psock_tpacket: skip un-supported tpacket_v3 test
The TPACKET_V3 test of PACKET_TX_RING will fail with kernel version lower than v4.11. Supported code of tx ring was add with commit id <7f953ab2ba46: af_packet: TX_RING support for TPACKET_V3> at Jan. 3 of 2017. So skip this item test instead of reporting failing for old kernels. Signed-off-by: Orson Zhai --- tools/testing/selftests/net/psock_tpacket.c | 17 ++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c index 7f6cd9fdacf3..f0cfc18c3726 100644 --- a/tools/testing/selftests/net/psock_tpacket.c +++ b/tools/testing/selftests/net/psock_tpacket.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "psock_lib.h" @@ -676,7 +677,7 @@ static void __v3_fill(struct ring *ring, unsigned int blocks, int type) ring->flen = ring->req3.tp_block_size; } -static void setup_ring(int sock, struct ring *ring, int version, int type) +static int setup_ring(int sock, struct ring *ring, int version, int type) { int ret = 0; unsigned int blocks = 256; @@ -703,7 +704,11 @@ static void setup_ring(int sock, struct ring *ring, int version, int type) if (ret == -1) { perror("setsockopt"); - exit(1); + if (errno == EINVAL) { + printf("[SKIP] This type seems un-supported in current kernel, skipped.\n"); + return -1; + } else + exit(1); } ring->rd_len = ring->rd_num * sizeof(*ring->rd); @@ -715,6 +720,7 @@ static void setup_ring(int sock, struct ring *ring, int version, int type) total_packets = 0; total_bytes = 0; + return 0; } static void mmap_ring(int sock, struct ring *ring) @@ -830,7 +836,12 @@ static int test_tpacket(int version, int type) sock = pfsocket(version); memset(&ring, 0, sizeof(ring)); - setup_ring(sock, &ring, version, type); + if(setup_ring(sock, &ring, version, type)) { + /* skip test when error of invalid argument */ + close(sock); + return 0; + } + mmap_ring(sock, &ring); bind_ring(sock, &ring); walk_ring(sock, &ring); -- 2.12.2
Re: virtio_net: ethtool supported link modes
On 2017年09月01日 01:04, Radu Rendec wrote: Hello, Looking at the code in virtnet_set_link_ksettings, it seems the speed and duplex can be set to any valid value. The driver will "remember" them and report them back in virtnet_get_link_ksettings. However, the supported link modes (link_modes.supported in struct ethtool_link_ksettings) is always 0, indicating that no speed/duplex setting is supported. Does it make more sense to set (at least a few of) the supported link modes, such as 10baseT_Half ... 1baseT_Full? I would expect to see consistency between what is reported in link_modes.supported and what can actually be set. Could you please share your opinion on this? I think the may make sense only if there's a hardware implementation for virtio. And we probably need to extend virtio spec for adding new commands. Thanks Thank you, Radu Rendec
Re: [PATCH net-next] doc: document MSG_ZEROCOPY
On Thu, Aug 31, 2017 at 11:10 PM, Alexei Starovoitov wrote: > On Thu, Aug 31, 2017 at 11:04:41PM -0400, Willem de Bruijn wrote: >> On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov >> wrote: >> > On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote: >> >> From: Willem de Bruijn >> >> >> >> Documentation for this feature was missing from the patchset. >> >> Copied a lot from the netdev 2.1 paper, addressing some small >> >> interface changes since then. >> >> >> >> Signed-off-by: Willem de Bruijn >> > ... >> >> +Notification Batching >> >> +~ >> >> + >> >> +Multiple outstanding packets can be read at once using the recvmmsg >> >> +call. This is often not needed. In each message the kernel returns not >> >> +a single value, but a range. It coalesces consecutive notifications >> >> +while one is outstanding for reception on the error queue. >> >> + >> >> +When a new notification is about to be queued, it checks whether the >> >> +new value extends the range of the notification at the tail of the >> >> +queue. If so, it drops the new notification packet and instead increases >> >> +the range upper value of the outstanding notification. >> > >> > Would it make sense to mention that max notification range is 32-bit? >> > So each 4Gbyte of xmit bytes there will be a notification. >> > In modern 40Gbps NICs it's not a lot. Means that there will be >> > at least one notification every second. >> > Or I misread the code? >> >> You're right. The doc does mention that the counter and range >> are 32-bit. I can state more explicitly that that bounds the working >> set size to 4GB. Do you expect this to be problematic? Processing >> a single notification per 4GB of data should not be a significant >> cost in itself. > > I think 4GB is fine. Just there was an idea that in cases when > notification of transmission can be known by other means Some kind of unspoofable response from the peer (i.e., not just a tcp ack), or a kernel mechanism independent from the error queue? The first does not guarantee that a retransmit is not in progress. > the user space > could have skipped reading errqeuee completely, but looks like it > still needs to poll. If a process has no need to see the notification, say because it is sending out a buffer that is constant for the process lifetime, then it could conceivably skip the recv, and poll with it. The code as written will not coalesce more than 4GB of data, but that could be revised.
Re: [PATCH v2 net-next 1/8] bpf: Add support for recursively running cgroup sock filters
On Thu, Aug 31, 2017 at 07:22:01AM -0700, Tejun Heo wrote: > Hello, David, Alexei. > > Sorry about late reply. > > On Sun, Aug 27, 2017 at 08:49:23AM -0600, David Ahern wrote: > > On 8/25/17 8:49 PM, Alexei Starovoitov wrote: > > > > > >> +if (prog && curr_recursive && !new_recursive) > > >> +/* if a parent has recursive prog attached, only > > >> + * allow recursive programs in descendent cgroup > > >> + */ > > >> +return -EINVAL; > > >> + > > >> old_prog = cgrp->bpf.prog[type]; > > > > > > ... I'm struggling to completely understand how it interacts > > > with BPF_F_ALLOW_OVERRIDE. > > > > The 2 flags are completely independent. The existing override logic is > > unchanged. If a program can not be overridden, then the new recursive > > flag is irrelevant. > > I'm not sure all four combo of the two flags makes sense. Can't we > have something simpler like the following? > > 1. None: No further bpf programs allowed in the subtree. > > 2. Overridable: If a sub-cgroup installs the same bpf program, this >one yields to that one. > > 3. Recursive: If a sub-cgroup installs the same bpf program, that >cgroup program gets run in addition to this one. > > Note that we can have combinations of overridables and recursives - > both allow further programs in the sub-hierarchy and the only > distinction is whether that specific program behaves when that > happens. If I understand the proposal correctly in case of: A (with recurs) -> B (with override) -> C (with recurse) -> D (with override) when something happens in D, you propose to run D,C,A ? With the order of execution from children to parent? That's a bit a different then what I was proposing with 'multi-prog' flag, but the more I think about it the more I like it. In your case detach is sort of transparent to everything around. And you would also allow to say 'None' to one of the substrees too, right? So something like: A (with recurs) -> B (with override) -> C (with recurse) -> D (None) -> E would mean that E cannot attach anything and events in E will call D->C->A, right? I will work on a patch for the above and see how it looks.
Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi
On 2017年08月31日 22:30, Willem de Bruijn wrote: Incomplete results at this stage, but I do see this correlation between flows. It occurs even while not running out of zerocopy descriptors, which I cannot yet explain. Running two threads in a guest, each with a udp socket, each sending up to 100 datagrams, or until EAGAIN, every msec. Sender A sends 1B datagrams. Sender B sends VHOST_GOODCOPY_LEN, which is enough to trigger zcopy_used in vhost net. A local receive process on the host receives both flows. To avoid a deep copy when looping the packet onto the receive path, changed skb_orphan_frags_rx to always return false (gross hack). The flow with the larger packets is redirected through netem on ifb0: modprobe ifb ip link set dev ifb0 up tc qdisc add dev ifb0 root netem limit $LIMIT rate 1MBit tc qdisc add dev tap0 ingress tc filter add dev tap0 parent : protocol ip \ u32 match ip dport 8000 0x \ action mirred egress redirect dev ifb0 For 10 second run, packet count with various ifb0 queue lengths $LIMIT: no filter rx.A: ~840,000 rx.B: ~840,000 limit 1 rx.A: ~500,000 rx.B: ~3100 ifb0: 3273 sent, 371141 dropped limit 100 rx.A: ~9000 rx.B: ~4200 ifb0: 4630 sent, 1491 dropped limit 1000 rx.A: ~6800 rx.B: ~4200 ifb0: 4651 sent, 0 dropped Sender B is always correctly rate limited to 1 MBps or less. With a short queue, it ends up dropping a lot and sending even less. When a queue builds up for sender B, sender A throughput is strongly correlated with queue length. With queue length 1, it can send almost at unthrottled speed. But even at limit 100 its throughput is on the same order as sender B. What is surprising to me is that this happens even though the number of ubuf_info in use at limit 100 is around 100 at all times. In other words, it does not exhaust the pool. When forcing zcopy_used to be false for all packets, this effect of sender A throughput being correlated with sender B does not happen. no filter rx.A: ~850,000 rx.B: ~850,000 limit 100 rx.A: ~850,000 rx.B: ~4200 ifb0: 4518 sent, 876182 dropped Also relevant is that with zerocopy, the sender processes back off and report the same count as the receiver. Without zerocopy, both senders send at full speed, even if only 4200 packets from flow B arrive at the receiver. This is with the default virtio_net driver, so without napi-tx. It appears that the zerocopy notifications are pausing the guest. Will look at that now. It was indeed as simple as that. With 256 descriptors, queuing even a hundred or so packets causes the guest to stall the device as soon as the qdisc is installed. Adding this check + in_use = nvq->upend_idx - nvq->done_idx; + if (nvq->upend_idx < nvq->done_idx) + in_use += UIO_MAXIOV; + + if (in_use > (vq->num >> 2)) + zcopy_used = false; Has the desired behavior of reverting zerocopy requests to copying. Without this change, the result is, as previously reported, throughput dropping to hundreds of packets per second on both flows. With the change, pps as observed for a few seconds at handle_tx is zerocopy=165 copy=168435 zerocopy=0 copy=168500 zerocopy=65 copy=168535 Both flows continue to send at more or less normal rate, with only sender B observing massive drops at the netem. With the queue removed the rate reverts to zerocopy=58878 copy=110239 zerocopy=58833 copy=110207 This is not a 50/50 split, which impliesTw that some packets from the large packet flow are still converted to copying. Without the change the rate without queue was 80k zerocopy vs 80k copy, so this choice of (vq->num >> 2) appears too conservative. However, testing with (vq->num >> 1) was not as effective at mitigating stalls. I did not save that data, unfortunately. Can run more tests on fine tuning this variable, if the idea sounds good. Looks like there're still two cases were left: 1) sndbuf is not INT_MAX 2) tx napi is used for virtio-net 1) could be a corner case, and for 2) what your suggest here may not solve the issue since it still do in order completion. Thanks
Re: [PATCH net-next] doc: document MSG_ZEROCOPY
On Thu, Aug 31, 2017 at 11:04:41PM -0400, Willem de Bruijn wrote: > On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov > wrote: > > On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote: > >> From: Willem de Bruijn > >> > >> Documentation for this feature was missing from the patchset. > >> Copied a lot from the netdev 2.1 paper, addressing some small > >> interface changes since then. > >> > >> Signed-off-by: Willem de Bruijn > > ... > >> +Notification Batching > >> +~ > >> + > >> +Multiple outstanding packets can be read at once using the recvmmsg > >> +call. This is often not needed. In each message the kernel returns not > >> +a single value, but a range. It coalesces consecutive notifications > >> +while one is outstanding for reception on the error queue. > >> + > >> +When a new notification is about to be queued, it checks whether the > >> +new value extends the range of the notification at the tail of the > >> +queue. If so, it drops the new notification packet and instead increases > >> +the range upper value of the outstanding notification. > > > > Would it make sense to mention that max notification range is 32-bit? > > So each 4Gbyte of xmit bytes there will be a notification. > > In modern 40Gbps NICs it's not a lot. Means that there will be > > at least one notification every second. > > Or I misread the code? > > You're right. The doc does mention that the counter and range > are 32-bit. I can state more explicitly that that bounds the working > set size to 4GB. Do you expect this to be problematic? Processing > a single notification per 4GB of data should not be a significant > cost in itself. I think 4GB is fine. Just there was an idea that in cases when notification of transmission can be known by other means the user space could have skipped reading errqeuee completely, but looks like it still needs to poll. That's fine. > > Thanks for the doc! > > Thanks for reviewing :) > > > > > Acked-by: Alexei Starovoitov > >
Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi
On 2017年08月30日 11:11, Willem de Bruijn wrote: On Tue, Aug 29, 2017 at 9:45 PM, Jason Wang wrote: On 2017年08月30日 03:35, Willem de Bruijn wrote: On Fri, Aug 25, 2017 at 9:03 PM, Willem de Bruijn wrote: On Fri, Aug 25, 2017 at 7:32 PM, Michael S. Tsirkin wrote: On Fri, Aug 25, 2017 at 06:44:36PM -0400, Willem de Bruijn wrote: [...] Incomplete results at this stage, but I do see this correlation between flows. It occurs even while not running out of zerocopy descriptors, which I cannot yet explain. Running two threads in a guest, each with a udp socket, each sending up to 100 datagrams, or until EAGAIN, every msec. Sender A sends 1B datagrams. Sender B sends VHOST_GOODCOPY_LEN, which is enough to trigger zcopy_used in vhost net. A local receive process on the host receives both flows. To avoid a deep copy when looping the packet onto the receive path, changed skb_orphan_frags_rx to always return false (gross hack). The flow with the larger packets is redirected through netem on ifb0: modprobe ifb ip link set dev ifb0 up tc qdisc add dev ifb0 root netem limit $LIMIT rate 1MBit tc qdisc add dev tap0 ingress tc filter add dev tap0 parent : protocol ip \ u32 match ip dport 8000 0x \ action mirred egress redirect dev ifb0 For 10 second run, packet count with various ifb0 queue lengths $LIMIT: no filter rx.A: ~840,000 rx.B: ~840,000 Just to make sure I understand the case here. What did rx.B mean here? I thought all traffic sent by Sender B has been redirected to ifb0? It has been, but the packet still arrives at the destination socket. IFB is a special virtual device that applies traffic shaping and then reinjects it back at the point it was intercept by mirred. rx.B is indeed arrival rate at the receiver, similar to rx.A. I see, then ifb looks pretty fit to the test.
Re: [PATCH net-next] doc: document MSG_ZEROCOPY
On Thu, Aug 31, 2017 at 10:10 PM, Alexei Starovoitov wrote: > On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote: >> From: Willem de Bruijn >> >> Documentation for this feature was missing from the patchset. >> Copied a lot from the netdev 2.1 paper, addressing some small >> interface changes since then. >> >> Signed-off-by: Willem de Bruijn > ... >> +Notification Batching >> +~ >> + >> +Multiple outstanding packets can be read at once using the recvmmsg >> +call. This is often not needed. In each message the kernel returns not >> +a single value, but a range. It coalesces consecutive notifications >> +while one is outstanding for reception on the error queue. >> + >> +When a new notification is about to be queued, it checks whether the >> +new value extends the range of the notification at the tail of the >> +queue. If so, it drops the new notification packet and instead increases >> +the range upper value of the outstanding notification. > > Would it make sense to mention that max notification range is 32-bit? > So each 4Gbyte of xmit bytes there will be a notification. > In modern 40Gbps NICs it's not a lot. Means that there will be > at least one notification every second. > Or I misread the code? You're right. The doc does mention that the counter and range are 32-bit. I can state more explicitly that that bounds the working set size to 4GB. Do you expect this to be problematic? Processing a single notification per 4GB of data should not be a significant cost in itself. > Thanks for the doc! Thanks for reviewing :) > > Acked-by: Alexei Starovoitov >
Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses
On Thu, 2017-08-31 at 09:59 -0700, Ivan Delalande wrote: > Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to > processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is > not possible to retrieve these from the kernel once they have been > configured on sockets. > > Signed-off-by: Ivan Delalande > --- > include/uapi/linux/inet_diag.h | 1 + > include/uapi/linux/tcp.h | 9 > net/ipv4/tcp_diag.c| 109 > ++--- > 3 files changed, 113 insertions(+), 6 deletions(-) Acked-by: Eric Dumazet
Re: [PATCH net-next v5 1/2] inet_diag: allow protocols to provide additional data
On Thu, 2017-08-31 at 09:59 -0700, Ivan Delalande wrote: > Extend inet_diag_handler to allow individual protocols to report > additional data on INET_DIAG_INFO through idiag_get_aux. The size > can be dynamic and is computed by idiag_get_aux_size. > > Signed-off-by: Ivan Delalande > --- > include/linux/inet_diag.h | 7 +++ > net/ipv4/inet_diag.c | 22 ++ > 2 files changed, 25 insertions(+), 4 deletions(-) Acked-by: Eric Dumazet
Re: [PATCH] bnx2x: drop packets where gso_size is too big for hardware
Eric Dumazet writes: > If you had this test in bnx2x_features_check(), packet could be > segmented by core networking stack before reaching bnx2x_start_xmit() by > clearing NETIF_F_GSO_MASK > > -> No drop would be involved. Thanks for the pointer - networking code is all a bit new to me. I'm just struggling at the moment to figure out what the right way to calculate the length. My original patch uses gso_size + hlen, but: - On reflection, while this solves the immediate bug, I'm not 100% sure this is the right thing to be calculating - If it is, then we have the problem that hlen is calculated in a bunch of weird and wonderful ways which make it a nightmare to extract. Yuval (or anyone else who groks the driver properly) - what's the right test to be doing here to make sure we don't write to much data to the card? Regards, Daniel
RE: [RFC PATCH] net: frag limit checks need to use percpu_counter_compare
Best Regards, liujian > -Original Message- > From: Michal Kubecek [mailto:mkube...@suse.cz] > Sent: Friday, September 01, 2017 12:24 AM > To: Jesper Dangaard Brouer > Cc: liujian (CE); netdev@vger.kernel.org; Florian Westphal > Subject: Re: [RFC PATCH] net: frag limit checks need to use > percpu_counter_compare > > On Thu, Aug 31, 2017 at 12:20:19PM +0200, Jesper Dangaard Brouer wrote: > > To: Liujian can you please test this patch? > > I want to understand if using __percpu_counter_compare() solves the > > problem correctness wise (even-though this will be slower than using > > a simple atomic_t on your big system). I have test the patch, it can work. 1. make sure frag_mem_limit reach to thresh ===>FRAG: inuse 0 memory 0 frag_mem_limit 5386864 2. change NIC rx irq's affinity to a fixed CPU 3. iperf -u -c 9.83.1.41 -l 1 -i 1 -t 1000 -P 10 -b 20M And check /proc/net/snmp, there are no ReasmFails. And I think it is a better way that adding some counter sync points as you said. > > Fix bug in fragmentation codes use of the percpu_counter API, that > > cause issues on systems with many CPUs. > > > > The frag_mem_limit() just reads the global counter (fbc->count), > > without considering other CPUs can have upto batch size (130K) that > > haven't been subtracted yet. Due to the 3MBytes lower thresh limit, > > this become dangerous at >=24 CPUs (3*1024*1024/13=24). > > > > The __percpu_counter_compare() does the right thing, and takes into > > account the number of (online) CPUs and batch size, to account for > > this and call __percpu_counter_sum() when needed. > > > > On systems with many CPUs this will unfortunately always result in the > > heavier fully locked __percpu_counter_sum() which touch the > > per_cpu_ptr of all (online) CPUs. > > > > On systems with a smaller number of CPUs this solution is also not > > optimal, because __percpu_counter_compare()/__percpu_counter_sum() > > doesn't help synchronize the global counter. > > Florian Westphal have an idea of adding some counter sync points, > > which should help address this issue. > > --- > > include/net/inet_frag.h | 16 ++-- > > net/ipv4/inet_fragment.c |6 +++--- > > 2 files changed, 17 insertions(+), 5 deletions(-) > > > > diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index > > 6fdcd2427776..b586e320783d 100644 > > --- a/include/net/inet_frag.h > > +++ b/include/net/inet_frag.h > > @@ -147,9 +147,21 @@ static inline bool inet_frag_evicting(struct > inet_frag_queue *q) > > */ > > static unsigned int frag_percpu_counter_batch = 13; > > > > -static inline int frag_mem_limit(struct netns_frags *nf) > > +static inline bool frag_mem_over_limit(struct netns_frags *nf, int > > +thresh) > > { > > - return percpu_counter_read(&nf->mem); > > + /* When reading counter here, __percpu_counter_compare() call > > +* will invoke __percpu_counter_sum() when needed. Which > > +* depend on num_online_cpus()*batch size, as each CPU can > > +* potentential can hold a batch count. > > +* > > +* With many CPUs this heavier sum operation will > > +* unfortunately always occur. > > +*/ > > + if (__percpu_counter_compare(&nf->mem, thresh, > > +frag_percpu_counter_batch) > 0) > > + return true; > > + else > > + return false; > > } > > > > static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) > > diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index > > 96e95e83cc61..ee2cf56900e6 100644 > > --- a/net/ipv4/inet_fragment.c > > +++ b/net/ipv4/inet_fragment.c > > @@ -120,7 +120,7 @@ static void inet_frag_secret_rebuild(struct > > inet_frags *f) static bool inet_fragq_should_evict(const struct > > inet_frag_queue *q) { > > return q->net->low_thresh == 0 || > > - frag_mem_limit(q->net) >= q->net->low_thresh; > > + frag_mem_over_limit(q->net, q->net->low_thresh); > > } > > > > static unsigned int > > @@ -355,7 +355,7 @@ static struct inet_frag_queue > > *inet_frag_alloc(struct netns_frags *nf, { > > struct inet_frag_queue *q; > > > > - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { > > + if (!nf->high_thresh || frag_mem_over_limit(nf, nf->high_thresh)) { > > inet_frag_schedule_worker(f); > > return NULL; > > } > > If we go this way (which would IMHO require some benchmarks to make sure it > doesn't harm performance too much) we can drop the explicit checks for zero > thresholds which were added to work around the unreliability of fast checks of > percpu counters (or at least the second one was by commit > 30759219f562 ("net: disable fragment reassembly if high_thresh is zero"). > > Michal Kubecek > > > @@ -396,7 +396,7 @@ struct inet_frag_queue *inet_frag_find(struct > netns_frags *nf, > > struct inet_frag_queue *q; > > int depth = 0; > > > > - if (frag_mem_limit(nf) > nf->low_thresh) > > +
Re: [PATCH net-next] doc: document MSG_ZEROCOPY
On Thu, Aug 31, 2017 at 05:00:13PM -0400, Willem de Bruijn wrote: > From: Willem de Bruijn > > Documentation for this feature was missing from the patchset. > Copied a lot from the netdev 2.1 paper, addressing some small > interface changes since then. > > Signed-off-by: Willem de Bruijn ... > +Notification Batching > +~ > + > +Multiple outstanding packets can be read at once using the recvmmsg > +call. This is often not needed. In each message the kernel returns not > +a single value, but a range. It coalesces consecutive notifications > +while one is outstanding for reception on the error queue. > + > +When a new notification is about to be queued, it checks whether the > +new value extends the range of the notification at the tail of the > +queue. If so, it drops the new notification packet and instead increases > +the range upper value of the outstanding notification. Would it make sense to mention that max notification range is 32-bit? So each 4Gbyte of xmit bytes there will be a notification. In modern 40Gbps NICs it's not a lot. Means that there will be at least one notification every second. Or I misread the code? Thanks for the doc! Acked-by: Alexei Starovoitov
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On Thu, Aug 31, 2017 at 01:56:34PM -0700, Chenbo Feng wrote: > From: Chenbo Feng > > Introduce a pointer into struct bpf_map to hold the security information > about the map. The actual security struct varies based on the security > models implemented. Place the LSM hooks before each of the unrestricted > eBPF operations, the map_update_elem and map_delete_elem operations are > checked by security_map_modify. The map_lookup_elem and map_get_next_key > operations are checked by securtiy_map_read. > > Signed-off-by: Chenbo Feng ... > @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_read(map); > + if (err) > + return -EACCES; > + > key = memdup_user(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key); > @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_modify(map); I don't feel these extra hooks are really thought through. With such hook you'll disallow map_update for given map. That's it. The key/values etc won't be used in such security decision. In such case you don't need such hooks in update/lookup at all. Only in map_creation and object_get calls where FD can be received. In other words I suggest to follow standard unix practices: Do permissions checks in open() and allow read/write() if FD is valid. Same here. Do permission checks in prog_load/map_create/obj_pin/get and that will be enough to jail bpf subsystem. bpf cmds that need to be fast (like lookup and update) should not have security hooks.
Re: [PATCH v3 net-next 6/7] samples/bpf: Update cgrp2 socket tests
On Thu, Aug 31, 2017 at 03:05:49PM -0700, David Ahern wrote: > Update cgrp2 bpf sock tests to check that device, mark and priority > can all be set on a socket via bpf programs attached to a cgroup. > > Signed-off-by: David Ahern Acked-by: Alexei Starovoitov
Re: [PATCH v3 net-next 7/7] samples/bpf: Update cgroup socket examples to use uid gid helper
On Thu, Aug 31, 2017 at 03:05:50PM -0700, David Ahern wrote: > Signed-off-by: David Ahern Acked-by: Alexei Starovoitov
Re: [PATCH v3 net-next 5/7] samples/bpf: Add option to dump socket settings
On Thu, Aug 31, 2017 at 03:05:48PM -0700, David Ahern wrote: > Add option to dump socket settings. Will be used in the next patch > to verify bpf programs are correctly setting mark, priority and > device based on the cgroup attachment for the program run. > > Signed-off-by: David Ahern Acked-by: Alexei Starovoitov
Re: [PATCH v3 net-next 4/7] samples/bpf: Add detach option to test_cgrp2_sock
On Thu, Aug 31, 2017 at 03:05:47PM -0700, David Ahern wrote: > Add option to detach programs from a cgroup. > > Signed-off-by: David Ahern Acked-by: Alexei Starovoitov
Re: [PATCH v3 net-next 3/7] samples/bpf: Update sock test to allow setting mark and priority
On Thu, Aug 31, 2017 at 03:05:46PM -0700, David Ahern wrote: > Update sock test to set mark and priority on socket create. > > Signed-off-by: David Ahern Acked-by: Alexei Starovoitov
[RFC iproute2 1/2] update headers with CBS API [RFC]
Signed-off-by: Vinicius Costa Gomes --- include/linux/pkt_sched.h | 29 + 1 file changed, 29 insertions(+) diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 099bf552..ba6c9a54 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -871,4 +871,33 @@ struct tc_pie_xstats { __u32 maxq; /* maximum queue size */ __u32 ecn_mark; /* packets marked with ecn*/ }; + +/* CBS */ +/* FIXME: this is only for usage with ndo_setup_tc(), this should be + * in another header someplace else. Is pkt_cls.h the right place? + */ +struct tc_cbs_qopt_offload { + __u8enable; + __s32 queue; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +struct tc_cbs_qopt { + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + #endif -- 2.14.1
[RFC iproute2 2/2] tc: Add support for the CBS qdisc
The Credit Based Shaper (CBS) queueing discipline allows bandwidth reservation with sub-milisecond precision. It is defined by the 802.1Q-2014 specification (section 8.6.8.2 and Annex L). The syntax is: tc qdisc add dev DEV parent NODE cbs locredit hicredit sendslope idleslope Signed-off-by: Vinicius Costa Gomes --- tc/Makefile | 1 + tc/q_cbs.c | 134 2 files changed, 135 insertions(+) create mode 100644 tc/q_cbs.c diff --git a/tc/Makefile b/tc/Makefile index a9b4b8e6..f0091217 100644 --- a/tc/Makefile +++ b/tc/Makefile @@ -73,6 +73,7 @@ TCMODULES += q_hhf.o TCMODULES += q_clsact.o TCMODULES += e_bpf.o TCMODULES += f_matchall.o +TCMODULES += q_cbs.o TCSO := ifeq ($(TC_CONFIG_ATM),y) diff --git a/tc/q_cbs.c b/tc/q_cbs.c new file mode 100644 index ..0120e838 --- /dev/null +++ b/tc/q_cbs.c @@ -0,0 +1,134 @@ +/* + * q_tbf.c TBF. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors:Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" +#include "tc_util.h" + +static void explain(void) +{ + fprintf(stderr, "Usage: ... tbf hicredit BYTES locredit BYTES sendslope BPS idleslope BPS\n"); +} + +static void explain1(const char *arg, const char *val) +{ + fprintf(stderr, "cbs: illegal value for \"%s\": \"%s\"\n", arg, val); +} + +static int cbs_parse_opt(struct qdisc_util *qu, int argc, char **argv, struct nlmsghdr *n) +{ + int ok = 0; + struct tc_cbs_qopt opt = {}; + struct rtattr *tail; + + while (argc > 0) { + if (matches(*argv, "hicredit") == 0) { + NEXT_ARG(); + if (opt.hicredit) { + fprintf(stderr, "cbs: duplicate \"hicredit\" specification\n"); + return -1; + } + if (get_s32(&opt.hicredit, *argv, 0)) { + explain1("hicredit", *argv); + return -1; + } + ok++; + } else if (matches(*argv, "locredit") == 0) { + NEXT_ARG(); + if (opt.locredit) { + fprintf(stderr, "cbs: duplicate \"locredit\" specification\n"); + return -1; + } + if (get_s32(&opt.locredit, *argv, 0)) { + explain1("locredit", *argv); + return -1; + } + ok++; + } else if (matches(*argv, "sendslope") == 0) { + NEXT_ARG(); + if (opt.sendslope) { + fprintf(stderr, "cbs: duplicate \"sendslope\" specification\n"); + return -1; + } + if (get_s32(&opt.sendslope, *argv, 0)) { + explain1("sendslope", *argv); + return -1; + } + ok++; + } else if (matches(*argv, "idleslope") == 0) { + NEXT_ARG(); + if (opt.idleslope) { + fprintf(stderr, "cbs: duplicate \"idleslope\" specification\n"); + return -1; + } + if (get_s32(&opt.idleslope, *argv, 0)) { + explain1("idleslope", *argv); + return -1; + } + ok++; + } else if (strcmp(*argv, "help") == 0) { + explain(); + return -1; + } else { + fprintf(stderr, "cbs: unknown parameter \"%s\"\n", *argv); + explain(); + return -1; + } + argc--; argv++; + } + + tail = NLMSG_TAIL(n); + addattr_l(n, 1024, TCA_OPTIONS, NULL, 0); + addattr_l(n, 2024, TCA_CBS_PARMS, &opt, sizeof(opt)); + tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail; + return 0; +} + +static int cbs_print_opt(struct qdisc_util *qu, FILE *f, struct rtattr *opt) +{ + struct rtattr *tb[TCA_TBF_MAX+1]; + struct tc_cbs_qopt *qopt; + + if (opt == NULL) + return 0; + + parse_rtattr_nested(tb, TCA_CBS_MAX, opt); + + if (tb[TCA_CBS_PARMS] == NULL) + return -1; + +
[RFC net-next 5/5] samples/tsn: Add script for calculating CBS config
From: Andre Guedes Add a script that takes as input the parameters of the Credit-based shaper used on FQTSS - link rate, max frame size of best effort traffic, idleslope and maximum frame size of the time-sensitive traffic class - for SR classes A and B, and calculates how the CBS qdisc must be configured for each traffic class. For example, if you want to have Class A with a bandwidth of 300 Mbps and Class B of 200 Mbps, and the max frame size of both classes' traffic is 1500 bytes: $ ./calculate_cbs_params.py -A 30 -a 1500 -B 20 -b 1500 would give you the correct cbs qdisc config command lines to be used. This script is just a helper to ease testing of the TSN samples - talker and listener - and shouldn't be taken as highly accurate. Signed-off-by: Andre Guedes Signed-off-by: Jesus Sanchez-Palencia --- samples/tsn/calculate_cbs_params.py | 112 1 file changed, 112 insertions(+) create mode 100755 samples/tsn/calculate_cbs_params.py diff --git a/samples/tsn/calculate_cbs_params.py b/samples/tsn/calculate_cbs_params.py new file mode 100755 index ..9c46210b699f --- /dev/null +++ b/samples/tsn/calculate_cbs_params.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2017, Intel Corporation +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Intel Corporation nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import math +import sys + +def print_cbs_params_for_class_a(args): +idleslope = args.idleslope_a +sendslope = idleslope - args.link_speed + +# According to 802.1Q-2014 spec, Annex L, hiCredit and +# loCredit for SR class A are calculated following the +# equations L-10 and L-12, respectively. +hicredit = math.ceil(idleslope * args.frame_non_sr / args.link_speed) +locredit = math.ceil(sendslope * args.frame_a / args.link_speed) + +print("Class A --> # tc qdisc replace dev IFACE parent ID cbs " \ + "locredit %d hicredit %d sendslope %d idleslope %d" % \ + (locredit, hicredit, sendslope, idleslope)) + +def print_cbs_params_for_class_b(args): +idleslope = args.idleslope_b +sendslope = idleslope - args.link_speed + +# Annex L doesn't present a straightforward equation to +# calculate hiCredit for Class B so we have to derive it +# based on generic equations presented in that Annex. +# +# L-3 is the primary equation to calculate hiCredit. Section +# L.2 states that the 'maxInterferenceSize' for SR class B +# is the maximum burst size for SR class A plus the +# maxInterferenceSize from SR class A (which is equal to the +# maximum frame from non-SR traffic). +# +# The maximum burst size for SR class A equation is shown in +# L-16. Merging L-16 into L-3 we get the resulting equation +# which calculates hiCredit B (refer to section L.3 in case +# you're not familiar with the legend): +# +# hiCredit B = Rb * ( Mo Ma ) +# -- + -- +# Ro - Ra Ro +# +hicredit = math.ceil(idleslope * \ + ((args.frame_non_sr / (args.link_speed - args.idleslope_a)) + \ + (args.frame_a / args.link_speed))) + +# loCredit B is calculated following equation L-2. +locredit = math.ceil(sendslope * args.frame_b / args.link_speed) + +print("Class B --> # tc qdisc replace dev IFACE parent ID cbs " \ + "locredit %d hicredit %d sendslope %d idleslope %d" % \ + (locredit, hicredit, sendslope, idleslope)) +
[RFC net-next 4/5] sample: Add TSN Talker and Listener examples
From: Jesus Sanchez-Palencia Add two examples so one can easily test a 'TSN distributed system' running with standard kernel interfaces. Both 'talker' and 'listener' sides are provided, and use a AF_PACKET for Tx / Rx of frames. Running the examples is rather simple. For the talker, just the interface and SO_PRIORITY are expected as parameters: $ ./talker -i enp3s0 -p 3 For the listener, only the interface is needed: $ ./listener -i enp3s0 The multicast MAC address being used is currently hardcoded on both examples for simplicity. Note that the listener side uses a BPF filter so only frames sent to the correct "stream" destination address are received by the socket. If you modify the address used by the talker, you must also adapt the BPF filter otherwise no frames will be received by the socket. The listener example will print the rate of packets reception after every 1 second. This makes it easier to verify if the bandwidth configured for a traffic class is being respected or not. Signed-off-by: Jesus Sanchez-Palencia Signed-off-by: Vinicius Costa Gomes Signed-off-by: Andre Guedes Signed-off-by: Iván Briano --- samples/tsn/listener.c | 254 + samples/tsn/talker.c | 136 ++ 2 files changed, 390 insertions(+) create mode 100644 samples/tsn/listener.c create mode 100644 samples/tsn/talker.c diff --git a/samples/tsn/listener.c b/samples/tsn/listener.c new file mode 100644 index ..2d17bdfbea99 --- /dev/null +++ b/samples/tsn/listener.c @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_FRAME_SIZE 1500 + +/* XXX: If this address is changed, the BPF filter must be adjusted. */ +static uint8_t multicast_macaddr[] = { 0xBB, 0xAA, 0xBB, 0xAA, 0xBB, 0xAA }; +static char ifname[IFNAMSIZ]; +static uint64_t data_count; +static int arg_count; + +/* + * BPF Filter so we only receive frames from the destination MAC address of our + * SRP stream. This is hardcoded in multicast_macaddr[]. + */ +static struct sock_filter dst_addr_filter[] = { + { 0x20, 0, 0, 00 }, /* Load DST address: first 32bits only */ + { 0x15, 0, 3, 0xbbaabbaa }, /* Compare with first 32bits from MAC */ + { 0x28, 0, 0, 0x0004 }, /* Load DST address: remaining 16bits */ + { 0x15, 0, 1, 0xbbaa }, /* Compare with last 16bits from MAC */ + { 0x06, 0, 0, 0x }, + { 0x06, 0, 0, 00 }, /* Ret 0. Jump here if any mismatches. */ +}; + +/* BPF program */ +static struct sock_fprog bpf = { + .len = 6, /* Number of instructions on BPF filter */ + .filter = dst_addr_filter, +}; + +static struct argp_option options[] = { + {"ifname", 'i', "IFNAME", 0, "Network Interface" }, + { 0 } +}; + +static error_t parser(int key, char *arg, struct argp_state *s) +{ + switch (key) { + case 'i': + strncpy(ifname, arg, sizeof(ifname) - 1); + arg_count++; + break; + case ARGP_KEY_END: + if (arg_count < 1) + argp_failure(s, 1, 0, "Options missing. Check --help"); + break; + } + + return 0; +} + +static struct argp argp = { options, par
[RFC net-next 0/5] TSN: Add qdisc-based config interfaces for traffic shapers
Hi, This patchset is an RFC on a proposal of how the Traffic Control subsystem can be used to offload the configuration of traffic shapers into network devices that provide support for them in HW. Our goal here is to start upstreaming support for features related to the Time-Sensitive Networking (TSN) set of standards into the kernel. As part of this work, we've assessed previous public discussions related to TSN enabling: patches from Henrik Austad (Cisco), the presentation from Eric Mann at Linux Plumbers 2012, patches from Gangfeng Huang (National Instruments) and the current state of the OpenAVNU project (https://github.com/AVnu/OpenAvnu/). Please note that the patches provided as part of this RFC are implementing what is needed only for 802.1Qav (FQTSS) only, but we'd like to take advantage of this discussion and share our WIP ideas for the 802.1Qbv and 802.1Qbu interfaces as well. The current patches are only providing support for HW offload of the configs. Overview Time-sensitive Networking (TSN) is a set of standards that aim to address resources availability for providing bandwidth reservation and bounded latency on Ethernet based LANs. The proposal described here aims to cover mainly what is needed to enable the following standards: 802.1Qat, 802.1Qav, 802.1Qbv and 802.1Qbu. The initial target of this work is the Intel i210 NIC, but other controllers' datasheet were also taken into account, like the Renesas RZ/A1H RZ/A1M group and the Synopsis DesignWare Ethernet QoS controller. Proposal Feature-wise, what is covered here are configuration interfaces for HW implementations of the Credit-Based shaper (CBS, 802.1Qav), Time-Aware shaper (802.1Qbv) and Frame Preemption (802.1Qbu). CBS is a per-queue shaper, while Qbv and Qbu must be configured per port, with the configuration covering all queues. Given that these features are related to traffic shaping, and that the traffic control subsystem already provides a queueing discipline that offloads config into the device driver (i.e. mqprio), designing new qdiscs for the specific purpose of offloading the config for each shaper seemed like a good fit. For steering traffic into the correct queues, we use the socket option SO_PRIORITY and then a mechanism to map priority to traffic classes / Tx queues. The qdisc mqprio is currently used in our tests. As for the shapers config interface: * CBS (802.1Qav) This patchset is proposing a new qdisc called 'cbs'. Its 'tc' cmd line is: $ tc qdisc add dev IFACE parent ID cbs locredit N hicredit M sendslope S \ idleslope I Note that the parameters for this qdisc are the ones defined by the 802.1Q-2014 spec, so no hardware specific functionality is exposed here. * Time-aware shaper (802.1Qbv): The idea we are currently exploring is to add a "time-aware", priority based qdisc, that also exposes the Tx queues available and provides a mechanism for mapping priority <-> traffic class <-> Tx queues in a similar fashion as mqprio. We are calling this qdisc 'taprio', and its 'tc' cmd line would be: $ $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4\ map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3 \ queues 0 1 2 3 \ sched-file gates.sched [base-time ] \ [cycle-time ] [extension-time ] is multi-line, with each line being of the following format: Qbv only defines one : "S" for 'SetGates' For example: S 0x01 300 S 0x03 500 This means that there are two intervals, the first will have the gate for traffic class 0 open for 300 nanoseconds, the second will have both traffic classes open for 500 nanoseconds. Additionally, an option to set just one entry of the gate control list will also be provided by 'taprio': $ tc qdisc (...) \ sched-row \ [base-time ] [cycle-time ] \ [extension-time ] * Frame Preemption (802.1Qbu): To control even further the latency, it may prove useful to signal which traffic classes are marked as preemptable. For that, 'taprio' provides the preemption command so you set each traffic class as preemptable or not: $ tc qdisc (...) \ preemption 0 1 1 1 * Time-aware shaper + Preemption: As an example of how Qbv and Qbu can be used together, we may specify both the schedule and the preempt-mask, and this way we may also specify the Set-Gates-and-Hold and Set-Gates-and-Release commands as specified in the Qbu spec: $ tc qdisc add dev ens4 parent root handle 100 taprio num_tc 4 \ map 2 2 1 0 3 3 3 3 3 3 3 3 3 3 3 3\ queues 0 1 2 3 \ preemption 0 1 1 1 \ sched-file preempt_gates.sched is multi-line, with each line being of the following format: F
[RFC net-next 1/5] net/sched: Introduce the user API for the CBS shaper
Export the API necessary for configuring the CBS shaper (implemented in the next patch) via the tc tool. Signed-off-by: Vinicius Costa Gomes --- include/uapi/linux/pkt_sched.h | 29 + 1 file changed, 29 insertions(+) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 099bf5528fed..aa4a3e5421be 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -871,4 +871,33 @@ struct tc_pie_xstats { __u32 maxq; /* maximum queue size */ __u32 ecn_mark; /* packets marked with ecn*/ }; + +/* CBS */ +/* FIXME: this is only for usage with ndo_setup_tc(), this should be + * in another header someplace else. Is pkt_cls.h the right place? + */ +struct tc_cbs_qopt_offload { + u8 enable; + s32 queue; + s32 hicredit; + s32 locredit; + s32 idleslope; + s32 sendslope; +}; + +struct tc_cbs_qopt { + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + #endif -- 2.14.1
[RFC net-next 2/5] net/sched: Introduce Credit Based Shaper (CBS) qdisc
This queueing discipline implements the shaper algorithm defined by the 802.1Q-2014 Section 8.6.8.2 and detailed in Annex L. It's primary usage is to apply some bandwidth reservation to user defined traffic classes, which are mapped to different queues via the mqprio qdisc. Initially, it only supports offloading the traffic shaping work to supporting controllers. Later, when a software implementation is added, the current dependency on being installed "under" mqprio can be lifted. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Jesus Sanchez-Palencia --- include/linux/netdevice.h | 1 + net/sched/Kconfig | 11 ++ net/sched/Makefile| 1 + net/sched/sch_cbs.c | 286 ++ 4 files changed, 299 insertions(+) create mode 100644 net/sched/sch_cbs.c diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 35de8312e0b5..dd9a2ecd0c03 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -775,6 +775,7 @@ enum tc_setup_type { TC_SETUP_CLSFLOWER, TC_SETUP_CLSMATCHALL, TC_SETUP_CLSBPF, + TC_SETUP_CBS, }; /* These structures hold the attributes of xdp state that are being passed diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e70ed26485a2..c03d86a7775e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -172,6 +172,17 @@ config NET_SCH_TBF To compile this code as a module, choose M here: the module will be called sch_tbf. +config NET_SCH_CBS + tristate "Credit Based Shaper (CBS)" + ---help--- + Say Y here if you want to use the Credit Based Shaper (CBS) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_cbs. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index 7b915d226de7..80c8f92d162d 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_FQ_CODEL)+= sch_fq_codel.o obj-$(CONFIG_NET_SCH_FQ) += sch_fq.o obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o +obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c new file mode 100644 index ..1c86a9e14150 --- /dev/null +++ b/net/sched/sch_cbs.c @@ -0,0 +1,286 @@ +/* + * net/sched/sch_cbs.c Credit Based Shaper + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors:Vininicius Costa Gomes + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cbs_sched_data { + struct Qdisc *qdisc; /* Inner qdisc, default - pfifo queue */ + s32 queue; + s32 locredit; + s32 hicredit; + s32 sendslope; + s32 idleslope; +}; + +static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + int ret; + + ret = qdisc_enqueue(skb, q->qdisc, to_free); + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + qdisc_qstats_drop(sch); + return ret; + } + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + return NET_XMIT_SUCCESS; +} + +static struct sk_buff *cbs_dequeue(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->ops->peek(q->qdisc); + if (skb) { + skb = qdisc_dequeue_peeked(q->qdisc); + if (unlikely(!skb)) + return NULL; + + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + qdisc_bstats_update(sch, skb); + + return skb; + } + return NULL; +} + +static void cbs_reset(struct Qdisc *sch) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); +} + +static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = { + [TCA_CBS_PARMS] = { .len = sizeof(struct tc_cbs_qopt) }, +}; + +static int cbs_change(struct Qdisc *sch, struct nlattr *opt) +{ + struct cbs_sched_data *q = qdisc_priv(sch); + struct tc_cbs_qopt_offload cbs = { }; + struct nlattr *tb[TCA_CBS_MAX + 1]; + const struct net_device_ops *ops; + struct tc_cbs_qopt *qopt; + struct net_device *dev; + int err; + +
[RFC net-next 3/5] igb: Add support for CBS offload
From: Andre Guedes This patch adds support for Credit-Based Shaper (CBS) qdisc offload from Traffic Control system. This support enable us to leverage the Forwarding and Queuing for Time-Sensitive Streams (FQTSS) features from Intel i210 Ethernet Controller. FQTSS is the former 802.1Qav standard which was merged into 802.1Q in 2014. It enables traffic prioritization and bandwidth reservation via the Credit-Based Shaper which is implemented in hardware by i210 controller. The patch introduces the igb_setup_tc() function which implements the support for CBS qdisc hardware offload in the IGB driver. CBS offload is the only traffic control offload supported by the driver at the moment. FQTSS transmission mode from i210 controller is automatically enabled by the IGB driver when the CBS is enabled for the first hardware queue. Likewise, FQTSS mode is automatically disabled when CBS is disabled for the last hardware queue. Changing FQTSS mode requires NIC reset. FQTSS feature is supported by i210 controller only. Signed-off-by: Andre Guedes Signed-off-by: Jesus Sanchez-Palencia --- drivers/net/ethernet/intel/igb/e1000_defines.h | 23 ++ drivers/net/ethernet/intel/igb/e1000_regs.h| 8 + drivers/net/ethernet/intel/igb/igb.h | 6 + drivers/net/ethernet/intel/igb/igb_main.c | 349 + 4 files changed, 386 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/e1000_defines.h b/drivers/net/ethernet/intel/igb/e1000_defines.h index 1de82f247312..83cabff1e0ab 100644 --- a/drivers/net/ethernet/intel/igb/e1000_defines.h +++ b/drivers/net/ethernet/intel/igb/e1000_defines.h @@ -353,7 +353,18 @@ #define E1000_RXPBS_CFG_TS_EN 0x8000 #define I210_RXPBSIZE_DEFAULT 0x00A2 /* RXPBSIZE default */ +#define I210_RXPBSIZE_MASK 0x003F +#define I210_RXPBSIZE_PB_32KB 0x0020 #define I210_TXPBSIZE_DEFAULT 0x0414 /* TXPBSIZE default */ +#define I210_TXPBSIZE_MASK 0xC0FF +#define I210_TXPBSIZE_PB0_8KB (8 << 0) +#define I210_TXPBSIZE_PB1_8KB (8 << 6) +#define I210_TXPBSIZE_PB2_4KB (4 << 12) +#define I210_TXPBSIZE_PB3_4KB (4 << 18) + +#define I210_DTXMXPKTSZ_DEFAULT0x0098 + +#define I210_SR_QUEUES_NUM 2 /* SerDes Control */ #define E1000_SCTL_DISABLE_SERDES_LOOPBACK 0x0400 @@ -1051,4 +1062,16 @@ #define E1000_VLAPQF_P_VALID(_n) (0x1 << (3 + (_n) * 4)) #define E1000_VLAPQF_QUEUE_MASK0x03 +/* TX Qav Control fields */ +#define E1000_TQAVCTRL_XMIT_MODE BIT(0) +#define E1000_TQAVCTRL_DATAFETCHARBBIT(4) +#define E1000_TQAVCTRL_DATATRANARB BIT(8) + +/* TX Qav Credit Control fields */ +#define E1000_TQAVCC_IDLESLOPE_MASK0x +#define E1000_TQAVCC_QUEUEMODE BIT(31) + +/* Transmit Descriptor Control fields */ +#define E1000_TXDCTL_PRIORITY BIT(27) + #endif diff --git a/drivers/net/ethernet/intel/igb/e1000_regs.h b/drivers/net/ethernet/intel/igb/e1000_regs.h index 58adbf234e07..8eee081d395f 100644 --- a/drivers/net/ethernet/intel/igb/e1000_regs.h +++ b/drivers/net/ethernet/intel/igb/e1000_regs.h @@ -421,6 +421,14 @@ do { \ #define E1000_I210_FLA 0x1201C +#define E1000_I210_DTXMXPKTSZ 0x355C + +#define E1000_I210_TXDCTL(_n) (0x0E028 + ((_n) * 0x40)) + +#define E1000_I210_TQAVCTRL0x3570 +#define E1000_I210_TQAVCC(_n) (0x3004 + ((_n) * 0x40)) +#define E1000_I210_TQAVHC(_n) (0x300C + ((_n) * 0x40)) + #define E1000_INVM_DATA_REG(_n)(0x12120 + 4*(_n)) #define E1000_INVM_SIZE64 /* Number of INVM Data Registers */ diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index 06ffb2bc713e..92845692087a 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -281,6 +281,11 @@ struct igb_ring { u16 count; /* number of desc. in the ring */ u8 queue_index; /* logical index of the ring*/ u8 reg_idx; /* physical index of the ring */ + bool cbs_enable;/* indicates if CBS is enabled */ + s32 idleslope; /* idleSlope in kbps */ + s32 sendslope; /* sendSlope in kbps */ + s32 hicredit; /* hiCredit in bytes */ + s32 locredit; /* loCredit in bytes */ /* everything past this point are written often */ u16 next_to_clean; @@ -621,6 +626,7 @@ struct igb_adapter { #define IGB_FLAG_EEE BIT(14) #define IGB_FLAG_VLAN_PROMISC BIT(15) #define IGB_FLAG_RX_LEGACY BIT(16) +#define IGB_FLAG_FQTSS BIT(17) /* Media Auto Sense */ #define IGB_MAS_ENABLE_0 0X0001 diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fd4a46b03cc8..47cabca0c99a 100644 --- a/drivers/n
Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
On Thu, Aug 31, 2017 at 4:59 PM, Kees Cook wrote: > On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov > wrote: >> On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook wrote: >>> In several places, .data is checked for initialization to gate early >>> calls to del_timer_sync(). Checking for .function is equally valid, so >>> switch to this in all callers. >> >> Not seeing the rest of patches it is unclear from the patch >> description why this is needed/wanted. > > The CC list would have been really giant, but here is the first patch > and the earlier series list: > > https://lkml.org/lkml/2017/8/31/904 > https://lkml.org/lkml/2017/8/30/760 > > tl;dr: We're going to switch all struct timer_list callbacks to get > the timer pointer as the argument instead of from the .data field. > This patch is one step in removing open-coded users of the .data > field. > And that is exactly what should have been in the patch description. FWIW for input bits: Acked-by: Dmitry Torokhov Thanks. -- Dmitry
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On Thu, Aug 31, 2017 at 3:38 PM, Daniel Borkmann wrote: > On 08/31/2017 10:56 PM, Chenbo Feng wrote: >> >> From: Chenbo Feng >> >> Introduce a pointer into struct bpf_map to hold the security information >> about the map. The actual security struct varies based on the security >> models implemented. Place the LSM hooks before each of the unrestricted >> eBPF operations, the map_update_elem and map_delete_elem operations are >> checked by security_map_modify. The map_lookup_elem and map_get_next_key >> operations are checked by securtiy_map_read. >> >> Signed-off-by: Chenbo Feng > > > Against which tree is this by the way, net-next? There are > changes here which require a rebase of your set. > This patch series is rebased on security subsystem since patch 1/3 is a patch for security system I assume. But I am not sure where this specific patch should go in. If I should submit this one to net-next, I will rebase it against net-next and submit again. >> --- >> include/linux/bpf.h | 3 +++ >> kernel/bpf/syscall.c | 28 >> 2 files changed, 31 insertions(+) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index b69e7a5869ff..ca3e6ff7091d 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -53,6 +53,9 @@ struct bpf_map { >> struct work_struct work; >> atomic_t usercnt; >> struct bpf_map *inner_map_meta; >> +#ifdef CONFIG_SECURITY >> + void *security; >> +#endif >> }; >> >> /* function argument constraints */ >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 045646da97cc..b15580bcf3b1 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr) >> if (err) >> return -EINVAL; >> >> + err = security_map_create(); > > > Seems a bit limited to me, don't you want to be able to > also differentiate between different map types? Same goes > for security_prog_load() wrt prog types, no? > I don't want to introduce extra complexity into it if not necessary. so I only included the thing needed for the selinux implementation for now. But I agree that the map and program type information could be useful when people developing more complex security checks. I will add a union bpf_attr *attr pointer into the security hook functions for future needs. >> + if (err) >> + return -EACCES; >> + >> /* find map type and init map: hashtable vs rbtree vs bloom vs ... >> */ >> map = find_and_alloc_map(attr); >> if (IS_ERR(map)) >> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr) >> if (err) >> goto free_map_nouncharge; >> >> + err = security_post_create(map); >> + if (err < 0) >> + goto free_map; >> + > > > So the hook you implement in patch 3/3 does: > > +static int selinux_bpf_post_create(struct bpf_map *map) > +{ > + struct bpf_security_struct *bpfsec; > + > + bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); > + if (!bpfsec) > + return -ENOMEM; > + > + bpfsec->sid = current_sid(); > + map->security = bpfsec; > + > + return 0; > +} > > Where do you kfree() bpfsec when the map gets released > normally or in error path? > Will add it in next version. Thanks for point it out. >> err = bpf_map_alloc_id(map); >> if (err) >> goto free_map; >> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_read(map); >> + if (err) >> + return -EACCES; > > > How about actually dropping ref? > May bad, thanks again. >> + >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_modify(map); >> + if (err) >> + return -EACCES; > > > Ditto ... > >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_modify(map); >> + if (err) >> + return -EACCES; > > > Ditto ... > >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_read(map); >> + if (err) >> + return -EACCES; > > > And once again here ... :( > > >> if (ukey) { >> key = memdup_
Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function
On 08/31/2017 04:29 PM, Kees Cook wrote: > This standardizes the callback and data prototypes in several places that > perform casting, in an effort to remove more open-coded .data and > .function uses in favor of setup_timer(). > > Cc: Samuel Ortiz > Cc: Tyrel Datwyler > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: "James E.J. Bottomley" > Cc: "Martin K. Petersen" > Cc: netdev@vger.kernel.org > Cc: linux-s...@vger.kernel.org > Cc: linuxppc-...@lists.ozlabs.org > Signed-off-by: Kees Cook > --- > drivers/net/irda/bfin_sir.c | 5 +++-- > drivers/scsi/ibmvscsi/ibmvfc.c | 14 ++ > drivers/scsi/ibmvscsi/ibmvscsi.c | 8 > 3 files changed, 13 insertions(+), 14 deletions(-) Resending from correct email address. For ibmvfc & ibmvscsi portions: Acked-by: Tyrel Datwyler
Re: [PATCH 19/31] timer: Remove open-coded casts for .data and .function
On 08/31/2017 04:29 PM, Kees Cook wrote: > This standardizes the callback and data prototypes in several places that > perform casting, in an effort to remove more open-coded .data and > .function uses in favor of setup_timer(). > > Cc: Samuel Ortiz > Cc: Tyrel Datwyler > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: "James E.J. Bottomley" > Cc: "Martin K. Petersen" > Cc: netdev@vger.kernel.org > Cc: linux-s...@vger.kernel.org > Cc: linuxppc-...@lists.ozlabs.org > Signed-off-by: Kees Cook > --- > drivers/net/irda/bfin_sir.c | 5 +++-- > drivers/scsi/ibmvscsi/ibmvfc.c | 14 ++ > drivers/scsi/ibmvscsi/ibmvscsi.c | 8 > 3 files changed, 13 insertions(+), 14 deletions(-) For ibmvfc & ibmvscsi portions: Acked-by: Tyrel Datwyler
Re: netdev carrier changes is one even after ethernet link up.
On 08/30/2017 10:53 PM, Bhadram Varka wrote: > Hi, > > > > I have observed that carrier_changes is one even in case of the ethernet > link is up. > > > > After investigating the code below is my observation – > > > > ethernet_driver_probe() > > +--->phy_connect() > > | +--->phy_attach_direct() > > | +---> netif_carrier_off(): which increments > carrier_changes to one. > > +--->register_netdevice() : will the carrier_changes becomes zero here ? > > +--->netif_carrier_off(): not increment the carrier_changes since > __LINK_STATE_NOCARRIER already set. > > > > From ethernet driver open will start the PHY and trigger the > phy_state_machine. > > Phy_state_machine workqueue calling netif_carrier_on() once the link is UP. > > netif_carrier_on() increments the carrier_changes by one. If the call trace is correct, then there is at least two problems here: - phy_connect() does start the PHY machine which means that as soon as it detects a link state of any kind (up or down) it can call netif_carrier_off() respectively netif_carrier_on() - as soon as you call register_netdevice() notifiers run and other parts of the kernel or user-space programs can see an inconsistent link state I would suggest doing the following sequence instead: netif_carrier_off() register_netdevice() phy_connect() Which should result in a consistent link state and carrier value. > > > > After link is UP if we check the carrier_changes sysfs node - it will be > one only. > > > > $ cat /sys/class/net/eth0/carrier_changes > > 1 > > > > After reverting the change - https://lkml.org/lkml/2016/1/9/173 (net: > phy: turn carrier off on phy attach) then I could see the carrier > changes incremented to 2 after Link UP. > > $ cat /sys/class/net/eth0/carrier_changes > > 2 > > > > Thanks, > > Bhadram. > > > This email message is for the sole use of the intended recipient(s) and > may contain confidential information. Any unauthorized review, use, > disclosure or distribution is prohibited. If you are not the intended > recipient, please contact the sender by reply email and destroy all > copies of the original message. > -- Florian
Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses
On Fri, Sep 01, 2017 at 01:26:33AM +0200, Sabrina Dubroca wrote: > 2017-08-31, 09:59:39 -0700, Ivan Delalande wrote: > > diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c > > index a748c74aa8b7..abbf0edcf6c2 100644 > > --- a/net/ipv4/tcp_diag.c > > +++ b/net/ipv4/tcp_diag.c > [...] > > +static int tcp_diag_get_aux(struct sock *sk, bool net_admin, > > + struct sk_buff *skb) > > +{ > > +#ifdef CONFIG_TCP_MD5SIG > > + if (net_admin) { > > In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't > see anything preventing you from reaching this with a !fullsock? Currently handler->idiag_get_aux is only called from inet_sk_diag_fill which has a `BUG_ON(!sk_fullsock(sk));`, but I could add another explicit check in that function if you think it's more consistent. Actually, I wasn't sure when adding this idiag_get_aux in v2 if it should be called from inet_twsk_diag_fill, inet_req_diag_fill and inet_csk_diag_fill, or just the last one. I chose that simpler approach for now to avoid duplicating these state checks in the idiag_get_aux defined by protocols and because we didn't need for INET_DIAG_MD5SIG, but it shouldn't be too hard to change. Do you think this could be useful for other protocols or attributes? Thank you, -- Ivan Delalande Arista Networks
Re: [PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion
On Thu, 2017-08-31 at 16:48 -0700, Eric Dumazet wrote: > Yet another atomic_t -> refcount_t conversion, split in two patches. > > First patch prepares the automatic conversion done in the second patch. > > Eric Dumazet (2): > net: prepare (struct ubuf_info)->refcnt conversion > net: convert (struct ubuf_info)->refcnt to refcount_t > > drivers/vhost/net.c| 2 +- > include/linux/skbuff.h | 5 +++-- > net/core/skbuff.c | 14 -- > net/ipv4/tcp.c | 2 -- > 4 files changed, 8 insertions(+), 15 deletions(-) > David please ignore this series, I will send a V3 :)
Re: [RFC net-next 0/8] net: dsa: Multi-queue awareness
On Wed, Aug 30, 2017 at 05:18:44PM -0700, Florian Fainelli wrote: > This patch series is sent as reference, especially because the last patch > is trying not to be creating too many layer violations, but clearly there > are a little bit being created here anyways. > > Essentially what I am trying to achieve is that you have a stacked device > which > is multi-queue aware, that applications will be using, and for which they can > control the queue selection (using mq) the way they want. Each of each stacked > network devices are created for each port of the switch (this is what DSA > does). When a skb is submitted from say net_device X, we can derive its port > number and look at the queue_mapping value to determine which port of the > switch and queue we should be sending this to. The information is embedded in > a > tag (4 bytes) and is used by the switch to steer the transmission. > > These stacked devices will actually transmit using a "master" or conduit > network device which has a number of queues as well. In one version of the > hardware that I work with, we have up to 4 ports, each with 8 queues, and the > master device has a total of 32 hardware queues, so a 1:1 mapping is easy. > With > another version of the hardware, same number of ports and queues, but only 16 > hardware queues, so only a 2:1 mapping is possible. > > In order for congestion information to work properly, I need to establish a > mapping, preferably before transmission starts (but reconfiguration while > interfaces are running would be possible too) between these stacked device's > queue and the conduit interface's queue. > > Comments, flames, rotten tomatoes, anything! Right, i think i understand. This works just for traffic between the host and ports. The host can set the egress queue. And i assume the queues are priorities, either absolute or weighted round robin, etc. But this has no effect on traffic going from port to port. At some point, i expect you will want to offload TC for that. How will the two interact? Could the TC rules also act on traffic from the host to a port? Would it be simpler in the long run to just implement TC rules? Andrew
Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
On Thu, Aug 31, 2017 at 4:45 PM, Dmitry Torokhov wrote: > On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook wrote: >> In several places, .data is checked for initialization to gate early >> calls to del_timer_sync(). Checking for .function is equally valid, so >> switch to this in all callers. > > Not seeing the rest of patches it is unclear from the patch > description why this is needed/wanted. The CC list would have been really giant, but here is the first patch and the earlier series list: https://lkml.org/lkml/2017/8/31/904 https://lkml.org/lkml/2017/8/30/760 tl;dr: We're going to switch all struct timer_list callbacks to get the timer pointer as the argument instead of from the .data field. This patch is one step in removing open-coded users of the .data field. -Kees -- Kees Cook Pixel Security
Re: [PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet wrote: > refcount_t type and corresponding API should be > used instead of atomic_t when the variable is used as > a reference counter. This allows to avoid accidental > refcounter overflows that might lead to use-after-free > situations. > > v2: added the change in drivers/vhost/net.c as spotted > by Willem. > > Signed-off-by: Eric Dumazet Acked-by: Willem de Bruijn
Re: [PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
On Thu, Aug 31, 2017 at 7:48 PM, Eric Dumazet wrote: > In order to convert this atomic_t refcnt to refcount_t, > we need to init the refcount to one to not trigger > a 0 -> 1 transition. > > This also removes one atomic operation in fast path. > > v2: removed dead code in sock_zerocopy_put_abort() > as suggested by Willem. > > Signed-off-by: Eric Dumazet Acked-by: Willem de Bruijn
[PATCH v2 net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
In order to convert this atomic_t refcnt to refcount_t, we need to init the refcount to one to not trigger a 0 -> 1 transition. This also removes one atomic operation in fast path. v2: removed dead code in sock_zerocopy_put_abort() as suggested by Willem. Signed-off-by: Eric Dumazet --- net/core/skbuff.c | 10 ++ net/ipv4/tcp.c| 2 -- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 917da73d3ab3b82163cf0a9ee944da09cb5a391f..1a754d7896ceac1eb85e3b13c1422b4d6a88fedf 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - atomic_set(&uarg->refcnt, 0); + atomic_set(&uarg->refcnt, 1); sock_hold(sk); return uarg; @@ -1005,6 +1005,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, uarg->len++; uarg->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); + sock_zerocopy_get(uarg); return uarg; } } @@ -1102,13 +1103,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) atomic_dec(&sk->sk_zckey); uarg->len--; - /* sock_zerocopy_put expects a ref. Most sockets take one per -* skb, which is zero on abort. tcp_sendmsg holds one extra, to -* avoid an skb send inside the main loop triggering uarg free. -*/ - if (sk->sk_type != SOCK_STREAM) - atomic_inc(&uarg->refcnt); - sock_zerocopy_put(uarg); } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 21ca2df274c5130a13d31a391a1408d779af34af..fff4d7bc3b8b46174e7bd0a04d7c212307e55cb5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1190,8 +1190,6 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - /* skb may be freed in main loop, keep extra ref on uarg */ - sock_zerocopy_get(uarg); if (!(sk_check_csum_caps(sk) && sk->sk_route_caps & NETIF_F_SG)) uarg->zerocopy = 0; } -- 2.14.1.581.gf28d330327-goog
[PATCH 25/31] net/atm/mpc: Use separate static data field with with static timer
In preparation for changing the timer callback argument to the timer pointer, move to a separate static data variable. Cc: "David S. Miller" Cc: Andrew Morton Cc: Alexey Dobriyan Cc: "Reshetova, Elena" Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- net/atm/mpc.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 5baa31b8500c..fc24a46500ae 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -95,7 +95,7 @@ static netdev_tx_t mpc_send_packet(struct sk_buff *skb, static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev); static void mpc_timer_refresh(void); -static void mpc_cache_check(unsigned long checking_time); +static void mpc_cache_check(unsigned long unused); static struct llc_snap_hdr llc_snap_mpoa_ctrl = { 0xaa, 0xaa, 0x03, @@ -121,7 +121,8 @@ static struct notifier_block mpoa_notifier = { struct mpoa_client *mpcs = NULL; /* FIXME */ static struct atm_mpoa_qos *qos_head = NULL; -static DEFINE_TIMER(mpc_timer, NULL); +static DEFINE_TIMER(mpc_timer, mpc_cache_check); +static unsigned long checking_time; static struct mpoa_client *find_mpc_by_itfnum(int itf) @@ -1411,12 +1412,11 @@ static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action) static void mpc_timer_refresh(void) { mpc_timer.expires = jiffies + (MPC_P2 * HZ); - mpc_timer.data = mpc_timer.expires; - mpc_timer.function = mpc_cache_check; + checking_time = mpc_timer.expires; add_timer(&mpc_timer); } -static void mpc_cache_check(unsigned long checking_time) +static void mpc_cache_check(unsigned long unused) { struct mpoa_client *mpc = mpcs; static unsigned long previous_resolving_check_time; -- 2.7.4
[PATCH v2 net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. v2: added the change in drivers/vhost/net.c as spotted by Willem. Signed-off-by: Eric Dumazet --- drivers/vhost/net.c| 2 +- include/linux/skbuff.h | 5 +++-- net/core/skbuff.c | 6 +++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index ba08b78ed630c429ccf415af69bf27f2433af4f0..8d2bcae53a2ec9ea1c876625e581bcd429abe365 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -533,7 +533,7 @@ static void handle_tx(struct vhost_net *net) ubuf->callback = vhost_zerocopy_callback; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; - atomic_set(&ubuf->refcnt, 1); + refcount_set(&ubuf->refcnt, 1); msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = nvq->ubufs; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -456,7 +457,7 @@ struct ubuf_info { u32 bytelen; }; }; - atomic_t refcnt; + refcount_t refcnt; struct mmpin { struct user_struct *user; @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size, static inline void sock_zerocopy_get(struct ubuf_info *uarg) { - atomic_inc(&uarg->refcnt); + refcount_inc(&uarg->refcnt); } void sock_zerocopy_put(struct ubuf_info *uarg); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1a754d7896ceac1eb85e3b13c1422b4d6a88fedf..06b9ddca3188442d1d1d2052fc060cf5b1e2a6f4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - atomic_set(&uarg->refcnt, 1); + refcount_set(&uarg->refcnt, 1); sock_hold(sk); return uarg; @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback); void sock_zerocopy_put(struct ubuf_info *uarg) { - if (uarg && atomic_dec_and_test(&uarg->refcnt)) { + if (uarg && refcount_dec_and_test(&uarg->refcnt)) { if (uarg->callback) uarg->callback(uarg, uarg->zerocopy); else @@ -1483,7 +1483,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_orphan_frags(skb, gfp_mask)) goto nofrags; if (skb_zcopy(skb)) - atomic_inc(&skb_uarg(skb)->refcnt); + refcount_inc(&skb_uarg(skb)->refcnt); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_frag_ref(skb, i); -- 2.14.1.581.gf28d330327-goog
[PATCH v2 net-next 0/2] net: ubuf_info.refcnt conversion
Yet another atomic_t -> refcount_t conversion, split in two patches. First patch prepares the automatic conversion done in the second patch. Eric Dumazet (2): net: prepare (struct ubuf_info)->refcnt conversion net: convert (struct ubuf_info)->refcnt to refcount_t drivers/vhost/net.c| 2 +- include/linux/skbuff.h | 5 +++-- net/core/skbuff.c | 14 -- net/ipv4/tcp.c | 2 -- 4 files changed, 8 insertions(+), 15 deletions(-) -- 2.14.1.581.gf28d330327-goog
Re: [PATCH 31/31] timer: Switch to testing for .function instead of .data
On Thu, Aug 31, 2017 at 4:29 PM, Kees Cook wrote: > In several places, .data is checked for initialization to gate early > calls to del_timer_sync(). Checking for .function is equally valid, so > switch to this in all callers. Not seeing the rest of patches it is unclear from the patch description why this is needed/wanted. Thanks. -- Dmitry
Re: [PATCH net] ipv4: Don't override return code from ip_route_input_noref()
2017-08-31, 18:11:41 +0200, Stefano Brivio wrote: > After ip_route_input() calls ip_route_input_noref(), another > check on skb_dst() is done, but if this fails, we shouldn't > override the return code from ip_route_input_noref(), as it > could have been more specific (i.e. -EHOSTUNREACH). > > This also saves one call to skb_dst_force_safe() and one to > skb_dst() in case the ip_route_input_noref() check fails. > > Reported-by: Sabrina Dubroca > Fixes: ad65a2f05695 ("ipv4: call dst_hold_safe() properly") That should be instead: Fixes: 9df16efadd2a ("ipv4: call dst_hold_safe() properly") Acked-by: Sabrina Dubroca -- Sabrina
Re: [RFC net-next 1/8] net: dsa: Allow switch drivers to indicate number of RX/TX queues
On Wed, Aug 30, 2017 at 05:18:45PM -0700, Florian Fainelli wrote: > Let switch drivers indicate how many RX and TX queues they support. Some > switches, such as Broadcom Starfighter 2 are resigned with 8 egress > queues. Marvell switches also have egress queue. Does the SF2 have ingress queues? Marvel don't as far as i known. So i wounder if num_rx_queues is useful? Do switches in general have ingress queues? Andrew
[PATCH 20/31] net/core: Collapse redundant sk_timer callback data assignments
The core sk_timer initializer can provide the common .data assignment instead of it being set separately in users. Cc: "David S. Miller" Cc: Ralf Baechle Cc: Andrew Hendry Cc: Eric Dumazet Cc: Paolo Abeni Cc: David Howells Cc: Colin Ian King Cc: Ingo Molnar Cc: linzhang Cc: netdev@vger.kernel.org Cc: linux-h...@vger.kernel.org Cc: linux-...@vger.kernel.org Signed-off-by: Kees Cook --- net/core/sock.c | 2 +- net/netrom/nr_timer.c | 1 - net/rose/rose_timer.c | 1 - net/x25/af_x25.c | 1 - net/x25/x25_timer.c | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index ac2a404c73eb..5281a998ab32 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2623,7 +2623,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk_init_common(sk); sk->sk_send_head= NULL; - init_timer(&sk->sk_timer); + setup_timer(&sk->sk_timer, NULL, (unsigned long)sk); sk->sk_allocation = GFP_KERNEL; sk->sk_rcvbuf = sysctl_rmem_default; diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c index 94d05806a9a2..f84ce71f1f5f 100644 --- a/net/netrom/nr_timer.c +++ b/net/netrom/nr_timer.c @@ -45,7 +45,6 @@ void nr_init_timers(struct sock *sk) setup_timer(&nr->idletimer, nr_idletimer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &nr_heartbeat_expiry; } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index bc5469d6d9cb..6baa415b199a 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -36,7 +36,6 @@ void rose_start_heartbeat(struct sock *sk) { del_timer(&sk->sk_timer); - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &rose_heartbeat_expiry; sk->sk_timer.expires = jiffies + 5 * HZ; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5a1a98df3499..a5ac385b9120 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -414,7 +414,6 @@ static void __x25_destroy_socket(struct sock *sk) /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; - sk->sk_timer.data = (unsigned long)sk; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 5c5db1a36399..de5cec41d100 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -36,7 +36,6 @@ void x25_init_timers(struct sock *sk) setup_timer(&x25->timer, x25_timer_expiry, (unsigned long)sk); /* initialized by sock_init_data */ - sk->sk_timer.data = (unsigned long)sk; sk->sk_timer.function = &x25_heartbeat_expiry; } -- 2.7.4
[PATCH 19/31] timer: Remove open-coded casts for .data and .function
This standardizes the callback and data prototypes in several places that perform casting, in an effort to remove more open-coded .data and .function uses in favor of setup_timer(). Cc: Samuel Ortiz Cc: Tyrel Datwyler Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Cc: netdev@vger.kernel.org Cc: linux-s...@vger.kernel.org Cc: linuxppc-...@lists.ozlabs.org Signed-off-by: Kees Cook --- drivers/net/irda/bfin_sir.c | 5 +++-- drivers/scsi/ibmvscsi/ibmvfc.c | 14 ++ drivers/scsi/ibmvscsi/ibmvscsi.c | 8 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/net/irda/bfin_sir.c b/drivers/net/irda/bfin_sir.c index 3151b580dbd6..c9413bd580a7 100644 --- a/drivers/net/irda/bfin_sir.c +++ b/drivers/net/irda/bfin_sir.c @@ -317,8 +317,9 @@ static void bfin_sir_dma_rx_chars(struct net_device *dev) async_unwrap_char(dev, &self->stats, &self->rx_buff, port->rx_dma_buf.buf[i]); } -void bfin_sir_rx_dma_timeout(struct net_device *dev) +void bfin_sir_rx_dma_timeout(unsigned long data) { + struct net_device *dev = (struct net_device *)data; struct bfin_sir_self *self = netdev_priv(dev); struct bfin_sir_port *port = self->sir_port; int x_pos, pos; @@ -406,7 +407,7 @@ static int bfin_sir_startup(struct bfin_sir_port *port, struct net_device *dev) enable_dma(port->rx_dma_channel); port->rx_dma_timer.data = (unsigned long)(dev); - port->rx_dma_timer.function = (void *)bfin_sir_rx_dma_timeout; + port->rx_dma_timer.function = bfin_sir_rx_dma_timeout; #else diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index cc4e05be8d4a..1be20688dd1f 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -1393,8 +1393,9 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd, * * Called when an internally generated command times out **/ -static void ibmvfc_timeout(struct ibmvfc_event *evt) +static void ibmvfc_timeout(unsigned long data) { + struct ibmvfc_event *evt = (struct ibmvfc_event *)data; struct ibmvfc_host *vhost = evt->vhost; dev_err(vhost->dev, "Command timed out (%p). Resetting connection\n", evt); ibmvfc_reset_host(vhost); @@ -1424,12 +1425,10 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt, BUG(); list_add_tail(&evt->queue, &vhost->sent); - init_timer(&evt->timer); + setup_timer(&evt->timer, ibmvfc_timeout, (unsigned long)evt); if (timeout) { - evt->timer.data = (unsigned long) evt; evt->timer.expires = jiffies + (timeout * HZ); - evt->timer.function = (void (*)(unsigned long))ibmvfc_timeout; add_timer(&evt->timer); } @@ -3696,8 +3695,9 @@ static void ibmvfc_tgt_adisc_cancel_done(struct ibmvfc_event *evt) * out, reset the CRQ. When the ADISC comes back as cancelled, * log back into the target. **/ -static void ibmvfc_adisc_timeout(struct ibmvfc_target *tgt) +static void ibmvfc_adisc_timeout(unsigned long data) { + struct ibmvfc_target *tgt = (struct ibmvfc_target *)data; struct ibmvfc_host *vhost = tgt->vhost; struct ibmvfc_event *evt; struct ibmvfc_tmf *tmf; @@ -3782,9 +3782,7 @@ static void ibmvfc_tgt_adisc(struct ibmvfc_target *tgt) if (timer_pending(&tgt->timer)) mod_timer(&tgt->timer, jiffies + (IBMVFC_ADISC_TIMEOUT * HZ)); else { - tgt->timer.data = (unsigned long) tgt; tgt->timer.expires = jiffies + (IBMVFC_ADISC_TIMEOUT * HZ); - tgt->timer.function = (void (*)(unsigned long))ibmvfc_adisc_timeout; add_timer(&tgt->timer); } @@ -3916,7 +3914,7 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, u64 scsi_id) tgt->vhost = vhost; tgt->need_login = 1; tgt->cancel_key = vhost->task_set++; - init_timer(&tgt->timer); + setup_timer(&tgt->timer, ibmvfc_adisc_timeout, (unsigned long)tgt); kref_init(&tgt->kref); ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout); spin_lock_irqsave(vhost->host->host_lock, flags); diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c index da22b3665cb0..44ae85903a00 100644 --- a/drivers/scsi/ibmvscsi/ibmvscsi.c +++ b/drivers/scsi/ibmvscsi/ibmvscsi.c @@ -837,8 +837,9 @@ static void ibmvscsi_reset_host(struct ibmvscsi_host_data *hostdata) * * Called when an internally generated command times out */ -static void ibmvscsi_timeout(struct srp_event_struct *evt_struct) +static void ibmvscsi_timeout(unsigned long data) { + struct srp_event_struct *evt_struct = (struct srp_event_struct *)data; struct ibmvscsi_host_data *hostdata = evt_struct->hostdata; dev_err(hostdata->dev, "Command timed out (%x). Resetting connection\n", @
[PATCH 30/31] appletalk: Remove unneeded synchronization
The use of del_timer_sync() will make sure a timer is not rescheduled. As such, there is no need to add external signals to kill timers. In preparation for switching the timer callback argument to the timer pointer, this drops the .data argument since it doesn't serve a meaningful purpose here. Cc: David Howells Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- drivers/net/appletalk/ltpc.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c index e4aa374caa4d..cc3dc9337eae 100644 --- a/drivers/net/appletalk/ltpc.c +++ b/drivers/net/appletalk/ltpc.c @@ -880,14 +880,10 @@ static void ltpc_poll(unsigned long l) } ltpc_poll_counter--; } - - if (!dev) - return; /* we've been downed */ /* poll 20 times per second */ idle(dev); ltpc_timer.expires = jiffies + HZ/20; - add_timer(
[PATCH 31/31] timer: Switch to testing for .function instead of .data
In several places, .data is checked for initialization to gate early calls to del_timer_sync(). Checking for .function is equally valid, so switch to this in all callers. Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Greg Kroah-Hartman Cc: Mike Marciniszyn Cc: Dennis Dalessandro Cc: Doug Ledford Cc: Sean Hefty Cc: Hal Rosenstock Cc: Dmitry Torokhov Cc: Jeff Kirsher Cc: linux...@vger.kernel.org Cc: linux-r...@vger.kernel.org Cc: linux-in...@vger.kernel.org Cc: intel-wired-...@lists.osuosl.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook --- drivers/base/power/wakeup.c | 3 +-- drivers/infiniband/hw/hfi1/chip.c | 6 ++ drivers/infiniband/hw/hfi1/init.c | 2 +- drivers/infiniband/hw/qib/qib_iba7220.c | 2 +- drivers/infiniband/hw/qib/qib_iba7322.c | 2 +- drivers/infiniband/hw/qib/qib_init.c| 14 +- drivers/infiniband/hw/qib/qib_mad.c | 2 +- drivers/input/input.c | 5 ++--- drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +- 9 files changed, 15 insertions(+), 23 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 144e6d8fafc8..79a3c1b204af 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -479,8 +479,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws) * Use timer struct to check if the given source is initialized * by wakeup_source_add. */ - return ws->timer.function != pm_wakeup_timer_fn || - ws->timer.data != (unsigned long)ws; + return ws->timer.function != pm_wakeup_timer_fn; } /* diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 94b54850ec75..53a6596cd7d6 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -5513,9 +5513,8 @@ static int init_rcverr(struct hfi1_devdata *dd) static void free_rcverr(struct hfi1_devdata *dd) { - if (dd->rcverr_timer.data) + if (dd->rcverr_timer.function) del_timer_sync(&dd->rcverr_timer); - dd->rcverr_timer.data = 0; } static void handle_rxe_err(struct hfi1_devdata *dd, u32 unused, u64 reg) @@ -11992,9 +11991,8 @@ static void free_cntrs(struct hfi1_devdata *dd) struct hfi1_pportdata *ppd; int i; - if (dd->synth_stats_timer.data) + if (dd->synth_stats_timer.function) del_timer_sync(&dd->synth_stats_timer); - dd->synth_stats_timer.data = 0; ppd = (struct hfi1_pportdata *)(dd + 1); for (i = 0; i < dd->num_pports; i++, ppd++) { kfree(ppd->cntrs); diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 4a11d4da4c92..bc2af709c111 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -839,7 +839,7 @@ static void stop_timers(struct hfi1_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; - if (ppd->led_override_timer.data) { + if (ppd->led_override_timer.function) { del_timer_sync(&ppd->led_override_timer); atomic_set(&ppd->led_override_timer_active, 0); } diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index b1d512c7ff4b..22fd65fe7193 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -1662,7 +1662,7 @@ static void qib_7220_quiet_serdes(struct qib_pportdata *ppd) dd->control | QLOGIC_IB_C_FREEZEMODE); ppd->cpspec->chase_end = 0; - if (ppd->cpspec->chase_timer.data) /* if initted */ + if (ppd->cpspec->chase_timer.function) /* if initted */ del_timer_sync(&ppd->cpspec->chase_timer); if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta || diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index bb2439fff8fa..471aaf6bcbf2 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2531,7 +2531,7 @@ static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd) cancel_delayed_work_sync(&ppd->cpspec->ipg_work); ppd->cpspec->chase_end = 0; - if (ppd->cpspec->chase_timer.data) /* if initted */ + if (ppd->cpspec->chase_timer.function) /* if initted */ del_timer_sync(&ppd->cpspec->chase_timer); /* diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 6c16ba1107ba..66fb0318660b 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -815,23 +815,19 @@ static void qib_stop_timers(struct qib_devdata *dd) struct qib_pportdata *ppd; int pidx; - if (dd->stats_timer.dat
[PATCH 13/31] timer: Remove meaningless .data/.function assignments
Several timer users needlessly reset their .function/.data fields during their timer callback, but nothing else changes them. Some users do not use their .data field at all. Each instance is removed here. Cc: Krzysztof Halasa Cc: Aditya Shankar Cc: Ganesh Krishna Cc: Greg Kroah-Hartman Cc: Jens Axboe Cc: netdev@vger.kernel.org Cc: linux-wirel...@vger.kernel.org Cc: de...@driverdev.osuosl.org Signed-off-by: Kees Cook --- drivers/block/amiflop.c | 3 +-- drivers/net/wan/hdlc_cisco.c | 2 -- drivers/net/wan/hdlc_fr.c | 2 -- drivers/staging/wilc1000/wilc_wfi_cfgoperations.c | 4 +--- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index c4b1cba27178..6680d75bc857 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -323,7 +323,7 @@ static void fd_deselect (int drive) } -static void motor_on_callback(unsigned long nr) +static void motor_on_callback(unsigned long ignored) { if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) { complete_all(&motor_on_completion); @@ -344,7 +344,6 @@ static int fd_motor_on(int nr) fd_select(nr); reinit_completion(&motor_on_completion); - motor_on_timer.data = nr; mod_timer(&motor_on_timer, jiffies + HZ/2); on_attempts = 10; diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c index c696d42f4502..6c98d85f2773 100644 --- a/drivers/net/wan/hdlc_cisco.c +++ b/drivers/net/wan/hdlc_cisco.c @@ -276,8 +276,6 @@ static void cisco_timer(unsigned long arg) spin_unlock(&st->lock); st->timer.expires = jiffies + st->settings.interval * HZ; - st->timer.function = cisco_timer; - st->timer.data = arg; add_timer(&st->timer); } diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index de42faca076a..7da2424c28a4 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -644,8 +644,6 @@ static void fr_timer(unsigned long arg) state(hdlc)->settings.t391 * HZ; } - state(hdlc)->timer.function = fr_timer; - state(hdlc)->timer.data = arg; add_timer(&state(hdlc)->timer); } diff --git a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c index 68fd5b3b8b2d..2fca2b017093 100644 --- a/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c +++ b/drivers/staging/wilc1000/wilc_wfi_cfgoperations.c @@ -275,7 +275,7 @@ static void update_scan_time(void) last_scanned_shadow[i].time_scan = jiffies; } -static void remove_network_from_shadow(unsigned long arg) +static void remove_network_from_shadow(unsigned long unused) { unsigned long now = jiffies; int i, j; @@ -296,7 +296,6 @@ static void remove_network_from_shadow(unsigned long arg) } if (last_scanned_cnt != 0) { - hAgingTimer.data = arg; mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME)); } } @@ -313,7 +312,6 @@ static int is_network_in_shadow(struct network_info *pstrNetworkInfo, int i; if (last_scanned_cnt == 0) { - hAgingTimer.data = (unsigned long)user_void; mod_timer(&hAgingTimer, jiffies + msecs_to_jiffies(AGING_TIME)); state = -1; } else { -- 2.7.4
[RFC PATCH] net: Introduce a socket option to enable picking tx queue based on rx queue.
This patch introduces a new socket option SO_SYMMETRIC_QUEUES that can be used to enable symmetric tx and rx queues on a socket. This option is specifically useful for epoll based multi threaded workloads where each thread handles packets received on a single RX queue . In this model, we have noticed that it helps to send the packets on the same TX queue corresponding to the queue-pair associated with the RX queue specifically when busy poll is enabled with epoll(). Two new fields are added to struct sock_common to cache the last rx ifindex and the rx queue in the receive path of an SKB. __netdev_pick_tx() returns the cached rx queue when this option is enabled and the TX is happening on the same device. Signed-off-by: Sridhar Samudrala --- include/net/request_sock.h| 1 + include/net/sock.h| 17 + include/uapi/asm-generic/socket.h | 2 ++ net/core/dev.c| 8 +++- net/core/sock.c | 10 ++ net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 1 + 8 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 23e2205..c3bc12e 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -100,6 +100,7 @@ static inline struct sock *req_to_sk(struct request_sock *req) req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); + req_to_sk(req)->sk_symmetric_queues = sk_listener->sk_symmetric_queues; req->saved_syn = NULL; refcount_set(&req->rsk_refcnt, 0); diff --git a/include/net/sock.h b/include/net/sock.h index 03a3625..3421809 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -138,11 +138,14 @@ void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection + * @skc_rx_queue_mapping: rx queue number for this connection + * @skc_rx_ifindex: rx ifindex for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_refcnt: reference count + * @skc_symmetric_queues: symmetric tx/rx queues * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. @@ -177,6 +180,7 @@ struct sock_common { unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; + unsigned char skc_symmetric_queues:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; @@ -214,6 +218,8 @@ struct sock_common { struct hlist_nulls_node skc_nulls_node; }; int skc_tx_queue_mapping; + int skc_rx_queue_mapping; + int skc_rx_ifindex; union { int skc_incoming_cpu; u32 skc_rcv_wnd; @@ -324,6 +330,8 @@ struct sock { #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping__sk_common.skc_tx_queue_mapping +#define sk_rx_queue_mapping__sk_common.skc_rx_queue_mapping +#define sk_rx_ifindex __sk_common.skc_rx_ifindex #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end__sk_common.skc_dontcopy_end @@ -340,6 +348,7 @@ struct sock { #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only__sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt +#define sk_symmetric_queues__sk_common.skc_symmetric_queues #define sk_bound_dev_if__sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot__sk_common.skc_prot @@ -1676,6 +1685,14 @@ static inline int sk_tx_queue_get(const struct sock *sk) return sk ? sk->sk_tx_queue_mapping : -1; } +static inline void sk_mark_rx_queue(struct sock *sk, struct sk_buff *skb) +{ + if (sk->sk_symmetric_queues) { + sk->sk_rx_ifindex = skb->skb_iif; + sk->sk_rx_queue_mapping = skb_get_rx_queue(skb); + } +} + static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk_tx_queue_clear(sk); diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/so
Re: [PATCH net-next v5 2/2] tcp_diag: report TCP MD5 signing keys and addresses
2017-08-31, 09:59:39 -0700, Ivan Delalande wrote: > diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c > index a748c74aa8b7..abbf0edcf6c2 100644 > --- a/net/ipv4/tcp_diag.c > +++ b/net/ipv4/tcp_diag.c [...] > +static int tcp_diag_get_aux(struct sock *sk, bool net_admin, > + struct sk_buff *skb) > +{ > +#ifdef CONFIG_TCP_MD5SIG > + if (net_admin) { In tcp_diag_get_aux_size() you put a check for sk_fullsock. I don't see anything preventing you from reaching this with a !fullsock? > + struct tcp_md5sig_info *md5sig; > + int err = 0; > + > + rcu_read_lock(); > + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); > + if (md5sig) > + err = tcp_diag_put_md5sig(skb, md5sig); > + rcu_read_unlock(); > + if (err < 0) > + return err; > + } > +#endif > + > + return 0; > +} > + > +static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) > +{ > + size_t size = 0; > + > +#ifdef CONFIG_TCP_MD5SIG > + if (net_admin && sk_fullsock(sk)) { > + const struct tcp_md5sig_info *md5sig; > + const struct tcp_md5sig_key *key; > + size_t md5sig_count = 0; > + > + rcu_read_lock(); > + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); > + if (md5sig) { > + hlist_for_each_entry_rcu(key, &md5sig->head, node) > + md5sig_count++; > + } > + rcu_read_unlock(); > + size += nla_total_size(md5sig_count * > +sizeof(struct tcp_diag_md5sig)); > + } > +#endif > + > + return size; > +} > + > static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, > const struct inet_diag_req_v2 *r, struct nlattr *bc) > { -- Sabrina
[PATCH net-next 0/2] netvsc: transparent VF related cleanups
The first gets rid of unnecessary ref counting, and second allows removing hv_netvsc driver even if VF present. Stephen Hemminger (2): netvsc: cleanup datapath switch netvsc: allow driver to be removed even if VF is present drivers/net/hyperv/netvsc_drv.c | 55 - 1 file changed, 16 insertions(+), 39 deletions(-) -- 2.11.0
[PATCH net-next 2/2] netvsc: allow driver to be removed even if VF is present
If VF is attached then can still allow netvsc driver module to be removed. Just have to make sure and do the cleanup. Also, avoid extra rtnl round trip when calling unregister. Signed-off-by: Stephen Hemminger --- drivers/net/hyperv/netvsc_drv.c | 17 + 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 128165e49c9a..97ed4bdc439f 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1834,9 +1834,6 @@ static int netvsc_register_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF registering: %s\n", vf_netdev->name); - /* Prevent this module from being unloaded while VF is registered */ - try_module_get(THIS_MODULE); - dev_hold(vf_netdev); rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev); return NOTIFY_OK; @@ -1880,10 +1877,11 @@ static int netvsc_unregister_vf(struct net_device *vf_netdev) netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name); + netdev_rx_handler_unregister(vf_netdev); netdev_upper_dev_unlink(vf_netdev, ndev); RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL); dev_put(vf_netdev); - module_put(THIS_MODULE); + return NOTIFY_OK; } @@ -1987,11 +1985,11 @@ static int netvsc_probe(struct hv_device *dev, static int netvsc_remove(struct hv_device *dev) { - struct net_device *net; struct net_device_context *ndev_ctx; + struct net_device *vf_netdev; + struct net_device *net; net = hv_get_drvdata(dev); - if (net == NULL) { dev_err(&dev->device, "No net device to remove\n"); return 0; @@ -2008,12 +2006,15 @@ static int netvsc_remove(struct hv_device *dev) * removed. Also blocks mtu and channel changes. */ rtnl_lock(); + vf_netdev = rtnl_dereference(ndev_ctx->vf_netdev); + if (vf_netdev) + netvsc_unregister_vf(vf_netdev); + rndis_filter_device_remove(dev, rtnl_dereference(ndev_ctx->nvdev)); + unregister_netdevice(net); rtnl_unlock(); - unregister_netdev(net); - hv_set_drvdata(dev, NULL); free_percpu(ndev_ctx->vf_stats); -- 2.11.0
[PATCH net-next 1/2] netvsc: cleanup datapath switch
Use one routine for datapath up/down. Don't need to reopen the rndis layer. Signed-off-by: Stephen Hemminger --- drivers/net/hyperv/netvsc_drv.c | 38 +++--- 1 file changed, 7 insertions(+), 31 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index fac44c5c8d0d..128165e49c9a 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -1842,11 +1842,13 @@ static int netvsc_register_vf(struct net_device *vf_netdev) return NOTIFY_OK; } -static int netvsc_vf_up(struct net_device *vf_netdev) +/* VF up/down change detected, schedule to change data path */ +static int netvsc_vf_changed(struct net_device *vf_netdev) { struct net_device_context *net_device_ctx; struct netvsc_device *netvsc_dev; struct net_device *ndev; + bool vf_is_up = netif_running(vf_netdev); ndev = get_netvsc_byref(vf_netdev); if (!ndev) @@ -1857,34 +1859,9 @@ static int netvsc_vf_up(struct net_device *vf_netdev) if (!netvsc_dev) return NOTIFY_DONE; - /* Bump refcount when datapath is acvive - Why? */ - rndis_filter_open(netvsc_dev); - - /* notify the host to switch the data path. */ - netvsc_switch_datapath(ndev, true); - netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name); - - return NOTIFY_OK; -} - -static int netvsc_vf_down(struct net_device *vf_netdev) -{ - struct net_device_context *net_device_ctx; - struct netvsc_device *netvsc_dev; - struct net_device *ndev; - - ndev = get_netvsc_byref(vf_netdev); - if (!ndev) - return NOTIFY_DONE; - - net_device_ctx = netdev_priv(ndev); - netvsc_dev = rtnl_dereference(net_device_ctx->nvdev); - if (!netvsc_dev) - return NOTIFY_DONE; - - netvsc_switch_datapath(ndev, false); - netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name); - rndis_filter_close(netvsc_dev); + netvsc_switch_datapath(ndev, vf_is_up); + netdev_info(ndev, "Data path switched %s VF: %s\n", + vf_is_up ? "to" : "from", vf_netdev->name); return NOTIFY_OK; } @@ -2094,9 +2071,8 @@ static int netvsc_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: return netvsc_unregister_vf(event_dev); case NETDEV_UP: - return netvsc_vf_up(event_dev); case NETDEV_DOWN: - return netvsc_vf_down(event_dev); + return netvsc_vf_changed(event_dev); default: return NOTIFY_DONE; } -- 2.11.0
Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
On Thu, 2017-08-31 at 18:45 -0400, Willem de Bruijn wrote: > On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet wrote: > > refcount_t type and corresponding API should be > > used instead of atomic_t when the variable is used as > > a reference counter. This allows to avoid accidental > > refcounter overflows that might lead to use-after-free > > situations. > > > > Signed-off-by: Eric Dumazet > > --- > > include/linux/skbuff.h | 5 +++-- > > net/core/skbuff.c | 8 > > 2 files changed, 7 insertions(+), 6 deletions(-) > > > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > > index > > 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 > > 100644 > > --- a/include/linux/skbuff.h > > +++ b/include/linux/skbuff.h > > @@ -22,6 +22,7 @@ > > #include > > #include > > #include > > +#include > > > > #include > > #include > > @@ -456,7 +457,7 @@ struct ubuf_info { > > u32 bytelen; > > }; > > }; > > - atomic_t refcnt; > > + refcount_t refcnt; > > > > struct mmpin { > > struct user_struct *user; > > @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock > > *sk, size_t size, > > > > static inline void sock_zerocopy_get(struct ubuf_info *uarg) > > { > > - atomic_inc(&uarg->refcnt); > > + refcount_inc(&uarg->refcnt); > > } > > > > void sock_zerocopy_put(struct ubuf_info *uarg); > > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > > index > > 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612 > > 100644 > > --- a/net/core/skbuff.c > > +++ b/net/core/skbuff.c > > @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, > > size_t size) > > uarg->len = 1; > > uarg->bytelen = size; > > uarg->zerocopy = 1; > > - atomic_set(&uarg->refcnt, 1); > > + refcount_set(&uarg->refcnt, 1); > > sock_hold(sk); > > > > return uarg; > > @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback); > > > > void sock_zerocopy_put(struct ubuf_info *uarg) > > { > > - if (uarg && atomic_dec_and_test(&uarg->refcnt)) { > > + if (uarg && refcount_dec_and_test(&uarg->refcnt)) { > > if (uarg->callback) > > uarg->callback(uarg, uarg->zerocopy); > > else > > @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) > > * avoid an skb send inside the main loop triggering uarg > > free. > > */ > > if (sk->sk_type != SOCK_STREAM) > > - atomic_inc(&uarg->refcnt); > > + refcount_inc(&uarg->refcnt); > > This would be a 0 -> 1 transition. It is taken only on the error path from > a socket that does not take an explicit hold at the start of sendmsg. > Arg. > The code is vestigial, really, as the final patchset only includes tcp. > It is safe to leave as is for now, or I can remove it. > OK I probably should remove this chunk in patch 1/2 then. > In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated > other users of ubuf_info to initialize refcnt. We probably also want to > convert the call in handle_tx in drivers/vhost/net.c. Thanks ! :)
Re: [PATCH net-next 2/2] net: convert (struct ubuf_info)->refcnt to refcount_t
On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet wrote: > refcount_t type and corresponding API should be > used instead of atomic_t when the variable is used as > a reference counter. This allows to avoid accidental > refcounter overflows that might lead to use-after-free > situations. > > Signed-off-by: Eric Dumazet > --- > include/linux/skbuff.h | 5 +++-- > net/core/skbuff.c | 8 > 2 files changed, 7 insertions(+), 6 deletions(-) > > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h > index > 7594e19bce622a38dc39c054093c3da15b99b67b..316a92b45351f53709886ee0099cbc83b66f1b15 > 100644 > --- a/include/linux/skbuff.h > +++ b/include/linux/skbuff.h > @@ -22,6 +22,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -456,7 +457,7 @@ struct ubuf_info { > u32 bytelen; > }; > }; > - atomic_t refcnt; > + refcount_t refcnt; > > struct mmpin { > struct user_struct *user; > @@ -472,7 +473,7 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, > size_t size, > > static inline void sock_zerocopy_get(struct ubuf_info *uarg) > { > - atomic_inc(&uarg->refcnt); > + refcount_inc(&uarg->refcnt); > } > > void sock_zerocopy_put(struct ubuf_info *uarg); > diff --git a/net/core/skbuff.c b/net/core/skbuff.c > index > 65b9ca3945f8fd2b1bef4aef5dd774be04e5d128..ed86ca9afd9d8d1ac47983acf6006c179285a612 > 100644 > --- a/net/core/skbuff.c > +++ b/net/core/skbuff.c > @@ -963,7 +963,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, > size_t size) > uarg->len = 1; > uarg->bytelen = size; > uarg->zerocopy = 1; > - atomic_set(&uarg->refcnt, 1); > + refcount_set(&uarg->refcnt, 1); > sock_hold(sk); > > return uarg; > @@ -1086,7 +1086,7 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_callback); > > void sock_zerocopy_put(struct ubuf_info *uarg) > { > - if (uarg && atomic_dec_and_test(&uarg->refcnt)) { > + if (uarg && refcount_dec_and_test(&uarg->refcnt)) { > if (uarg->callback) > uarg->callback(uarg, uarg->zerocopy); > else > @@ -1108,7 +1108,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg) > * avoid an skb send inside the main loop triggering uarg > free. > */ > if (sk->sk_type != SOCK_STREAM) > - atomic_inc(&uarg->refcnt); > + refcount_inc(&uarg->refcnt); This would be a 0 -> 1 transition. It is taken only on the error path from a socket that does not take an explicit hold at the start of sendmsg. The code is vestigial, really, as the final patchset only includes tcp. It is safe to leave as is for now, or I can remove it. In 1f8b977ab32d ("sock: enable MSG_ZEROCOPY") I also updated other users of ubuf_info to initialize refcnt. We probably also want to convert the call in handle_tx in drivers/vhost/net.c.
Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update
On Thu, 31 Aug 2017 23:50:26 +0200 Jesper Dangaard Brouer wrote: > On Thu, 31 Aug 2017 11:43:25 -0700 (PDT) > David Miller wrote: > > > From: Roopa Prabhu > > Date: Wed, 30 Aug 2017 22:18:13 -0700 > > > > > From: Roopa Prabhu > > > > > > This extends bridge fdb table tracepoints to also cover > > > learned fdb entries in the br_fdb_update path. Note that > > > unlike other tracepoints I have moved this to when the fdb > > > is modified because this is in the datapath and can generate > > > a lot of noise in the trace output. br_fdb_update is also called > > > from added_by_user context in the NTF_USE case which is already > > > traced ..hence the !added_by_user check. > > > > > > Signed-off-by: Roopa Prabhu > > > > Applied. > > > > Let's use dev->name for now and if the tooling can eventually > > do transparent ifindex->name then we can consider redoing > > a bunch of networking tracepoints. > > I agree! :-) > Agreed, but it is yet another case of tracepoints not having a stable ABI. There is no ABI guarantee on tracing, but it still makes for user complaints.
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On 08/31/2017 10:56 PM, Chenbo Feng wrote: From: Chenbo Feng Introduce a pointer into struct bpf_map to hold the security information about the map. The actual security struct varies based on the security models implemented. Place the LSM hooks before each of the unrestricted eBPF operations, the map_update_elem and map_delete_elem operations are checked by security_map_modify. The map_lookup_elem and map_get_next_key operations are checked by securtiy_map_read. Signed-off-by: Chenbo Feng Against which tree is this by the way, net-next? There are changes here which require a rebase of your set. --- include/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 28 2 files changed, 31 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b69e7a5869ff..ca3e6ff7091d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -53,6 +53,9 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; +#ifdef CONFIG_SECURITY + void *security; +#endif }; /* function argument constraints */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..b15580bcf3b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + err = security_map_create(); Seems a bit limited to me, don't you want to be able to also differentiate between different map types? Same goes for security_prog_load() wrt prog types, no? + if (err) + return -EACCES; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; + err = security_post_create(map); + if (err < 0) + goto free_map; + So the hook you implement in patch 3/3 does: +static int selinux_bpf_post_create(struct bpf_map *map) +{ + struct bpf_security_struct *bpfsec; + + bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); + if (!bpfsec) + return -ENOMEM; + + bpfsec->sid = current_sid(); + map->security = bpfsec; + + return 0; +} Where do you kfree() bpfsec when the map gets released normally or in error path? err = bpf_map_alloc_id(map); if (err) goto free_map; @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_read(map); + if (err) + return -EACCES; How about actually dropping ref? + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_modify(map); + if (err) + return -EACCES; Ditto ... key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_modify(map); + if (err) + return -EACCES; Ditto ... key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_read(map); + if (err) + return -EACCES; And once again here ... :( if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; + err = security_prog_load(); + if (err) + return -EACCES; + if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) return -EINVAL;
Re: [PATCH net-next 1/2] net: prepare (struct ubuf_info)->refcnt conversion
On Thu, Aug 31, 2017 at 4:30 PM, Eric Dumazet wrote: > In order to convert this atomic_t refcnt to refcount_t, > we need to init the refcount to one to not trigger > a 0 -> 1 transition. > > This also removes one atomic operation in fast path. > > Signed-off-by: Eric Dumazet Acked-by: Willem de Bruijn Thanks, Eric.
[PATCH net-next 1/2] flow_dissector: Cleanup control flow
__skb_flow_dissect is riddled with gotos that make discerning the flow, debugging, and extending the capability difficult. This patch reorganizes things so that we only perform goto's after the two main switch statements (no gotos within the cases now). It also eliminates several goto labels so that there are only two labels that can be target for goto. Reported-by: Alexander Popov Signed-off-by: Tom Herbert --- include/net/flow_dissector.h | 9 ++ net/core/flow_dissector.c| 225 --- 2 files changed, 156 insertions(+), 78 deletions(-) diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index e2663e900b0a..c358c3ff6acc 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -19,6 +19,15 @@ struct flow_dissector_key_control { #define FLOW_DIS_FIRST_FRAGBIT(1) #define FLOW_DIS_ENCAPSULATION BIT(2) +enum flow_dissect_ret { + FLOW_DISSECT_RET_OUT_GOOD, + FLOW_DISSECT_RET_OUT_BAD, + FLOW_DISSECT_RET_PROTO_AGAIN, + FLOW_DISSECT_RET_IPPROTO_AGAIN, + FLOW_DISSECT_RET_IPPROTO_AGAIN_EH, + FLOW_DISSECT_RET_CONTINUE, +}; + /** * struct flow_dissector_key_basic: * @thoff: Transport header offset diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index e2eaa1ff948d..5110180a3e96 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -115,12 +115,6 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); -enum flow_dissect_ret { - FLOW_DISSECT_RET_OUT_GOOD, - FLOW_DISSECT_RET_OUT_BAD, - FLOW_DISSECT_RET_OUT_PROTO_AGAIN, -}; - static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -341,7 +335,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) return FLOW_DISSECT_RET_OUT_GOOD; - return FLOW_DISSECT_RET_OUT_PROTO_AGAIN; + return FLOW_DISSECT_RET_PROTO_AGAIN; } static void @@ -431,6 +425,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; + enum flow_dissect_ret fdret; bool skip_vlan = false; u8 ip_proto = 0; bool ret; @@ -482,14 +477,19 @@ bool __skb_flow_dissect(const struct sk_buff *skb, } proto_again: + fdret = FLOW_DISSECT_RET_CONTINUE; + switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; struct iphdr _iph; -ip: + iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); - if (!iph || iph->ihl < 5) - goto out_bad; + if (!iph || iph->ihl < 5) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } + nhoff += iph->ihl * 4; ip_proto = iph->protocol; @@ -509,19 +509,25 @@ bool __skb_flow_dissect(const struct sk_buff *skb, key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { - goto out_good; + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; } else { key_control->flags |= FLOW_DIS_FIRST_FRAG; - if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) - goto out_good; + if (!(flags & + FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } } } __skb_flow_dissect_ipv4(skb, flow_dissector, target_container, data, iph); - if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) - goto out_good; + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } break; } @@ -529,10 +535,11 @@ bool __skb_flow_dissect(const struct sk_buff *skb, const struct ipv6hdr *iph; struct ipv6hdr _iph; -ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); - if (!iph) - goto out_bad; + if (!iph) { + fdret = FLOW_DISSECT_RET_OUT_BAD; + break; + } ip_proto = iph->nex
[PATCH net-next 2/2] flow_dissector: Add limits for encapsulation and EH
In flow dissector there are no limits to the number of nested encapsulations that might be dissected which makes for a nice DOS attack. This patch limits for dissecting nested encapsulations as well as for dissecting over extension headers. Reported-by: Hannes Frederic Sowa Signed-off-by: Tom Herbert --- net/core/flow_dissector.c | 48 --- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5110180a3e96..1bca748de27d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -396,6 +396,35 @@ __skb_flow_dissect_ipv6(const struct sk_buff *skb, key_ip->ttl = iph->hop_limit; } +/* Maximum number of nested encapsulations that can be processed in + * __skb_flow_dissect + */ +#define MAX_FLOW_DISSECT_ENCAPS5 + +static bool skb_flow_dissect_encap_allowed(int *num_encaps, unsigned int *flags) +{ + ++*num_encaps; + + if (*num_encaps >= MAX_FLOW_DISSECT_ENCAPS) { + if (*num_encaps == MAX_FLOW_DISSECT_ENCAPS) { + /* Allow one more pass but ignore disregard +* further encapsulations +*/ + *flags |= FLOW_DISSECTOR_F_STOP_AT_ENCAP; + } else { + /* Max encaps reached */ + return false; + } + } + + return true; +} + +/* Maximum number of extension headers can be processed in __skb_flow_dissect + * per IPv6 packet + */ +#define MAX_FLOW_DISSECT_EH5 + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -426,6 +455,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; + int num_eh, num_encaps = 0; bool skip_vlan = false; u8 ip_proto = 0; bool ret; @@ -714,7 +744,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb, case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; case FLOW_DISSECT_RET_PROTO_AGAIN: - goto proto_again; + if (skb_flow_dissect_encap_allowed(&num_encaps, &flags)) + goto proto_again; + goto out_good; case FLOW_DISSECT_RET_CONTINUE: case FLOW_DISSECT_RET_IPPROTO_AGAIN: case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH: @@ -724,6 +756,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, goto out_bad; } + num_eh = 0; + ip_proto_again: fdret = FLOW_DISSECT_RET_CONTINUE; @@ -844,10 +878,18 @@ bool __skb_flow_dissect(const struct sk_buff *skb, /* Process result of IP proto processing */ switch (fdret) { case FLOW_DISSECT_RET_PROTO_AGAIN: - goto proto_again; + if (skb_flow_dissect_encap_allowed(&num_encaps, &flags)) + goto proto_again; + break; case FLOW_DISSECT_RET_IPPROTO_AGAIN: + if (skb_flow_dissect_encap_allowed(&num_encaps, &flags)) + goto ip_proto_again; + break; case FLOW_DISSECT_RET_IPPROTO_AGAIN_EH: - goto ip_proto_again; + ++num_eh; + if (num_eh <= MAX_FLOW_DISSECT_EH) + goto ip_proto_again; + break; case FLOW_DISSECT_RET_OUT_GOOD: case FLOW_DISSECT_RET_CONTINUE: break; -- 2.11.0
[PATCH net-next 0/2] flow_dissector: Flow dissector fixes
This patch set fixes some basic issues with __skb_flow_dissect function. Items addressed: - Cleanup control flow in the fucntion; in particular eliminate a bunch of goto's and implement a simplified control flow model - Add limits for number of encapsulations of extension headers that can be dissected Tested: Ran normal traffic, GUE, and VXLAN traffic. Tom Herbert (2): flow_dissector: Cleanup control flow flow_dissector: Add limits for encapsulation and EH include/net/flow_dissector.h | 9 ++ net/core/flow_dissector.c| 267 ++- 2 files changed, 198 insertions(+), 78 deletions(-) -- 2.11.0
Re: [PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper
On 09/01/2017 12:05 AM, David Ahern wrote: Allow BPF programs run on sock create to use the get_current_uid_gid helper. IPv4 and IPv6 sockets are created in a process context so there is always a valid uid/gid Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann
Re: [PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set
On 09/01/2017 12:05 AM, David Ahern wrote: Add socket mark and priority to fields that can be set by ebpf program when a socket is created. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c| 26 ++ 2 files changed, 28 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d46cf326b95f..e9c89e20adff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -758,6 +758,8 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 diff --git a/net/core/filter.c b/net/core/filter.c index c6a37fe0285b..f51b9690adf3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size, switch (off) { case offsetof(struct bpf_sock, bound_dev_if): break; + case offsetof(struct bpf_sock, mark): + break; + case offsetof(struct bpf_sock, priority): + break; Can also be follow-up, but please do keep this consistent to all the other *_is_valid_access() helpers, meaning: switch (off) { case offsetof(struct bpf_sock, bound_dev_if): case offsetof(struct bpf_sock, mark): case offsetof(struct bpf_sock, priority): break; default: return false; } Rest: Acked-by: Daniel Borkmann
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On Thu, Aug 31, 2017 at 2:17 PM, Mimi Zohar wrote: > On Thu, 2017-08-31 at 13:56 -0700, Chenbo Feng wrote: >> From: Chenbo Feng >> >> Introduce a pointer into struct bpf_map to hold the security information >> about the map. The actual security struct varies based on the security >> models implemented. Place the LSM hooks before each of the unrestricted >> eBPF operations, the map_update_elem and map_delete_elem operations are >> checked by security_map_modify. The map_lookup_elem and map_get_next_key >> operations are checked by securtiy_map_read. >> >> Signed-off-by: Chenbo Feng >> --- >> include/linux/bpf.h | 3 +++ >> kernel/bpf/syscall.c | 28 >> 2 files changed, 31 insertions(+) >> >> diff --git a/include/linux/bpf.h b/include/linux/bpf.h >> index b69e7a5869ff..ca3e6ff7091d 100644 >> --- a/include/linux/bpf.h >> +++ b/include/linux/bpf.h >> @@ -53,6 +53,9 @@ struct bpf_map { >> struct work_struct work; >> atomic_t usercnt; >> struct bpf_map *inner_map_meta; >> +#ifdef CONFIG_SECURITY >> + void *security; >> +#endif >> }; >> >> /* function argument constraints */ >> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c >> index 045646da97cc..b15580bcf3b1 100644 >> --- a/kernel/bpf/syscall.c >> +++ b/kernel/bpf/syscall.c >> @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr) >> if (err) >> return -EINVAL; >> >> + err = security_map_create(); >> + if (err) >> + return -EACCES; > > Any reason not to just return err? > > Mimi > Nope... return err might be better. I will fix this in next version. Thanks Chenbo >> + >> /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ >> map = find_and_alloc_map(attr); >> if (IS_ERR(map)) >> @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr) >> if (err) >> goto free_map_nouncharge; >> >> + err = security_post_create(map); >> + if (err < 0) >> + goto free_map; >> + >> err = bpf_map_alloc_id(map); >> if (err) >> goto free_map; >> @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_read(map); >> + if (err) >> + return -EACCES; >> + >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_modify(map); >> + if (err) >> + return -EACCES; >> + >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_modify(map); >> + if (err) >> + return -EACCES; >> + >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> err = PTR_ERR(key); >> @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr) >> if (IS_ERR(map)) >> return PTR_ERR(map); >> >> + err = security_map_read(map); >> + if (err) >> + return -EACCES; >> + >> if (ukey) { >> key = memdup_user(ukey, map->key_size); >> if (IS_ERR(key)) { >> @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr) >> if (CHECK_ATTR(BPF_PROG_LOAD)) >> return -EINVAL; >> >> + err = security_prog_load(); >> + if (err) >> + return -EACCES; >> + >> if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) >> return -EINVAL; >> >
[PATCH v3 net-next 3/7] samples/bpf: Update sock test to allow setting mark and priority
Update sock test to set mark and priority on socket create. Signed-off-by: David Ahern --- samples/bpf/test_cgrp2_sock.c | 134 - samples/bpf/test_cgrp2_sock.sh | 2 +- 2 files changed, 119 insertions(+), 17 deletions(-) diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index c3cfb23e23b5..681abbe6c85e 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -19,59 +19,161 @@ #include #include #include +#include #include #include "libbpf.h" char bpf_log_buf[BPF_LOG_BUF_SIZE]; -static int prog_load(int idx) +static int prog_load(__u32 idx, __u32 mark, __u32 prio) { - struct bpf_insn prog[] = { + /* save pointer to context */ + struct bpf_insn prog_start[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + }; + struct bpf_insn prog_end[] = { + BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ + BPF_EXIT_INSN(), + }; + + /* set sk_bound_dev_if on socket */ + struct bpf_insn prog_dev[] = { BPF_MOV64_IMM(BPF_REG_3, idx), BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, bound_dev_if)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, bound_dev_if)), - BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = verdict */ - BPF_EXIT_INSN(), }; - size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn); - return bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, + /* set mark on socket */ + struct bpf_insn prog_mark[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_3, mark), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)), + }; + + /* set priority on socket */ + struct bpf_insn prog_prio[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_MOV64_IMM(BPF_REG_3, prio), + BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, priority)), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, priority)), + }; + + struct bpf_insn *prog; + size_t insns_cnt; + void *p; + int ret; + + insns_cnt = sizeof(prog_start) + sizeof(prog_end); + if (idx) + insns_cnt += sizeof(prog_dev); + + if (mark) + insns_cnt += sizeof(prog_mark); + + if (prio) + insns_cnt += sizeof(prog_prio); + + p = prog = malloc(insns_cnt); + if (!prog) { + fprintf(stderr, "Failed to allocate memory for instructions\n"); + return EXIT_FAILURE; + } + + memcpy(p, prog_start, sizeof(prog_start)); + p += sizeof(prog_start); + + if (idx) { + memcpy(p, prog_dev, sizeof(prog_dev)); + p += sizeof(prog_dev); + } + + if (mark) { + memcpy(p, prog_mark, sizeof(prog_mark)); + p += sizeof(prog_mark); + } + + if (prio) { + memcpy(p, prog_prio, sizeof(prog_prio)); + p += sizeof(prog_prio); + } + + memcpy(p, prog_end, sizeof(prog_end)); + p += sizeof(prog_end); + + insns_cnt /= sizeof(struct bpf_insn); + + ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SOCK, prog, insns_cnt, "GPL", 0, bpf_log_buf, BPF_LOG_BUF_SIZE); + + free(prog); + + return ret; } static int usage(const char *argv0) { - printf("Usage: %s cg-path device-index\n", argv0); + printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); return EXIT_FAILURE; } int main(int argc, char **argv) { + __u32 idx = 0, mark = 0, prio = 0; + const char *cgrp_path = NULL; int cg_fd, prog_fd, ret; - unsigned int idx; + int rc; + + while ((rc = getopt(argc, argv, "b:m:p:")) != -1) { + switch (rc) { + case 'b': + idx = if_nametoindex(optarg); + if (!idx) { + idx = strtoumax(optarg, NULL, 0); + if (!idx) { + printf("Invalid device name\n"); + return EXIT_FAILURE; + } + } + break; + case 'm': + mark = strtoumax(optarg, NULL, 0); + break; + case 'p': + prio = strtoumax(optarg, NULL, 0); + break; + default: + return usage(argv[0]); + } + } - if (argc < 2) + if (optind == argc) return usage(arg
[PATCH v3 net-next 4/7] samples/bpf: Add detach option to test_cgrp2_sock
Add option to detach programs from a cgroup. Signed-off-by: David Ahern --- samples/bpf/test_cgrp2_sock.c | 50 ++- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index 681abbe6c85e..15396761c5cc 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -114,7 +114,12 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) static int usage(const char *argv0) { - printf("Usage: %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); + printf("Usage:\n"); + printf(" Attach a program\n"); + printf(" %s -b bind-to-dev -m mark -p prio cg-path\n", argv0); + printf("\n"); + printf(" Detach a program\n"); + printf(" %s -d cg-path\n", argv0); return EXIT_FAILURE; } @@ -123,10 +128,14 @@ int main(int argc, char **argv) __u32 idx = 0, mark = 0, prio = 0; const char *cgrp_path = NULL; int cg_fd, prog_fd, ret; + int do_attach = 1; int rc; - while ((rc = getopt(argc, argv, "b:m:p:")) != -1) { + while ((rc = getopt(argc, argv, "db:m:p:")) != -1) { switch (rc) { + case 'd': + do_attach = 0; + break; case 'b': idx = if_nametoindex(optarg); if (!idx) { @@ -157,7 +166,7 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - if (!idx && !mark && !prio) { + if (do_attach && !idx && !mark && !prio) { fprintf(stderr, "One of device, mark or priority must be given\n"); return EXIT_FAILURE; @@ -169,20 +178,31 @@ int main(int argc, char **argv) return EXIT_FAILURE; } - prog_fd = prog_load(idx, mark, prio); - if (prog_fd < 0) { - printf("Failed to load prog: '%s'\n", strerror(errno)); - printf("Output from kernel verifier:\n%s\n---\n", - bpf_log_buf); - return EXIT_FAILURE; - } + if (do_attach) { + prog_fd = prog_load(idx, mark, prio); + if (prog_fd < 0) { + printf("Failed to load prog: '%s'\n", strerror(errno)); + printf("Output from kernel verifier:\n%s\n---\n", + bpf_log_buf); + return EXIT_FAILURE; + } - ret = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_INET_SOCK_CREATE, 0); - if (ret < 0) { - printf("Failed to attach prog to cgroup: '%s'\n", - strerror(errno)); - return EXIT_FAILURE; + ret = bpf_prog_attach(prog_fd, cg_fd, + BPF_CGROUP_INET_SOCK_CREATE, 0); + if (ret < 0) { + printf("Failed to attach prog to cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } + } else { + ret = bpf_prog_detach(cg_fd, BPF_CGROUP_INET_SOCK_CREATE); + if (ret < 0) { + printf("Failed to detach prog from cgroup: '%s'\n", + strerror(errno)); + return EXIT_FAILURE; + } } + close(cg_fd); return EXIT_SUCCESS; } -- 2.1.4
[PATCH v3 net-next 2/7] bpf: Allow cgroup sock filters to use get_current_uid_gid helper
Allow BPF programs run on sock create to use the get_current_uid_gid helper. IPv4 and IPv6 sockets are created in a process context so there is always a valid uid/gid Signed-off-by: David Ahern Acked-by: Alexei Starovoitov --- net/core/filter.c | 16 +++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index f51b9690adf3..9dad3e7e2e10 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3150,6 +3150,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * +sock_filter_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process +* context so there is always a valid uid/gid +*/ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -4233,7 +4247,7 @@ const struct bpf_verifier_ops lwt_xmit_prog_ops = { }; const struct bpf_verifier_ops cg_sock_prog_ops = { - .get_func_proto = bpf_base_func_proto, + .get_func_proto = sock_filter_func_proto, .is_valid_access= sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -- 2.1.4
[PATCH v3 net-next 6/7] samples/bpf: Update cgrp2 socket tests
Update cgrp2 bpf sock tests to check that device, mark and priority can all be set on a socket via bpf programs attached to a cgroup. Signed-off-by: David Ahern --- samples/bpf/test_cgrp2_sock.sh | 162 +++-- 1 file changed, 124 insertions(+), 38 deletions(-) diff --git a/samples/bpf/test_cgrp2_sock.sh b/samples/bpf/test_cgrp2_sock.sh index 1153c33e8964..a81f38eef417 100755 --- a/samples/bpf/test_cgrp2_sock.sh +++ b/samples/bpf/test_cgrp2_sock.sh @@ -1,47 +1,133 @@ -#!/bin/bash - -function config_device { - ip netns add at_ns0 - ip link add veth0 type veth peer name veth0b - ip link set veth0b up - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad - ip netns exec at_ns0 ip link set dev veth0 up - ip link add foo type vrf table 1234 - ip link set foo up - ip addr add 172.16.1.101/24 dev veth0b - ip addr add 2401:db00::2/64 dev veth0b nodad - ip link set veth0b master foo +#!/bin/sh + +# Test various socket options that can be set by attaching programs to cgroups. + +CGRP_MNT="/tmp/cgroupv2-test_cgrp2_sock" + + +# +print_result() +{ + local rc=$1 + local status=" OK " + + [ $rc -ne 0 ] && status="FAIL" + + printf "%-50s[%4s]\n" "$2" "$status" } -function attach_bpf { - rm -rf /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2 - mount -t cgroup2 none /tmp/cgroupv2 - mkdir -p /tmp/cgroupv2/foo - test_cgrp2_sock -b foo /tmp/cgroupv2/foo - echo $$ >> /tmp/cgroupv2/foo/cgroup.procs +check_sock() +{ + out=$(test_cgrp2_sock) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv4: $2" + echo "expected: $1" + echo "have: $out" + rc=1 + else + print_result 0 "IPv4: $2" + fi } -function cleanup { - set +ex - ip netns delete at_ns0 - ip link del veth0 - ip link del foo - umount /tmp/cgroupv2 - rm -rf /tmp/cgroupv2 - set -ex +check_sock6() +{ + out=$(test_cgrp2_sock -6) + echo $out | grep -q "$1" + if [ $? -ne 0 ]; then + print_result 1 "IPv6: $2" + echo "expected: $1" + echo "have: $out" + rc=1 + else + print_result 0 "IPv6: $2" + fi } -function do_test { - ping -c1 -w1 172.16.1.100 - ping6 -c1 -w1 2401:db00::1 + +# + +cleanup() +{ + echo $$ >> ${CGRP_MNT}/cgroup.procs + rmdir ${CGRP_MNT}/sockopts } +cleanup_and_exit() +{ + local rc=$1 + local msg="$2" + + [ -n "$msg" ] && echo "ERROR: $msg" + + ip li del cgrp2_sock + umount ${CGRP_MNT} + + exit $rc +} + + + +# main + +rc=0 + +ip li add cgrp2_sock type dummy 2>/dev/null + +set -e +mkdir -p ${CGRP_MNT} +mount -t cgroup2 none ${CGRP_MNT} +set +e + + +# make sure we have a known start point cleanup 2>/dev/null -config_device -attach_bpf -do_test -cleanup -echo "*** PASS ***" + +mkdir -p ${CGRP_MNT}/sockopts +[ $? -ne 0 ] && cleanup_and_exit 1 "Failed to create cgroup hierarchy" + + +# set pid into cgroup +echo $$ > ${CGRP_MNT}/sockopts/cgroup.procs + +# no bpf program attached, so socket should show no settings +check_sock "dev , mark 0, priority 0" "No programs attached" +check_sock6 "dev , mark 0, priority 0" "No programs attached" + +# verify device is set +# +test_cgrp2_sock -b cgrp2_sock ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device" +fi +check_sock "dev cgrp2_sock, mark 0, priority 0" "Device set" +check_sock6 "dev cgrp2_sock, mark 0, priority 0" "Device set" + +# verify mark is set +# +test_cgrp2_sock -m 666 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set mark" +fi +check_sock "dev , mark 666, priority 0" "Mark set" +check_sock6 "dev , mark 666, priority 0" "Mark set" + +# verify priority is set +# +test_cgrp2_sock -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set priority" +fi +check_sock "dev , mark 0, priority 123" "Priority set" +check_sock6 "dev , mark 0, priority 123" "Priority set" + +# all 3 at once +# +test_cgrp2_sock -b cgrp2_sock -m 666 -p 123 ${CGRP_MNT}/sockopts +if [ $? -ne 0 ]; then + cleanup_and_exit 1 "Failed to install program to set device, mark and priority" +fi +check_sock "dev cgrp2_sock, mark 666, priority 123" "Priority set" +check_sock6 "dev cgrp2_sock, mark 666, priority 123" "Priority set" + +clea
[PATCH v3 net-next 7/7] samples/bpf: Update cgroup socket examples to use uid gid helper
Signed-off-by: David Ahern --- samples/bpf/sock_flags_kern.c | 5 + samples/bpf/test_cgrp2_sock.c | 12 +++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/samples/bpf/sock_flags_kern.c b/samples/bpf/sock_flags_kern.c index 533dd11a6baa..05dcdf8a4baa 100644 --- a/samples/bpf/sock_flags_kern.c +++ b/samples/bpf/sock_flags_kern.c @@ -9,8 +9,13 @@ SEC("cgroup/sock1") int bpf_prog1(struct bpf_sock *sk) { char fmt[] = "socket: family %d type %d protocol %d\n"; + char fmt2[] = "socket: uid %u gid %u\n"; + __u64 gid_uid = bpf_get_current_uid_gid(); + __u32 uid = gid_uid & 0x; + __u32 gid = gid_uid >> 32; bpf_trace_printk(fmt, sizeof(fmt), sk->family, sk->type, sk->protocol); + bpf_trace_printk(fmt2, sizeof(fmt2), uid, gid); /* block PF_INET6, SOCK_RAW, IPPROTO_ICMPV6 sockets * ie., make ping6 fail diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index 5a688837720c..e79594dd629b 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -46,8 +46,18 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) /* set mark on socket */ struct bpf_insn prog_mark[] = { - BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + /* get uid of process */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, +BPF_FUNC_get_current_uid_gid), + BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0x), + + /* if uid is 0, use given mark, else use the uid as the mark */ + BPF_MOV64_REG(BPF_REG_3, BPF_REG_0), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_MOV64_IMM(BPF_REG_3, mark), + + /* set the mark on the new socket */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_MOV64_IMM(BPF_REG_2, offsetof(struct bpf_sock, mark)), BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3, offsetof(struct bpf_sock, mark)), }; -- 2.1.4
[PATCH v3 net-next 5/7] samples/bpf: Add option to dump socket settings
Add option to dump socket settings. Will be used in the next patch to verify bpf programs are correctly setting mark, priority and device based on the cgroup attachment for the program run. Signed-off-by: David Ahern --- samples/bpf/test_cgrp2_sock.c | 75 +-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/samples/bpf/test_cgrp2_sock.c b/samples/bpf/test_cgrp2_sock.c index 15396761c5cc..5a688837720c 100644 --- a/samples/bpf/test_cgrp2_sock.c +++ b/samples/bpf/test_cgrp2_sock.c @@ -112,6 +112,70 @@ static int prog_load(__u32 idx, __u32 mark, __u32 prio) return ret; } +static int get_bind_to_device(int sd, char *name, size_t len) +{ + socklen_t optlen = len; + int rc; + + name[0] = '\0'; + rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen); + if (rc < 0) + perror("setsockopt(SO_BINDTODEVICE)"); + + return rc; +} + +static unsigned int get_somark(int sd) +{ + unsigned int mark = 0; + socklen_t optlen = sizeof(mark); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_MARK, &mark, &optlen); + if (rc < 0) + perror("getsockopt(SO_MARK)"); + + return mark; +} + +static unsigned int get_priority(int sd) +{ + unsigned int prio = 0; + socklen_t optlen = sizeof(prio); + int rc; + + rc = getsockopt(sd, SOL_SOCKET, SO_PRIORITY, &prio, &optlen); + if (rc < 0) + perror("getsockopt(SO_PRIORITY)"); + + return prio; +} + +static int show_sockopts(int family) +{ + unsigned int mark, prio; + char name[16]; + int sd; + + sd = socket(family, SOCK_DGRAM, 17); + if (sd < 0) { + perror("socket"); + return 1; + } + + if (get_bind_to_device(sd, name, sizeof(name)) < 0) + return 1; + + mark = get_somark(sd); + prio = get_priority(sd); + + close(sd); + + printf("sd %d: dev %s, mark %u, priority %u\n", sd, name, mark, prio); + + return 0; +} + static int usage(const char *argv0) { printf("Usage:\n"); @@ -120,6 +184,9 @@ static int usage(const char *argv0) printf("\n"); printf(" Detach a program\n"); printf(" %s -d cg-path\n", argv0); + printf("\n"); + printf(" Show inherited socket settings (mark, priority, and device)\n"); + printf(" %s [-6]\n", argv0); return EXIT_FAILURE; } @@ -128,10 +195,11 @@ int main(int argc, char **argv) __u32 idx = 0, mark = 0, prio = 0; const char *cgrp_path = NULL; int cg_fd, prog_fd, ret; + int family = PF_INET; int do_attach = 1; int rc; - while ((rc = getopt(argc, argv, "db:m:p:")) != -1) { + while ((rc = getopt(argc, argv, "db:m:p:6")) != -1) { switch (rc) { case 'd': do_attach = 0; @@ -152,13 +220,16 @@ int main(int argc, char **argv) case 'p': prio = strtoumax(optarg, NULL, 0); break; + case '6': + family = PF_INET6; + break; default: return usage(argv[0]); } } if (optind == argc) - return usage(argv[0]); + return show_sockopts(family); cgrp_path = argv[optind]; if (!cgrp_path) { -- 2.1.4
[PATCH v3 net-next 1/7] bpf: Add mark and priority to sock options that can be set
Add socket mark and priority to fields that can be set by ebpf program when a socket is created. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c| 26 ++ 2 files changed, 28 insertions(+) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d46cf326b95f..e9c89e20adff 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -758,6 +758,8 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 diff --git a/net/core/filter.c b/net/core/filter.c index c6a37fe0285b..f51b9690adf3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size, switch (off) { case offsetof(struct bpf_sock, bound_dev_if): break; + case offsetof(struct bpf_sock, mark): + break; + case offsetof(struct bpf_sock, priority): + break; default: return false; } @@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, offsetof(struct sock, sk_bound_dev_if)); break; + case offsetof(struct bpf_sock, mark): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + break; + + case offsetof(struct bpf_sock, priority): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + break; + case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); -- 2.1.4
[PATCH v3 net-next 0/7] bpf: Add option to set mark and priority in cgroup sock programs
Add option to set mark and priority in addition to bound device for newly created sockets. Also, allow the bpf programs to use the get_current_uid_gid helper meaning socket marks, priority and device can be set based on the uid/gid of the running process. Sample programs are updated to demonstrate the new options. v3 - no changes to Patches 1 and 2 which Alexei acked in previous versions - dropped change related to recursive programs in a cgroup - updated tests per dropped patch v2 - added flag to control recursive behavior as requested by Alexei - added comment to sock_filter_func_proto regarding use of get_current_uid_gid helper - updated test programs for recursive option David Ahern (7): bpf: Add mark and priority to sock options that can be set bpf: Allow cgroup sock filters to use get_current_uid_gid helper samples/bpf: Update sock test to allow setting mark and priority samples/bpf: Add detach option to test_cgrp2_sock samples/bpf: Add option to dump socket settings samples/bpf: Update cgrp2 socket tests samples/bpf: Update cgroup socket examples to use uid gid helper include/uapi/linux/bpf.h | 2 + net/core/filter.c | 42 ++- samples/bpf/sock_flags_kern.c | 5 + samples/bpf/test_cgrp2_sock.c | 255 - samples/bpf/test_cgrp2_sock.sh | 162 -- 5 files changed, 401 insertions(+), 65 deletions(-) -- 2.1.4
Re: [PATCH net-next] bridge: add tracepoint in br_fdb_update
On Thu, 31 Aug 2017 11:43:25 -0700 (PDT) David Miller wrote: > From: Roopa Prabhu > Date: Wed, 30 Aug 2017 22:18:13 -0700 > > > From: Roopa Prabhu > > > > This extends bridge fdb table tracepoints to also cover > > learned fdb entries in the br_fdb_update path. Note that > > unlike other tracepoints I have moved this to when the fdb > > is modified because this is in the datapath and can generate > > a lot of noise in the trace output. br_fdb_update is also called > > from added_by_user context in the NTF_USE case which is already > > traced ..hence the !added_by_user check. > > > > Signed-off-by: Roopa Prabhu > > Applied. > > Let's use dev->name for now and if the tooling can eventually > do transparent ifindex->name then we can consider redoing > a bunch of networking tracepoints. I agree! :-) -- Best regards, Jesper Dangaard Brouer MSc.CS, Principal Kernel Engineer at Red Hat LinkedIn: http://www.linkedin.com/in/brouer
Re: [patch net-next repost 0/8] mlxsw: Add IPv6 host dpipe table
From: Jiri Pirko Date: Thu, 31 Aug 2017 17:59:11 +0200 > This patchset adds IPv6 host dpipe table support. This will provide the > ability to observe the hardware offloaded IPv6 neighbors. Series applied, thanks. I noticed while reviewing this we are pretty much split on how to access neigh->primary_key. Half the code goes "(type *) n->primary_key" and the other half (mostly in ipv6) goes "(type *) &n->primary_key" I know both forms are basically equivalent, but consistency matters :-)
Re: [PATCH net-next] net/ncsi: Define {add, kill}_vid callbacks for !CONFIG_NET_NCSI
On Thu, 2017-08-31 at 08:24 -0700, Vernon Mauery wrote: > +int ncsi_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) > > +{ > > + return -ENOTTY; > > +} > > +int ncsi_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid) > > +{ > > + return -ENOTTY; > > +} > > These should be static functions because they are defined in the header > file or you will get multiple symbol definitions. static inline even or you'll get warning about them being unused iirc. Cheers, Ben.
Re: [PATCH net-next 7/8] net: hns3: add vlan filter config of Ports
From: Lipeng Date: Thu, 31 Aug 2017 21:39:08 +0800 > Config the self_define vlan_type as TPID(0x8100) for vlan identification. > When normal port initialize vlan configure, set default vlan id as 0. > > Signed-off-by: Mingguang Qu > Signed-off-by: Lipeng No, that's not what this patch is doing. > @@ -3308,6 +3308,7 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport > *vport, > mc_desc[1].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); > hclge_cmd_reuse_desc(&mc_desc[2], false); > mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_NEXT); > + > memcpy(mc_desc[0].data, req, > sizeof(struct hclge_mac_vlan_tbl_entry)); > ret = hclge_cmd_send(&hdev->hw, mc_desc, 3); All it does is add an empty line.
Re: [PATCH] ath9k: remove cast to void pointer
On Thu, 2017-08-31 at 18:37 +0530, Himanshu Jha wrote: > casting to void pointer from any pointer type and vice-versa is done > implicitly and therefore casting is not needed in such a case. You said you were going to remember to mention the tool and script that did this.
Re: [PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
On Thu, 2017-08-31 at 13:56 -0700, Chenbo Feng wrote: > From: Chenbo Feng > > Introduce a pointer into struct bpf_map to hold the security information > about the map. The actual security struct varies based on the security > models implemented. Place the LSM hooks before each of the unrestricted > eBPF operations, the map_update_elem and map_delete_elem operations are > checked by security_map_modify. The map_lookup_elem and map_get_next_key > operations are checked by securtiy_map_read. > > Signed-off-by: Chenbo Feng > --- > include/linux/bpf.h | 3 +++ > kernel/bpf/syscall.c | 28 > 2 files changed, 31 insertions(+) > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > index b69e7a5869ff..ca3e6ff7091d 100644 > --- a/include/linux/bpf.h > +++ b/include/linux/bpf.h > @@ -53,6 +53,9 @@ struct bpf_map { > struct work_struct work; > atomic_t usercnt; > struct bpf_map *inner_map_meta; > +#ifdef CONFIG_SECURITY > + void *security; > +#endif > }; > > /* function argument constraints */ > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > index 045646da97cc..b15580bcf3b1 100644 > --- a/kernel/bpf/syscall.c > +++ b/kernel/bpf/syscall.c > @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr) > if (err) > return -EINVAL; > > + err = security_map_create(); > + if (err) > + return -EACCES; Any reason not to just return err? Mimi > + > /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ > map = find_and_alloc_map(attr); > if (IS_ERR(map)) > @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr) > if (err) > goto free_map_nouncharge; > > + err = security_post_create(map); > + if (err < 0) > + goto free_map; > + > err = bpf_map_alloc_id(map); > if (err) > goto free_map; > @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_read(map); > + if (err) > + return -EACCES; > + > key = memdup_user(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key); > @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_modify(map); > + if (err) > + return -EACCES; > + > key = memdup_user(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key); > @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_modify(map); > + if (err) > + return -EACCES; > + > key = memdup_user(ukey, map->key_size); > if (IS_ERR(key)) { > err = PTR_ERR(key); > @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr) > if (IS_ERR(map)) > return PTR_ERR(map); > > + err = security_map_read(map); > + if (err) > + return -EACCES; > + > if (ukey) { > key = memdup_user(ukey, map->key_size); > if (IS_ERR(key)) { > @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr) > if (CHECK_ATTR(BPF_PROG_LOAD)) > return -EINVAL; > > + err = security_prog_load(); > + if (err) > + return -EACCES; > + > if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) > return -EINVAL; >
[PATCH net-next] doc: document MSG_ZEROCOPY
From: Willem de Bruijn Documentation for this feature was missing from the patchset. Copied a lot from the netdev 2.1 paper, addressing some small interface changes since then. Signed-off-by: Willem de Bruijn --- Documentation/networking/msg_zerocopy.rst | 254 ++ 1 file changed, 254 insertions(+) create mode 100644 Documentation/networking/msg_zerocopy.rst diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst new file mode 100644 index ..2e2a3410b947 --- /dev/null +++ b/Documentation/networking/msg_zerocopy.rst @@ -0,0 +1,254 @@ + + +MSG_ZEROCOPY + + +Intro += + +The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. +The feature is currently implemented for TCP sockets. + + +Opportunity and Caveats +--- + +Copying large buffers between user process and kernel can be +expensive. Linux supports various interfaces that eschew copying, +such as sendpage and splice. The MSG_ZEROCOPY flag extends the +underlying copy avoidance mechanism to common socket send calls. + +Copy avoidance is not a free lunch. As implemented, with page pinning, +it replaces per byte copy cost with page accounting and completion +notification overhead. As a result, MSG_ZEROCOPY is generally only +effective at writes over around 10 KB. + +Page pinning also changes system call semantics. It temporarily shares +the buffer between process and network stack. Unlike with copying, the +process cannot immediately overwrite the buffer after system call +return without possibly modifying the data in flight. Kernel integrity +is not affected, but a buggy program can possibly corrupt its own data +stream. + +The kernel returns a notification when it is safe to modify data. +Converting an existing application to MSG_ZEROCOPY is not always as +trivial as just passing the flag, then. + + +More Info +- + +Much of this document was derived from a longer paper presented at +netdev 2.1. For more in-depth information see that paper and talk, +the excellent reporting over at LWN.net or read the original code. + + paper, slides, video +https://netdevconf.org/2.1/session.html?debruijn + + LWN article +https://lwn.net/Articles/726917/ + + patchset +[PATCH net-next v4 0/9] socket sendmsg MSG_ZEROCOPY +https://lwn.net/Articles/730010/ +https://www.spinics.net/lists/netdev/msg447552.html + + +Interface += + +Passing the MSG_ZEROCOPY flag is the most obvious step to enable copy +avoidance, but not the only one. + +Socket Setup + + +The kernel is permissive when applications pass undefined flags to the +send system call. By default it simply ignores these. To avoid enabling +copy avoidance mode for legacy processes that accidentally pass this +flag, a process must first signal intent by setting a socket option: + +:: + + if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one))) + error(1, errno, "setsockopt zerocopy"); + + +Transmission + + +The change to send (or sendto, sendmsg, sendmmsg) itself is trivial. +Pass the new flag. + +:: + + ret = send(fd, buf, sizeof(buf), MSG_ZEROCOPY); + if (ret != sizeof(buf)) + error(1, errno, "send"); + + +Mixing copy avoidance and copying +~ + +Many workloads have a mixture of large and small buffers. Because copy +avoidance is more expensive than copying for small packets, the +feature is implemented as a flag. It is safe to mix calls with the flag +with those without. + + +Notifications +- + +The kernel has to notify the process when it is safe to reuse a +previously passed buffer. It queues completion notifications on the +socket error queue, akin to the transmit timestamping interface. + +The notification itself is a simple scalar value. Each socket +maintains an internal 32-bit counter. Each send call with MSG_ZEROCOPY +that successfully sends data increments the counter. The counter is +not incremented on failure or if called with length zero. + + +Notification Reception +~~ + +The below snippet demonstrates the API. In the simplest case, each +send syscall is followed by a poll and recvmsg on the error queue. + +Reading from the error queue is always a non-blocking operation. The +poll call is there to block until an error is outstanding. It will set +POLLERR in its output flags. That flag does not have to be set in the +events field: errors are signaled unconditionally. + +:: + + pfd.fd = fd; + pfd.events = 0; + if (poll(&pfd, 1, -1) != 1 || pfd.revents & POLLERR == 0) + error(1, errno, "poll"); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE); + if (ret == -1) + error(1, errno, "recvmsg"); + + read_notification(msg); + +The example is for demonstration purpose only. In practice, it is more +efficient to not wait for notifications, but
Re: [PATCH v4 4/5] net: stmmac: dwmac-sun8i: choose internal PHY via phy-is-integrated
On Thu, Aug 31, 2017 at 03:18:03PM -0500, Rob Herring wrote: > On Sat, Aug 26, 2017 at 11:20:51PM +0200, Andrew Lunn wrote: > > Hi Corentin > > > > I think we have now all agreed this is an mdio-mux, plus it is also an > > MII mux. We should represent that in device tree. This patchset does > > this. However, as it is now, the mux structure in DT is ignored. All > > it does is search for the phy-is-integrated flags and goes on that. > > > > I made the comment that the device tree representation cannot be > > implemented using an MDIO mux driver, because of driver loading > > issues. However, the core of the MDIO mux code is just a library, > > symbols exported as GPL, free for anything to use. > > > > What i think should happen is the mdio-mux is implemented inside the > > MAC driver, using the mux-core as a library. The device tree structure > > of a mix is then reflected within Linux. The mux switch callback is > > implemented within the MAC driver. So it can reset the MAC when the > > mux is switched. The 'phy-is-integrated' property is then no longer > > needed. > > > > I would suggest a binding something like: > > This is looks better to me, but... > > > emac: ethernet@1c0b000 { > > compatible = "allwinner,sun8i-h3-emac"; > > syscon = <&syscon>; > > reg = <0x01c0b000 0x104>; > > interrupts = ; > > interrupt-names = "macirq"; > > resets = <&ccu RST_BUS_EMAC>; > > reset-names = "stmmaceth"; > > clocks = <&ccu CLK_BUS_EMAC>; > > clock-names = "stmmaceth"; > > #address-cells = <1>; > > #size-cells = <0>; > > > > phy-handle = <&int_mii_phy>; > > phy-mode = "mii"; > > allwinner,leds-active-low; > > > > mdio: mdio { > > #address-cells = <1>; > > #size-cells = <0>; > > } > > Why do you need this node still? Hi Rob It might not be needed, depending on how it is implemented. But: Documentation/devicetree/bindings/net/mdio-mux.txt It is normal for an mdio bus mux to have a phandle back to the parent mdio bus. Also, i think the stmmac driver will only instantiate the mdio bus if there is a node for it in the device tree. Andrew
[PATCH 3/3] selinux: bpf: Implement the selinux checks for eBPF object
From: Chenbo Feng Introduce 5 new selinux checks for eBPF object related operations. The check is based on the ownership information of eBPF maps and the capability of creating eBPF object. Signed-off-by: Chenbo Feng --- security/selinux/hooks.c| 54 + security/selinux/include/classmap.h | 2 ++ security/selinux/include/objsec.h | 4 +++ 3 files changed, 60 insertions(+) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 33fd061305c4..39ad7d9f335d 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -85,6 +85,7 @@ #include #include #include +#include #include "avc.h" #include "objsec.h" @@ -6245,6 +6246,52 @@ static void selinux_ib_free_security(void *ib_sec) } #endif +#ifdef CONFIG_BPF_SYSCALL +static int selinux_bpf_map_create(void) +{ + u32 sid = current_sid(); + + return avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE, NULL); +} + +static int selinux_bpf_map_modify(struct bpf_map *map) +{ + struct bpf_security_struct *bpfsec = map->security; + + return avc_has_perm(current_sid(), bpfsec->sid, SECCLASS_BPF, + BPF__MAP_MODIFY, NULL); +} + +static int selinux_bpf_map_read(struct bpf_map *map) +{ + struct bpf_security_struct *bpfsec = map->security; + + return avc_has_perm(current_sid(), bpfsec->sid, SECCLASS_BPF, + BPF__MAP_READ, NULL); +} + +static int selinux_bpf_prog_load(void) +{ + u32 sid = current_sid(); + + return avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD, NULL); +} + +static int selinux_bpf_post_create(struct bpf_map *map) +{ + struct bpf_security_struct *bpfsec; + + bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL); + if (!bpfsec) + return -ENOMEM; + + bpfsec->sid = current_sid(); + map->security = bpfsec; + + return 0; +} +#endif + static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr), LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction), @@ -6465,6 +6512,13 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match), LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free), #endif +#ifdef CONFIG_BPF_SYSCALL + LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create), + LSM_HOOK_INIT(bpf_map_modify, selinux_bpf_map_modify), + LSM_HOOK_INIT(bpf_map_read, selinux_bpf_map_read), + LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load), + LSM_HOOK_INIT(bpf_post_create, selinux_bpf_post_create), +#endif }; static __init int selinux_init(void) diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h index b9fe3434b036..83c880fb17b4 100644 --- a/security/selinux/include/classmap.h +++ b/security/selinux/include/classmap.h @@ -235,6 +235,8 @@ struct security_class_mapping secclass_map[] = { { "access", NULL } }, { "infiniband_endport", { "manage_subnet", NULL } }, + { "bpf", + {"map_create", "map_modify", "map_read", "prog_load" } }, { NULL } }; diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 6ebc61e370ff..ba564f662b0d 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -150,6 +150,10 @@ struct pkey_security_struct { u32 sid;/* SID of pkey */ }; +struct bpf_security_struct { + u32 sid;/*SID of bpf obj creater*/ +}; + extern unsigned int selinux_checkreqprot; #endif /* _SELINUX_OBJSEC_H_ */ -- 2.14.1.581.gf28d330327-goog
[PATCH 2/3] security: bpf: Add eBPF LSM hooks and security field to eBPF map
From: Chenbo Feng Introduce a pointer into struct bpf_map to hold the security information about the map. The actual security struct varies based on the security models implemented. Place the LSM hooks before each of the unrestricted eBPF operations, the map_update_elem and map_delete_elem operations are checked by security_map_modify. The map_lookup_elem and map_get_next_key operations are checked by securtiy_map_read. Signed-off-by: Chenbo Feng --- include/linux/bpf.h | 3 +++ kernel/bpf/syscall.c | 28 2 files changed, 31 insertions(+) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b69e7a5869ff..ca3e6ff7091d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -53,6 +53,9 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; +#ifdef CONFIG_SECURITY + void *security; +#endif }; /* function argument constraints */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..b15580bcf3b1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -279,6 +279,10 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + err = security_map_create(); + if (err) + return -EACCES; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) @@ -291,6 +295,10 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_nouncharge; + err = security_post_create(map); + if (err < 0) + goto free_map; + err = bpf_map_alloc_id(map); if (err) goto free_map; @@ -410,6 +418,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_read(map); + if (err) + return -EACCES; + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -490,6 +502,10 @@ static int map_update_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_modify(map); + if (err) + return -EACCES; + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -573,6 +589,10 @@ static int map_delete_elem(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_modify(map); + if (err) + return -EACCES; + key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -616,6 +636,10 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); + err = security_map_read(map); + if (err) + return -EACCES; + if (ukey) { key = memdup_user(ukey, map->key_size); if (IS_ERR(key)) { @@ -935,6 +959,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; + err = security_prog_load(); + if (err) + return -EACCES; + if (attr->prog_flags & ~BPF_F_STRICT_ALIGNMENT) return -EINVAL; -- 2.14.1.581.gf28d330327-goog