[PATCH net] Revert "blackhole_netdev: fix syzkaller reported issue"
This reverts commit b0818f80c8c1bc215bba276bd61c216014fab23b. Started seeing weird behavior after this patch especially in the IPv6 code path. Haven't root caused it, but since this was applied to net branch, taking a precautionary measure to revert it and look / analyze those failures Revert this now and I'll send a better fix after analysing / fixing the weirdness observed. CC: Eric Dumazet CC: Wei Wang CC: David S. Miller Signed-off-by: Mahesh Bandewar --- net/ipv6/addrconf.c | 7 +-- net/ipv6/route.c| 15 +-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c87594d1389..34ccef18b40e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6996,7 +6996,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev, *bdev; + struct inet6_dev *idev; int i, err; err = ipv6_addr_label_init(); @@ -7036,14 +7036,10 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); - bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; - } else if (IS_ERR(bdev)) { - err = PTR_ERR(bdev); - goto errlo; } ip6_route_init_special_entries(); @@ -7128,7 +7124,6 @@ void addrconf_cleanup(void) addrconf_ifdown(dev, 1); } addrconf_ifdown(init_net.loopback_dev, 2); - addrconf_ifdown(blackhole_netdev, 2); /* * Check hash table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 742120728869..a63ff85fe141 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,9 +155,10 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { + struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == net->loopback_dev) + if (dev == loopback_dev) return; for_each_possible_cpu(cpu) { @@ -170,7 +171,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(blackhole_netdev); + rt->rt6i_idev = in6_dev_get(loopback_dev); in6_dev_put(rt_idev); } @@ -385,11 +386,13 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *loopback_dev = + dev_net(dev)->loopback_dev; - if (idev && idev->dev != dev_net(dev)->loopback_dev) { - struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev); - if (ibdev) { - rt->rt6i_idev = ibdev; + if (idev && idev->dev != loopback_dev) { + struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); + if (loopback_idev) { + rt->rt6i_idev = loopback_idev; in6_dev_put(idev); } } -- 2.23.0.700.g56cf767bdb-goog
[PATCHv3 next] blackhole_netdev: fix syzkaller reported issue
83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15 RSP: 0018:88806c037038 EFLAGS: 00010207 RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000 RDX: 0047 RSI: 863e3caa RDI: 023c RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288 R10: fbfff134af07 R11: 89a5783f R12: 0003 R13: 88806c037298 R14: 888094c5cd6f R15: FS: 7fa15e7de700() GS:8880ae80() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 00625208 CR3: 5e348000 CR4: 001406f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Signed-off-by: Mahesh Bandewar CC: Eric Dumazet CC: Wei Wang --- v1->v2: fixed missing update in ip6_dst_ifdown() v2->v3: added idev cleanup step in addrconf_cleanup() net/ipv6/addrconf.c | 7 ++- net/ipv6/route.c| 15 ++- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98d82305d6de..2a01404aba78 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev; + struct inet6_dev *idev, *bdev; int i, err; err = ipv6_addr_label_init(); @@ -7035,10 +7035,14 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); + bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; + } else if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto errlo; } ip6_route_init_special_entries(); @@ -7123,6 +7127,7 @@ void addrconf_cleanup(void) addrconf_ifdown(dev, 1); } addrconf_ifdown(init_net.loopback_dev, 2); + addrconf_ifdown(blackhole_netdev, 2); /* * Check hash table. diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a63ff85fe141..742120728869 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == loopback_dev) + if (dev == net->loopback_dev) return; for_each_possible_cpu(cpu) { @@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(loopback_dev); + rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); } @@ -386,13 +385,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *loopback_dev = - dev_net(dev)->loopback_dev; - if (idev && idev->dev != loopback_dev) { - struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; + if (idev && idev->dev != dev_net(dev)->loopback_dev) { + struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev); + if (ibdev) { + rt->rt6i_idev = ibdev; in6_dev_put(idev); } } -- 2.23.0.700.g56cf767bdb-goog
[PATCHv2 next] blackhole_netdev: fix syzkaller reported issue
83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15 RSP: 0018:88806c037038 EFLAGS: 00010207 RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000 RDX: 0047 RSI: 863e3caa RDI: 023c RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288 R10: fbfff134af07 R11: 89a5783f R12: 0003 R13: 88806c037298 R14: 888094c5cd6f R15: FS: 7fa15e7de700() GS:8880ae80() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 00625208 CR3: 5e348000 CR4: 001406f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Signed-off-by: Mahesh Bandewar CC: Eric Dumazet --- v1->v2: fixed missing update in ip6_dst_ifdown() net/ipv6/addrconf.c | 6 +- net/ipv6/route.c| 15 ++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98d82305d6de..0f216f7cbe3e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev; + struct inet6_dev *idev, *bdev; int i, err; err = ipv6_addr_label_init(); @@ -7035,10 +7035,14 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); + bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; + } else if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto errlo; } ip6_route_init_special_entries(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a63ff85fe141..742120728869 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == loopback_dev) + if (dev == net->loopback_dev) return; for_each_possible_cpu(cpu) { @@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(loopback_dev); + rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); } @@ -386,13 +385,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rt6_info *rt = (struct rt6_info *)dst; struct inet6_dev *idev = rt->rt6i_idev; - struct net_device *loopback_dev = - dev_net(dev)->loopback_dev; - if (idev && idev->dev != loopback_dev) { - struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev); - if (loopback_idev) { - rt->rt6i_idev = loopback_idev; + if (idev && idev->dev != dev_net(dev)->loopback_dev) { + struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev); + if (ibdev) { + rt->rt6i_idev = ibdev; in6_dev_put(idev); } } -- 2.23.0.581.g78d2f28ef7-goog
[PATCH next] ipvlan: consolidate TSO flags using NETIF_F_ALL_TSO
This will ensure that any new TSO related flags added (which would be part of ALL_TSO mask and IPvlan driver doesn't need to update every time new flag gets added. Signed-off-by: Mahesh Bandewar Suggested-by: Eric Dumazet --- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 887bbba4631e..b0ac557f8e60 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -108,8 +108,8 @@ static void ipvlan_port_destroy(struct net_device *dev) #define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ -NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \ -NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ +NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \ +NETIF_F_GRO | NETIF_F_RXCSUM | \ NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) #define IPVLAN_STATE_MASK \ -- 2.23.0.581.g78d2f28ef7-goog
[PATCH next] blackhole_netdev: fix syzkaller reported issue
83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15 RSP: 0018:88806c037038 EFLAGS: 00010207 RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000 RDX: 0047 RSI: 863e3caa RDI: 023c RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288 R10: fbfff134af07 R11: 89a5783f R12: 0003 R13: 88806c037298 R14: 888094c5cd6f R15: FS: 7fa15e7de700() GS:8880ae80() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 00625208 CR3: 5e348000 CR4: 001406f0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst entries") Signed-off-by: Mahesh Bandewar --- net/ipv6/addrconf.c | 6 +- net/ipv6/route.c| 5 ++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 98d82305d6de..0f216f7cbe3e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { - struct inet6_dev *idev; + struct inet6_dev *idev, *bdev; int i, err; err = ipv6_addr_label_init(); @@ -7035,10 +7035,14 @@ int __init addrconf_init(void) */ rtnl_lock(); idev = ipv6_add_dev(init_net.loopback_dev); + bdev = ipv6_add_dev(blackhole_netdev); rtnl_unlock(); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; + } else if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto errlo; } ip6_route_init_special_entries(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a63ff85fe141..ea16e037ec13 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt) static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) { - struct net_device *loopback_dev = net->loopback_dev; int cpu; - if (dev == loopback_dev) + if (dev == net->loopback_dev) return; for_each_possible_cpu(cpu) { @@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) struct net_device *rt_dev = rt->dst.dev; if (rt_idev->dev == dev) { - rt->rt6i_idev = in6_dev_get(loopback_dev); + rt->rt6i_idev = in6_dev_get(blackhole_netdev); in6_dev_put(rt_idev); } -- 2.23.0.581.g78d2f28ef7-goog
[PATCH next] loopback: fix lockdep splat
dev_init_scheduler() and dev_activate() expect the caller to hold RTNL. Since we don't want blackhole device to be initialized per ns, we are initializing at init. [3.855027] Call Trace: [3.855034] dump_stack+0x67/0x95 [3.855037] lockdep_rcu_suspicious+0xd5/0x110 [3.855044] dev_init_scheduler+0xe3/0x120 [3.855048] ? net_olddevs_init+0x60/0x60 [3.855050] blackhole_netdev_init+0x45/0x6e [3.855052] do_one_initcall+0x6c/0x2fa [3.855058] ? rcu_read_lock_sched_held+0x8c/0xa0 [3.855066] kernel_init_freeable+0x1e5/0x288 [3.855071] ? rest_init+0x260/0x260 [3.855074] kernel_init+0xf/0x180 [3.855076] ? rest_init+0x260/0x260 [3.855078] ret_from_fork+0x24/0x30 Fixes: 4de83b88c66 ("loopback: create blackhole net device similar to loopack.") Reported-by: Geert Uytterhoeven Cc: Eric Dumazet Signed-off-by: Mahesh Bandewar --- drivers/net/loopback.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 3b39def5471e..14545a8797a8 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -261,8 +261,10 @@ static int __init blackhole_netdev_init(void) if (!blackhole_netdev) return -ENOMEM; + rtnl_lock(); dev_init_scheduler(blackhole_netdev); dev_activate(blackhole_netdev); + rtnl_unlock(); blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; dev_net_set(blackhole_netdev, &init_net); -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv3 next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries
Use blackhole_netdev instead of 'lo' device with lower MTU when marking dst "dead". Signed-off-by: Mahesh Bandewar --- v1->v2->v3 no change net/core/dst.c | 2 +- net/ipv4/route.c | 3 +-- net/ipv6/route.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index e46366228eaf..1325316d9eab 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = dev_net(dst->dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bbd55c7f6b2e..dc1f510a7c81 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev) list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7556275b1cef..39361f57351a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv3 next 3/3] blackhole_dev: add a selftest
Since this is not really a device with all capabilities, this test ensures that it has *enough* to make it through the data path without causing unwanted side-effects (read crash!). Signed-off-by: Mahesh Bandewar --- v1 -> v2 fixed the conflict resolution in selftests Makefile v2 -> v3 fixed lib/Kconfig.debug tristate text / string. lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 6 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cbdfae379896..99272b5dd980 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1909,6 +1909,15 @@ config TEST_BPF If unsure, say N. +config TEST_BLACKHOLE_DEV + tristate "Test blackhole netdev functionality" + depends on m && NET + help + This builds the "test_blackhole_dev" module that validates the + data path through this blackhole netdev. + + If unsure, say N. + config FIND_BIT_BENCHMARK tristate "Test find_bit functions" help diff --git a/lib/Makefile b/lib/Makefile index dcb558c7554d..6ac44fe2a37f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o +obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/ diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c new file mode 100644 index ..4c40580a99a3 --- /dev/null +++ b/lib/test_blackhole_dev.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This module tests the blackhole_dev that is created during the + * net subsystem initialization. The test this module performs is + * by injecting an skb into the stack with skb->dev as the + * blackhole_dev and expects kernel to behave in a sane manner + * (in other words, *not crash*)! + * + * Copyright (c) 2018, Mahesh Bandewar + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SKB_SIZE 256 +#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */ +#define TAIL_SIZE 32 /* random tail-room */ + +#define UDP_PORT 1234 + +static int __init test_blackholedev_init(void) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *ethh; + struct udphdr *uh; + int data_len; + int ret; + + skb = alloc_skb(SKB_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + /* Reserve head-room for the headers */ + skb_reserve(skb, HEAD_SIZE); + + /* Add data to the skb */ + data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE); + memset(__skb_put(skb, data_len), 0xf, data_len); + + /* Add protocol data */ + /* (Transport) UDP */ + uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr)); + skb_set_transport_header(skb, 0); + uh->source = uh->dest = htons(UDP_PORT); + uh->len = htons(data_len); + uh->check = 0; + /* (Network) IPv6 */ + ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr)); + skb_set_network_header(skb, 0); + ip6h->hop_limit = 32; + ip6h->payload_len = data_len + sizeof(struct udphdr); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->saddr = in6addr_loopback; + ip6h->daddr = in6addr_loopback; + /* Ether */ + ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr)); + skb_set_mac_header(skb, 0); + + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + skb->dev = blackhole_netdev; + + /* Now attempt to send the packet */ + ret = dev_queue_xmit(skb); + + switch (ret) { + case NET_XMIT_SUCCESS: + pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n"); + break; + case NET_XMIT_DROP: + pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n"); + break; + case NET_XMIT_CN: + pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n"); + break; + default: + pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret); + } + + return 0; +} + +static void __exit test_blackholedev_exit(void) +{ + pr_warn("test_blackholedev module terminating.\n"); +} + +module_init(test_blackholedev_init); +mo
[PATCHv3 next 1/3] loopback: create blackhole net device similar to loopack.
Create a blackhole net device that can be used for "dead" dst entries instead of loopback device. This blackhole device differs from loopback in few aspects: (a) It's not per-ns. (b) MTU on this device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb(). and (d) since it's not registered it won't have ifindex. Lower MTU effectively make the device not pass the MTU check during the route check when a dst associated with the skb is dead. Signed-off-by: Mahesh Bandewar --- v1->v2->v3 no change drivers/net/loopback.c| 76 ++- include/linux/netdevice.h | 2 ++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 87d361666cdd..3b39def5471e 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -55,6 +55,13 @@ #include #include +/* blackhole_netdev - a device used for dsts that are marked expired! + * This is global device (instead of per-net-ns) since it's not needed + * to be per-ns and gets initialized at boot time. + */ +struct net_device *blackhole_netdev; +EXPORT_SYMBOL(blackhole_netdev); + /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ @@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = { .ndo_set_mac_address = eth_mac_addr, }; -/* The loopback device is special. There is only one instance - * per network namespace. - */ -static void loopback_setup(struct net_device *dev) +static void gen_lo_setup(struct net_device *dev, +unsigned int mtu, +const struct ethtool_ops *eth_ops, +const struct header_ops *hdr_ops, +const struct net_device_ops *dev_ops, +void (*dev_destructor)(struct net_device *dev)) { - dev->mtu= 64 * 1024; + dev->mtu= mtu; dev->hard_header_len= ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6*/ @@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev) | NETIF_F_NETNS_LOCAL | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; - dev->ethtool_ops= &loopback_ethtool_ops; - dev->header_ops = ð_header_ops; - dev->netdev_ops = &loopback_ops; + dev->ethtool_ops= eth_ops; + dev->header_ops = hdr_ops; + dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; - dev->priv_destructor= loopback_dev_free; + dev->priv_destructor= dev_destructor; +} + +/* The loopback device is special. There is only one instance + * per network namespace. + */ +static void loopback_setup(struct net_device *dev) +{ + gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops, +&loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ @@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net) struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; + +/* blackhole netdevice */ +static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, +struct net_device *dev) +{ + kfree_skb(skb); + net_warn_ratelimited("%s(): Dropping skb.\n", __func__); + return NETDEV_TX_OK; +} + +static const struct net_device_ops blackhole_netdev_ops = { + .ndo_start_xmit = blackhole_netdev_xmit, +}; + +/* This is a dst-dummy device used specifically for invalidated + * DSTs and unlike loopback, this is not per-ns. + */ +static void blackhole_netdev_setup(struct net_device *dev) +{ + gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); +} + +/* Setup and register the blackhole_netdev. */ +static int __init blackhole_netdev_init(void) +{ + blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, + blackhole_netdev_setup); + if (!blackhole_netdev) + return -ENOMEM; + + dev_init_scheduler(blackhole_netdev); + dev_activate(blackhole_netdev); + + blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; + dev_net_set(blackhole_netdev, &init_net); + + return 0; +} + +device_initcall(blackhole_netdev_init); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eeacebd7debb..88292953aa6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4870,4 +4870,6 @@ do { \ #define PTYPE_HASH_SIZE(16) #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1) +extern struct net_device *blackhole_netdev; + #endif /* _LINUX_NETDEVICE_H */ -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv3 next 0/3] blackhole device to invalidate dst
When we invalidate dst or mark it "dead", we assign 'lo' to dst->dev. First of all this assignment is racy and more over, it has MTU implications. The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP code when dereferencing the dst don't check if the dst is valid or not. TCP when dereferencing a dead-dst while negotiating a new connection, may use dst device which is 'lo' instead of using the correct device. Consider the following scenario: A SYN arrives on an interface and tcp-layer while processing SYNACK finds a dst and associates it with SYNACK skb. Now before skb gets passed to L3 for processing, if that dst gets "dead" (because of the virtual device getting disappeared & then reappeared), the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume the SYN has ADV_MSS set as 9k while the output device through which this SYNACK is going to go out has standard MTU of 1500. The MTU check during the route check passes since MIN(9K, 64K) is 9k and TCP successfully negotiates 9k MSS. The subsequent data packet; bigger in size gets passed to the device and it won't be marked as GSO since the assumed MTU of the device is 9k. This either crashes the NIC and we have seen fixes that went into drivers to handle this scenario. 8914a595110a ('bnx2x: disable GSO where gso_size is too big for hardware') and 2b16f048729b ('net: create skb_gso_validate_mac_len()') and with those fixes TCP eventually recovers but not before few dropped segments. Well, I'm not a TCP expert and though we have experienced these corner cases in our environment, I could not reproduce this case reliably in my test setup to try this fix myself. However, Michael Chan had a setup where these fixes helped him mitigate the issue and not cause the crash. The idea here is to not alter the data-path with additional locks or smb()/rmb() barriers to avoid racy assignments but to create a new device that has really low MTU that has .ndo_start_xmit essentially a kfree_skb(). Make use of this device instead of 'lo' when marking the dst dead. First patch implements the blackhole device and second patch uses it in IPv4 and IPv6 stack while the third patch is the self test that ensures the sanity of this device. v1->v2 fixed the self-test patch to handle the conflict v2 -> v3 fixed Kconfig text/string. Mahesh Bandewar (3): loopback: create blackhole net device similar to loopack. blackhole_netdev: use blackhole_netdev to invalidate dst entries blackhole_dev: add a selftest drivers/net/loopback.c| 76 +++-- include/linux/netdevice.h | 2 + lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ net/core/dst.c| 2 +- net/ipv4/route.c | 3 +- net/ipv6/route.c | 2 +- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 11 files changed, 195 insertions(+), 14 deletions(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv2 next 3/3] blackhole_dev: add a selftest
Since this is not really a device with all capabilities, this test ensures that it has *enough* to make it through the data path without causing unwanted side-effects (read crash!). Signed-off-by: Mahesh Bandewar --- v1 -> v2 fixed the conflict resolution in selftests Makefile lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 6 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cbdfae379896..79d96e1614f5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1909,6 +1909,15 @@ config TEST_BPF If unsure, say N. +config TEST_BLACKHOLE_DEV + tristate "Test BPF filter functionality" + depends on m && NET + help + This builds the "test_blackhole_dev" module that validates the + data path through this blackhole netdev. + + If unsure, say N. + config FIND_BIT_BENCHMARK tristate "Test find_bit functions" help diff --git a/lib/Makefile b/lib/Makefile index fb7697031a79..c293624d3664 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o +obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/ diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c new file mode 100644 index ..4c40580a99a3 --- /dev/null +++ b/lib/test_blackhole_dev.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This module tests the blackhole_dev that is created during the + * net subsystem initialization. The test this module performs is + * by injecting an skb into the stack with skb->dev as the + * blackhole_dev and expects kernel to behave in a sane manner + * (in other words, *not crash*)! + * + * Copyright (c) 2018, Mahesh Bandewar + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SKB_SIZE 256 +#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */ +#define TAIL_SIZE 32 /* random tail-room */ + +#define UDP_PORT 1234 + +static int __init test_blackholedev_init(void) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *ethh; + struct udphdr *uh; + int data_len; + int ret; + + skb = alloc_skb(SKB_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + /* Reserve head-room for the headers */ + skb_reserve(skb, HEAD_SIZE); + + /* Add data to the skb */ + data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE); + memset(__skb_put(skb, data_len), 0xf, data_len); + + /* Add protocol data */ + /* (Transport) UDP */ + uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr)); + skb_set_transport_header(skb, 0); + uh->source = uh->dest = htons(UDP_PORT); + uh->len = htons(data_len); + uh->check = 0; + /* (Network) IPv6 */ + ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr)); + skb_set_network_header(skb, 0); + ip6h->hop_limit = 32; + ip6h->payload_len = data_len + sizeof(struct udphdr); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->saddr = in6addr_loopback; + ip6h->daddr = in6addr_loopback; + /* Ether */ + ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr)); + skb_set_mac_header(skb, 0); + + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + skb->dev = blackhole_netdev; + + /* Now attempt to send the packet */ + ret = dev_queue_xmit(skb); + + switch (ret) { + case NET_XMIT_SUCCESS: + pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n"); + break; + case NET_XMIT_DROP: + pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n"); + break; + case NET_XMIT_CN: + pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n"); + break; + default: + pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret); + } + + return 0; +} + +static void __exit test_blackholedev_exit(void) +{ + pr_warn("test_blackholedev module terminating.\n"); +} + +module_init(test_blackholedev_init); +module_exit(test_blackholedev_exit); + +MODULE_AUTHOR("Mahe
[PATCHv2 next 0/3] blackhole device to invalidate dst
When we invalidate dst or mark it "dead", we assign 'lo' to dst->dev. First of all this assignment is racy and more over, it has MTU implications. The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP code when dereferencing the dst don't check if the dst is valid or not. TCP when dereferencing a dead-dst while negotiating a new connection, may use dst device which is 'lo' instead of using the correct device. Consider the following scenario: A SYN arrives on an interface and tcp-layer while processing SYNACK finds a dst and associates it with SYNACK skb. Now before skb gets passed to L3 for processing, if that dst gets "dead" (because of the virtual device getting disappeared & then reappeared), the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume the SYN has ADV_MSS set as 9k while the output device through which this SYNACK is going to go out has standard MTU of 1500. The MTU check during the route check passes since MIN(9K, 64K) is 9k and TCP successfully negotiates 9k MSS. The subsequent data packet; bigger in size gets passed to the device and it won't be marked as GSO since the assumed MTU of the device is 9k. This either crashes the NIC and we have seen fixes that went into drivers to handle this scenario. 8914a595110a ('bnx2x: disable GSO where gso_size is too big for hardware') and 2b16f048729b ('net: create skb_gso_validate_mac_len()') and with those fixes TCP eventually recovers but not before few dropped segments. Well, I'm not a TCP expert and though we have experienced these corner cases in our environment, I could not reproduce this case reliably in my test setup to try this fix myself. However, Michael Chan had a setup where these fixes helped him mitigate the issue and not cause the crash. The idea here is to not alter the data-path with additional locks or smb()/rmb() barriers to avoid racy assignments but to create a new device that has really low MTU that has .ndo_start_xmit essentially a kfree_skb(). Make use of this device instead of 'lo' when marking the dst dead. First patch implements the blackhole device and second patch uses it in IPv4 and IPv6 stack while the third patch is the self test that ensures the sanity of this device. v1->v2 fixed the self-test patch to handle the conflict Mahesh Bandewar (3): loopback: create blackhole net device similar to loopack. blackhole_netdev: use blackhole_netdev to invalidate dst entries blackhole_dev: add a selftest drivers/net/loopback.c| 76 +++-- include/linux/netdevice.h | 2 + lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ net/core/dst.c| 2 +- net/ipv4/route.c | 3 +- net/ipv6/route.c | 2 +- tools/testing/selftests/net/Makefile | 2 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 11 files changed, 195 insertions(+), 14 deletions(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv2 next 1/3] loopback: create blackhole net device similar to loopack.
Create a blackhole net device that can be used for "dead" dst entries instead of loopback device. This blackhole device differs from loopback in few aspects: (a) It's not per-ns. (b) MTU on this device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb(). and (d) since it's not registered it won't have ifindex. Lower MTU effectively make the device not pass the MTU check during the route check when a dst associated with the skb is dead. Signed-off-by: Mahesh Bandewar --- v1->v2 no change drivers/net/loopback.c| 76 ++- include/linux/netdevice.h | 2 ++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 87d361666cdd..3b39def5471e 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -55,6 +55,13 @@ #include #include +/* blackhole_netdev - a device used for dsts that are marked expired! + * This is global device (instead of per-net-ns) since it's not needed + * to be per-ns and gets initialized at boot time. + */ +struct net_device *blackhole_netdev; +EXPORT_SYMBOL(blackhole_netdev); + /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ @@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = { .ndo_set_mac_address = eth_mac_addr, }; -/* The loopback device is special. There is only one instance - * per network namespace. - */ -static void loopback_setup(struct net_device *dev) +static void gen_lo_setup(struct net_device *dev, +unsigned int mtu, +const struct ethtool_ops *eth_ops, +const struct header_ops *hdr_ops, +const struct net_device_ops *dev_ops, +void (*dev_destructor)(struct net_device *dev)) { - dev->mtu= 64 * 1024; + dev->mtu= mtu; dev->hard_header_len= ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6*/ @@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev) | NETIF_F_NETNS_LOCAL | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; - dev->ethtool_ops= &loopback_ethtool_ops; - dev->header_ops = ð_header_ops; - dev->netdev_ops = &loopback_ops; + dev->ethtool_ops= eth_ops; + dev->header_ops = hdr_ops; + dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; - dev->priv_destructor= loopback_dev_free; + dev->priv_destructor= dev_destructor; +} + +/* The loopback device is special. There is only one instance + * per network namespace. + */ +static void loopback_setup(struct net_device *dev) +{ + gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops, +&loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ @@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net) struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; + +/* blackhole netdevice */ +static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, +struct net_device *dev) +{ + kfree_skb(skb); + net_warn_ratelimited("%s(): Dropping skb.\n", __func__); + return NETDEV_TX_OK; +} + +static const struct net_device_ops blackhole_netdev_ops = { + .ndo_start_xmit = blackhole_netdev_xmit, +}; + +/* This is a dst-dummy device used specifically for invalidated + * DSTs and unlike loopback, this is not per-ns. + */ +static void blackhole_netdev_setup(struct net_device *dev) +{ + gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); +} + +/* Setup and register the blackhole_netdev. */ +static int __init blackhole_netdev_init(void) +{ + blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, + blackhole_netdev_setup); + if (!blackhole_netdev) + return -ENOMEM; + + dev_init_scheduler(blackhole_netdev); + dev_activate(blackhole_netdev); + + blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; + dev_net_set(blackhole_netdev, &init_net); + + return 0; +} + +device_initcall(blackhole_netdev_init); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eeacebd7debb..88292953aa6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4870,4 +4870,6 @@ do { \ #define PTYPE_HASH_SIZE(16) #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1) +extern struct net_device *blackhole_netdev; + #endif /* _LINUX_NETDEVICE_H */ -- 2.22.0.410.gd8fdbe21b5-goog
[PATCHv2 next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries
Use blackhole_netdev instead of 'lo' device with lower MTU when marking dst "dead". Signed-off-by: Mahesh Bandewar --- v1 -> v2 no change net/core/dst.c | 2 +- net/ipv4/route.c | 3 +-- net/ipv6/route.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index e46366228eaf..1325316d9eab 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = dev_net(dst->dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 59670fafcd26..d61f43feb7fa 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev) list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e7c2824435c6..ff44c6287633 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } -- 2.22.0.410.gd8fdbe21b5-goog
[PATCH next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries
Use blackhole_netdev instead of 'lo' device with lower MTU when marking dst "dead". Signed-off-by: Mahesh Bandewar --- net/core/dst.c | 2 +- net/ipv4/route.c | 3 +-- net/ipv6/route.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index e46366228eaf..1325316d9eab 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev, true); dst->input = dst_discard; dst->output = dst_discard_out; - dst->dev = dev_net(dst->dev)->loopback_dev; + dst->dev = blackhole_netdev; dev_hold(dst->dev); dev_put(dev); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 66cbe8a7a168..049af0f88307 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) void rt_flush_dev(struct net_device *dev) { - struct net *net = dev_net(dev); struct rtable *rt; int cpu; @@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev) list_for_each_entry(rt, &ul->head, rt_uncached) { if (rt->dst.dev != dev) continue; - rt->dst.dev = net->loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(dev); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4d285fe0adc..335bcf812b25 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev) } if (rt_dev == dev) { - rt->dst.dev = loopback_dev; + rt->dst.dev = blackhole_netdev; dev_hold(rt->dst.dev); dev_put(rt_dev); } -- 2.22.0.410.gd8fdbe21b5-goog
[PATCH next 3/3] blackhole_dev: add a selftest
Since this is not really a device with all capabilities, this test ensures that it has *enough* to make it through the data path without causing unwanted side-effects (read crash!). Signed-off-by: Mahesh Bandewar --- lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 6 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index cbdfae379896..79d96e1614f5 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1909,6 +1909,15 @@ config TEST_BPF If unsure, say N. +config TEST_BLACKHOLE_DEV + tristate "Test BPF filter functionality" + depends on m && NET + help + This builds the "test_blackhole_dev" module that validates the + data path through this blackhole netdev. + + If unsure, say N. + config FIND_BIT_BENCHMARK tristate "Test find_bit functions" help diff --git a/lib/Makefile b/lib/Makefile index fb7697031a79..c293624d3664 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o +obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/ diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c new file mode 100644 index ..4c40580a99a3 --- /dev/null +++ b/lib/test_blackhole_dev.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This module tests the blackhole_dev that is created during the + * net subsystem initialization. The test this module performs is + * by injecting an skb into the stack with skb->dev as the + * blackhole_dev and expects kernel to behave in a sane manner + * (in other words, *not crash*)! + * + * Copyright (c) 2018, Mahesh Bandewar + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#define SKB_SIZE 256 +#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */ +#define TAIL_SIZE 32 /* random tail-room */ + +#define UDP_PORT 1234 + +static int __init test_blackholedev_init(void) +{ + struct ipv6hdr *ip6h; + struct sk_buff *skb; + struct ethhdr *ethh; + struct udphdr *uh; + int data_len; + int ret; + + skb = alloc_skb(SKB_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + /* Reserve head-room for the headers */ + skb_reserve(skb, HEAD_SIZE); + + /* Add data to the skb */ + data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE); + memset(__skb_put(skb, data_len), 0xf, data_len); + + /* Add protocol data */ + /* (Transport) UDP */ + uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr)); + skb_set_transport_header(skb, 0); + uh->source = uh->dest = htons(UDP_PORT); + uh->len = htons(data_len); + uh->check = 0; + /* (Network) IPv6 */ + ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr)); + skb_set_network_header(skb, 0); + ip6h->hop_limit = 32; + ip6h->payload_len = data_len + sizeof(struct udphdr); + ip6h->nexthdr = IPPROTO_UDP; + ip6h->saddr = in6addr_loopback; + ip6h->daddr = in6addr_loopback; + /* Ether */ + ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr)); + skb_set_mac_header(skb, 0); + + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + skb->dev = blackhole_netdev; + + /* Now attempt to send the packet */ + ret = dev_queue_xmit(skb); + + switch (ret) { + case NET_XMIT_SUCCESS: + pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n"); + break; + case NET_XMIT_DROP: + pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n"); + break; + case NET_XMIT_CN: + pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n"); + break; + default: + pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret); + } + + return 0; +} + +static void __exit test_blackholedev_exit(void) +{ + pr_warn("test_blackholedev module terminating.\n"); +} + +module_init(test_blackholedev_init); +module_exit(test_blackholedev_exit); + +MODULE_AUTHOR("Mahesh Bandewar "); +MODULE_LICENSE("GPL"); diff --git a/too
[PATCH next 1/3] loopback: create blackhole net device similar to loopack.
Create a blackhole net device that can be used for "dead" dst entries instead of loopback device. This blackhole device differs from loopback in few aspects: (a) It's not per-ns. (b) MTU on this device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb(). and (d) since it's not registered it won't have ifindex. Lower MTU effectively make the device not pass the MTU check during the route check when a dst associated with the skb is dead. Signed-off-by: Mahesh Bandewar --- drivers/net/loopback.c| 76 ++- include/linux/netdevice.h | 2 ++ 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 87d361666cdd..3b39def5471e 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -55,6 +55,13 @@ #include #include +/* blackhole_netdev - a device used for dsts that are marked expired! + * This is global device (instead of per-net-ns) since it's not needed + * to be per-ns and gets initialized at boot time. + */ +struct net_device *blackhole_netdev; +EXPORT_SYMBOL(blackhole_netdev); + /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ @@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = { .ndo_set_mac_address = eth_mac_addr, }; -/* The loopback device is special. There is only one instance - * per network namespace. - */ -static void loopback_setup(struct net_device *dev) +static void gen_lo_setup(struct net_device *dev, +unsigned int mtu, +const struct ethtool_ops *eth_ops, +const struct header_ops *hdr_ops, +const struct net_device_ops *dev_ops, +void (*dev_destructor)(struct net_device *dev)) { - dev->mtu= 64 * 1024; + dev->mtu= mtu; dev->hard_header_len= ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6*/ @@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev) | NETIF_F_NETNS_LOCAL | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; - dev->ethtool_ops= &loopback_ethtool_ops; - dev->header_ops = ð_header_ops; - dev->netdev_ops = &loopback_ops; + dev->ethtool_ops= eth_ops; + dev->header_ops = hdr_ops; + dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; - dev->priv_destructor= loopback_dev_free; + dev->priv_destructor= dev_destructor; +} + +/* The loopback device is special. There is only one instance + * per network namespace. + */ +static void loopback_setup(struct net_device *dev) +{ + gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops, +&loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ @@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net) struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; + +/* blackhole netdevice */ +static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, +struct net_device *dev) +{ + kfree_skb(skb); + net_warn_ratelimited("%s(): Dropping skb.\n", __func__); + return NETDEV_TX_OK; +} + +static const struct net_device_ops blackhole_netdev_ops = { + .ndo_start_xmit = blackhole_netdev_xmit, +}; + +/* This is a dst-dummy device used specifically for invalidated + * DSTs and unlike loopback, this is not per-ns. + */ +static void blackhole_netdev_setup(struct net_device *dev) +{ + gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); +} + +/* Setup and register the blackhole_netdev. */ +static int __init blackhole_netdev_init(void) +{ + blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, + blackhole_netdev_setup); + if (!blackhole_netdev) + return -ENOMEM; + + dev_init_scheduler(blackhole_netdev); + dev_activate(blackhole_netdev); + + blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; + dev_net_set(blackhole_netdev, &init_net); + + return 0; +} + +device_initcall(blackhole_netdev_init); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eeacebd7debb..88292953aa6f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4870,4 +4870,6 @@ do { \ #define PTYPE_HASH_SIZE(16) #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1) +extern struct net_device *blackhole_netdev; + #endif /* _LINUX_NETDEVICE_H */ -- 2.22.0.410.gd8fdbe21b5-goog
[PATCH next 0/3] blackhole device to invalidate dst
When we invalidate dst or mark it "dead", we assign 'lo' to dst->dev. First of all this assignment is racy and more over, it has MTU implications. The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP code when dereferencing the dst don't check if the dst is valid or not. TCP when dereferencing a dead-dst while negotiating a new connection, may use dst device which is 'lo' instead of using the correct device. Consider the following scenario: A SYN arrives on an interface and tcp-layer while processing SYNACK finds a dst and associates it with SYNACK skb. Now before skb gets passed to L3 for processing, if that dst gets "dead" (because of the virtual device getting disappeared & then reappeared), the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume the SYN has ADV_MSS set as 9k while the output device through which this SYNACK is going to go out has standard MTU of 1500. The MTU check during the route check passes since MIN(9K, 64K) is 9k and TCP successfully negotiates 9k MSS. The subsequent data packet; bigger in size gets passed to the device and it won't be marked as GSO since the assumed MTU of the device is 9k. This either crashes the NIC and we have seen fixes that went into drivers to handle this scenario. 8914a595110a ('bnx2x: disable GSO where gso_size is too big for hardware') and 2b16f048729b ('net: create skb_gso_validate_mac_len()') and with those fixes TCP eventually recovers but not before few dropped segments. Well, I'm not a TCP expert and though we have experienced these corner cases in our environment, I could not reproduce this case reliably in my test setup to try this fix myself. However, Michael Chan had a setup where these fixes helped him mitigate the issue and not cause the crash. The idea here is to not alter the data-path with additional locks or smb()/rmb() barriers to avoid racy assignments but to create a new device that has really low MTU that has .ndo_start_xmit essentially a kfree_skb(). Make use of this device instead of 'lo' when marking the dst dead. First patch implements the blackhole device and second patch uses it in IPv4 and IPv6 stack while the third patch is the self test that ensures the sanity of this device. Mahesh Bandewar (3): loopback: create blackhole net device similar to loopack. blackhole_netdev: use blackhole_netdev to invalidate dst entries blackhole_dev: add a selftest drivers/net/loopback.c| 76 +++-- include/linux/netdevice.h | 2 + lib/Kconfig.debug | 9 ++ lib/Makefile | 1 + lib/test_blackhole_dev.c | 100 ++ net/core/dst.c| 2 +- net/ipv4/route.c | 3 +- net/ipv6/route.c | 2 +- tools/testing/selftests/net/Makefile | 3 +- tools/testing/selftests/net/config| 1 + .../selftests/net/test_blackhole_dev.sh | 11 ++ 11 files changed, 196 insertions(+), 14 deletions(-) create mode 100644 lib/test_blackhole_dev.c create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh -- 2.22.0.410.gd8fdbe21b5-goog
[PATCH iproute2] ip6tunnel: fix 'ip -6 {show|change} dev ' cmds
Inclusion of 'dev' is allowed by the syntax but not handled correctly by the command. It produces no output for show command and falsely successful for change command but does not make any changes. can be verified with the following steps # ip -6 tunnel add ip6tnl1 mode ip6gre local fd::1 remote fd::2 tos inherit ttl 127 encaplimit none # ip -6 tunnel show ip6tnl1 # ip -6 tunnel show dev ip6tnl1 # ip -6 tunnel change dev ip6tnl1 local 2001:1234::1 remote 2001:1234::2 encaplimit none ttl 127 tos inherit allow-localremote # echo $? 0 # ip -6 tunnel show ip6tnl1 Signed-off-by: Mahesh Bandewar --- ip/ip6tunnel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c index 999408ed801b..56fd3466ed06 100644 --- a/ip/ip6tunnel.c +++ b/ip/ip6tunnel.c @@ -298,6 +298,8 @@ static int parse_args(int argc, char **argv, int cmd, struct ip6_tnl_parm2 *p) p->link = ll_name_to_index(medium); if (!p->link) return nodev(medium); + else + strlcpy(p->name, medium, sizeof(p->name)); } return 0; } -- 2.22.0.rc1.311.g5d7573a151-goog
[PATCH net] bonding: fix warning message
From: Mahesh Bandewar RX queue config for bonding master could be different from its slave device(s). With the commit 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also."), the packet is reinjected into stack with skb->dev as bonding master. This potentially triggers the message: "bondX received packet on queue Y, but number of RX queues is Z" whenever the queue that packet is received on is higher than the numrxqueues on bonding master (Y > Z). Fixes: 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also.") Reported-by: John Sperbeck Signed-off-by: Eric Dumazet Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c05c01a00755..ee28ec9e0aba 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1187,6 +1187,7 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) if (nskb) { nskb->dev = bond->dev; + nskb->queue_mapping = 0; netif_rx(nskb); } return RX_HANDLER_PASS; -- 2.19.0.605.g01d371f741-goog
[PATCH net] bonding: avoid possible dead-lock
From: Mahesh Bandewar Syzkaller reported this on a slightly older kernel but it's still applicable to the current kernel - == WARNING: possible circular locking dependency detected 4.18.0-next-20180823+ #46 Not tainted -- syz-executor4/26841 is trying to acquire lock: dd41ef48 ((wq_completion)bond_dev->name){+.+.}, at: flush_workqueue+0x2db/0x1e10 kernel/workqueue.c:2652 but task is already holding lock: 768ab431 (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 [inline] 768ab431 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 net/core/rtnetlink.c:4708 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (rtnl_mutex){+.+.}: __mutex_lock_common kernel/locking/mutex.c:925 [inline] __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088 rtnl_lock+0x17/0x20 net/core/rtnetlink.c:77 bond_netdev_notify drivers/net/bonding/bond_main.c:1310 [inline] bond_netdev_notify_work+0x44/0xd0 drivers/net/bonding/bond_main.c:1320 process_one_work+0xc73/0x1aa0 kernel/workqueue.c:2153 worker_thread+0x189/0x13c0 kernel/workqueue.c:2296 kthread+0x35a/0x420 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 -> #1 ((work_completion)(&(&nnw->work)->work)){+.+.}: process_one_work+0xc0b/0x1aa0 kernel/workqueue.c:2129 worker_thread+0x189/0x13c0 kernel/workqueue.c:2296 kthread+0x35a/0x420 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415 -> #0 ((wq_completion)bond_dev->name){+.+.}: lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734 register_netdevice+0x337/0x1100 net/core/dev.c:8410 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4729 netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline] netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343 netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1908 sock_sendmsg_nosec net/socket.c:622 [inline] sock_sendmsg+0xd5/0x120 net/socket.c:632 ___sys_sendmsg+0x7fd/0x930 net/socket.c:2115 __sys_sendmsg+0x11d/0x290 net/socket.c:2153 __do_sys_sendmsg net/socket.c:2162 [inline] __se_sys_sendmsg net/socket.c:2160 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2160 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: (wq_completion)bond_dev->name --> (work_completion)(&(&nnw->work)->work) --> rtnl_mutex Possible unsafe locking scenario: CPU0CPU1 lock(rtnl_mutex); lock((work_completion)(&(&nnw->work)->work)); lock(rtnl_mutex); lock((wq_completion)bond_dev->name); *** DEADLOCK *** 1 lock held by syz-executor4/26841: stack backtrace: CPU: 1 PID: 26841 Comm: syz-executor4 Not tainted 4.18.0-next-20180823+ #46 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113 print_circular_bug.isra.34.cold.55+0x1bd/0x27d kernel/locking/lockdep.c:1222 check_prev_add kernel/locking/lockdep.c:1862 [inline] check_prevs_add kernel/locking/lockdep.c:1975 [inline] validate_chain kernel/locking/lockdep.c:2416 [inline] __lock_acquire+0x3449/0x5020 kernel/locking/lockdep.c:3412 lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734 register_netdevice+0x337/0x1100 net/core/dev.c:8410 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454 rtnetlink_r
[PATCH net] bonding: pass link-local packets to bonding master also.
From: Mahesh Bandewar Commit b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") changed the behavior of how link-local-multicast packets are processed. The change in the behavior broke some legacy use cases where these packets are expected to arrive on bonding master device also. This patch passes the packet to the stack with the link it arrived on as well as passes to the bonding-master device to preserve the legacy use case. Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") Reported-by: Michal Soltys Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9866fa959a19..e3157aed656a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1178,9 +1178,26 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* don't change skb->dev for link-local packets */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + /* Link-local multicast packets should be passed to the +* stack on the link they arrive as well as pass them to the +* bond-master device. These packets are mostly usable when +* stack receives it with the link on which they arrive +* (e.g. LLDP) they also must be available on master. Some of +* the use cases include (but are not limited to): LLDP agents +* that must be able to operate both on enslaved interfaces as +* well as on bonds themselves; linux bridges that must be able +* to process/pass BPDUs from attached bonds when any kind of +* STP version is enabled on the network. +*/ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + nskb->dev = bond->dev; + netif_rx(nskb); + } return RX_HANDLER_PASS; + } if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT; -- 2.19.0.444.g18242da7ef-goog
[PATCH iproute2] iproute2: fix use-after-free
From: Mahesh Bandewar A local program using iproute2 lib pointed out the issue and looking at the code it is pretty obvious - a = (struct nlmsghdr *)b; ... free(b); if (a->nlmsg_seq == seq) ... Fixes: 86bf43c7c2fd ("lib/libnetlink: update rtnl_talk to support malloc buff at run time") Signed-off-by: Mahesh Bandewar --- lib/libnetlink.c | 15 +++ 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/libnetlink.c b/lib/libnetlink.c index 928de1dd16d8..016a5f0bcfb6 100644 --- a/lib/libnetlink.c +++ b/lib/libnetlink.c @@ -661,17 +661,24 @@ next: if (l < sizeof(struct nlmsgerr)) { fprintf(stderr, "ERROR truncated\n"); } else if (!err->error) { + unsigned int tmp_seq; + /* check messages from kernel */ nl_dump_ext_ack(h, errfn); - if (answer) + tmp_seq = h->nlmsg_seq; + if (answer) { *answer = (struct nlmsghdr *)buf; - else + } else { free(buf); - if (h->nlmsg_seq == seq) + buf = NULL; + } + if (tmp_seq == seq) { return 0; - else if (i < iovlen) + } else if (i < iovlen) { + free(buf); goto next; + } return 0; } -- 2.19.0.rc2.392.g5ba43deb5a-goog
[PATCHv3 iproute2 0/2] clang + misc changes
From: Mahesh Bandewar The primary theme is to make clang compile the iproute2 package without warnings. Along with this there are two other misc patches in the series. First patch uses the preferred_family when operating with maddr feature. Prior to this patch, it would always open an AF_INET socket irrespective of the family that is preferred via command-line. Second patch mostly adds format attributes to make the c-lang compiler happy and not throw the warning messages. Mahesh Bandewar (2): ipmaddr: use preferred_family when given iproute: make clang happy with iproute2 package include/json_writer.h | 3 +-- ip/iplink_can.c | 19 --- ip/ipmaddr.c | 13 - lib/color.c | 1 + lib/json_print.c | 1 + lib/json_writer.c | 15 +-- misc/ss.c | 3 ++- tc/m_ematch.c | 1 + tc/m_ematch.h | 1 + 9 files changed, 32 insertions(+), 25 deletions(-) -- 2.18.0.1017.ga543ac7ca45-goog
[PATCHv3 iproute2 1/2] ipmaddr: use preferred_family when given
From: Mahesh Bandewar When creating socket() AF_INET is used irrespective of the family that is given at the command-line (with -4, -6, or -0). This change will open the socket with the preferred family. Signed-off-by: Mahesh Bandewar --- ip/ipmaddr.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index a48499029e17..abf83784d0df 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv) static int multiaddr_modify(int cmd, int argc, char **argv) { struct ifreq ifr = {}; + int family; int fd; if (cmd == RTM_NEWADDR) @@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv) exit(-1); } - fd = socket(AF_INET, SOCK_DGRAM, 0); + switch (preferred_family) { + case AF_INET6: + case AF_PACKET: + case AF_INET: + family = preferred_family; + break; + default: + family = AF_INET; + } + + fd = socket(family, SOCK_DGRAM, 0); if (fd < 0) { perror("Cannot create socket"); exit(1); -- 2.18.0.1017.ga543ac7ca45-goog
[PATCHv3 iproute2 2/2] iproute: make clang happy
From: Mahesh Bandewar These are primarily fixes for "string is not string literal" warnings / errors (with -Werror -Wformat-nonliteral). This should be a no-op change. I had to replace couple of print helper functions with the code they call as it was becoming harder to eliminate these warnings, however these helpers were used only at couple of places, so no major change as such. Signed-off-by: Mahesh Bandewar --- include/json_writer.h | 3 +-- ip/iplink_can.c | 19 --- lib/color.c | 1 + lib/json_print.c | 1 + lib/json_writer.c | 15 +-- misc/ss.c | 3 ++- tc/m_ematch.c | 1 + tc/m_ematch.h | 1 + 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/json_writer.h b/include/json_writer.h index 9ab88e1dbdd9..0c8831c1136d 100644 --- a/include/json_writer.h +++ b/include/json_writer.h @@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on); void jsonw_name(json_writer_t *self, const char *name); /* Add value */ +__attribute__((format(printf, 2, 3))) void jsonw_printf(json_writer_t *self, const char *fmt, ...); void jsonw_string(json_writer_t *self, const char *value); void jsonw_bool(json_writer_t *self, bool value); @@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop, unsigned long int num); void jsonw_lluint_field(json_writer_t *self, const char *prop, unsigned long long int num); -void jsonw_float_field_fmt(json_writer_t *self, const char *prop, - const char *fmt, double val); /* Collections */ void jsonw_start_object(json_writer_t *self); diff --git a/ip/iplink_can.c b/ip/iplink_can.c index 587413da15c4..c0deeb1f1fcf 100644 --- a/ip/iplink_can.c +++ b/ip/iplink_can.c @@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("bittiming"); print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", "%.3f", - (float) bt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) bt->sample_point / 1000); print_int(PRINT_ANY, "tq", NULL, bt->tq); print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg); print_int(PRINT_ANY, "phase_seg1", @@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("data_bittiming"); print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", - "%.3f", - (float) dbt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) dbt->sample_point / 1000.); print_int(PRINT_JSON, "tq", NULL, dbt->tq); print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg); print_int(PRINT_JSON, "phase_seg1", diff --git a/lib/color.c b/lib/color.c index eaf69e74d673..e5406294dfc4 100644 --- a/lib/color.c +++ b/lib/color.c @@ -132,6 +132,7 @@ void set_color_palette(void) is_dark_bg = 1; } +__attribute__((format(printf, 3, 4))) int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...) { int ret = 0; diff --git a/lib/json_print.c b/lib/json_print.c index 5dc41bfabfd4..77902824a738 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char *str) * functions handling different types */ #define _PRINT_FUNC(type_name, type) \ + __attribute__((format(printf, 4, 0))) \ void print_color_##type_name(enum output_type t,
[PATCHv2 iproute2 3/3] iproute: make clang happy with iproute2 package
From: Mahesh Bandewar These are primarily fixes for "string is not string literal" warnings / errors (with -Werror -Wformat-nonliteral). This should be a no-op change. I had to replace couple of print helper functions with the code they call as it was becoming harder to eliminate these warnings, however these helpers were used only at couple of places, so no major change as such. Signed-off-by: Mahesh Bandewar --- include/json_writer.h | 3 +-- ip/iplink_can.c | 19 --- lib/color.c | 1 + lib/json_print.c | 1 + lib/json_writer.c | 15 +-- misc/ss.c | 3 ++- tc/m_ematch.c | 1 + tc/m_ematch.h | 1 + 8 files changed, 20 insertions(+), 24 deletions(-) diff --git a/include/json_writer.h b/include/json_writer.h index 9ab88e1dbdd9..0c8831c1136d 100644 --- a/include/json_writer.h +++ b/include/json_writer.h @@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on); void jsonw_name(json_writer_t *self, const char *name); /* Add value */ +__attribute__((format(printf, 2, 3))) void jsonw_printf(json_writer_t *self, const char *fmt, ...); void jsonw_string(json_writer_t *self, const char *value); void jsonw_bool(json_writer_t *self, bool value); @@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop, unsigned long int num); void jsonw_lluint_field(json_writer_t *self, const char *prop, unsigned long long int num); -void jsonw_float_field_fmt(json_writer_t *self, const char *prop, - const char *fmt, double val); /* Collections */ void jsonw_start_object(json_writer_t *self); diff --git a/ip/iplink_can.c b/ip/iplink_can.c index 587413da15c4..c0deeb1f1fcf 100644 --- a/ip/iplink_can.c +++ b/ip/iplink_can.c @@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("bittiming"); print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", "%.3f", - (float) bt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) bt->sample_point / 1000); print_int(PRINT_ANY, "tq", NULL, bt->tq); print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg); print_int(PRINT_ANY, "phase_seg1", @@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("data_bittiming"); print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", - "%.3f", - (float) dbt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) dbt->sample_point / 1000.); print_int(PRINT_JSON, "tq", NULL, dbt->tq); print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg); print_int(PRINT_JSON, "phase_seg1", diff --git a/lib/color.c b/lib/color.c index eaf69e74d673..e5406294dfc4 100644 --- a/lib/color.c +++ b/lib/color.c @@ -132,6 +132,7 @@ void set_color_palette(void) is_dark_bg = 1; } +__attribute__((format(printf, 3, 4))) int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...) { int ret = 0; diff --git a/lib/json_print.c b/lib/json_print.c index 5dc41bfabfd4..77902824a738 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char *str) * functions handling different types */ #define _PRINT_FUNC(type_name, type) \ + __attribute__((format(printf, 4, 0))) \ void print_color_##type_name(enum output_type t,
[PATCHv2 iproute2 0/3] clang + misc changes
From: Mahesh Bandewar The primary theme is to make clang compile the iproute2 package without warnings. Along with this there are two other misc patches in the series. First patch uses the preferred_family when operating with maddr feature. Prior to this patch, it would always open an AF_INET socket irrespective of the family that is preferred via command-line. Second patch just removes extern from the prototype declarations from the m_ematch.h header file. Third patch mostly adds format attributes to make the c-lang compiler happy and not throw the warning messages. Mahesh Bandewar (3): ipmaddr: use preferred_family when given tc: remove extern from prototype declarations iproute: make clang happy with iproute2 package include/json_writer.h | 3 +-- ip/iplink_can.c | 19 --- ip/ipmaddr.c | 13 - lib/color.c | 1 + lib/json_print.c | 1 + lib/json_writer.c | 15 +-- misc/ss.c | 3 ++- tc/m_ematch.c | 1 + tc/m_ematch.h | 15 --- 9 files changed, 39 insertions(+), 32 deletions(-) -- 2.18.0.865.gffc8e1a3cd6-goog
[PATCHv2 iproute2 2/3] tc: remove extern from prototype declarations
From: Mahesh Bandewar Signed-off-by: Mahesh Bandewar --- tc/m_ematch.h | 14 +++--- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tc/m_ematch.h b/tc/m_ematch.h index f634f19164fa..80b02cfad6cc 100644 --- a/tc/m_ematch.h +++ b/tc/m_ematch.h @@ -20,7 +20,7 @@ struct bstr struct bstr *next; }; -extern struct bstr * bstr_alloc(const char *text); +struct bstr * bstr_alloc(const char *text); static inline struct bstr * bstr_new(char *data, unsigned int len) { @@ -51,8 +51,8 @@ static inline struct bstr *bstr_next(struct bstr *b) return b->next; } -extern unsigned long bstrtoul(const struct bstr *b); -extern void bstr_print(FILE *fd, const struct bstr *b, int ascii); +unsigned long bstrtoul(const struct bstr *b); +void bstr_print(FILE *fd, const struct bstr *b, int ascii); struct ematch @@ -79,7 +79,7 @@ static inline struct ematch * new_ematch(struct bstr *args, int inverted) return e; } -extern void print_ematch_tree(const struct ematch *tree); +void print_ematch_tree(const struct ematch *tree); struct ematch_util @@ -107,9 +107,9 @@ static inline int parse_layer(struct bstr *b) return INT_MAX; } -extern int em_parse_error(int err, struct bstr *args, struct bstr *carg, +int em_parse_error(int err, struct bstr *args, struct bstr *carg, struct ematch_util *, char *fmt, ...); -extern int print_ematch(FILE *, const struct rtattr *); -extern int parse_ematch(int *, char ***, int, struct nlmsghdr *); +int print_ematch(FILE *, const struct rtattr *); +int parse_ematch(int *, char ***, int, struct nlmsghdr *); #endif -- 2.18.0.865.gffc8e1a3cd6-goog
[PATCHv2 iproute2 1/3] ipmaddr: use preferred_family when given
From: Mahesh Bandewar When creating socket() AF_INET is used irrespective of the family that is given at the command-line (with -4, -6, or -0). This change will open the socket with the preferred family. Signed-off-by: Mahesh Bandewar --- ip/ipmaddr.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index a48499029e17..abf83784d0df 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv) static int multiaddr_modify(int cmd, int argc, char **argv) { struct ifreq ifr = {}; + int family; int fd; if (cmd == RTM_NEWADDR) @@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv) exit(-1); } - fd = socket(AF_INET, SOCK_DGRAM, 0); + switch (preferred_family) { + case AF_INET6: + case AF_PACKET: + case AF_INET: + family = preferred_family; + break; + default: + family = AF_INET; + } + + fd = socket(family, SOCK_DGRAM, 0); if (fd < 0) { perror("Cannot create socket"); exit(1); -- 2.18.0.865.gffc8e1a3cd6-goog
[PATCH iproute2] iproute: make clang happy
From: Mahesh Bandewar These are primarily fixes for "string is not string literal" warnings / errors (with -Werror -Wformat-nonliteral). This should be a no-op change. I had to replace couple of print helper functions with the code they call as it was becoming harder to eliminate these warnings, however these helpers were used only at couple of places, so no major change as such. Signed-off-by: Mahesh Bandewar --- include/json_writer.h | 2 -- ip/iplink_can.c | 19 --- lib/color.c | 1 + lib/json_print.c | 1 + lib/json_writer.c | 15 +-- misc/ss.c | 3 ++- tc/m_ematch.c | 1 + 7 files changed, 18 insertions(+), 24 deletions(-) diff --git a/include/json_writer.h b/include/json_writer.h index 9ab88e1dbdd9..a111cc62e7b2 100644 --- a/include/json_writer.h +++ b/include/json_writer.h @@ -59,8 +59,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop, unsigned long int num); void jsonw_lluint_field(json_writer_t *self, const char *prop, unsigned long long int num); -void jsonw_float_field_fmt(json_writer_t *self, const char *prop, - const char *fmt, double val); /* Collections */ void jsonw_start_object(json_writer_t *self); diff --git a/ip/iplink_can.c b/ip/iplink_can.c index 587413da15c4..c0deeb1f1fcf 100644 --- a/ip/iplink_can.c +++ b/ip/iplink_can.c @@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("bittiming"); print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", "%.3f", - (float) bt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) bt->sample_point / 1000); print_int(PRINT_ANY, "tq", NULL, bt->tq); print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg); print_int(PRINT_ANY, "phase_seg1", @@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]); if (is_json_context()) { + json_writer_t *jw; + open_json_object("data_bittiming"); print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate); - jsonw_float_field_fmt(get_json_writer(), - "sample_point", - "%.3f", - (float) dbt->sample_point / 1000.); + jw = get_json_writer(); + jsonw_name(jw, "sample_point"); + jsonw_printf(jw, "%.3f", +(float) dbt->sample_point / 1000.); print_int(PRINT_JSON, "tq", NULL, dbt->tq); print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg); print_int(PRINT_JSON, "phase_seg1", diff --git a/lib/color.c b/lib/color.c index eaf69e74d673..e5406294dfc4 100644 --- a/lib/color.c +++ b/lib/color.c @@ -132,6 +132,7 @@ void set_color_palette(void) is_dark_bg = 1; } +__attribute__((format(printf, 3, 4))) int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...) { int ret = 0; diff --git a/lib/json_print.c b/lib/json_print.c index 5dc41bfabfd4..77902824a738 100644 --- a/lib/json_print.c +++ b/lib/json_print.c @@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char *str) * functions handling different types */ #define _PRINT_FUNC(type_name, type) \ + __attribute__((format(printf, 4, 0))) \ void print_color_##type_name(enum output_type t,\ enum color_attr color, \ const char *key, \ diff --git a/lib/json_writer.c b/lib/json_writer.c index aa9ce1c65e51..68890b34ee92 100644 --- a/lib/json_writer.c +++ b/lib/json_writer.c @@ -152,6 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
[PATCH iproute2] ipmaddr: use preferred_family when given
From: Mahesh Bandewar When creating socket() AF_INET is used irrespective of the family that is given at the command-line (with -4, -6, or -0). This change will open the socket with the preferred family. Signed-off-by: Mahesh Bandewar --- ip/ipmaddr.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c index a48499029e17..abf83784d0df 100644 --- a/ip/ipmaddr.c +++ b/ip/ipmaddr.c @@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv) static int multiaddr_modify(int cmd, int argc, char **argv) { struct ifreq ifr = {}; + int family; int fd; if (cmd == RTM_NEWADDR) @@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv) exit(-1); } - fd = socket(AF_INET, SOCK_DGRAM, 0); + switch (preferred_family) { + case AF_INET6: + case AF_PACKET: + case AF_INET: + family = preferred_family; + break; + default: + family = AF_INET; + } + + fd = socket(family, SOCK_DGRAM, 0); if (fd < 0) { perror("Cannot create socket"); exit(1); -- 2.18.0.865.gffc8e1a3cd6-goog
[PATCH net v2] bonding: pass link-local packets to bonding master also.
From: Mahesh Bandewar Commit b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") changed the behavior of how link-local-multicast packets are processed. The change in the behavior broke some legacy use cases where these packets are expected to arrive on bonding master device also. This patch passes the packet to the stack with the link it arrived on as well as passes to the bonding-master device to preserve the legacy use case. Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") Reported-by: Michal Soltys Signed-off-by: Mahesh Bandewar --- v2: Added Fixes tag. v1: Initial patch. drivers/net/bonding/bond_main.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9a2ea3c1f949..1d3b7d8448f2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* don't change skb->dev for link-local packets */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + /* Link-local multicast packets should be passed to the +* stack on the link they arrive as well as pass them to the +* bond-master device. These packets are mostly usable when +* stack receives it with the link on which they arrive +* (e.g. LLDP) but there may be some legacy behavior that +* expects these packets to appear on bonding master too. +*/ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + nskb->dev = bond->dev; + netif_rx(nskb); + } return RX_HANDLER_PASS; + } if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT; -- 2.18.0.203.gfac676dfb9-goog
[PATCH next v2] bonding: pass link-local packets to bonding master also.
From: Mahesh Bandewar Commit b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") changed the behavior of how link-local-multicast packets are processed. The change in the behavior broke some legacy use cases where these packets are expected to arrive on bonding master device also. This patch passes the packet to the stack with the link it arrived on as well as passes to the bonding-master device to preserve the legacy use case. Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") Reported-by: Michal Soltys Signed-off-by: Mahesh Bandewar --- v2: Added Fixes tag. v1: Initial patch. drivers/net/bonding/bond_main.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9a2ea3c1f949..1d3b7d8448f2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* don't change skb->dev for link-local packets */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + /* Link-local multicast packets should be passed to the +* stack on the link they arrive as well as pass them to the +* bond-master device. These packets are mostly usable when +* stack receives it with the link on which they arrive +* (e.g. LLDP) but there may be some legacy behavior that +* expects these packets to appear on bonding master too. +*/ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + nskb->dev = bond->dev; + netif_rx(nskb); + } return RX_HANDLER_PASS; + } if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT; -- 2.18.0.203.gfac676dfb9-goog
[PATCH next] bonding: pass link-local packets to bonding master also.
From: Mahesh Bandewar Commit b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to link that packets arrived on") changed the behavior of how link-local-multicast packets are processed. The change in the behavior broke some legacy use cases where these packets are expected to arrive on bonding master device also. This patch passes the packet to the stack with the link it arrived on as well as passes to the bonding-master device to preserve the legacy use case. Reported-by: Michal Soltys Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 9a2ea3c1f949..1d3b7d8448f2 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* don't change skb->dev for link-local packets */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + /* Link-local multicast packets should be passed to the +* stack on the link they arrive as well as pass them to the +* bond-master device. These packets are mostly usable when +* stack receives it with the link on which they arrive +* (e.g. LLDP) but there may be some legacy behavior that +* expects these packets to appear on bonding master too. +*/ + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + nskb->dev = bond->dev; + netif_rx(nskb); + } return RX_HANDLER_PASS; + } if (bond_should_deliver_exact_match(skb, slave, bond)) return RX_HANDLER_EXACT; -- 2.18.0.203.gfac676dfb9-goog
[PATCHv4 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. Capability mask is stored in kernel as kernel_cap_t type (array of u32). This sysctl takes input as comma separated hex u32 words. For simplicity one could see this sysctl to operate on string inputs. However the value is not expected to change that often during the life of a kernel-boot. It makes more sense to use the widely available API instead of bringing another string manipulation for the purpose of making this simpler. The default value set (for kernel.controlled_userns_caps_whitelist) is CAP_FULL_SET indicating that no capability is controlled by default to maintain compatibility with the existing behavior of user-ns. Administrator will have to modify this sysctl to control any capability as such. e.g. to control CAP_NET_RAW the mask need to be changed like - # sysctl -q kernel.controlled_userns_caps_whitelist kernel.controlled_userns_caps_whitelist = 1f, # sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff kernel.controlled_userns_caps_whitelist = 1f,dfff For bit-to-mask conversion please check include/uapi/linux/capability.h file. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. In above example CAP_NET_RAW will not be available to controlled-user-namespaces. Acked-by: Serge Hallyn Signed-off-by: Mahesh Bandewar --- v4: commit message changes. v3: Added couple of comments as requested by Serge Hallyn v2: Rebase v1: Initial submission Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 694968c7523c..6aa1e087afee 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is available in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index f640dcbc880c..7d79a4689625 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -14,6 +14,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..4a859b7d4902 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ct
[PATCHv4 0/2] capability controlled user-namespaces
From: Mahesh Bandewar TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Processes that do not have CAP_SYS_ADMIN in init-ns can *only* create controlled user-namespaces. In other words, user-namespaces created by privileged processes (those which have CAP_SYS_ADMIN in init-ns) are not controlled. A hierarchy underneath any controlled user-ns is always controlled. A global whitelist is list of capabilities governed by a sysctl (kernel.controlled_userns_caps_whitelist) which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host irrespective of when that user-ns was created. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Example --- Here is the example that demonstrates the behavior of a kernel that has this patch-set applied. It uses the same c-code from this commit-log and is called acquire_raw.c - (a) The 'root' user has all the capabilities all the time (before and after taking capability). root@vm0:~# id uid=0(root) gid=0(root) groups=0(root) root@vm0:~# sysctl -q kernel.controlled_userns_caps_whitelist kernel.controlled_userns_caps_whitelist = 1f, root@vm0:~# ./acquire_raw Attempting to open RAW socket before unshare()... Successfully opened RAW-Sock before unshare(). Attempting to open RAW socket after unshare()... Successfully opened RAW-Sock after unshare(). root@vm0:~# sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff kernel.controlled_userns_caps_whitelist = 1f,dfff root@vm0:~# ./acquire_raw Attempting to open RAW socket before unshare()... Successfully opened RAW-Sock before unshare(). Attempting to open RAW socket after unshare()... Successfully opened RAW-Sock after unshare(). (b) Unprivileged user cannot change the mask. mahesh@vm0:~$ id uid=1000(mahesh) gid=1000(mahesh) groups=1000(mahesh),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),118(lpadmin),128(sambashare) mahesh@vm
[PATCHv4 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Acked-by: Serge Hallyn Signed-off-by: Mahesh Bandewar --- v4: Rebase v3: Rebase v2: Don't recalculate user-ns flags for every setns() call. v1: Initial submission. include/linux/capability.h | 4 include/linux/user_namespace.h | 25 + kernel/capability.c| 5 + kernel/user_namespace.c| 4 security/commoncap.c | 8 5 files changed, 46 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 7d79a4689625..383f31f066f0 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +/* Controlled capability is capability that is missing from the capability-mask + * controlled_userns_caps_whitelist controlled via sysctl. + */ +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index d6b74b91096b..a5c48684b317 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +/* Controlled user-ns is the one that is created by a process that does not + * have CAP_SYS_ADMIN (or descended from such an user-ns). + * For more details please see the sysctl description of + * controlled_userns_caps_whitelist. + */ +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 4a859b7d4902..bffe249922de 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 246d4d4ce5c7..ca0556d466b6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -141,6 +141,10 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + if (!ns_capable(parent_ns, CAP_SYS_ADMIN) || + is_user_ns_controlled(parent_ns)) + mark_user_ns_controlled(ns); + return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS diff --git a/security/commoncap.c b/security/commoncap.c index 4f8
[PATCH next 0/2] ipvlan: packet scrub
From: Mahesh Bandewar While crossing namespace boundary IPvlan aggressively scrubs packets. This is creating problems. First thing is that scrubbing changes the packet type in skb meta-data to PACKET_HOST. This causes erroneous packet delivery when dev_forward_skb() has already marked the packet type as OTHER_HOST. On the egress side scrubbing just before calling dev_queue_xmit() creates another set of problems. Scrubbing remove skb->sk so the prio update gets missed and more seriously, socket back-pressure fails making TSQ not function correctly. The first patch in the series just reverts the earlier change which was adding a mac-check, but that is unnecessary if packet_type that dev_forward_skb() has set is honored. The second path removes two of the scrubs which are causing problems described above. Mahesh Bandewar (2): Revert "ipvlan: add L2 check for packets arriving via virtual devices" ipvlan: remove excessive packet scrubbing drivers/net/ipvlan/ipvlan_core.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) -- 2.15.1.424.g9478a66081-goog
[PATCH next 1/2] Revert "ipvlan: add L2 check for packets arriving via virtual devices"
From: Mahesh Bandewar This reverts commit 92ff42645028fa6f9b8aa767718457b9264316b4. Even though the check added is not that taxing, it's not really needed. First of all this will be per packet cost and second thing is that the eth_type_trans() already does this correctly. The excessive scrubbing in IPvlan was changing the pkt-type skb metadata of the packet which made it necessary to re-check the mac. The subsequent patch in this series removes the faulty packet-scrub. Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_core.c | 4 1 file changed, 4 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 0bc7f721b717..9774c96ac7bb 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -322,10 +322,6 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { - if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest, -ipvlan->phy_dev->dev_addr)) - skb->pkt_type = PACKET_OTHERHOST; - ret = RX_HANDLER_ANOTHER; success = true; } -- 2.15.1.424.g9478a66081-goog
[PATCH next 2/2] ipvlan: remove excessive packet scrubbing
From: Mahesh Bandewar IPvlan currently scrubs packets at every location where packets may be crossing namespace boundary. Though this is desirable, currently IPvlan does it more than necessary. e.g. packets that are going to take dev_forward_skb() path will get scrubbed so no point in scrubbing them before forwarding. Another side-effect of scrubbing is that pkt-type gets set to PACKET_HOST which overrides what was already been set by the earlier path making erroneous delivery of the packets. Also scrubbing packets just before calling dev_queue_xmit() has detrimental effects since packets lose skb->sk and because of that miss prio updates, incorrect socket back-pressure and would even break TSQ. Fixes: b93dd49c1a35 ('ipvlan: Scrub skb before crossing the namespace boundary') Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 9774c96ac7bb..c1f008fe4e1d 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -315,13 +315,13 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, *pskb = skb; } - ipvlan_skb_crossing_ns(skb, dev); if (local) { skb->pkt_type = PACKET_HOST; if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { + skb->dev = dev; ret = RX_HANDLER_ANOTHER; success = true; } @@ -586,7 +586,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) return NET_XMIT_SUCCESS; } - ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); + skb->dev = ipvlan->phy_dev; return dev_queue_xmit(skb); } -- 2.15.1.424.g9478a66081-goog
[PATCH next] ipvlan: add L2 check for packets arriving via virtual devices
From: Mahesh Bandewar Packets that don't have dest mac as the mac of the master device should not be entertained by the IPvlan rx-handler. This is mostly true as the packet path mostly takes care of that, except when the master device is a virtual device. As demonstrated in the following case - ip netns add ns1 ip link add ve1 type veth peer name ve2 ip link add link ve2 name iv1 type ipvlan mode l2 ip link set dev iv1 netns ns1 ip link set ve1 up ip link set ve2 up ip -n ns1 link set iv1 up ip addr add 192.168.10.1/24 dev ve1 ip -n ns1 addr 192.168.10.2/24 dev iv1 ping -c2 192.168.10.2 ip neigh show dev ve1 ip neigh show 192.168.10.2 lladdr dev ve1 ping -c2 192.168.10.2 This patch adds that missing check in the IPvlan rx-handler. Reported-by: Amit Sikka Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_core.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 9774c96ac7bb..0bc7f721b717 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -322,6 +322,10 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb, if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) success = true; } else { + if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest, +ipvlan->phy_dev->dev_addr)) + skb->pkt_type = PACKET_OTHERHOST; + ret = RX_HANDLER_ANOTHER; success = true; } -- 2.15.1.424.g9478a66081-goog
[PATCHv3 0/2] capability controlled user-namespaces
From: Mahesh Bandewar TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Once a user-ns is marked as 'controlled'; all its child user- namespaces are marked as 'controlled' too. A global whitelist is list of capabilities governed by the sysctl which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Please see individual patches in this series. Mahesh Bandewar (2): capability: introduce sysctl for controlled user-ns capability whitelist userns: control capabilities of some user namespaces Documentation/sysctl/kernel.txt | 21 + include/linux/capability.h | 7 ++ include/linux/user_namespace.h | 25 kernel/capability.c | 52 + kernel/sysctl.c | 5 kernel/user_namespace.c | 4 security/commoncap.c| 8 +++ 7 files changed, 122 insertions(+) -- 2.15.0.531.g2ccb3012c9-goog
[PATCHv3 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. This takes input as capability mask expressed as two comma separated hex u32 words. The mask, however, is stored in kernel as kernel_cap_t type. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. Acked-by: Serge Hallyn Signed-off-by: Mahesh Bandewar --- v3: Added couple of comments as requested by Serge Hallyn v2: Rebase v1: Initial submission Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 694968c7523c..a1d39dbae847 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index f640dcbc880c..7d79a4689625 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -14,6 +14,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..4a859b7d4902 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos) +{ + DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP); + struct ctl_table caps_table; + char tbuf[NAME_MAX]; + int ret; + + ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP, + controlled_userns_caps_whitelist.cap, + _KERNEL_CAPABILITY_U32S); + if (ret != CAP_LAST_CAP) + return -1; + + scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap); + + caps_table.data = tbuf; + caps_table.maxlen = NAME_MAX; + caps_table.mode = table->mode; + ret = proc_dostring(&caps_table, write, buff, lenp, ppos); + if (ret) + return ret; + if (write) { + kernel_cap_t tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP); + if (ret) +
[PATCHv3 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Acked-by: Serge Hallyn Signed-off-by: Mahesh Bandewar --- v3: Rebase v2: Don't recalculate user-ns flags for every setns() call. v1: Initial submission. include/linux/capability.h | 4 include/linux/user_namespace.h | 25 + kernel/capability.c| 5 + kernel/user_namespace.c| 4 security/commoncap.c | 8 5 files changed, 46 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 7d79a4689625..383f31f066f0 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +/* Controlled capability is capability that is missing from the capability-mask + * controlled_userns_caps_whitelist controlled via sysctl. + */ +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index d6b74b91096b..a5c48684b317 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +/* Controlled user-ns is the one that is created by a process that does not + * have CAP_SYS_ADMIN (or descended from such an user-ns). + * For more details please see the sysctl description of + * controlled_userns_caps_whitelist. + */ +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 4a859b7d4902..bffe249922de 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 246d4d4ce5c7..ca0556d466b6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -141,6 +141,10 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + if (!ns_capable(parent_ns, CAP_SYS_ADMIN) || + is_user_ns_controlled(parent_ns)) + mark_user_ns_controlled(ns); + return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS diff --git a/security/commoncap.c b/security/commoncap.c index 4f8e09340956..5454
[PATCHv2 0/2] capability controlled user-namespaces
From: Mahesh Bandewar TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Once a user-ns is marked as 'controlled'; all its child user- namespaces are marked as 'controlled' too. A global whitelist is list of capabilities governed by the sysctl which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Please see individual patches in this series. Mahesh Bandewar (2): capability: introduce sysctl for controlled user-ns capability whitelist userns: control capabilities of some user namespaces Documentation/sysctl/kernel.txt | 21 + include/linux/capability.h | 4 include/linux/user_namespace.h | 20 kernel/capability.c | 52 + kernel/sysctl.c | 5 kernel/user_namespace.c | 4 security/commoncap.c| 8 +++ 7 files changed, 114 insertions(+) -- 2.15.0.448.gf294e3d99a-goog
[PATCHv2 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. This takes input as capability mask expressed as two comma separated hex u32 words. The mask, however, is stored in kernel as kernel_cap_t type. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. Signed-off-by: Mahesh Bandewar --- v2: Rebase v1: Initial submission Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 694968c7523c..a1d39dbae847 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index f640dcbc880c..7d79a4689625 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -14,6 +14,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index 1e1c0236f55b..4a859b7d4902 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos) +{ + DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP); + struct ctl_table caps_table; + char tbuf[NAME_MAX]; + int ret; + + ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP, + controlled_userns_caps_whitelist.cap, + _KERNEL_CAPABILITY_U32S); + if (ret != CAP_LAST_CAP) + return -1; + + scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap); + + caps_table.data = tbuf; + caps_table.maxlen = NAME_MAX; + caps_table.mode = table->mode; + ret = proc_dostring(&caps_table, write, buff, lenp, ppos); + if (ret) + return ret; + if (write) { + kernel_cap_t tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP); + if (ret) + return ret; + + ret = bitmap_to
[PATCHv2 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Signed-off-by: Mahesh Bandewar --- v2: Don't recalculate user-ns flags for every setns() call. v1: Initial submission. include/linux/capability.h | 1 + include/linux/user_namespace.h | 20 kernel/capability.c| 5 + kernel/user_namespace.c| 4 security/commoncap.c | 8 5 files changed, 38 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 7d79a4689625..a1fd9e460379 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -251,6 +251,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 3fe714da7f5a..647f825c7b5f 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -23,6 +23,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -103,6 +104,16 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -161,6 +172,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 4a859b7d4902..bffe249922de 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..600c7dcb9ff7 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -139,6 +139,10 @@ int create_user_ns(struct cred *new) goto fail_keyring; set_cred_user_ns(new, ns); + if (!ns_capable(parent_ns, CAP_SYS_ADMIN) || + is_user_ns_controlled(parent_ns)) + mark_user_ns_controlled(ns); + return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS diff --git a/security/commoncap.c b/security/commoncap.c index fc46f5b85251..89103f16ac37 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, { struct user_namespace *ns = targ_ns; + /* If the capability is controlled and user-ns that process +* belongs-to is 'controlled' then return EPERM and no need +* to check the user-ns hierarchy. +*/ +
[PATCH resend 0/2] capability controlled user-namespaces
From: Mahesh Bandewar TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Once a user-ns is marked as 'controlled'; all its child user- namespaces are marked as 'controlled' too. A global whitelist is list of capabilities governed by the sysctl which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Please see individual patches in this series. Mahesh Bandewar (2): capability: introduce sysctl for controlled user-ns capability whitelist userns: control capabilities of some user namespaces Documentation/sysctl/kernel.txt | 21 + include/linux/capability.h | 4 include/linux/user_namespace.h | 20 kernel/capability.c | 52 + kernel/sysctl.c | 5 kernel/user_namespace.c | 3 +++ security/commoncap.c| 8 +++ 7 files changed, 113 insertions(+) -- 2.15.0.403.gc27cc4dac6-goog
[PATCH resend 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Signed-off-by: Mahesh Bandewar --- include/linux/capability.h | 1 + include/linux/user_namespace.h | 20 kernel/capability.c| 5 + kernel/user_namespace.c| 3 +++ security/commoncap.c | 8 5 files changed, 37 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 6c0b9677c03f..b8c6cac18658 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c18e01252346..e890fe81b47e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -22,6 +22,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 62dbe3350c1b..40a38cc4ff43 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..f393ea5108f0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; + if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) || + is_user_ns_controlled(user_ns->parent)) + mark_user_ns_controlled(user_ns); #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; diff --git a/security/commoncap.c b/security/commoncap.c index fc46f5b85251..89103f16ac37 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, { struct user_namespace *ns = targ_ns; + /* If the capability is controlled and user-ns that process +* belongs-to is 'controlled' then return EPE
[PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. This takes input as capability mask expressed as two comma separated hex u32 words. The mask, however, is stored in kernel as kernel_cap_t type. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. Signed-off-by: Mahesh Bandewar --- Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 694968c7523c..a1d39dbae847 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index b52e278e4744..6c0b9677c03f 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,6 +13,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index f97fe77ceb88..62dbe3350c1b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos) +{ + DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP); + struct ctl_table caps_table; + char tbuf[NAME_MAX]; + int ret; + + ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP, + controlled_userns_caps_whitelist.cap, + _KERNEL_CAPABILITY_U32S); + if (ret != CAP_LAST_CAP) + return -1; + + scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap); + + caps_table.data = tbuf; + caps_table.maxlen = NAME_MAX; + caps_table.mode = table->mode; + ret = proc_dostring(&caps_table, write, buff, lenp, ppos); + if (ret) + return ret; + if (write) { + kernel_cap_t tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP); + if (ret) + return ret; + + ret = bitmap_to_u32array(tmp.cap, _KERNEL_CAPABILITY_U32S, +
[PATCH] ip/ipvlan: enhance ability to add mode flags to existing modes
From: Mahesh Bandewar IPvlan supported bridge-only functionality prior to commits a190d04db937 ('ipvlan: introduce 'private' attribute for all existing modes.') and fe89aa6b250c ('ipvlan: implement VEPA mode'). These two commits allow to configure the VEPA and private modes now. This patch adds those options in ip command. e.g. bash:~# ip link add link eth0 name ipvl0 type ipvlan mode l2 private -or- bash:~# ip link add link eth0 type ipvl0 type ipvlan mode l2 vepa Also the output will reflect the mode and the mode-flag accordingly. e.g. bash:~# ip -details link show ipvl0 4: ipvl0@eth0: mtu 1500 qdisc ... link/ether 00:1a:11:44:a5:3e brd ff:ff:ff:ff:ff:ff promiscuity 0 ipvlan mode l2 private addrgenmode eui64 numtxqueues 1 ... Signed-off-by: Mahesh Bandewar --- include/uapi/linux/if_link.h | 4 ip/iplink_ipvlan.c | 35 ++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index dafe0a6e0421..35bc598566e0 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -463,6 +463,7 @@ enum macsec_validation_type { enum { IFLA_IPVLAN_UNSPEC, IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, __IFLA_IPVLAN_MAX }; @@ -475,6 +476,9 @@ enum ipvlan_mode { IPVLAN_MODE_MAX }; +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c index 9f48309ee030..8889808508fe 100644 --- a/ip/iplink_ipvlan.c +++ b/ip/iplink_ipvlan.c @@ -20,12 +20,21 @@ static void ipvlan_explain(FILE *f) { - fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3 | l3s } ]\n"); + fprintf(f, + "Usage: ... ipvlan [ mode MODE ] [ FLAGS ]\n" + "\n" + "MODE: l3 | l3s | l2\n" + "FLAGS: bridge | private | vepa\n" + "(first values are the defaults if nothing is specified).\n" + ); } static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv, struct nlmsghdr *n) { + __u16 flags = 0; + bool mflag_given = false; + while (argc > 0) { if (matches(*argv, "mode") == 0) { __u16 mode = 0; @@ -43,6 +52,14 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv, return -1; } addattr16(n, 1024, IFLA_IPVLAN_MODE, mode); + } else if (matches(*argv, "private") == 0 && !mflag_given) { + flags |= IPVLAN_F_PRIVATE; + mflag_given = true; + } else if (matches(*argv, "vepa") == 0 && !mflag_given) { + flags |= IPVLAN_F_VEPA; + mflag_given = true; + } else if (matches(*argv, "bridge") == 0 && !mflag_given) { + mflag_given = true; } else if (matches(*argv, "help") == 0) { ipvlan_explain(stderr); return -1; @@ -55,6 +72,7 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv, argc--; argv++; } + addattr16(n, 1024, IFLA_IPVLAN_FLAGS, flags); return 0; } @@ -75,6 +93,21 @@ static void ipvlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) print_string(PRINT_ANY, "mode", " mode %s ", mode_str); } } + if (tb[IFLA_IPVLAN_FLAGS]) { + if (RTA_PAYLOAD(tb[IFLA_IPVLAN_FLAGS]) == sizeof(__u16)) { + __u16 flags = rta_getattr_u16(tb[IFLA_IPVLAN_FLAGS]); + + if (flags & IPVLAN_F_PRIVATE) + print_bool(PRINT_ANY, "private", "private ", + true); + else if (flags & IPVLAN_F_VEPA) + print_bool(PRINT_ANY, "vepa", "vepa ", + true); + else + print_bool(PRINT_ANY, "bridge", "bridge ", + true); + } + } } static void ipvlan_print_help(struct link_util *lu, int argc, char **argv, -- 2.15.0.rc2.357.g7e34df9404-goog
[PATCH next 0/2] add 'private' and 'vepa' attributes to ipvlan modes
From: Mahesh Bandewar IPvlan has always been operating in bridge-mode for its supported modes i.e. if the packets are destined to the adjacent neighbor dev, then IPvlan driver will switch the packet internally without needing the packets to hit the wire or get routed. However, there are situations where this bridge-mode is not needed. e.g. two private processes running inside two namespaces which are having one IPvlan slave each for its namespace but sharing the master. These processes should reach the outside world through the master device but at the same time the bridge function should not work. Currently that's not possible hence the private attribute for the selected mode comes in play. VEPA or 802.1Qbg on the other hand has limited appeal with IPvlan since IPvlan uses the mac-address of the lower device. So packets that are destined to the adjacent neighbor slave-dev will have same src and dest mac. When these packets reach the external switch/router, they will send you the redirect message which the host will have to deal with. Having said that this attribute will have appeal in debugging as IPvlan will not switch / short-circuit packets internally. e.g. using VEPA mode with lower-device in loopback mode will avoid some complicated set-ups that use non-local-bind with some route jugglery. This patch-set implements these attributes for the existing modes that IPvlan has. Please see individual patches for their detailed implementation. A subsequent ip-utils patch is needed and will be sent soon. Mahesh Bandewar (2): ipvlan: introduce 'private' attribute for all existing modes. ipvlan: implement VEPA mode Documentation/networking/ipvlan.txt | 42 + drivers/net/ipvlan/ipvlan.h | 31 drivers/net/ipvlan/ipvlan_core.c| 24 ++- drivers/net/ipvlan/ipvlan_main.c| 47 +++-- include/uapi/linux/if_link.h| 4 5 files changed, 136 insertions(+), 12 deletions(-) -- 2.15.0.rc2.357.g7e34df9404-goog
[PATCH next 2/2] ipvlan: implement VEPA mode
From: Mahesh Bandewar This is very similar to the Macvlan VEPA mode, however, there is some difference. IPvlan uses the mac-address of the lower device, so the VEPA mode has implications of ICMP-redirects for packets destined for its immediate neighbors sharing same master since the packets will have same source and dest mac. The external switch/router will send redirect msg. Having said that, this will be useful tool in terms of debugging since IPvlan will not switch packets within its slaves and rely completely on the external entity as intended in 802.1Qbg. Signed-off-by: Mahesh Bandewar --- Documentation/networking/ipvlan.txt | 12 +++- drivers/net/ipvlan/ipvlan.h | 15 +++ drivers/net/ipvlan/ipvlan_core.c| 17 ++--- drivers/net/ipvlan/ipvlan_main.c| 13 +++-- include/uapi/linux/if_link.h| 1 + 5 files changed, 48 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt index bfa91c77a4c9..812ef003e0a8 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.txt @@ -25,7 +25,7 @@ using IProute2/ip utility. ip link add link name type ipvlan [ mode MODE ] [ FLAGS ] where MODE: l3 (default) | l3s | l2 - FLAGS: bridge (default) | private + FLAGS: bridge (default) | private | vepa e.g. (a) Following will create IPvlan link with eth0 as master in @@ -35,6 +35,8 @@ using IProute2/ip utility. bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge (c) This command will create an IPvlan device in L2 private mode. bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private +(d) This command will create an IPvlan device in L2 vepa mode. + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa 4. Operating modes: @@ -77,6 +79,14 @@ themseleves apart from talking through the master device. If this option is added to the command-line, the port is set in private mode. i.e. port wont allow cross communication between slaves. +5.3 vepa: + If this is added to the command-line, the port is set in VEPA mode. +i.e. port will offload switching functionality to the external entity as +described in 802.1Qbg +Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the +master-device, so the packets which are emitted in this mode for the adjacent +neighbor will have source and destination mac same. This will make the switch / +router send the redirect message. 6. What to choose (macvlan vs. ipvlan)? These two devices are very similar in many regards and the specific use diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 9941851bcc13..5166575a164d 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -139,6 +139,21 @@ static inline void ipvlan_clear_private(struct ipvl_port *port) port->flags &= ~IPVLAN_F_PRIVATE; } +static inline bool ipvlan_is_vepa(const struct ipvl_port *port) +{ + return !!(port->flags & IPVLAN_F_VEPA); +} + +static inline void ipvlan_mark_vepa(struct ipvl_port *port) +{ + port->flags |= IPVLAN_F_VEPA; +} + +static inline void ipvlan_clear_vepa(struct ipvl_port *port) +{ + port->flags &= ~IPVLAN_F_VEPA; +} + void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 72fd56de9c00..034ae4c57196 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -514,13 +514,15 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) if (!lyr3h) goto out; - addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); - if (addr) { - if (ipvlan_is_private(ipvlan->port)) { - consume_skb(skb); - return NET_XMIT_DROP; + if (!ipvlan_is_vepa(ipvlan->port)) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) { + if (ipvlan_is_private(ipvlan->port)) { + consume_skb(skb); + return NET_XMIT_DROP; + } + return ipvlan_rcv_frame(addr, &skb, true); } - return ipvlan_rcv_frame(addr, &skb, true); } out: ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); @@ -535,7 +537,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) void *lyr3h; int addr_type; - if (ether_addr_equal(eth->h_dest, eth->h_source)) { + if (!ipvlan_is_vepa(ipvlan->port)
[PATCH next 1/2] ipvlan: introduce 'private' attribute for all existing modes.
From: Mahesh Bandewar IPvlan has always operated in bridge mode. However there are scenarios where each slave should be able to talk through the master device but not necessarily across each other. Think of an environment where each of a namespace is a private and independant customer. In this scenario the machine which is hosting these namespaces neither want to tell who their neighbor is nor the individual namespaces care to talk to neighbor on short-circuited network path. This patch implements the mode that is very similar to the 'private' mode in macvlan where individual slaves can send and receive traffic through the master device, just that they can not talk among slave devices. Signed-off-by: Mahesh Bandewar --- Documentation/networking/ipvlan.txt | 30 ++--- drivers/net/ipvlan/ipvlan.h | 16 drivers/net/ipvlan/ipvlan_core.c| 15 --- drivers/net/ipvlan/ipvlan_main.c| 38 +++-- include/uapi/linux/if_link.h| 3 +++ 5 files changed, 94 insertions(+), 8 deletions(-) diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt index 1fe42a874aae..bfa91c77a4c9 100644 --- a/Documentation/networking/ipvlan.txt +++ b/Documentation/networking/ipvlan.txt @@ -22,9 +22,19 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module There are no module parameters for this driver and it can be configured using IProute2/ip utility. - ip link add link name type ipvlan mode { l2 | l3 | l3s } +ip link add link name type ipvlan [ mode MODE ] [ FLAGS ] + where + MODE: l3 (default) | l3s | l2 + FLAGS: bridge (default) | private - e.g. ip link add link eth0 name ipvl0 type ipvlan mode l2 +e.g. +(a) Following will create IPvlan link with eth0 as master in +L3 bridge mode + bash# ip link add link eth0 name ipvl0 type ipvlan +(b) This command will create IPvlan link in L2 bridge mode. + bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge +(c) This command will create an IPvlan device in L2 private mode. + bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private 4. Operating modes: @@ -54,7 +64,21 @@ works in this mode and hence it is L3-symmetric (L3s). This will have slightly l performance but that shouldn't matter since you are choosing this mode over plain-L3 mode to make conn-tracking work. -5. What to choose (macvlan vs. ipvlan)? +5. Mode flags: + At this time following mode flags are available + +5.1 bridge: + This is the default option. To configure the IPvlan port in this mode, +user can choose to either add this option on the command-line or don't specify +anything. This is the traditional mode where slaves can cross-talk among +themseleves apart from talking through the master device. + +5.2 private: + If this option is added to the command-line, the port is set in private +mode. i.e. port wont allow cross communication between slaves. + + +6. What to choose (macvlan vs. ipvlan)? These two devices are very similar in many regards and the specific use case could very well define which device to choose. if one of the following situations defines your use case then you can choose to use ipvlan - diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index ba8173a0b62e..9941851bcc13 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -96,6 +96,7 @@ struct ipvl_port { struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_headipvlans; u16 mode; + u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; @@ -123,6 +124,21 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) return rtnl_dereference(d->rx_handler_data); } +static inline bool ipvlan_is_private(const struct ipvl_port *port) +{ + return !!(port->flags & IPVLAN_F_PRIVATE); +} + +static inline void ipvlan_mark_private(struct ipvl_port *port) +{ + port->flags |= IPVLAN_F_PRIVATE; +} + +static inline void ipvlan_clear_private(struct ipvl_port *port) +{ + port->flags &= ~IPVLAN_F_PRIVATE; +} + void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 1f3295e274d0..72fd56de9c00 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -515,9 +515,13 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) goto out; addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type
[PATCH next] ipvlan: always use the current L2 addr of the master
From: Mahesh Bandewar If the underlying master ever changes its L2 (e.g. bonding device), then make sure that the IPvlan slaves always emit packets with the current L2 of the master instead of the stale mac addr which was copied during the device creation. The problem can be seen with following script - #!/bin/bash # Create a vEth pair ip link add dev veth0 type veth peer name veth1 ip link set veth0 up ip link set veth1 up ip link show veth0 ip link show veth1 # Create an IPvlan device on one end of this vEth pair. ip link add link veth0 dev ipvl0 type ipvlan mode l2 ip link show ipvl0 # Change the mac-address of the vEth master. ip link set veth0 address 02:11:22:33:44:55 Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_main.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index c74893c1e620..5832091680f4 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -407,7 +407,7 @@ static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, * while the packets use the mac-addr on the physical device. */ return dev_hard_header(skb, phy_dev, type, daddr, - saddr ? : dev->dev_addr, len); + saddr ? : phy_dev->dev_addr, len); } static const struct header_ops ipvlan_header_ops = { @@ -730,6 +730,11 @@ static int ipvlan_device_event(struct notifier_block *unused, ipvlan_adjust_mtu(ipvlan, dev); break; + case NETDEV_CHANGEADDR: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr); + break; + case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; -- 2.15.0.rc0.271.g36b669edcc-goog
[PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. This takes input as capability mask expressed as two comma separated hex u32 words. The mask, however, is stored in kernel as kernel_cap_t type. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. Signed-off-by: Mahesh Bandewar --- Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 694968c7523c..a1d39dbae847 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index b52e278e4744..6c0b9677c03f 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,6 +13,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index f97fe77ceb88..62dbe3350c1b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos) +{ + DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP); + struct ctl_table caps_table; + char tbuf[NAME_MAX]; + int ret; + + ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP, + controlled_userns_caps_whitelist.cap, + _KERNEL_CAPABILITY_U32S); + if (ret != CAP_LAST_CAP) + return -1; + + scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap); + + caps_table.data = tbuf; + caps_table.maxlen = NAME_MAX; + caps_table.mode = table->mode; + ret = proc_dostring(&caps_table, write, buff, lenp, ppos); + if (ret) + return ret; + if (write) { + kernel_cap_t tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP); + if (ret) + return ret; + + ret = bitmap_to_u32array(tmp.cap, _KERNEL_CAPABILITY_U32S, +
[PATCH 0/2] capability controlled user-namespaces
From: Mahesh Bandewar [Same as the previous RFC series sent on 9/21] TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Once a user-ns is marked as 'controlled'; all its child user- namespaces are marked as 'controlled' too. A global whitelist is list of capabilities governed by the sysctl which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Please see individual patches in this series. Mahesh Bandewar (2): capability: introduce sysctl for controlled user-ns capability whitelist userns: control capabilities of some user namespaces Documentation/sysctl/kernel.txt | 21 + include/linux/capability.h | 4 include/linux/user_namespace.h | 20 kernel/capability.c | 52 + kernel/sysctl.c | 5 kernel/user_namespace.c | 3 +++ security/commoncap.c| 8 +++ 7 files changed, 113 insertions(+) -- 2.14.2.822.g60be5d43e6-goog
[PATCH 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Signed-off-by: Mahesh Bandewar --- include/linux/capability.h | 1 + include/linux/user_namespace.h | 20 kernel/capability.c| 5 + kernel/user_namespace.c| 3 +++ security/commoncap.c | 8 5 files changed, 37 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 6c0b9677c03f..b8c6cac18658 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c18e01252346..e890fe81b47e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -22,6 +22,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 62dbe3350c1b..40a38cc4ff43 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..f393ea5108f0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; + if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) || + is_user_ns_controlled(user_ns->parent)) + mark_user_ns_controlled(user_ns); #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; diff --git a/security/commoncap.c b/security/commoncap.c index 6bf72b175b49..26f41602da10 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, { struct user_namespace *ns = targ_ns; + /* If the capability is controlled and user-ns that process +* belongs-to is 'controlled' then return EPE
[PATCH next] bonding: speed/duplex update at NETDEV_UP event
From: Mahesh Bandewar Some NIC drivers don't have correct speed/duplex settings at the time they send NETDEV_UP notification and that messes up the bonding state. Especially 802.3ad mode which is very sensitive to these settings. In the current implementation we invoke bond_update_speed_duplex() when we receive NETDEV_UP, however, ignore the return value. If the values we get are invalid (UNKNOWN), then slave gets removed from the aggregator with speed and duplex set to UNKNOWN while link is still marked as UP. This patch fixes this scenario. Also 802.3ad mode is sensitive to these conditions while other modes are not, so making sure that it doesn't change the behavior for other modes. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b7313c1d9dcd..177be373966b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3076,7 +3076,16 @@ static int bond_slave_netdev_event(unsigned long event, break; case NETDEV_UP: case NETDEV_CHANGE: - bond_update_speed_duplex(slave); + /* For 802.3ad mode only: +* Getting invalid Speed/Duplex values here will put slave +* in weird state. So mark it as link-down for the time +* being and let link-monitoring (miimon) set it right when +* correct speeds/duplex are available. +*/ + if (bond_update_speed_duplex(slave) && + BOND_MODE(bond) == BOND_MODE_8023AD) + slave->link = BOND_LINK_DOWN; + if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); /* Fallthrough */ -- 2.14.2.822.g60be5d43e6-goog
[RFC PATCH 0/2] capability controlled user-namespaces
From: Mahesh Bandewar TL;DR version - Creating a sandbox environment with namespaces is challenging considering what these sandboxed processes can engage into. e.g. CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few. Current form of user-namespaces, however, if changed a bit can allow us to create a sandbox environment without locking down user- namespaces. Detailed version Problem --- User-namespaces in the current form have increased the attack surface as any process can acquire capabilities which are not available to them (by default) by performing combination of clone()/unshare()/setns() syscalls. #define _GNU_SOURCE #include #include #include int main(int ac, char **av) { int sock = -1; printf("Attempting to open RAW socket before unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock before unshare().\n"); close(sock); sock = -1; } if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) { perror("unshare() failed: "); return 1; } printf("Attempting to open RAW socket after unshare()...\n"); sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW); if (sock < 0) { perror("socket() SOCK_RAW failed: "); } else { printf("Successfully opened RAW-Sock after unshare().\n"); close(sock); sock = -1; } return 0; } The above example shows how easy it is to acquire NET_RAW capabilities and once acquired, these processes could take benefit of above mentioned or similar issues discovered/undiscovered with malicious intent. Note that this is just an example and the problem/solution is not limited to NET_RAW capability *only*. The easiest fix one can apply here is to lock-down user-namespaces which many of the distros do (i.e. don't allow users to create user namespaces), but unfortunately that prevents everyone from using them. Approach Introduce a notion of 'controlled' user-namespaces. Every process on the host is allowed to create user-namespaces (governed by the limit imposed by per-ns sysctl) however, mark user-namespaces created by sandboxed processes as 'controlled'. Use this 'mark' at the time of capability check in conjunction with a global capability whitelist. If the capability is not whitelisted, processes that belong to controlled user-namespaces will not be allowed. Once a user-ns is marked as 'controlled'; all its child user- namespaces are marked as 'controlled' too. A global whitelist is list of capabilities governed by the sysctl which is available to (privileged) user in init-ns to modify while it's applicable to all controlled user-namespaces on the host. Marking user-namespaces controlled without modifying the whitelist is equivalent of the current behavior. The default value of whitelist includes all capabilities so that the compatibility is maintained. However it gives admins fine-grained ability to control various capabilities system wide without locking down user-namespaces. Please see individual patches in this series. Mahesh Bandewar (2): capability: introduce sysctl for controlled user-ns capability whitelist userns: control capabilities of some user namespaces Documentation/sysctl/kernel.txt | 21 + include/linux/capability.h | 4 include/linux/user_namespace.h | 20 kernel/capability.c | 52 + kernel/sysctl.c | 5 kernel/user_namespace.c | 3 +++ security/commoncap.c| 8 +++ 7 files changed, 113 insertions(+) -- 2.14.1.821.g8fa685d3b7-goog
[RFC PATCH 2/2] userns: control capabilities of some user namespaces
From: Mahesh Bandewar With this new notion of "controlled" user-namespaces, the controlled user-namespaces are marked at the time of their creation while the capabilities of processes that belong to them are controlled using the global mask. Init-user-ns is always uncontrolled and a process that has SYS_ADMIN that belongs to uncontrolled user-ns can create another (child) user- namespace that is uncontrolled. Any other process (that either does not have SYS_ADMIN or belongs to a controlled user-ns) can only create a user-ns that is controlled. global-capability-whitelist (controlled_userns_caps_whitelist) is used at the capability check-time and keeps the semantics for the processes that belong to uncontrolled user-ns as it is. Processes that belong to controlled user-ns however are subjected to different checks- (a) if the capability in question is controlled and process belongs to controlled user-ns, then it's always denied. (b) if the capability in question is NOT controlled then fall back to the traditional check. Signed-off-by: Mahesh Bandewar --- include/linux/capability.h | 1 + include/linux/user_namespace.h | 20 kernel/capability.c| 5 + kernel/user_namespace.c| 3 +++ security/commoncap.c | 8 5 files changed, 37 insertions(+) diff --git a/include/linux/capability.h b/include/linux/capability.h index 6c0b9677c03f..b8c6cac18658 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos); +bool is_capability_controlled(int cap); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index c18e01252346..e890fe81b47e 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -22,6 +22,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ }; #define USERNS_SETGROUPS_ALLOWED 1UL +#define USERNS_CONTROLLED 2UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED @@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns) __put_user_ns(ns); } +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return ns->flags & USERNS_CONTROLLED; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ + ns->flags |= USERNS_CONTROLLED; +} + struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; @@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } + +static inline bool is_user_ns_controlled(const struct user_namespace *ns) +{ + return false; +} + +static inline void mark_user_ns_controlled(struct user_namespace *ns) +{ +} #endif #endif /* _LINUX_USER_H */ diff --git a/kernel/capability.c b/kernel/capability.c index 62dbe3350c1b..40a38cc4ff43 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) } /* Controlled-userns capabilities routines */ +bool is_capability_controlled(int cap) +{ + return !cap_raised(controlled_userns_caps_whitelist, cap); +} + #ifdef CONFIG_SYSCTL int proc_douserns_caps_whitelist(struct ctl_table *table, int write, void __user *buff, size_t *lenp, loff_t *ppos) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index c490f1e4313b..f393ea5108f0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; + if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) || + is_user_ns_controlled(user_ns->parent)) + mark_user_ns_controlled(user_ns); #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; diff --git a/security/commoncap.c b/security/commoncap.c index 6bf72b175b49..26f41602da10 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, { struct user_namespace *ns = targ_ns; + /* If the capability is controlled and user-ns that process +* belongs-to is 'controlled' then return EPE
[RFC PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist
From: Mahesh Bandewar Add a sysctl variable kernel.controlled_userns_caps_whitelist. This takes input as capability mask expressed as two comma separated hex u32 words. The mask, however, is stored in kernel as kernel_cap_t type. Any capabilities that are not part of this mask will be controlled and will not be allowed to processes in controlled user-ns. Signed-off-by: Mahesh Bandewar CC: Serge Hallyn CC: Kees Cook CC: "Eric W. Biederman" --- Documentation/sysctl/kernel.txt | 21 ++ include/linux/capability.h | 3 +++ kernel/capability.c | 47 + kernel/sysctl.c | 5 + 4 files changed, 76 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ce61d1fe08ca..ec0d74476f48 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -25,6 +25,7 @@ show up in /proc/sys/kernel: - bootloader_version[ X86 only ] - callhome [ S390 only ] - cap_last_cap +- controlled_userns_caps_whitelist - core_pattern - core_pipe_limit - core_uses_pid @@ -186,6 +187,26 @@ CAP_LAST_CAP from the kernel. == +controlled_userns_caps_whitelist + +Capability mask that is whitelisted for "controlled" user namespaces. +Any capability that is missing from this mask will not be allowed to +any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW +is not part of this mask, then processes running inside any controlled +userns's will not be allowed to perform action that needs CAP_NET_RAW +capability. However, processes that are attached to a parent user-ns +hierarchy that is *not* controlled and has CAP_NET_RAW can continue +performing those actions. User-namespaces are marked "controlled" at +the time of their creation based on the capabilities of the creator. +A process that does not have CAP_SYS_ADMIN will create user-namespaces +that are controlled. + +The value is expressed as two comma separated hex words (u32). This +sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns +are allowed to make changes. + +== + core_pattern: core_pattern is used to specify a core dumpfile pattern name. diff --git a/include/linux/capability.h b/include/linux/capability.h index b52e278e4744..6c0b9677c03f 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -13,6 +13,7 @@ #define _LINUX_CAPABILITY_H #include +#include #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3 @@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos); extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size); diff --git a/kernel/capability.c b/kernel/capability.c index f97fe77ceb88..62dbe3350c1b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; +kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET; + static int __init file_caps_disable(char *str) { file_caps_enabled = 0; @@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) rcu_read_unlock(); return (ret == 0); } + +/* Controlled-userns capabilities routines */ +#ifdef CONFIG_SYSCTL +int proc_douserns_caps_whitelist(struct ctl_table *table, int write, +void __user *buff, size_t *lenp, loff_t *ppos) +{ + DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP); + struct ctl_table caps_table; + char tbuf[NAME_MAX]; + int ret; + + ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP, + controlled_userns_caps_whitelist.cap, + _KERNEL_CAPABILITY_U32S); + if (ret != CAP_LAST_CAP) + return -1; + + scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap); + + caps_table.data = tbuf; + caps_table.maxlen = NAME_MAX; + caps_table.mode = table->mode; + ret = proc_dostring(&caps_table, write, buff, lenp, ppos); + if (ret) + return ret; + if (write) { + kernel_cap_t tmp; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP); + if (ret) + return ret; + + ret =
[PATCH next] neigh: initialize neigh entry correctly during arp processing
From: Mahesh Bandewar If the ARP processing creates a neigh entry, it's immediately marked as STALE without timer and stays that way in that state as long as host do not send traffic to that neighbour. I observed this on hosts which are in IPv6 environment, where there is very little to no IPv4 traffic and neigh-entries are stuck in STALE mode. Ideally, the host should have PROBEd these neighbours before it can send the first packet out. It happens as a result of following call sequence in an environment where host is mostly quiet as far as IPv4 traffic but few connected hosts/gateways are sending ARPs. arp_process() neigh_event_ns() neigh_lookup() neigh_create() neigh_alloc() nud_state=NUD_NONE neigh_update(nud_state=NUD_STALE) In the above scenario, the neighbour entry does not get a chance to get PROBEd as subsequent call to neigh_update() marks this entry STALE. This patch initializes the neigh-entry correctly if it was created as a result of neigh_lookup instead of just updating it in neigh_event_ns() right after creating it. Signed-off-by: Mahesh Bandewar --- net/core/neighbour.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 16a1a4c4eb57..d8a35db6c43b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1300,9 +1300,13 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, { struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, lladdr || !dev->addr_len); - if (neigh) - neigh_update(neigh, lladdr, NUD_STALE, -NEIGH_UPDATE_F_OVERRIDE, 0); + if (neigh) { + if (neigh->nud_state & NUD_VALID) + neigh_update(neigh, lladdr, NUD_STALE, +NEIGH_UPDATE_F_OVERRIDE, 0); + else + neigh_event_send(neigh, NULL); + } return neigh; } EXPORT_SYMBOL(neigh_event_ns); -- 2.14.1.480.gb18f417b89-goog
[PATCH 1/3] ipv4: initialize fib_trie prior to register_netdev_notifier call.
From: Mahesh Bandewar Net stack initialization currently initializes fib-trie after the first call to netdevice_notifier() call. In fact fib_trie initialization needs to happen before first rtnl_register(). It does not cause any problem since there are no devices UP at this moment, but trying to bring 'lo' UP at initialization would make this assumption wrong and exposes the issue. Fixes following crash Call Trace: ? alternate_node_alloc+0x76/0xa0 fib_table_insert+0x1b7/0x4b0 fib_magic.isra.17+0xea/0x120 fib_add_ifaddr+0x7b/0x190 fib_netdev_event+0xc0/0x130 register_netdevice_notifier+0x1c1/0x1d0 ip_fib_init+0x72/0x85 ip_rt_init+0x187/0x1e9 ip_init+0xe/0x1a inet_init+0x171/0x26c ? ipv4_offload_init+0x66/0x66 do_one_initcall+0x43/0x160 kernel_init_freeable+0x191/0x219 ? rest_init+0x80/0x80 kernel_init+0xe/0x150 ret_from_fork+0x22/0x30 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28 CR2: 0014 Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle multiple namespaces.") Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization") Signed-off-by: Mahesh Bandewar --- net/ipv4/fib_frontend.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e678fa892dd..044d2a159a3c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_trie_init(); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); } -- 2.14.0.rc0.284.gd933b75aa4-goog
[PATCH 1/2] ipv4: initialize fib_trie prior to register_netdev_notifier call.
From: Mahesh Bandewar Net stack initialization currently initializes fib-trie after the first call to netdevice_notifier() call. It does not cause any problem since there are no devices UP at this moment, but trying to bring 'lo' UP at initialization would make this assumption wrong. However changing order solves the issue. Fixes following crash [8.647095] BUG: unable to handle kernel NULL pointer dereference at 0014 [8.654836] IP: kmem_cache_alloc+0xcf/0x1c0 [8.658966] PGD 0 [8.658967] P4D 0 [8.660953] [8.664406] Oops: [#1] SMP [8.667505] Modules linked in: [8.670519] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.12.0-smp-DEV #34 [8.677138] Hardware name: Intel Grantley,Wellsburg/Ixion_IT_15, BIOS 2.60.0 06/03/2016 [8.685043] task: 8f26b2b00dc0 task.stack: 9b1500014000 [8.690891] RIP: 0010:kmem_cache_alloc+0xcf/0x1c0 [8.695535] RSP: :9b1500017c28 EFLAGS: 00010246 [8.700696] RAX: 96ed4f47 RBX: RCX: 0200 [8.707743] RDX: 0400 RSI: 014000c0 RDI: [8.714791] RBP: 9b1500017c58 R08: 0001 R09: [8.721837] R10: R11: R12: 014000c0 [8.728886] R13: 014000c0 R14: R15: [8.735932] FS: () GS:8f26bf50() knlGS: [8.743925] CS: 0010 DS: ES: CR0: 80050033 [8.749599] CR2: 0014 CR3: 001d05609000 CR4: 001406e0 [8.756648] Call Trace: [8.759061] ? alternate_node_alloc+0x76/0xa0 [8.763364] fib_table_insert+0x1b7/0x4b0 [8.767323] fib_magic.isra.17+0xea/0x120 [8.771283] fib_add_ifaddr+0x7b/0x190 [8.774983] fib_netdev_event+0xc0/0x130 [8.778857] register_netdevice_notifier+0x1c1/0x1d0 [8.783761] ip_fib_init+0x72/0x85 [8.787120] ip_rt_init+0x187/0x1e9 [8.790564] ip_init+0xe/0x1a [8.793494] inet_init+0x171/0x26c [8.796851] ? ipv4_offload_init+0x66/0x66 [8.800896] do_one_initcall+0x43/0x160 [8.804685] kernel_init_freeable+0x191/0x219 [8.808987] ? rest_init+0x80/0x80 [8.812343] kernel_init+0xe/0x150 [8.815702] ret_from_fork+0x22/0x30 [8.819229] Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08 [8.837882] RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28 [8.843986] CR2: 0014 [8.847288] ---[ end trace 8fc60994c8a367cb ]--- [8.851847] Kernel panic - not syncing: Fatal exception [8.857035] Rebooting in 10 seconds.. Signed-off-by: Mahesh Bandewar --- net/ipv4/fib_frontend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e678fa892dd..ca22ca47019a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1338,9 +1338,9 @@ void __init ip_fib_init(void) rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); + register_pernet_subsys(&fib_net_ops); register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - - fib_trie_init(); } -- 2.13.2.725.g09c95d1e9-goog
[PATCH 2/2] loopback: bringup 'lo' by default at initialization
From: Mahesh Bandewar loopback devices are always brought up right after its initialization including the case of network namespace creation. e.g. ip netns add foo ip -netns foo link set lo up This patch will eliminate the need to do that separately and would bring it up as part of the loopback initialization. Signed-off-by: Mahesh Bandewar --- drivers/net/loopback.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 30612497643c..3216977935d1 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -217,6 +217,12 @@ static __net_init int loopback_net_init(struct net *net) BUG_ON(dev->ifindex != LOOPBACK_IFINDEX); net->loopback_dev = dev; + + /* Set the loopback device UP */ + rtnl_lock(); + dev_open(net->loopback_dev); + rtnl_unlock(); + return 0; out_free_netdev: -- 2.13.2.725.g09c95d1e9-goog
[PATCH 0/2] bring UP loopback device at initialziation
From: Mahesh Bandewar In almost every scenario the loopback device is brought UP after initialization. So there is no point of bringing up the device in DOWN state followed by device UP operation. This change exposed another issue of fib-trie initialization which is corrected in the first path. Mahesh Bandewar (2): ipv4: initialize fib_trie prior to register_netdev_notifier call. loopback: bringup 'lo' by default at initialization drivers/net/loopback.c | 6 ++ net/ipv4/fib_frontend.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) -- 2.13.2.725.g09c95d1e9-goog
[PATCH net] ipv6: avoid dad-failures for addresses with NODAD
From: Mahesh Bandewar Every address gets added with TENTATIVE flag even for the addresses with IFA_F_NODAD flag and dad-work is scheduled for them. During this DAD process we realize it's an address with NODAD and complete the process without sending any probe. However the TENTATIVE flags stays on the address for sometime enough to cause misinterpretation when we receive a NS. While processing NS, if the address has TENTATIVE flag, we mark it DADFAILED and endup with an address that was originally configured as NODAD with DADFAILED. We can't avoid scheduling dad_work for addresses with NODAD but we can avoid adding TENTATIVE flag to avoid this racy situation. Signed-off-by: Mahesh Bandewar --- net/ipv6/addrconf.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b09ac38d8dc4..53f2dc092023 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, INIT_HLIST_NODE(&ifa->addr_lst); ifa->scope = scope; ifa->prefix_len = pfxlen; - ifa->flags = flags | IFA_F_TENTATIVE; + ifa->flags = flags; + /* No need to add the TENTATIVE flag for addresses with NODAD */ + if (!(flags & IFA_F_NODAD)) + ifa->flags |= IFA_F_TENTATIVE; ifa->valid_lft = valid_lft; ifa->prefered_lft = prefered_lft; ifa->cstamp = ifa->tstamp = jiffies; -- 2.13.0.rc2.291.g57267f2277-goog
[PATCH] kmod: don't load module unless req process has CAP_SYS_MODULE
From: Mahesh Bandewar A process inside random user-ns should not load a module, which is currently possible. As demonstrated in following scenario - Create namespaces; especially a user-ns and become root inside. $ unshare -rfUp -- unshare -unm -- bash Try to load the bridge module. It should fail and this is expected! # modprobe bridge WARNING: Error inserting stp (/lib/modules/4.11.0-smp-DEV/kernel/net/802/stp.ko): Operation not permitted FATAL: Error inserting bridge (/lib/modules/4.11.0-smp-DEV/kernel/net/bridge/bridge.ko): Operation not permitted Verify bridge module is not loaded. # lsmod | grep bridge # Now try to create a bridge inside this newly created net-ns which would mean bridge module need to be loaded. # ip link add br0 type bridge # echo $? 0 # lsmod | grep bridge bridge110592 0 stp16384 1 bridge llc16384 2 bridge,stp # After this patch - # ip link add br0 type bridge RTNETLINK answers: Operation not supported # echo $? 2 # lsmod | grep bridge # Signed-off-by: Mahesh Bandewar --- kernel/kmod.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/kmod.c b/kernel/kmod.c index 563f97e2be36..ac30157169b7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -133,6 +133,9 @@ int __request_module(bool wait, const char *fmt, ...) #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; + if (!capable(CAP_SYS_MODULE)) + return -EPERM; + /* * We don't allow synchronous module loading from async. Module * init may invoke async_synchronize_full() which will end up -- 2.13.0.rc2.291.g57267f2277-goog
[PATCHv2 next] bonding: fix wq initialization for links created via netlink
From: Mahesh Bandewar Earlier patch 4493b81bea ("bonding: initialize work-queues during creation of bond") moved the work-queue initialization from bond_open() to bond_create(). However this caused the link those are created using netlink 'create bond option' (ip link add bondX type bond); create the new trunk without initializing work-queues. Prior to the above mentioned change, ndo_open was in both paths and things worked correctly. The consequence is visible in the report shared by Joe Stringer - I've noticed that this patch breaks bonding within namespaces if you're not careful to perform device cleanup correctly. Here's my repro script, you can run on any net-next with this patch and you'll start seeing some weird behaviour: ip netns add foo ip li add veth0 type veth peer name veth0+ netns foo ip li add veth1 type veth peer name veth1+ netns foo ip netns exec foo ip li add bond0 type bond ip netns exec foo ip li set dev veth0+ master bond0 ip netns exec foo ip li set dev veth1+ master bond0 ip netns exec foo ip addr add dev bond0 192.168.0.1/24 ip netns exec foo ip li set dev bond0 up ip li del dev veth0 ip li del dev veth1 The second to last command segfaults, last command hangs. rtnl is now permanently locked. It's not a problem if you take bond0 down before deleting veths, or delete bond0 before deleting veths. If you delete either end of the veth pair as per above, either inside or outside the namespace, it hits this problem. Here's some kernel logs: [ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link [ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link [ 1281.193863] bond0: Releasing backup interface veth0+ [ 1281.193866] bond0: the permanent HWaddr of veth0+ - 16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of veth0+ to a different address to avoid conflicts [ 1281.193867] [ cut here ] [ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511 __queue_delayed_work+0x13f/0x150 [ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6 nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid hid mptspi mptscsih e1000 mptbase ahci libahci [ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW 4.10.0-bisect-bond-v0.14 #37 [ 1281.193906] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014 [ 1281.193906] Call Trace: [ 1281.193912] dump_stack+0x63/0x89 [ 1281.193915] __warn+0xd1/0xf0 [ 1281.193917] warn_slowpath_null+0x1d/0x20 [ 1281.193918] __queue_delayed_work+0x13f/0x150 [ 1281.193920] queue_delayed_work_on+0x27/0x40 [ 1281.193929] bond_change_active_slave+0x25b/0x670 [bonding] [ 1281.193932] ? synchronize_rcu_expedited+0x27/0x30 [ 1281.193935] __bond_release_one+0x489/0x510 [bonding] [ 1281.193939] ? addrconf_notify+0x1b7/0xab0 [ 1281.193942] bond_netdev_event+0x2c5/0x2e0 [bonding] [ 1281.193944] ? netconsole_netdev_event+0x124/0x190 [netconsole] [ 1281.193947] notifier_call_chain+0x49/0x70 [ 1281.193948] raw_notifier_call_chain+0x16/0x20 [ 1281.193950] call_netdevice_notifiers_info+0x35/0x60 [ 1281.193951] rollback_registered_many+0x23b/0x3e0 [ 1281.193953] unregister_netdevice_many+0x24/0xd0 [ 1281.193955] rtnl_delete_link+0x3c/0x50 [ 1281.193956] rtnl_dellink+0x8d/0x1b0 [ 1281.193960] rtnetlink_rcv_msg+0x95/0x220 [ 1281.193962] ? __kmalloc_node_track_caller+0x35/0x280 [ 1281.193964] ? __netlink_lookup+0xf1/0x110 [ 1281.193966] ? rtnl_newlink+0x830/0x830 [ 1281.193967] netlink_rcv_skb+0xa7/0xc0 [ 1281.193969] rtnetlink_rcv+0x28/0x30 [ 1281.193970] netlink_unicast+0x15b/0x210 [ 1281.193971] netlink_sendmsg+0x319/0x390 [ 1281.193974] sock_sendmsg+0x38/0x50 [ 1281.193975] ___sys_sendmsg+0x25c/0x270 [ 1281.193978] ? mem_cgroup_commit_charge+0x76/0xf0 [ 1281.193981] ? page_add_new_anon_rmap+0x89/0xc0 [ 1281.193984] ? lru_cache_add_active_or_unevictable+0x35/0xb0 [ 1281.193985] ? __handle_mm_fault+0x4e9/0x1170 [ 1281.193987] __sys_sendmsg+0x45/0x80 [ 1281.193989] SyS_sendmsg+0x12/0x20 [ 1281.193991] do_syscall_64+0x6e/0x180 [ 1281.193993] entry_SYSCALL64_slow_path+0x25/0x25 [ 1281.193995] RIP: 0033:0x7f6ec122f5a0 [ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX: 002e [ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0 [ 1281.193997] RDX: RSI: 7ffe69e89c90 RDI: 0003 [ 1281.193998] RBP: 7ffe69e89c90 R08: R09: 0003 [ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 000
[PATCH next] bonding: fix wq initialization for links created via netlink
From: Mahesh Bandewar Earlier patch 4493b81bea ("bonding: initialize work-queues during creation of bond") moved the work-queue initialization from bond_open() to bond_create(). However this caused the link those are created using netlink 'create bond option' (ip link add bondX type bond); create the new trunk without initializing work-queues. Prior to the above mentioned change, ndo_open was in both paths and things worked correctly. The consequence is visible in the report shared by Joe Stringer - I've noticed that this patch breaks bonding within namespaces if you're not careful to perform device cleanup correctly. Here's my repro script, you can run on any net-next with this patch and you'll start seeing some weird behaviour: ip netns add foo ip li add veth0 type veth peer name veth0+ netns foo ip li add veth1 type veth peer name veth1+ netns foo ip netns exec foo ip li add bond0 type bond ip netns exec foo ip li set dev veth0+ master bond0 ip netns exec foo ip li set dev veth1+ master bond0 ip netns exec foo ip addr add dev bond0 192.168.0.1/24 ip netns exec foo ip li set dev bond0 up ip li del dev veth0 ip li del dev veth1 The second to last command segfaults, last command hangs. rtnl is now permanently locked. It's not a problem if you take bond0 down before deleting veths, or delete bond0 before deleting veths. If you delete either end of the veth pair as per above, either inside or outside the namespace, it hits this problem. Here's some kernel logs: [ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link [ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link [ 1281.193863] bond0: Releasing backup interface veth0+ [ 1281.193866] bond0: the permanent HWaddr of veth0+ - 16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of veth0+ to a different address to avoid conflicts [ 1281.193867] [ cut here ] [ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511 __queue_delayed_work+0x13f/0x150 [ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6 nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid hid mptspi mptscsih e1000 mptbase ahci libahci [ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW 4.10.0-bisect-bond-v0.14 #37 [ 1281.193906] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014 [ 1281.193906] Call Trace: [ 1281.193912] dump_stack+0x63/0x89 [ 1281.193915] __warn+0xd1/0xf0 [ 1281.193917] warn_slowpath_null+0x1d/0x20 [ 1281.193918] __queue_delayed_work+0x13f/0x150 [ 1281.193920] queue_delayed_work_on+0x27/0x40 [ 1281.193929] bond_change_active_slave+0x25b/0x670 [bonding] [ 1281.193932] ? synchronize_rcu_expedited+0x27/0x30 [ 1281.193935] __bond_release_one+0x489/0x510 [bonding] [ 1281.193939] ? addrconf_notify+0x1b7/0xab0 [ 1281.193942] bond_netdev_event+0x2c5/0x2e0 [bonding] [ 1281.193944] ? netconsole_netdev_event+0x124/0x190 [netconsole] [ 1281.193947] notifier_call_chain+0x49/0x70 [ 1281.193948] raw_notifier_call_chain+0x16/0x20 [ 1281.193950] call_netdevice_notifiers_info+0x35/0x60 [ 1281.193951] rollback_registered_many+0x23b/0x3e0 [ 1281.193953] unregister_netdevice_many+0x24/0xd0 [ 1281.193955] rtnl_delete_link+0x3c/0x50 [ 1281.193956] rtnl_dellink+0x8d/0x1b0 [ 1281.193960] rtnetlink_rcv_msg+0x95/0x220 [ 1281.193962] ? __kmalloc_node_track_caller+0x35/0x280 [ 1281.193964] ? __netlink_lookup+0xf1/0x110 [ 1281.193966] ? rtnl_newlink+0x830/0x830 [ 1281.193967] netlink_rcv_skb+0xa7/0xc0 [ 1281.193969] rtnetlink_rcv+0x28/0x30 [ 1281.193970] netlink_unicast+0x15b/0x210 [ 1281.193971] netlink_sendmsg+0x319/0x390 [ 1281.193974] sock_sendmsg+0x38/0x50 [ 1281.193975] ___sys_sendmsg+0x25c/0x270 [ 1281.193978] ? mem_cgroup_commit_charge+0x76/0xf0 [ 1281.193981] ? page_add_new_anon_rmap+0x89/0xc0 [ 1281.193984] ? lru_cache_add_active_or_unevictable+0x35/0xb0 [ 1281.193985] ? __handle_mm_fault+0x4e9/0x1170 [ 1281.193987] __sys_sendmsg+0x45/0x80 [ 1281.193989] SyS_sendmsg+0x12/0x20 [ 1281.193991] do_syscall_64+0x6e/0x180 [ 1281.193993] entry_SYSCALL64_slow_path+0x25/0x25 [ 1281.193995] RIP: 0033:0x7f6ec122f5a0 [ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX: 002e [ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0 [ 1281.193997] RDX: RSI: 7ffe69e89c90 RDI: 0003 [ 1281.193998] RBP: 7ffe69e89c90 R08: R09: 0003 [ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 000
[PATCH next] bonding: handle link transition from FAIL to UP correctly
From: Mahesh Bandewar When link transitions from LINK_FAIL to LINK_UP, the commit phase is not called. This leads to an erroneous state causing slave-link state to get stuck in "going down" state while its speed and duplex are perfectly fine. This issue is a side-effect of splitting link-set into propose and commit phases introduced by de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") This patch fixes these issues by calling commit phase whenever link state change is proposed. Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring") Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index aba7352906a5..01e4a69af421 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2064,6 +2064,7 @@ static int bond_miimon_inspect(struct bonding *bond) (bond->params.downdelay - slave->delay) * bond->params.miimon, slave->dev->name); + commit++; continue; } @@ -2098,7 +2099,7 @@ static int bond_miimon_inspect(struct bonding *bond) (bond->params.updelay - slave->delay) * bond->params.miimon, slave->dev->name); - + commit++; continue; } -- 2.12.2.715.g7642488e1d-goog
[PATCH next] bonding: fix active-backup transition
From: Mahesh Bandewar Earlier patch c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") made an attempt to keep slave state consistent with speed and duplex settings. Unfortunately link-state transition is used to change the active link especially when used in conjunction with mii-mon. The above mentioned patch broke that logic. Also when speed and duplex settings for a link are updated during a link-event, the link-status should not be changed to invoke correct transition logic. This patch fixes this issue by moving the link-state update outside of the bond_update_speed_duplex() fn and to the places where this fn is called and update link-state selectively. Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent with link state") Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 13 + 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 27359dab78a1..535388b15cde 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -378,20 +378,15 @@ static int bond_update_speed_duplex(struct slave *slave) slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); - if (res < 0) { - slave->link = BOND_LINK_DOWN; + if (res < 0) return 1; - } - if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) { - slave->link = BOND_LINK_DOWN; + if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) return 1; - } switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: - slave->link = BOND_LINK_DOWN; return 1; } @@ -1563,7 +1558,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev) new_slave->delay = 0; new_slave->link_failure_count = 0; - bond_update_speed_duplex(new_slave); + if (bond_update_speed_duplex(new_slave)) + new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); @@ -2126,6 +2122,7 @@ static void bond_miimon_commit(struct bonding *bond) case BOND_LINK_UP: if (bond_update_speed_duplex(slave)) { + slave->link = BOND_LINK_DOWN; netdev_warn(bond->dev, "failed to get link speed/duplex for %s\n", slave->dev->name); -- 2.12.2.715.g7642488e1d-goog
[PATCH next 4/5] bonding: correctly update link status during mii-commit phase
From: Mahesh Bandewar bond_miimon_commit() marks the link UP after attempting to get the speed and duplex settings for the link. There is a possibility that bond_update_speed_duplex() could fail. This is another place where it could result into an inconsistent bonding link state. With this patch the link will be marked UP only if the speed and duplex values retrieved have sane values and processed further. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ad317bb63193..6cea964ab70a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2125,7 +2125,12 @@ static void bond_miimon_commit(struct bonding *bond) continue; case BOND_LINK_UP: - bond_update_speed_duplex(slave); + if (bond_update_speed_duplex(slave)) { + netdev_warn(bond->dev, + "failed to get link speed/duplex for %s\n", + slave->dev->name); + continue; + } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 5/5] bonding: avoid printing while holding a spinlock
From: Mahesh Bandewar Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_3ad.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 508713b4e533..c5fd4259da33 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2446,9 +2446,9 @@ void bond_3ad_adapter_speed_duplex_changed(struct slave *slave) spin_lock_bh(&slave->bond->mode_lock); ad_update_actor_keys(port, false); + spin_unlock_bh(&slave->bond->mode_lock); netdev_dbg(slave->bond->dev, "Port %d slave %s changed speed/duplex\n", port->actor_port_number, slave->dev->name); - spin_unlock_bh(&slave->bond->mode_lock); } /** @@ -2492,12 +2492,12 @@ void bond_3ad_handle_link_change(struct slave *slave, char link) agg = __get_first_agg(port); ad_agg_selection_logic(agg, &dummy); + spin_unlock_bh(&slave->bond->mode_lock); + netdev_dbg(slave->bond->dev, "Port %d changed link status to %s\n", port->actor_port_number, link == BOND_LINK_UP ? "UP" : "DOWN"); - spin_unlock_bh(&slave->bond->mode_lock); - /* RTNL is held and mode_lock is released so it's safe * to update slave_array here. */ -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 2/5] bonding: improve link-status update in mii-monitoring
From: Mahesh Bandewar The primary issue is that mii-inspect phase updates link-state and expects changes to be committed during the mii-commit phase. After the inspect phase if it fails to acquire rtnl-mutex, the commit phase (bond_mii_commit) doesn't get to run. This partially updated state stays and makes the internal-state inconsistent. e.g. setup bond0 => slaves: eth1, eth2 eth1 goes DOWN -> UP mii_monitor() mii-inspect() bond_set_slave_link_state(eth1, UP, DontNotify) rtnl_trylock() <- fails! Next mii-monitor round eth1: No change mii_monitor() mii-inspect() eth1->link == current-status (ethtool_ops->get_link) no-change-detected End result: eth1: Link = BOND_LINK_UP Speed = 0xf [SpeedUnknown] Duplex = 0xff[DuplexUnknown] This doesn't always happen but for some unlucky machines in a large set of machines it creates problems. The fix for this is to avoid making changes during inspect phase and postpone them until acquiring the rtnl-mutex / invoking commit phase. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ba934020dfaa..85999e479916 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2033,8 +2033,7 @@ static int bond_miimon_inspect(struct bonding *bond) if (link_state) continue; - bond_set_slave_link_state(slave, BOND_LINK_FAIL, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_FAIL); slave->delay = bond->params.downdelay; if (slave->delay) { netdev_info(bond->dev, "link status down for %sinterface %s, disabling it in %d ms\n", @@ -2049,8 +2048,7 @@ static int bond_miimon_inspect(struct bonding *bond) case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ - bond_set_slave_link_state(slave, BOND_LINK_UP, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; netdev_info(bond->dev, "link status up again after %d ms for interface %s\n", (bond->params.downdelay - slave->delay) * @@ -2072,8 +2070,7 @@ static int bond_miimon_inspect(struct bonding *bond) if (!link_state) continue; - bond_set_slave_link_state(slave, BOND_LINK_BACK, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_BACK); slave->delay = bond->params.updelay; if (slave->delay) { @@ -2086,9 +2083,7 @@ static int bond_miimon_inspect(struct bonding *bond) /*FALLTHRU*/ case BOND_LINK_BACK: if (!link_state) { - bond_set_slave_link_state(slave, - BOND_LINK_DOWN, - BOND_SLAVE_NOTIFY_LATER); + bond_propose_link_state(slave, BOND_LINK_DOWN); netdev_info(bond->dev, "link status down again after %d ms for interface %s\n", (bond->params.updelay - slave->delay) * bond->params.miimon, @@ -2225,6 +2220,8 @@ static void bond_mii_monitor(struct work_struct *work) mii_work.work); bool should_notify_peers = false; unsigned long delay; + struct slave *slave; + struct list_head *iter; delay = msecs_to_jiffies(bond->params.miimon); @@ -2245,6 +2242,9 @@ static void bond_mii_monitor(struct work_struct *work) goto re_arm; } + bond_for_each_slave(bond, slave, iter) { + bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); + } bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 1/5] bonding: split bond_set_slave_link_state into two parts
From: Mahesh Bandewar Split the function into two (a) propose (b) commit phase without changing the semantics for the original API. Signed-off-by: Mahesh Bandewar --- include/net/bonding.h | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/include/net/bonding.h b/include/net/bonding.h index 3c857778a6ca..fb2dd97857c4 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -153,7 +153,8 @@ struct slave { unsigned long last_link_up; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; - s8 link;/* one of BOND_LINK_ */ + s8 link;/* one of BOND_LINK_ */ + s8 link_new_state; /* one of BOND_LINK_ */ s8 new_link; u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ @@ -504,13 +505,17 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } -static inline void bond_set_slave_link_state(struct slave *slave, int state, -bool notify) +static inline void bond_propose_link_state(struct slave *slave, int state) +{ + slave->link_new_state = state; +} + +static inline void bond_commit_link_state(struct slave *slave, bool notify) { - if (slave->link == state) + if (slave->link == slave->link_new_state) return; - slave->link = state; + slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); @@ -523,6 +528,13 @@ static inline void bond_set_slave_link_state(struct slave *slave, int state, } } +static inline void bond_set_slave_link_state(struct slave *slave, int state, +bool notify) +{ + bond_propose_link_state(slave, state); + bond_commit_link_state(slave, notify); +} + static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 0/5] link-status fixes for mii-monitoring
From: Mahesh Bandewar The mii monitoring is divided into two phases - inspect and commit. The inspect phase technically should not make any changes to the state and defer it to the commit phase. However detected link state inconsistencies on several machines and discovered that it's the result of some inconsistent update to link states and assumption that you *always* get rtnl-mutex. In reality when trylock() fails to acquire rtnl-mutex, the commit phase is postponed until next mii-mon run. At the next round because of the state change performed in the previous inspect-run, this round does not detect any changes and would skip calling commit phase. This would result in an inconsistent state until next link event happens (if it ever happens). During the the commit phase, it's always assumed that speed and duplex fetch is always successful, but that's always not the case. However the slave state is marked UP irrespective of speed / duplex fetch operation. If the speed / duplex fetch operation results in insane values for either of these two fields, then keeping internal link state UP is not going to provide fruitful results either. Please see into individual patches for more details. Mahesh Bandewar (5): bonding: split bond_set_slave_link_state into two parts bonding: improve link-status update in mii-monitoring bonding: make speed, duplex setting consistent with link state bonding: correctly update link status during mii-commit phase bonding: avoid printing while holding a spinlock drivers/net/bonding/bond_3ad.c | 6 ++--- drivers/net/bonding/bond_main.c | 49 - include/net/bonding.h | 22 +- 3 files changed, 49 insertions(+), 28 deletions(-) -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 3/5] bonding: make speed, duplex setting consistent with link state
From: Mahesh Bandewar bond_update_speed_duplex() retrieves speed and duplex settings. There is a possibility of failure in retrieving these values but caller has to assume it's always successful. This leads to having inconsistent slave link settings. If these (speed, duplex) values cannot be retrieved, then keeping the link UP causes problems. The updated bond_update_speed_duplex() returns 0 on success if it retrieves sane values for speed and duplex. On failure it returns 1 and marks the link down. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 24 ++-- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 85999e479916..ad317bb63193 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -365,9 +365,10 @@ int bond_set_carrier(struct bonding *bond) /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, - * and return. + * and return. Return 1 if speed or duplex settings are + * UNKNOWN; 0 otherwise. */ -static void bond_update_speed_duplex(struct slave *slave) +static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; @@ -377,24 +378,27 @@ static void bond_update_speed_duplex(struct slave *slave) slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); - if (res < 0) - return; - - if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) - return; - + if (res < 0) { + slave->link = BOND_LINK_DOWN; + return 1; + } + if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) { + slave->link = BOND_LINK_DOWN; + return 1; + } switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: - return; + slave->link = BOND_LINK_DOWN; + return 1; } slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex; - return; + return 0; } const char *bond_slave_link_status(s8 link) -- 2.12.1.578.ge9c3154ca4-goog
[PATCH next 0/5] bonding: winter cleanup
From: Mahesh Bandewar Few cleanup patches that I have accumulated over some time now. (a) First two patches are basically to move the work-queue initialization from every ndo_open / bond_open operation to once at the beginning while port creation. Work-queue initialization is an unnecessary operation for every 'ifup' operation. However we have some mode-specific work-queues and mode can change anytime after port creation. So the second patch is to ensure the correct work-handler is called based on the mode. (b) Third patch is simple and straightforward that removes hard-coded value that was added into the initial commit and replaces it with the default value configured. (c) The final patch in the series removes the unimplemented "port-moved" state from the LACP state machine. This state is defined but never set so removing from the state machine logic makes code little cleaner. (d) Reduce scope of some global variables to local. Note: None of these patches are making any functional changes. Mahesh Bandewar (5): bonding: restructure arp-monitor bonding: initialize work-queues during creation of bond bonding: remove hardcoded value bonding: remove "port-moved" state that was never implemented bonding: reduce scope of some global variables drivers/net/bonding/bond_3ad.c | 11 +++-- drivers/net/bonding/bond_main.c | 53 ++--- 2 files changed, 37 insertions(+), 27 deletions(-) -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 2/5] bonding: initialize work-queues during creation of bond
From: Mahesh Bandewar Initializing work-queues every time ifup operation performed is unnecessary and can be performed only once when the port is created. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 619f0c65f18a..1329110ed85f 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev) } } - bond_work_init_all(bond); - if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. @@ -4691,6 +4689,8 @@ int bond_create(struct net *net, const char *name) netif_carrier_off(bond_dev); + bond_work_init_all(bond); + rtnl_unlock(); if (res < 0) bond_destructor(bond_dev); -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 1/5] bonding: restructure arp-monitor
From: Mahesh Bandewar In preparation to move the work-queue initialization to port creation from current port_open phase. Work-queue initialization does not make sense every time we do 'ifup/ifdown'. So moving to port creation phase. Arp monitoring work depends on the bonding mode and that is not tied to the port creation and can change anytime during the life after port creation. So this restructuring allows us to move the initialization at creation without losing the ability to arm the correct work for the mode user has selected. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 24 ++-- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8a4ba8b88e52..619f0c65f18a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ -static void bond_loadbalance_arp_mon(struct work_struct *work) +static void bond_loadbalance_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; @@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond) return should_notify_rtnl; } -static void bond_activebackup_arp_mon(struct work_struct *work) +static void bond_activebackup_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; @@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct *work) } } +static void bond_arp_monitor(struct work_struct *work) +{ + struct bonding *bond = container_of(work, struct bonding, + arp_work.work); + + if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) + bond_activebackup_arp_mon(bond); + else + bond_loadbalance_arp_mon(bond); +} + /*-- netdev event handling --*/ /* Change device name */ @@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond) bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); - if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) - INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon); - else - INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon); + INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 4/5] bonding: remove "port-moved" state that was never implemented
From: Mahesh Bandewar LACP state-machine defines "port-moved" state when the same ActorSystemID and Port are seen in a LACPDU received on different port. The state is never set since it's not implemented. However the state-machine attempts to clear that state occasionally. LACP state machine is already complicated and since this state is not implemented, removing it's checks makes the state-machine little simpler. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_3ad.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index edc70ffad660..431926bba9f4 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ - } else if (!(port->sm_vars & AD_PORT_BEGIN) -&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED)) + } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || @@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: - if (port->sm_vars & AD_PORT_MOVED) - port->sm_rx_state = AD_RX_INITIALIZE; - else if (port->is_enabled -&& (port->sm_vars -& AD_PORT_LACP_ENABLED)) + if (port->is_enabled && + (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars @@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~AD_STATE_EXPIRED; - port->sm_vars &= ~AD_PORT_MOVED; port->sm_rx_state = AD_RX_PORT_DISABLED; /* Fall Through */ -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 5/5] bonding: reduce scope of some global variables
From: Mahesh Bandewar Many of the bond param variables are declared global while it's not really necessary for these variables to be global. So moving them to the location these are used. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0f9f5ceae80e..ba934020dfaa 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -201,12 +201,6 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0); unsigned int bond_net_id __read_mostly; -static __be32 arp_target[BOND_MAX_ARP_TARGETS]; -static int arp_ip_count; -static int bond_mode = BOND_MODE_ROUNDROBIN; -static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; -static int lacp_fast; - /*-- Forward declarations ---*/ static int bond_init(struct net_device *bond_dev); @@ -4254,6 +4248,11 @@ static int bond_check_params(struct bond_params *params) int arp_all_targets_value; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; + __be32 arp_target[BOND_MAX_ARP_TARGETS]; + int arp_ip_count; + int bond_mode = BOND_MODE_ROUNDROBIN; + int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; + int lacp_fast = 0; int tlb_dynamic_lb = 0; /* Convert string parameters. */ -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 3/5] bonding: remove hardcoded value
From: Mahesh Bandewar Eliminate hard-coded value and use the default that is set. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 1329110ed85f..0f9f5ceae80e 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4254,6 +4254,7 @@ static int bond_check_params(struct bond_params *params) int arp_all_targets_value; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; + int tlb_dynamic_lb = 0; /* Convert string parameters. */ if (mode) { @@ -4566,6 +4567,17 @@ static int bond_check_params(struct bond_params *params) } ad_user_port_key = valptr->value; + if (bond_mode == BOND_MODE_TLB) { + bond_opt_initstr(&newval, "default"); + valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), + &newval); + if (!valptr) { + pr_err("Error: No tlb_dynamic_lb default value"); + return -EINVAL; + } + tlb_dynamic_lb = valptr->value; + } + if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); @@ -4593,7 +4605,7 @@ static int bond_check_params(struct bond_params *params) params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; - params->tlb_dynamic_lb = 1; /* Default value */ + params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; -- 2.12.0.246.ga2ecc84866-goog
[PATCH next 3/4] bonding: remove hardcoded value
From: Mahesh Bandewar Eliminate hard-coded value and use the default that is set. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b7a344ffc1de..06ac1783bfdd 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4253,6 +4253,7 @@ static int bond_check_params(struct bond_params *params) int arp_all_targets_value; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; + int tlb_dynamic_lb = 0; /* Convert string parameters. */ if (mode) { @@ -4565,6 +4566,17 @@ static int bond_check_params(struct bond_params *params) } ad_user_port_key = valptr->value; + if (bond_mode == BOND_MODE_TLB) { + bond_opt_initstr(&newval, "default"); + valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), + &newval); + if (!valptr) { + pr_err("Error: No tlb_dynamic_lb default value"); + return -EINVAL; + } + tlb_dynamic_lb = valptr->value; + } + if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); @@ -4592,7 +4604,7 @@ static int bond_check_params(struct bond_params *params) params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; - params->tlb_dynamic_lb = 1; /* Default value */ + params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; -- 2.11.0.483.g087da7b7c-goog
[PATCH next 1/4] bonding: restructure arp-monitor
From: Mahesh Bandewar In preparation to move the work-queue initialization to port creation from current port_open phase. Work-queue initialization does not make sense every time we do 'ifup/ifdown'. So moving to port creation phase. Arp monitoring work depends on the bonding mode and that is not tied to the port creation and can change anytime during the life after port creation. So this restructuring allows us to move the initialization at creation without losing the ability to arm the correct work for the mode user has selected. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 24 ++-- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6321f12630c8..4e03807bb0d6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ -static void bond_loadbalance_arp_mon(struct work_struct *work) +static void bond_loadbalance_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; @@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond) return should_notify_rtnl; } -static void bond_activebackup_arp_mon(struct work_struct *work) +static void bond_activebackup_arp_mon(struct bonding *bond) { - struct bonding *bond = container_of(work, struct bonding, - arp_work.work); bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; @@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct *work) } } +static void bond_arp_monitor(struct work_struct *work) +{ + struct bonding *bond = container_of(work, struct bonding, + arp_work.work); + + if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) + bond_activebackup_arp_mon(bond); + else + bond_loadbalance_arp_mon(bond); +} + /*-- netdev event handling --*/ /* Change device name */ @@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond) bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); - if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) - INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon); - else - INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon); + INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } -- 2.11.0.483.g087da7b7c-goog
[PATCH next 4/4] bonding: remove "port-moved" state that was never implemented
From: Mahesh Bandewar LACP state-machine defines "port-moved" state when the same ActorSystemID and Port are seen in a LACPDU received on different port. The state is never set since it's not implemented. However the state-machine attempts to clear that state occasionally. LACP state machine is already complicated and since this state is not implemented, removing it's checks makes the state-machine little simpler. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_3ad.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index edc70ffad660..431926bba9f4 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) port->sm_rx_state = AD_RX_INITIALIZE; port->sm_vars |= AD_PORT_CHURNED; /* check if port is not enabled */ - } else if (!(port->sm_vars & AD_PORT_BEGIN) -&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED)) + } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled) port->sm_rx_state = AD_RX_PORT_DISABLED; /* check if new lacpdu arrived */ else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) || @@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) /* if no lacpdu arrived and no timer is on */ switch (port->sm_rx_state) { case AD_RX_PORT_DISABLED: - if (port->sm_vars & AD_PORT_MOVED) - port->sm_rx_state = AD_RX_INITIALIZE; - else if (port->is_enabled -&& (port->sm_vars -& AD_PORT_LACP_ENABLED)) + if (port->is_enabled && + (port->sm_vars & AD_PORT_LACP_ENABLED)) port->sm_rx_state = AD_RX_EXPIRED; else if (port->is_enabled && ((port->sm_vars @@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port) port->sm_vars &= ~AD_PORT_SELECTED; __record_default(port); port->actor_oper_port_state &= ~AD_STATE_EXPIRED; - port->sm_vars &= ~AD_PORT_MOVED; port->sm_rx_state = AD_RX_PORT_DISABLED; /* Fall Through */ -- 2.11.0.483.g087da7b7c-goog
[PATCH next 2/4] bonding: initialize work-queues during creation of bond
From: Mahesh Bandewar Initializing work-queues every time ifup operation performed is unnecessary and can be performed only once when the port is created. Signed-off-by: Mahesh Bandewar --- drivers/net/bonding/bond_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4e03807bb0d6..b7a344ffc1de 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev) } } - bond_work_init_all(bond); - if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. @@ -4690,6 +4688,8 @@ int bond_create(struct net *net, const char *name) netif_carrier_off(bond_dev); + bond_work_init_all(bond); + rtnl_unlock(); if (res < 0) bond_destructor(bond_dev); -- 2.11.0.483.g087da7b7c-goog
[PATCH next 0/4] bonding: winter cleanup
From: Mahesh Bandewar Few cleanup patches that I have accumulated over some time now. (a) First two patches are basically to move the work-queue initialization from every ndo_open / bond_open operation to once at the beginning while port creation. Work-queue initialization is an unnecessary operation for every 'ifup' operation. However we have some mode-specific work-queues and mode can change anytime after port creation. So the second patch is to ensure the correct work-handler is called based on the mode. (b) Third patch is simple and straightforward that removes hard-coded value that was added into the initial commit and replaces it with the default value configured. (c) The final patch in the series removes the unimplemented "port-moved" state from the LACP state machine. This state is defined but never set so removing from the state machine logic makes code little cleaner. Note: None of these patches are making any functional changes. Mahesh Bandewar (4): bonding: restructure arp-monitor bonding: initialize work-queues during creation of bond bonding: remove hardcoded value bonding: remove "port-moved" state that was never implemented drivers/net/bonding/bond_3ad.c | 11 +++ drivers/net/bonding/bond_main.c | 42 - 2 files changed, 32 insertions(+), 21 deletions(-) -- 2.11.0.483.g087da7b7c-goog
[PATCH next 0/3] use netdev_is_rx_handler_busy() in few known cases
From: Mahesh Bandewar netdev_rx_handler_register() was recently split into two parts - (a) check if the handler is used, (b) register the new handler, parts. This is helpful in scenarios like bonding where at the time of registration there is too much state to unwind and it should check if the device is free before building that state. IPvlan and macvlan drivers don't have this issue however it can make use of the same check instead of using a device specific check. Mahesh Bandewar (3): net: remove duplicate code. ipvlan: use netdev_is_rx_handler_busy instead of checking specific type macvlan: use netdev_is_rx_handler_busy instead of checking specific type drivers/net/ipvlan/ipvlan_main.c | 4 ++-- drivers/net/macvlan.c| 2 +- net/core/dev.c | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) -- 2.11.0.483.g087da7b7c-goog
[PATCH next 3/3] macvlan: use netdev_is_rx_handler_busy instead of checking specific type
From: Mahesh Bandewar netdev_is_rx_handler_busy() check is a superset of netif_is_ipvlan_port() check and hence should be preferred. Signed-off-by: Mahesh Bandewar --- drivers/net/macvlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 440ab3d8adf7..cbfc1be23a0e 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1110,7 +1110,7 @@ static int macvlan_port_create(struct net_device *dev) if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) return -EINVAL; - if (netif_is_ipvlan_port(dev)) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; port = kzalloc(sizeof(*port), GFP_KERNEL); -- 2.11.0.483.g087da7b7c-goog
[PATCH next 1/3] net: remove duplicate code.
From: Mahesh Bandewar netdev_rx_handler_register() checks to see if the handler is already busy which was recently separated into netdev_is_rx_handler_busy(). So use the same function inside register() to avoid code duplication. Essentially this change should be a no-op Signed-off-by: Mahesh Bandewar --- net/core/dev.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index ad5959e56116..c8f1f67ff16c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3961,9 +3961,7 @@ int netdev_rx_handler_register(struct net_device *dev, rx_handler_func_t *rx_handler, void *rx_handler_data) { - ASSERT_RTNL(); - - if (dev->rx_handler) + if (netdev_is_rx_handler_busy(dev)) return -EBUSY; /* Note: rx_handler_data must be set before rx_handler */ -- 2.11.0.483.g087da7b7c-goog
[PATCH next 2/3] ipvlan: use netdev_is_rx_handler_busy instead of checking specific type
From: Mahesh Bandewar IPvlan checks if the master device is already used by checking a specific device (here it's macvlan device). This is technically not sufficient and it should just ensure the rx_handler is busy or not. This would be a super check that includes macvlan and any other that has already registered rx-handler. Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index b5c390f0f2b3..95b18f4602cf 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -102,8 +102,8 @@ static int ipvlan_port_create(struct net_device *dev) return -EINVAL; } - if (netif_is_macvlan_port(dev)) { - netdev_err(dev, "Master is a macvlan port.\n"); + if (netdev_is_rx_handler_busy(dev)) { + netdev_err(dev, "Device is already in use.\n"); return -EBUSY; } -- 2.11.0.483.g087da7b7c-goog
[PATCH next] ipvlan: fix dev_id creation corner case.
From: Mahesh Bandewar In the last patch da36e13cf65 ("ipvlan: improvise dev_id generation logic in IPvlan") I missed some part of Dave's suggestion and because of that the dev_id creation could fail in a corner case scenario. This would happen when more or less 64k devices have been already created and several have been deleted. If the devices that are still sticking around are the last n bits from the bitmap. So in this scenario even if lower bits are available, the dev_id search is so narrow that it always fails. Fixes: da36e13cf65 ("ipvlan: improvise dev_id generation logic in IPvlan") CC: David Miller CC: Eric Dumazet Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 92b221a03350..b5c390f0f2b3 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -550,6 +550,9 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE, GFP_KERNEL); if (err < 0) + err = ida_simple_get(&port->ida, 0x1, port->dev_id_start, +GFP_KERNEL); + if (err < 0) goto destroy_ipvlan_port; dev->dev_id = err; /* Increment id-base to the next slot for the future assignment */ -- 2.11.0.483.g087da7b7c-goog
[PATCH next v2] ipvlan: improvise dev_id generation logic in IPvlan
From: Mahesh Bandewar The patch 009146d117b ("ipvlan: assign unique dev-id for each slave device.") used ida_simple_get() to generate dev_ids assigned to the slave devices. However (Eric has pointed out that) there is a shortcoming with that approach as it always uses the first available ID. This becomes a problem when a slave gets deleted and a new slave gets added. The ID gets reassigned causing the new slave to get the same link-local address. This side-effect is undesirable. This patch adds a per-port variable that keeps track of the IDs assigned and used as the stat-base for the IDR api. This base will be wrapped around when it reaches the MAX (0xFFFE) value possibly on a busy system where slaves are added and deleted routinely. Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.") Signed-off-by: Mahesh Bandewar CC: Eric Dumazet CC: David Miller --- v1 -> v2 Implemented Dave's comments and reinstated IDR API. drivers/net/ipvlan/ipvlan.h | 1 + drivers/net/ipvlan/ipvlan_main.c | 13 - 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 0a9068fdee0f..406ae4ff0ae8 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -94,6 +94,7 @@ struct ipvl_port { struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_headipvlans; u16 mode; + u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index ce7ca6a5aa8a..aca2885cf0b4 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -120,6 +120,7 @@ static int ipvlan_port_create(struct net_device *dev) skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); ida_init(&port->ida); + port->dev_id_start = 1; err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) @@ -535,15 +536,25 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); + /* If the port-id base is at the MAX value, then wrap it around and +* begin from 0x1 again. This may be due to a busy system where lots +* of slaves are getting created and deleted. +*/ + if (port->dev_id_start == 0xFFFE) + port->dev_id_start = 0x1; + /* Since L2 address is shared among all IPvlan slaves including * master, use unique 16 bit dev-ids to diffentiate among them. * Assign IDs between 0x1 and 0xFFFE (used by the master) to each * slave link [see addrconf_ifid_eui48()]. */ - err = ida_simple_get(&port->ida, 1, 0xFFFE, GFP_KERNEL); + err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE, +GFP_KERNEL); if (err < 0) goto destroy_ipvlan_port; dev->dev_id = err; + /* Increment id-base to the next slot for the future assignment */ + port->dev_id_start = err + 1; /* TODO Probably put random address here to be presented to the * world but keep using the physical-dev address for the outgoing -- 2.11.0.390.gc69c2f50cf-goog
[PATCH next v1] ipvlan: don't use IDR for generating dev_id
From: Mahesh Bandewar The patch 009146d117b ("ipvlan: assign unique dev-id for each slave device.") used ida_simple_get() to generate dev_ids assigned to the slave devices. However (Eric has pointed out that) there is a shortcoming with that approach as it always uses the first available ID. This becomes a problem when a slave gets deleted and a new slave gets added. The ID gets reassigned causing the new slave to get the same link-local address. This side-effect is undesirable. This patch replaces IDR logic with a simple per-port variable that keeps incrementing and wraps around when the MAX (0xFFFE) is reached. The only downside is that this is an inefficient (n^2) search if there are 64k (or close to 64k) slaves in the system, the dev-id search takes time. However having these many devices in the system has it's own challenges. Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.") Signed-off-by: Mahesh Bandewar CC: Eric Dumazet --- drivers/net/ipvlan/ipvlan.h | 2 +- drivers/net/ipvlan/ipvlan_main.c | 69 ++-- 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 0a9068fdee0f..535f254324ba 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -94,10 +94,10 @@ struct ipvl_port { struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_headipvlans; u16 mode; + u16 dev_id_base; struct work_struct wq; struct sk_buff_head backlog; int count; - struct ida ida; }; struct ipvl_skb_cb { diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index ce7ca6a5aa8a..691662e02abb 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -119,7 +119,7 @@ static int ipvlan_port_create(struct net_device *dev) skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); - ida_init(&port->ida); + port->dev_id_base = 1; err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) @@ -151,7 +151,6 @@ static void ipvlan_port_destroy(struct net_device *dev) dev_put(skb->dev); kfree_skb(skb); } - ida_destroy(&port->ida); kfree(port); } @@ -496,6 +495,60 @@ static int ipvlan_nl_fillinfo(struct sk_buff *skb, return ret; } +static u16 ipvlan_get_dev_id(struct ipvl_port *port) +{ + struct ipvl_dev *ipvlan; + u16 tid, dev_id = 0; + bool found, exhausted = false, second_round = false; + + /* Idea here is to get the next available ID while avoiding +* reuse of already used IDs. Per port variable (dev_id_base) +* is used to get the next available ID. +* +* Possibilities are from 0x1 to 0xfffe so finding next available +* ID is not be that difficult. However if the system is up for a +* long time and these slave devices are getting added / deleted +* routinely and it reaches the MAX, then only the assignment will +* attempt to recycle previously used IDs starting from 0x1 (Provided +* these are not currently in use). +* +* In case of a failure, dev-id search will return 0. All IDs in-use +* case is very inefficient (n^2 search) but having close to 64k +* slaves has it's own system limitations. +*/ + while (!exhausted) { + tid = port->dev_id_base++; + + if (tid == 0xFFFE) { + /* Reached MAX; restart from 1 */ + port->dev_id_base = 1; + if (second_round) + exhausted = true; + else + second_round = true; + continue; + } + /* Should not be same as the masters' (if present) */ + if (tid == port->dev->dev_id) + continue; + /* Make sure that it's already not assigned to any of the +* existing slaves. +*/ + found = false; + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + if (ipvlan->dev->dev_id == tid) { + found = true; + break; + } + } + if (!found) { + dev_id = tid; + break; + } + } + return dev_id; +} + static int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], str
[RFC PATCH next] ipv6: do not send RTM_DELADDR for tentative addresses
From: Mahesh Bandewar RTM_NEWADDR notification is sent when IFA_F_TENTATIVE is cleared from the address. So if the address is added and deleted before DAD probes completes, the RTM_DELADDR will be sent for which there was no RTM_NEWADDR causing asymmetry in notification. However if the same logic is used while sending RTM_DELADDR notification, this asymmetry can be avoided. Signed-off-by: Mahesh Bandewar CC: Hideaki YOSHIFUJI CC: Patrick McHardy CC: Hannes Frederic Sowa --- net/ipv6/addrconf.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c1e124bc8e1e..ac9bd5620f81 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4888,6 +4888,13 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) struct net *net = dev_net(ifa->idev->dev); int err = -ENOBUFS; + /* Don't send DELADDR notification for TENTATIVE address, +* since NEWADDR notification is sent only after removing +* TENTATIVE flag. +*/ + if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR) + return; + skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC); if (!skb) goto errout; -- 2.11.0.390.gc69c2f50cf-goog
[PATCH next v1] ipvlan: assign unique dev-id for each slave device.
From: Mahesh Bandewar IPvlan setup uses one mac-address (of master). The IPv6 link-local addresses are derived using the mac-address on the link. Lack of dev-ids makes these link-local addresses same for all slaves including that of master device. dev-ids are necessary to add differentiation when L2 address is shared. Signed-off-by: Mahesh Bandewar --- drivers/net/ipvlan/ipvlan.h | 1 + drivers/net/ipvlan/ipvlan_main.c | 17 - 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index dbfbb33ac66c..0a9068fdee0f 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -97,6 +97,7 @@ struct ipvl_port { struct work_struct wq; struct sk_buff_head backlog; int count; + struct ida ida; }; struct ipvl_skb_cb { diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 975f9ddb9908..ce7ca6a5aa8a 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -119,6 +119,7 @@ static int ipvlan_port_create(struct net_device *dev) skb_queue_head_init(&port->backlog); INIT_WORK(&port->wq, ipvlan_process_multicast); + ida_init(&port->ida); err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); if (err) @@ -150,6 +151,7 @@ static void ipvlan_port_destroy(struct net_device *dev) dev_put(skb->dev); kfree_skb(skb); } + ida_destroy(&port->ida); kfree(port); } @@ -533,6 +535,16 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, ipvlan_adjust_mtu(ipvlan, phy_dev); INIT_LIST_HEAD(&ipvlan->addrs); + /* Since L2 address is shared among all IPvlan slaves including +* master, use unique 16 bit dev-ids to diffentiate among them. +* Assign IDs between 0x1 and 0xFFFE (used by the master) to each +* slave link [see addrconf_ifid_eui48()]. +*/ + err = ida_simple_get(&port->ida, 1, 0xFFFE, GFP_KERNEL); + if (err < 0) + goto destroy_ipvlan_port; + dev->dev_id = err; + /* TODO Probably put random address here to be presented to the * world but keep using the physical-dev address for the outgoing * packets. @@ -543,7 +555,7 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, err = register_netdevice(dev); if (err < 0) - goto destroy_ipvlan_port; + goto remove_ida; err = netdev_upper_dev_link(phy_dev, dev); if (err) { @@ -562,6 +574,8 @@ static int ipvlan_link_new(struct net *src_net, struct net_device *dev, netdev_upper_dev_unlink(phy_dev, dev); unregister_netdev: unregister_netdevice(dev); +remove_ida: + ida_simple_remove(&port->ida, dev->dev_id); destroy_ipvlan_port: if (create) ipvlan_port_destroy(phy_dev); @@ -579,6 +593,7 @@ static void ipvlan_link_delete(struct net_device *dev, struct list_head *head) kfree_rcu(addr, rcu); } + ida_simple_remove(&ipvlan->port->ida, dev->dev_id); list_del_rcu(&ipvlan->pnode); unregister_netdevice_queue(dev, head); netdev_upper_dev_unlink(ipvlan->phy_dev, dev); -- 2.11.0.390.gc69c2f50cf-goog