[PATCH net] Revert "blackhole_netdev: fix syzkaller reported issue"

2019-10-16 Thread Mahesh Bandewar
This reverts commit b0818f80c8c1bc215bba276bd61c216014fab23b.

Started seeing weird behavior after this patch especially in
the IPv6 code path. Haven't root caused it, but since this was
applied to net branch, taking a precautionary measure to revert
it and look / analyze those failures

Revert this now and I'll send a better fix after analysing / fixing
the weirdness observed.

CC: Eric Dumazet 
CC: Wei Wang 
CC: David S. Miller 
Signed-off-by: Mahesh Bandewar 
---
 net/ipv6/addrconf.c |  7 +--
 net/ipv6/route.c| 15 +--
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4c87594d1389..34ccef18b40e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6996,7 +6996,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
 
 int __init addrconf_init(void)
 {
-   struct inet6_dev *idev, *bdev;
+   struct inet6_dev *idev;
int i, err;
 
err = ipv6_addr_label_init();
@@ -7036,14 +7036,10 @@ int __init addrconf_init(void)
 */
rtnl_lock();
idev = ipv6_add_dev(init_net.loopback_dev);
-   bdev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
-   } else if (IS_ERR(bdev)) {
-   err = PTR_ERR(bdev);
-   goto errlo;
}
 
ip6_route_init_special_entries();
@@ -7128,7 +7124,6 @@ void addrconf_cleanup(void)
addrconf_ifdown(dev, 1);
}
addrconf_ifdown(init_net.loopback_dev, 2);
-   addrconf_ifdown(blackhole_netdev, 2);
 
/*
 *  Check hash table.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 742120728869..a63ff85fe141 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -155,9 +155,10 @@ void rt6_uncached_list_del(struct rt6_info *rt)
 
 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device 
*dev)
 {
+   struct net_device *loopback_dev = net->loopback_dev;
int cpu;
 
-   if (dev == net->loopback_dev)
+   if (dev == loopback_dev)
return;
 
for_each_possible_cpu(cpu) {
@@ -170,7 +171,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
struct net_device *rt_dev = rt->dst.dev;
 
if (rt_idev->dev == dev) {
-   rt->rt6i_idev = in6_dev_get(blackhole_netdev);
+   rt->rt6i_idev = in6_dev_get(loopback_dev);
in6_dev_put(rt_idev);
}
 
@@ -385,11 +386,13 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct 
net_device *dev,
 {
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
+   struct net_device *loopback_dev =
+   dev_net(dev)->loopback_dev;
 
-   if (idev && idev->dev != dev_net(dev)->loopback_dev) {
-   struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev);
-   if (ibdev) {
-   rt->rt6i_idev = ibdev;
+   if (idev && idev->dev != loopback_dev) {
+   struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
+   if (loopback_idev) {
+   rt->rt6i_idev = loopback_idev;
in6_dev_put(idev);
}
}
-- 
2.23.0.700.g56cf767bdb-goog



[PATCHv3 next] blackhole_netdev: fix syzkaller reported issue

2019-10-11 Thread Mahesh Bandewar
83 
e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15
RSP: 0018:88806c037038 EFLAGS: 00010207
RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000
RDX: 0047 RSI: 863e3caa RDI: 023c
RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288
R10: fbfff134af07 R11: 89a5783f R12: 0003
R13: 88806c037298 R14: 888094c5cd6f R15: 
FS:  7fa15e7de700() GS:8880ae80() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 00625208 CR3: 5e348000 CR4: 001406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400

Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst 
entries")
Signed-off-by: Mahesh Bandewar 
CC: Eric Dumazet 
CC: Wei Wang 
---
v1->v2:
   fixed missing update in ip6_dst_ifdown()
v2->v3:
   added idev cleanup step in addrconf_cleanup()

 net/ipv6/addrconf.c |  7 ++-
 net/ipv6/route.c| 15 ++-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 98d82305d6de..2a01404aba78 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
 
 int __init addrconf_init(void)
 {
-   struct inet6_dev *idev;
+   struct inet6_dev *idev, *bdev;
int i, err;
 
err = ipv6_addr_label_init();
@@ -7035,10 +7035,14 @@ int __init addrconf_init(void)
 */
rtnl_lock();
idev = ipv6_add_dev(init_net.loopback_dev);
+   bdev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
+   } else if (IS_ERR(bdev)) {
+   err = PTR_ERR(bdev);
+   goto errlo;
}
 
ip6_route_init_special_entries();
@@ -7123,6 +7127,7 @@ void addrconf_cleanup(void)
addrconf_ifdown(dev, 1);
}
addrconf_ifdown(init_net.loopback_dev, 2);
+   addrconf_ifdown(blackhole_netdev, 2);
 
/*
 *  Check hash table.
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a63ff85fe141..742120728869 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt)
 
 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device 
*dev)
 {
-   struct net_device *loopback_dev = net->loopback_dev;
int cpu;
 
-   if (dev == loopback_dev)
+   if (dev == net->loopback_dev)
return;
 
for_each_possible_cpu(cpu) {
@@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
struct net_device *rt_dev = rt->dst.dev;
 
if (rt_idev->dev == dev) {
-   rt->rt6i_idev = in6_dev_get(loopback_dev);
+   rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
}
 
@@ -386,13 +385,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct 
net_device *dev,
 {
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
-   struct net_device *loopback_dev =
-   dev_net(dev)->loopback_dev;
 
-   if (idev && idev->dev != loopback_dev) {
-   struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
-   if (loopback_idev) {
-   rt->rt6i_idev = loopback_idev;
+   if (idev && idev->dev != dev_net(dev)->loopback_dev) {
+   struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev);
+   if (ibdev) {
+   rt->rt6i_idev = ibdev;
in6_dev_put(idev);
}
}
-- 
2.23.0.700.g56cf767bdb-goog



[PATCHv2 next] blackhole_netdev: fix syzkaller reported issue

2019-10-10 Thread Mahesh Bandewar
83 
e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15
RSP: 0018:88806c037038 EFLAGS: 00010207
RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000
RDX: 0047 RSI: 863e3caa RDI: 023c
RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288
R10: fbfff134af07 R11: 89a5783f R12: 0003
R13: 88806c037298 R14: 888094c5cd6f R15: 
FS:  7fa15e7de700() GS:8880ae80() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 00625208 CR3: 5e348000 CR4: 001406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400

Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst 
entries")
Signed-off-by: Mahesh Bandewar 
CC: Eric Dumazet 
---
v1->v2:
   fixed missing update in ip6_dst_ifdown()

 net/ipv6/addrconf.c |  6 +-
 net/ipv6/route.c| 15 ++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 98d82305d6de..0f216f7cbe3e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
 
 int __init addrconf_init(void)
 {
-   struct inet6_dev *idev;
+   struct inet6_dev *idev, *bdev;
int i, err;
 
err = ipv6_addr_label_init();
@@ -7035,10 +7035,14 @@ int __init addrconf_init(void)
 */
rtnl_lock();
idev = ipv6_add_dev(init_net.loopback_dev);
+   bdev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
+   } else if (IS_ERR(bdev)) {
+   err = PTR_ERR(bdev);
+   goto errlo;
}
 
ip6_route_init_special_entries();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a63ff85fe141..742120728869 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt)
 
 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device 
*dev)
 {
-   struct net_device *loopback_dev = net->loopback_dev;
int cpu;
 
-   if (dev == loopback_dev)
+   if (dev == net->loopback_dev)
return;
 
for_each_possible_cpu(cpu) {
@@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
struct net_device *rt_dev = rt->dst.dev;
 
if (rt_idev->dev == dev) {
-   rt->rt6i_idev = in6_dev_get(loopback_dev);
+   rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
}
 
@@ -386,13 +385,11 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct 
net_device *dev,
 {
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
-   struct net_device *loopback_dev =
-   dev_net(dev)->loopback_dev;
 
-   if (idev && idev->dev != loopback_dev) {
-   struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
-   if (loopback_idev) {
-   rt->rt6i_idev = loopback_idev;
+   if (idev && idev->dev != dev_net(dev)->loopback_dev) {
+   struct inet6_dev *ibdev = in6_dev_get(blackhole_netdev);
+   if (ibdev) {
+   rt->rt6i_idev = ibdev;
in6_dev_put(idev);
}
}
-- 
2.23.0.581.g78d2f28ef7-goog



[PATCH next] ipvlan: consolidate TSO flags using NETIF_F_ALL_TSO

2019-10-09 Thread Mahesh Bandewar
This will ensure that any new TSO related flags added (which
would be part of ALL_TSO mask and IPvlan driver doesn't need
to update every time new flag gets added.

Signed-off-by: Mahesh Bandewar 
Suggested-by: Eric Dumazet 
---
 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 887bbba4631e..b0ac557f8e60 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -108,8 +108,8 @@ static void ipvlan_port_destroy(struct net_device *dev)
 
 #define IPVLAN_FEATURES \
(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
-NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \
-NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
+NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \
+NETIF_F_GRO | NETIF_F_RXCSUM | \
 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)
 
 #define IPVLAN_STATE_MASK \
-- 
2.23.0.581.g78d2f28ef7-goog



[PATCH next] blackhole_netdev: fix syzkaller reported issue

2019-10-09 Thread Mahesh Bandewar
83 
e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 15
RSP: 0018:88806c037038 EFLAGS: 00010207
RAX: dc00 RBX: 888094c5cd60 RCX: c9000b791000
RDX: 0047 RSI: 863e3caa RDI: 023c
RBP: 88806c037098 R08: 888063de0600 R09: 88806c037288
R10: fbfff134af07 R11: 89a5783f R12: 0003
R13: 88806c037298 R14: 888094c5cd6f R15: 
FS:  7fa15e7de700() GS:8880ae80() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 00625208 CR3: 5e348000 CR4: 001406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400

Fixes: 8d7017fd621d ("blackhole_netdev: use blackhole_netdev to invalidate dst 
entries")
Signed-off-by: Mahesh Bandewar 
---
 net/ipv6/addrconf.c | 6 +-
 net/ipv6/route.c| 5 ++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 98d82305d6de..0f216f7cbe3e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -6995,7 +6995,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
 
 int __init addrconf_init(void)
 {
-   struct inet6_dev *idev;
+   struct inet6_dev *idev, *bdev;
int i, err;
 
err = ipv6_addr_label_init();
@@ -7035,10 +7035,14 @@ int __init addrconf_init(void)
 */
rtnl_lock();
idev = ipv6_add_dev(init_net.loopback_dev);
+   bdev = ipv6_add_dev(blackhole_netdev);
rtnl_unlock();
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
+   } else if (IS_ERR(bdev)) {
+   err = PTR_ERR(bdev);
+   goto errlo;
}
 
ip6_route_init_special_entries();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a63ff85fe141..ea16e037ec13 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -155,10 +155,9 @@ void rt6_uncached_list_del(struct rt6_info *rt)
 
 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device 
*dev)
 {
-   struct net_device *loopback_dev = net->loopback_dev;
int cpu;
 
-   if (dev == loopback_dev)
+   if (dev == net->loopback_dev)
return;
 
for_each_possible_cpu(cpu) {
@@ -171,7 +170,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
struct net_device *rt_dev = rt->dst.dev;
 
if (rt_idev->dev == dev) {
-   rt->rt6i_idev = in6_dev_get(loopback_dev);
+   rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
}
 
-- 
2.23.0.581.g78d2f28ef7-goog



[PATCH next] loopback: fix lockdep splat

2019-07-02 Thread Mahesh Bandewar
dev_init_scheduler() and dev_activate() expect the caller to
hold RTNL. Since we don't want blackhole device to be initialized
per ns, we are initializing at init.

[3.855027] Call Trace:
[3.855034]  dump_stack+0x67/0x95
[3.855037]  lockdep_rcu_suspicious+0xd5/0x110
[3.855044]  dev_init_scheduler+0xe3/0x120
[3.855048]  ? net_olddevs_init+0x60/0x60
[3.855050]  blackhole_netdev_init+0x45/0x6e
[3.855052]  do_one_initcall+0x6c/0x2fa
[3.855058]  ? rcu_read_lock_sched_held+0x8c/0xa0
[3.855066]  kernel_init_freeable+0x1e5/0x288
[3.855071]  ? rest_init+0x260/0x260
[3.855074]  kernel_init+0xf/0x180
[3.855076]  ? rest_init+0x260/0x260
[3.855078]  ret_from_fork+0x24/0x30

Fixes: 4de83b88c66 ("loopback: create blackhole net device similar to loopack.")
Reported-by: Geert Uytterhoeven 
Cc: Eric Dumazet 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/loopback.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 3b39def5471e..14545a8797a8 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -261,8 +261,10 @@ static int __init blackhole_netdev_init(void)
if (!blackhole_netdev)
return -ENOMEM;
 
+   rtnl_lock();
dev_init_scheduler(blackhole_netdev);
dev_activate(blackhole_netdev);
+   rtnl_unlock();
 
blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
dev_net_set(blackhole_netdev, &init_net);
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv3 next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries

2019-07-01 Thread Mahesh Bandewar
Use blackhole_netdev instead of 'lo' device with lower MTU when marking
dst "dead".

Signed-off-by: Mahesh Bandewar 
---
v1->v2->v3
  no change

 net/core/dst.c   | 2 +-
 net/ipv4/route.c | 3 +--
 net/ipv6/route.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/core/dst.c b/net/core/dst.c
index e46366228eaf..1325316d9eab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
-   dst->dev = dev_net(dst->dev)->loopback_dev;
+   dst->dev = blackhole_netdev;
dev_hold(dst->dev);
dev_put(dev);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bbd55c7f6b2e..dc1f510a7c81 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 
 void rt_flush_dev(struct net_device *dev)
 {
-   struct net *net = dev_net(dev);
struct rtable *rt;
int cpu;
 
@@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev)
list_for_each_entry(rt, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
-   rt->dst.dev = net->loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7556275b1cef..39361f57351a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
}
 
if (rt_dev == dev) {
-   rt->dst.dev = loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv3 next 3/3] blackhole_dev: add a selftest

2019-07-01 Thread Mahesh Bandewar
Since this is not really a device with all capabilities, this test
ensures that it has *enough* to make it through the data path
without causing unwanted side-effects (read crash!).

Signed-off-by: Mahesh Bandewar 
---
v1 -> v2
  fixed the conflict resolution in selftests Makefile
v2 -> v3
  fixed lib/Kconfig.debug tristate text / string.

 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 tools/testing/selftests/net/Makefile  |   2 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 6 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cbdfae379896..99272b5dd980 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1909,6 +1909,15 @@ config TEST_BPF
 
  If unsure, say N.
 
+config TEST_BLACKHOLE_DEV
+   tristate "Test blackhole netdev functionality"
+   depends on m && NET
+   help
+ This builds the "test_blackhole_dev" module that validates the
+ data path through this blackhole netdev.
+
+ If unsure, say N.
+
 config FIND_BIT_BENCHMARK
tristate "Test find_bit functions"
help
diff --git a/lib/Makefile b/lib/Makefile
index dcb558c7554d..6ac44fe2a37f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
 obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
+obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c
new file mode 100644
index ..4c40580a99a3
--- /dev/null
+++ b/lib/test_blackhole_dev.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This module tests the blackhole_dev that is created during the
+ * net subsystem initialization. The test this module performs is
+ * by injecting an skb into the stack with skb->dev as the
+ * blackhole_dev and expects kernel to behave in a sane manner
+ * (in other words, *not crash*)!
+ *
+ * Copyright (c) 2018, Mahesh Bandewar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define SKB_SIZE  256
+#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */
+#define TAIL_SIZE 32   /* random tail-room */
+
+#define UDP_PORT 1234
+
+static int __init test_blackholedev_init(void)
+{
+   struct ipv6hdr *ip6h;
+   struct sk_buff *skb;
+   struct ethhdr *ethh;
+   struct udphdr *uh;
+   int data_len;
+   int ret;
+
+   skb = alloc_skb(SKB_SIZE, GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+
+   /* Reserve head-room for the headers */
+   skb_reserve(skb, HEAD_SIZE);
+
+   /* Add data to the skb */
+   data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE);
+   memset(__skb_put(skb, data_len), 0xf, data_len);
+
+   /* Add protocol data */
+   /* (Transport) UDP */
+   uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr));
+   skb_set_transport_header(skb, 0);
+   uh->source = uh->dest = htons(UDP_PORT);
+   uh->len = htons(data_len);
+   uh->check = 0;
+   /* (Network) IPv6 */
+   ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr));
+   skb_set_network_header(skb, 0);
+   ip6h->hop_limit = 32;
+   ip6h->payload_len = data_len + sizeof(struct udphdr);
+   ip6h->nexthdr = IPPROTO_UDP;
+   ip6h->saddr = in6addr_loopback;
+   ip6h->daddr = in6addr_loopback;
+   /* Ether */
+   ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr));
+   skb_set_mac_header(skb, 0);
+
+   skb->protocol = htons(ETH_P_IPV6);
+   skb->pkt_type = PACKET_HOST;
+   skb->dev = blackhole_netdev;
+
+   /* Now attempt to send the packet */
+   ret = dev_queue_xmit(skb);
+
+   switch (ret) {
+   case NET_XMIT_SUCCESS:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n");
+   break;
+   case NET_XMIT_DROP:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n");
+   break;
+   case NET_XMIT_CN:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n");
+   break;
+   default:
+   pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret);
+   }
+
+   return 0;
+}
+
+static void __exit test_blackholedev_exit(void)
+{
+   pr_warn("test_blackholedev module terminating.\n");
+}
+
+module_init(test_blackholedev_init);
+mo

[PATCHv3 next 1/3] loopback: create blackhole net device similar to loopack.

2019-07-01 Thread Mahesh Bandewar
Create a blackhole net device that can be used for "dead"
dst entries instead of loopback device. This blackhole device differs
from loopback in few aspects: (a) It's not per-ns. (b)  MTU on this
device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb().
and (d) since it's not registered it won't have ifindex.

Lower MTU effectively make the device not pass the MTU check during
the route check when a dst associated with the skb is dead.

Signed-off-by: Mahesh Bandewar 
---
v1->v2->v3
  no change

 drivers/net/loopback.c| 76 ++-
 include/linux/netdevice.h |  2 ++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
 #include 
 #include 
 
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
.ndo_set_mac_address = eth_mac_addr,
 };
 
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+unsigned int mtu,
+const struct ethtool_ops *eth_ops,
+const struct header_ops *hdr_ops,
+const struct net_device_ops *dev_ops,
+void (*dev_destructor)(struct net_device *dev))
 {
-   dev->mtu= 64 * 1024;
+   dev->mtu= mtu;
dev->hard_header_len= ETH_HLEN; /* 14   */
dev->min_header_len = ETH_HLEN; /* 14   */
dev->addr_len   = ETH_ALEN; /* 6*/
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NETNS_LOCAL
| NETIF_F_VLAN_CHALLENGED
| NETIF_F_LOOPBACK;
-   dev->ethtool_ops= &loopback_ethtool_ops;
-   dev->header_ops = ð_header_ops;
-   dev->netdev_ops = &loopback_ops;
+   dev->ethtool_ops= eth_ops;
+   dev->header_ops = hdr_ops;
+   dev->netdev_ops = dev_ops;
dev->needs_free_netdev  = true;
-   dev->priv_destructor= loopback_dev_free;
+   dev->priv_destructor= dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops,
+&loopback_ops, loopback_dev_free);
 }
 
 /* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net)
 struct pernet_operations __net_initdata loopback_net_ops = {
.init = loopback_net_init,
 };
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+struct net_device *dev)
+{
+   kfree_skb(skb);
+   net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+   return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+   .ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+   blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+   blackhole_netdev_setup);
+   if (!blackhole_netdev)
+   return -ENOMEM;
+
+   dev_init_scheduler(blackhole_netdev);
+   dev_activate(blackhole_netdev);
+
+   blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+   dev_net_set(blackhole_netdev, &init_net);
+
+   return 0;
+}
+
+device_initcall(blackhole_netdev_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eeacebd7debb..88292953aa6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4870,4 +4870,6 @@ do {  
\
 #define PTYPE_HASH_SIZE(16)
 #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1)
 
+extern struct net_device *blackhole_netdev;
+
 #endif /* _LINUX_NETDEVICE_H */
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv3 next 0/3] blackhole device to invalidate dst

2019-07-01 Thread Mahesh Bandewar
When we invalidate dst or mark it "dead", we assign 'lo' to
dst->dev. First of all this assignment is racy and more over,
it has MTU implications.

The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP
code when dereferencing the dst don't check if the dst is valid
or not. TCP when dereferencing a dead-dst while negotiating a
new connection, may use dst device which is 'lo' instead of
using the correct device. Consider the following scenario:

A SYN arrives on an interface and tcp-layer while processing
SYNACK finds a dst and associates it with SYNACK skb. Now before
skb gets passed to L3 for processing, if that dst gets "dead"
(because of the virtual device getting disappeared & then reappeared),
the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume
the SYN has ADV_MSS set as 9k while the output device through
which this SYNACK is going to go out has standard MTU of 1500.
The MTU check during the route check passes since MIN(9K, 64K)
is 9k and TCP successfully negotiates 9k MSS. The subsequent
data packet; bigger in size gets passed to the device and it 
won't be marked as GSO since the assumed MTU of the device is
9k.

This either crashes the NIC and we have seen fixes that went
into drivers to handle this scenario. 8914a595110a ('bnx2x:
disable GSO where gso_size is too big for hardware') and
2b16f048729b ('net: create skb_gso_validate_mac_len()') and
with those fixes TCP eventually recovers but not before
few dropped segments.

Well, I'm not a TCP expert and though we have experienced
these corner cases in our environment, I could not reproduce 
this case reliably in my test setup to try this fix myself.
However, Michael Chan  had a setup
where these fixes helped him mitigate the issue and not cause
the crash.

The idea here is to not alter the data-path with additional
locks or smb()/rmb() barriers to avoid racy assignments but
to create a new device that has really low MTU that has
.ndo_start_xmit essentially a kfree_skb(). Make use of this
device instead of 'lo' when marking the dst dead.

First patch implements the blackhole device and second
patch uses it in IPv4 and IPv6 stack while the third patch
is the self test that ensures the sanity of this device.

v1->v2
  fixed the self-test patch to handle the conflict

v2 -> v3
  fixed Kconfig text/string.

Mahesh Bandewar (3):
  loopback: create blackhole net device similar to loopack.
  blackhole_netdev: use blackhole_netdev to invalidate dst entries
  blackhole_dev: add a selftest

 drivers/net/loopback.c|  76 +++--
 include/linux/netdevice.h |   2 +
 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 net/core/dst.c|   2 +-
 net/ipv4/route.c  |   3 +-
 net/ipv6/route.c  |   2 +-
 tools/testing/selftests/net/Makefile  |   2 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 11 files changed, 195 insertions(+), 14 deletions(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv2 next 3/3] blackhole_dev: add a selftest

2019-06-27 Thread Mahesh Bandewar
Since this is not really a device with all capabilities, this test
ensures that it has *enough* to make it through the data path
without causing unwanted side-effects (read crash!).

Signed-off-by: Mahesh Bandewar 
---
v1 -> v2
  fixed the conflict resolution in selftests Makefile

 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 tools/testing/selftests/net/Makefile  |   2 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 6 files changed, 123 insertions(+), 1 deletion(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cbdfae379896..79d96e1614f5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1909,6 +1909,15 @@ config TEST_BPF
 
  If unsure, say N.
 
+config TEST_BLACKHOLE_DEV
+   tristate "Test BPF filter functionality"
+   depends on m && NET
+   help
+ This builds the "test_blackhole_dev" module that validates the
+ data path through this blackhole netdev.
+
+ If unsure, say N.
+
 config FIND_BIT_BENCHMARK
tristate "Test find_bit functions"
help
diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..c293624d3664 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
 obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
+obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c
new file mode 100644
index ..4c40580a99a3
--- /dev/null
+++ b/lib/test_blackhole_dev.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This module tests the blackhole_dev that is created during the
+ * net subsystem initialization. The test this module performs is
+ * by injecting an skb into the stack with skb->dev as the
+ * blackhole_dev and expects kernel to behave in a sane manner
+ * (in other words, *not crash*)!
+ *
+ * Copyright (c) 2018, Mahesh Bandewar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define SKB_SIZE  256
+#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */
+#define TAIL_SIZE 32   /* random tail-room */
+
+#define UDP_PORT 1234
+
+static int __init test_blackholedev_init(void)
+{
+   struct ipv6hdr *ip6h;
+   struct sk_buff *skb;
+   struct ethhdr *ethh;
+   struct udphdr *uh;
+   int data_len;
+   int ret;
+
+   skb = alloc_skb(SKB_SIZE, GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+
+   /* Reserve head-room for the headers */
+   skb_reserve(skb, HEAD_SIZE);
+
+   /* Add data to the skb */
+   data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE);
+   memset(__skb_put(skb, data_len), 0xf, data_len);
+
+   /* Add protocol data */
+   /* (Transport) UDP */
+   uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr));
+   skb_set_transport_header(skb, 0);
+   uh->source = uh->dest = htons(UDP_PORT);
+   uh->len = htons(data_len);
+   uh->check = 0;
+   /* (Network) IPv6 */
+   ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr));
+   skb_set_network_header(skb, 0);
+   ip6h->hop_limit = 32;
+   ip6h->payload_len = data_len + sizeof(struct udphdr);
+   ip6h->nexthdr = IPPROTO_UDP;
+   ip6h->saddr = in6addr_loopback;
+   ip6h->daddr = in6addr_loopback;
+   /* Ether */
+   ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr));
+   skb_set_mac_header(skb, 0);
+
+   skb->protocol = htons(ETH_P_IPV6);
+   skb->pkt_type = PACKET_HOST;
+   skb->dev = blackhole_netdev;
+
+   /* Now attempt to send the packet */
+   ret = dev_queue_xmit(skb);
+
+   switch (ret) {
+   case NET_XMIT_SUCCESS:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n");
+   break;
+   case NET_XMIT_DROP:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n");
+   break;
+   case NET_XMIT_CN:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n");
+   break;
+   default:
+   pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret);
+   }
+
+   return 0;
+}
+
+static void __exit test_blackholedev_exit(void)
+{
+   pr_warn("test_blackholedev module terminating.\n");
+}
+
+module_init(test_blackholedev_init);
+module_exit(test_blackholedev_exit);
+
+MODULE_AUTHOR("Mahe

[PATCHv2 next 0/3] blackhole device to invalidate dst

2019-06-27 Thread Mahesh Bandewar
When we invalidate dst or mark it "dead", we assign 'lo' to
dst->dev. First of all this assignment is racy and more over,
it has MTU implications.

The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP
code when dereferencing the dst don't check if the dst is valid
or not. TCP when dereferencing a dead-dst while negotiating a
new connection, may use dst device which is 'lo' instead of
using the correct device. Consider the following scenario:

A SYN arrives on an interface and tcp-layer while processing
SYNACK finds a dst and associates it with SYNACK skb. Now before
skb gets passed to L3 for processing, if that dst gets "dead"
(because of the virtual device getting disappeared & then reappeared),
the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume
the SYN has ADV_MSS set as 9k while the output device through
which this SYNACK is going to go out has standard MTU of 1500.
The MTU check during the route check passes since MIN(9K, 64K)
is 9k and TCP successfully negotiates 9k MSS. The subsequent
data packet; bigger in size gets passed to the device and it 
won't be marked as GSO since the assumed MTU of the device is
9k.

This either crashes the NIC and we have seen fixes that went
into drivers to handle this scenario. 8914a595110a ('bnx2x:
disable GSO where gso_size is too big for hardware') and
2b16f048729b ('net: create skb_gso_validate_mac_len()') and
with those fixes TCP eventually recovers but not before
few dropped segments.

Well, I'm not a TCP expert and though we have experienced
these corner cases in our environment, I could not reproduce 
this case reliably in my test setup to try this fix myself.
However, Michael Chan  had a setup
where these fixes helped him mitigate the issue and not cause
the crash.

The idea here is to not alter the data-path with additional
locks or smb()/rmb() barriers to avoid racy assignments but
to create a new device that has really low MTU that has
.ndo_start_xmit essentially a kfree_skb(). Make use of this
device instead of 'lo' when marking the dst dead.

First patch implements the blackhole device and second
patch uses it in IPv4 and IPv6 stack while the third patch
is the self test that ensures the sanity of this device.

v1->v2
  fixed the self-test patch to handle the conflict

Mahesh Bandewar (3):
  loopback: create blackhole net device similar to loopack.
  blackhole_netdev: use blackhole_netdev to invalidate dst entries
  blackhole_dev: add a selftest

 drivers/net/loopback.c|  76 +++--
 include/linux/netdevice.h |   2 +
 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 net/core/dst.c|   2 +-
 net/ipv4/route.c  |   3 +-
 net/ipv6/route.c  |   2 +-
 tools/testing/selftests/net/Makefile  |   2 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 11 files changed, 195 insertions(+), 14 deletions(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv2 next 1/3] loopback: create blackhole net device similar to loopack.

2019-06-27 Thread Mahesh Bandewar
Create a blackhole net device that can be used for "dead"
dst entries instead of loopback device. This blackhole device differs
from loopback in few aspects: (a) It's not per-ns. (b)  MTU on this
device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb().
and (d) since it's not registered it won't have ifindex.

Lower MTU effectively make the device not pass the MTU check during
the route check when a dst associated with the skb is dead.

Signed-off-by: Mahesh Bandewar 
---
v1->v2
  no change

 drivers/net/loopback.c| 76 ++-
 include/linux/netdevice.h |  2 ++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
 #include 
 #include 
 
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
.ndo_set_mac_address = eth_mac_addr,
 };
 
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+unsigned int mtu,
+const struct ethtool_ops *eth_ops,
+const struct header_ops *hdr_ops,
+const struct net_device_ops *dev_ops,
+void (*dev_destructor)(struct net_device *dev))
 {
-   dev->mtu= 64 * 1024;
+   dev->mtu= mtu;
dev->hard_header_len= ETH_HLEN; /* 14   */
dev->min_header_len = ETH_HLEN; /* 14   */
dev->addr_len   = ETH_ALEN; /* 6*/
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NETNS_LOCAL
| NETIF_F_VLAN_CHALLENGED
| NETIF_F_LOOPBACK;
-   dev->ethtool_ops= &loopback_ethtool_ops;
-   dev->header_ops = ð_header_ops;
-   dev->netdev_ops = &loopback_ops;
+   dev->ethtool_ops= eth_ops;
+   dev->header_ops = hdr_ops;
+   dev->netdev_ops = dev_ops;
dev->needs_free_netdev  = true;
-   dev->priv_destructor= loopback_dev_free;
+   dev->priv_destructor= dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops,
+&loopback_ops, loopback_dev_free);
 }
 
 /* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net)
 struct pernet_operations __net_initdata loopback_net_ops = {
.init = loopback_net_init,
 };
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+struct net_device *dev)
+{
+   kfree_skb(skb);
+   net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+   return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+   .ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+   blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+   blackhole_netdev_setup);
+   if (!blackhole_netdev)
+   return -ENOMEM;
+
+   dev_init_scheduler(blackhole_netdev);
+   dev_activate(blackhole_netdev);
+
+   blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+   dev_net_set(blackhole_netdev, &init_net);
+
+   return 0;
+}
+
+device_initcall(blackhole_netdev_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eeacebd7debb..88292953aa6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4870,4 +4870,6 @@ do {  
\
 #define PTYPE_HASH_SIZE(16)
 #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1)
 
+extern struct net_device *blackhole_netdev;
+
 #endif /* _LINUX_NETDEVICE_H */
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCHv2 next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries

2019-06-27 Thread Mahesh Bandewar
Use blackhole_netdev instead of 'lo' device with lower MTU when marking
dst "dead".

Signed-off-by: Mahesh Bandewar 
---
v1 -> v2
  no change

 net/core/dst.c   | 2 +-
 net/ipv4/route.c | 3 +--
 net/ipv6/route.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/core/dst.c b/net/core/dst.c
index e46366228eaf..1325316d9eab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
-   dst->dev = dev_net(dst->dev)->loopback_dev;
+   dst->dev = blackhole_netdev;
dev_hold(dst->dev);
dev_put(dev);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 59670fafcd26..d61f43feb7fa 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 
 void rt_flush_dev(struct net_device *dev)
 {
-   struct net *net = dev_net(dev);
struct rtable *rt;
int cpu;
 
@@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev)
list_for_each_entry(rt, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
-   rt->dst.dev = net->loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e7c2824435c6..ff44c6287633 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
}
 
if (rt_dev == dev) {
-   rt->dst.dev = loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH next 2/3] blackhole_netdev: use blackhole_netdev to invalidate dst entries

2019-06-21 Thread Mahesh Bandewar
Use blackhole_netdev instead of 'lo' device with lower MTU when marking
dst "dead".

Signed-off-by: Mahesh Bandewar 
---
 net/core/dst.c   | 2 +-
 net/ipv4/route.c | 3 +--
 net/ipv6/route.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/core/dst.c b/net/core/dst.c
index e46366228eaf..1325316d9eab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
-   dst->dev = dev_net(dst->dev)->loopback_dev;
+   dst->dev = blackhole_netdev;
dev_hold(dst->dev);
dev_put(dev);
 }
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 66cbe8a7a168..049af0f88307 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1532,7 +1532,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst)
 
 void rt_flush_dev(struct net_device *dev)
 {
-   struct net *net = dev_net(dev);
struct rtable *rt;
int cpu;
 
@@ -1543,7 +1542,7 @@ void rt_flush_dev(struct net_device *dev)
list_for_each_entry(rt, &ul->head, rt_uncached) {
if (rt->dst.dev != dev)
continue;
-   rt->dst.dev = net->loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(dev);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c4d285fe0adc..335bcf812b25 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -176,7 +176,7 @@ static void rt6_uncached_list_flush_dev(struct net *net, 
struct net_device *dev)
}
 
if (rt_dev == dev) {
-   rt->dst.dev = loopback_dev;
+   rt->dst.dev = blackhole_netdev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH next 3/3] blackhole_dev: add a selftest

2019-06-21 Thread Mahesh Bandewar
Since this is not really a device with all capabilities, this test
ensures that it has *enough* to make it through the data path 
without causing unwanted side-effects (read crash!).

Signed-off-by: Mahesh Bandewar 
---
 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 tools/testing/selftests/net/Makefile  |   3 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 6 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cbdfae379896..79d96e1614f5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1909,6 +1909,15 @@ config TEST_BPF
 
  If unsure, say N.
 
+config TEST_BLACKHOLE_DEV
+   tristate "Test BPF filter functionality"
+   depends on m && NET
+   help
+ This builds the "test_blackhole_dev" module that validates the
+ data path through this blackhole netdev.
+
+ If unsure, say N.
+
 config FIND_BIT_BENCHMARK
tristate "Test find_bit functions"
help
diff --git a/lib/Makefile b/lib/Makefile
index fb7697031a79..c293624d3664 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o
 obj-$(CONFIG_TEST_MEMCAT_P) += test_memcat_p.o
 obj-$(CONFIG_TEST_OBJAGG) += test_objagg.o
 obj-$(CONFIG_TEST_STACKINIT) += test_stackinit.o
+obj-$(CONFIG_TEST_BLACKHOLE_DEV) += test_blackhole_dev.o
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_blackhole_dev.c b/lib/test_blackhole_dev.c
new file mode 100644
index ..4c40580a99a3
--- /dev/null
+++ b/lib/test_blackhole_dev.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This module tests the blackhole_dev that is created during the
+ * net subsystem initialization. The test this module performs is
+ * by injecting an skb into the stack with skb->dev as the
+ * blackhole_dev and expects kernel to behave in a sane manner
+ * (in other words, *not crash*)!
+ *
+ * Copyright (c) 2018, Mahesh Bandewar 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define SKB_SIZE  256
+#define HEAD_SIZE (14+40+8)/* Ether + IPv6 + UDP */
+#define TAIL_SIZE 32   /* random tail-room */
+
+#define UDP_PORT 1234
+
+static int __init test_blackholedev_init(void)
+{
+   struct ipv6hdr *ip6h;
+   struct sk_buff *skb;
+   struct ethhdr *ethh;
+   struct udphdr *uh;
+   int data_len;
+   int ret;
+
+   skb = alloc_skb(SKB_SIZE, GFP_KERNEL);
+   if (!skb)
+   return -ENOMEM;
+
+   /* Reserve head-room for the headers */
+   skb_reserve(skb, HEAD_SIZE);
+
+   /* Add data to the skb */
+   data_len = SKB_SIZE - (HEAD_SIZE + TAIL_SIZE);
+   memset(__skb_put(skb, data_len), 0xf, data_len);
+
+   /* Add protocol data */
+   /* (Transport) UDP */
+   uh = (struct udphdr *)skb_push(skb, sizeof(struct udphdr));
+   skb_set_transport_header(skb, 0);
+   uh->source = uh->dest = htons(UDP_PORT);
+   uh->len = htons(data_len);
+   uh->check = 0;
+   /* (Network) IPv6 */
+   ip6h = (struct ipv6hdr *)skb_push(skb, sizeof(struct ipv6hdr));
+   skb_set_network_header(skb, 0);
+   ip6h->hop_limit = 32;
+   ip6h->payload_len = data_len + sizeof(struct udphdr);
+   ip6h->nexthdr = IPPROTO_UDP;
+   ip6h->saddr = in6addr_loopback;
+   ip6h->daddr = in6addr_loopback;
+   /* Ether */
+   ethh = (struct ethhdr *)skb_push(skb, sizeof(struct ethhdr));
+   skb_set_mac_header(skb, 0);
+
+   skb->protocol = htons(ETH_P_IPV6);
+   skb->pkt_type = PACKET_HOST;
+   skb->dev = blackhole_netdev;
+
+   /* Now attempt to send the packet */
+   ret = dev_queue_xmit(skb);
+
+   switch (ret) {
+   case NET_XMIT_SUCCESS:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_SUCCESS\n");
+   break;
+   case NET_XMIT_DROP:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_DROP\n");
+   break;
+   case NET_XMIT_CN:
+   pr_warn("dev_queue_xmit() returned NET_XMIT_CN\n");
+   break;
+   default:
+   pr_err("dev_queue_xmit() returned UNKNOWN(%d)\n", ret);
+   }
+
+   return 0;
+}
+
+static void __exit test_blackholedev_exit(void)
+{
+   pr_warn("test_blackholedev module terminating.\n");
+}
+
+module_init(test_blackholedev_init);
+module_exit(test_blackholedev_exit);
+
+MODULE_AUTHOR("Mahesh Bandewar ");
+MODULE_LICENSE("GPL");
diff --git a/too

[PATCH next 1/3] loopback: create blackhole net device similar to loopack.

2019-06-21 Thread Mahesh Bandewar
Create a blackhole net device that can be used for "dead"
dst entries instead of loopback device. This blackhole device differs
from loopback in few aspects: (a) It's not per-ns. (b)  MTU on this
device is ETH_MIN_MTU (c) The xmit function is essentially kfree_skb().
and (d) since it's not registered it won't have ifindex.

Lower MTU effectively make the device not pass the MTU check during
the route check when a dst associated with the skb is dead.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/loopback.c| 76 ++-
 include/linux/netdevice.h |  2 ++
 2 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 87d361666cdd..3b39def5471e 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -55,6 +55,13 @@
 #include 
 #include 
 
+/* blackhole_netdev - a device used for dsts that are marked expired!
+ * This is global device (instead of per-net-ns) since it's not needed
+ * to be per-ns and gets initialized at boot time.
+ */
+struct net_device *blackhole_netdev;
+EXPORT_SYMBOL(blackhole_netdev);
+
 /* The higher levels take care of making this non-reentrant (it's
  * called with bh's disabled).
  */
@@ -150,12 +157,14 @@ static const struct net_device_ops loopback_ops = {
.ndo_set_mac_address = eth_mac_addr,
 };
 
-/* The loopback device is special. There is only one instance
- * per network namespace.
- */
-static void loopback_setup(struct net_device *dev)
+static void gen_lo_setup(struct net_device *dev,
+unsigned int mtu,
+const struct ethtool_ops *eth_ops,
+const struct header_ops *hdr_ops,
+const struct net_device_ops *dev_ops,
+void (*dev_destructor)(struct net_device *dev))
 {
-   dev->mtu= 64 * 1024;
+   dev->mtu= mtu;
dev->hard_header_len= ETH_HLEN; /* 14   */
dev->min_header_len = ETH_HLEN; /* 14   */
dev->addr_len   = ETH_ALEN; /* 6*/
@@ -174,11 +183,20 @@ static void loopback_setup(struct net_device *dev)
| NETIF_F_NETNS_LOCAL
| NETIF_F_VLAN_CHALLENGED
| NETIF_F_LOOPBACK;
-   dev->ethtool_ops= &loopback_ethtool_ops;
-   dev->header_ops = ð_header_ops;
-   dev->netdev_ops = &loopback_ops;
+   dev->ethtool_ops= eth_ops;
+   dev->header_ops = hdr_ops;
+   dev->netdev_ops = dev_ops;
dev->needs_free_netdev  = true;
-   dev->priv_destructor= loopback_dev_free;
+   dev->priv_destructor= dev_destructor;
+}
+
+/* The loopback device is special. There is only one instance
+ * per network namespace.
+ */
+static void loopback_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, ð_header_ops,
+&loopback_ops, loopback_dev_free);
 }
 
 /* Setup and register the loopback device. */
@@ -213,3 +231,43 @@ static __net_init int loopback_net_init(struct net *net)
 struct pernet_operations __net_initdata loopback_net_ops = {
.init = loopback_net_init,
 };
+
+/* blackhole netdevice */
+static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb,
+struct net_device *dev)
+{
+   kfree_skb(skb);
+   net_warn_ratelimited("%s(): Dropping skb.\n", __func__);
+   return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops blackhole_netdev_ops = {
+   .ndo_start_xmit = blackhole_netdev_xmit,
+};
+
+/* This is a dst-dummy device used specifically for invalidated
+ * DSTs and unlike loopback, this is not per-ns.
+ */
+static void blackhole_netdev_setup(struct net_device *dev)
+{
+   gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL);
+}
+
+/* Setup and register the blackhole_netdev. */
+static int __init blackhole_netdev_init(void)
+{
+   blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN,
+   blackhole_netdev_setup);
+   if (!blackhole_netdev)
+   return -ENOMEM;
+
+   dev_init_scheduler(blackhole_netdev);
+   dev_activate(blackhole_netdev);
+
+   blackhole_netdev->flags |= IFF_UP | IFF_RUNNING;
+   dev_net_set(blackhole_netdev, &init_net);
+
+   return 0;
+}
+
+device_initcall(blackhole_netdev_init);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index eeacebd7debb..88292953aa6f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4870,4 +4870,6 @@ do {  
\
 #define PTYPE_HASH_SIZE(16)
 #define PTYPE_HASH_MASK(PTYPE_HASH_SIZE - 1)
 
+extern struct net_device *blackhole_netdev;
+
 #endif /* _LINUX_NETDEVICE_H */
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH next 0/3] blackhole device to invalidate dst

2019-06-21 Thread Mahesh Bandewar
When we invalidate dst or mark it "dead", we assign 'lo' to
dst->dev. First of all this assignment is racy and more over,
it has MTU implications.

The standard dev MTU is 1500 while the Loopback MTU is 64k. TCP
code when dereferencing the dst don't check if the dst is valid
or not. TCP when dereferencing a dead-dst while negotiating a
new connection, may use dst device which is 'lo' instead of
using the correct device. Consider the following scenario:

A SYN arrives on an interface and tcp-layer while processing
SYNACK finds a dst and associates it with SYNACK skb. Now before
skb gets passed to L3 for processing, if that dst gets "dead"
(because of the virtual device getting disappeared & then reappeared),
the 'lo' gets assigned to that dst (lo MTU = 64k). Let's assume
the SYN has ADV_MSS set as 9k while the output device through
which this SYNACK is going to go out has standard MTU of 1500.
The MTU check during the route check passes since MIN(9K, 64K)
is 9k and TCP successfully negotiates 9k MSS. The subsequent
data packet; bigger in size gets passed to the device and it 
won't be marked as GSO since the assumed MTU of the device is
9k.

This either crashes the NIC and we have seen fixes that went
into drivers to handle this scenario. 8914a595110a ('bnx2x:
disable GSO where gso_size is too big for hardware') and
2b16f048729b ('net: create skb_gso_validate_mac_len()') and
with those fixes TCP eventually recovers but not before
few dropped segments.

Well, I'm not a TCP expert and though we have experienced
these corner cases in our environment, I could not reproduce 
this case reliably in my test setup to try this fix myself.
However, Michael Chan  had a setup
where these fixes helped him mitigate the issue and not cause
the crash.

The idea here is to not alter the data-path with additional
locks or smb()/rmb() barriers to avoid racy assignments but
to create a new device that has really low MTU that has
.ndo_start_xmit essentially a kfree_skb(). Make use of this
device instead of 'lo' when marking the dst dead.

First patch implements the blackhole device and second
patch uses it in IPv4 and IPv6 stack while the third patch
is the self test that ensures the sanity of this device.

Mahesh Bandewar (3):
  loopback: create blackhole net device similar to loopack.
  blackhole_netdev: use blackhole_netdev to invalidate dst entries
  blackhole_dev: add a selftest

 drivers/net/loopback.c|  76 +++--
 include/linux/netdevice.h |   2 +
 lib/Kconfig.debug |   9 ++
 lib/Makefile  |   1 +
 lib/test_blackhole_dev.c  | 100 ++
 net/core/dst.c|   2 +-
 net/ipv4/route.c  |   3 +-
 net/ipv6/route.c  |   2 +-
 tools/testing/selftests/net/Makefile  |   3 +-
 tools/testing/selftests/net/config|   1 +
 .../selftests/net/test_blackhole_dev.sh   |  11 ++
 11 files changed, 196 insertions(+), 14 deletions(-)
 create mode 100644 lib/test_blackhole_dev.c
 create mode 100755 tools/testing/selftests/net/test_blackhole_dev.sh

-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH iproute2] ip6tunnel: fix 'ip -6 {show|change} dev ' cmds

2019-06-06 Thread Mahesh Bandewar
Inclusion of 'dev' is allowed by the syntax but not handled
correctly by the command. It produces no output for show
command and falsely successful for change command but does
not make any changes.

can be verified with the following steps
  # ip -6 tunnel add ip6tnl1 mode ip6gre local fd::1 remote fd::2 tos inherit 
ttl 127 encaplimit none
  # ip -6 tunnel show ip6tnl1
  
  # ip -6 tunnel show dev ip6tnl1
  
  # ip -6 tunnel change dev ip6tnl1 local 2001:1234::1 remote 2001:1234::2 
encaplimit none ttl 127 tos inherit allow-localremote
  # echo $?
  0
  # ip -6 tunnel show ip6tnl1
  

Signed-off-by: Mahesh Bandewar 
---
 ip/ip6tunnel.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 999408ed801b..56fd3466ed06 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -298,6 +298,8 @@ static int parse_args(int argc, char **argv, int cmd, 
struct ip6_tnl_parm2 *p)
p->link = ll_name_to_index(medium);
if (!p->link)
return nodev(medium);
+   else
+   strlcpy(p->name, medium, sizeof(p->name));
}
return 0;
 }
-- 
2.22.0.rc1.311.g5d7573a151-goog



[PATCH net] bonding: fix warning message

2018-10-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

RX queue config for bonding master could be different from its slave
device(s). With the commit 6a9e461f6fe4 ("bonding: pass link-local
packets to bonding master also."), the packet is reinjected into stack
with skb->dev as bonding master. This potentially triggers the
message:

   "bondX received packet on queue Y, but number of RX queues is Z"

whenever the queue that packet is received on is higher than the
numrxqueues on bonding master (Y > Z).

Fixes: 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also.")
Reported-by: John Sperbeck 
Signed-off-by: Eric Dumazet 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c05c01a00755..ee28ec9e0aba 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1187,6 +1187,7 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
 
if (nskb) {
nskb->dev = bond->dev;
+   nskb->queue_mapping = 0;
netif_rx(nskb);
}
return RX_HANDLER_PASS;
-- 
2.19.0.605.g01d371f741-goog



[PATCH net] bonding: avoid possible dead-lock

2018-09-24 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Syzkaller reported this on a slightly older kernel but it's still
applicable to the current kernel -

==
WARNING: possible circular locking dependency detected
4.18.0-next-20180823+ #46 Not tainted
--
syz-executor4/26841 is trying to acquire lock:
dd41ef48 ((wq_completion)bond_dev->name){+.+.}, at: 
flush_workqueue+0x2db/0x1e10 kernel/workqueue.c:2652

but task is already holding lock:
768ab431 (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 
[inline]
768ab431 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 
net/core/rtnetlink.c:4708

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (rtnl_mutex){+.+.}:
   __mutex_lock_common kernel/locking/mutex.c:925 [inline]
   __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073
   mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088
   rtnl_lock+0x17/0x20 net/core/rtnetlink.c:77
   bond_netdev_notify drivers/net/bonding/bond_main.c:1310 [inline]
   bond_netdev_notify_work+0x44/0xd0 drivers/net/bonding/bond_main.c:1320
   process_one_work+0xc73/0x1aa0 kernel/workqueue.c:2153
   worker_thread+0x189/0x13c0 kernel/workqueue.c:2296
   kthread+0x35a/0x420 kernel/kthread.c:246
   ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415

-> #1 ((work_completion)(&(&nnw->work)->work)){+.+.}:
   process_one_work+0xc0b/0x1aa0 kernel/workqueue.c:2129
   worker_thread+0x189/0x13c0 kernel/workqueue.c:2296
   kthread+0x35a/0x420 kernel/kthread.c:246
   ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415

-> #0 ((wq_completion)bond_dev->name){+.+.}:
   lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901
   flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655
   drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820
   destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155
   __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138
   bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734
   register_netdevice+0x337/0x1100 net/core/dev.c:8410
   bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453
   rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099
   rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711
   netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454
   rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4729
   netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
   netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343
   netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1908
   sock_sendmsg_nosec net/socket.c:622 [inline]
   sock_sendmsg+0xd5/0x120 net/socket.c:632
   ___sys_sendmsg+0x7fd/0x930 net/socket.c:2115
   __sys_sendmsg+0x11d/0x290 net/socket.c:2153
   __do_sys_sendmsg net/socket.c:2162 [inline]
   __se_sys_sendmsg net/socket.c:2160 [inline]
   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2160
   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

other info that might help us debug this:

Chain exists of:
  (wq_completion)bond_dev->name --> (work_completion)(&(&nnw->work)->work) --> 
rtnl_mutex

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(rtnl_mutex);
   lock((work_completion)(&(&nnw->work)->work));
   lock(rtnl_mutex);
  lock((wq_completion)bond_dev->name);

 *** DEADLOCK ***

1 lock held by syz-executor4/26841:

stack backtrace:
CPU: 1 PID: 26841 Comm: syz-executor4 Not tainted 4.18.0-next-20180823+ #46
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_circular_bug.isra.34.cold.55+0x1bd/0x27d kernel/locking/lockdep.c:1222
 check_prev_add kernel/locking/lockdep.c:1862 [inline]
 check_prevs_add kernel/locking/lockdep.c:1975 [inline]
 validate_chain kernel/locking/lockdep.c:2416 [inline]
 __lock_acquire+0x3449/0x5020 kernel/locking/lockdep.c:3412
 lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901
 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655
 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820
 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155
 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138
 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734
 register_netdevice+0x337/0x1100 net/core/dev.c:8410
 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453
 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099
 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711
 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454
 rtnetlink_r

[PATCH net] bonding: pass link-local packets to bonding master also.

2018-09-24 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9866fa959a19..e3157aed656a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1178,9 +1178,26 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) they also must be available on master. Some of
+* the use cases include (but are not limited to): LLDP agents
+* that must be able to operate both on enslaved interfaces as
+* well as on bonds themselves; linux bridges that must be able
+* to process/pass BPDUs from attached bonds when any kind of
+* STP version is enabled on the network.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.19.0.444.g18242da7ef-goog



[PATCH iproute2] iproute2: fix use-after-free

2018-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar 

A local program using iproute2 lib pointed out the issue and looking
at the code it is pretty obvious -

a = (struct nlmsghdr *)b;
...
free(b);
if (a->nlmsg_seq == seq)
...

Fixes: 86bf43c7c2fd ("lib/libnetlink: update rtnl_talk to support malloc buff 
at run time")
Signed-off-by: Mahesh Bandewar 
---
 lib/libnetlink.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 928de1dd16d8..016a5f0bcfb6 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -661,17 +661,24 @@ next:
if (l < sizeof(struct nlmsgerr)) {
fprintf(stderr, "ERROR truncated\n");
} else if (!err->error) {
+   unsigned int tmp_seq;
+
/* check messages from kernel */
nl_dump_ext_ack(h, errfn);
 
-   if (answer)
+   tmp_seq = h->nlmsg_seq;
+   if (answer) {
*answer = (struct nlmsghdr 
*)buf;
-   else
+   } else {
free(buf);
-   if (h->nlmsg_seq == seq)
+   buf = NULL;
+   }
+   if (tmp_seq == seq) {
return 0;
-   else if (i < iovlen)
+   } else if (i < iovlen) {
+   free(buf);
goto next;
+   }
return 0;
}
 
-- 
2.19.0.rc2.392.g5ba43deb5a-goog



[PATCHv3 iproute2 0/2] clang + misc changes

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The primary theme is to make clang compile the iproute2 package without
warnings. Along with this there are two other misc patches in the series.

First patch uses the preferred_family when operating with maddr feature.
Prior to this patch, it would always open an AF_INET socket irrespective
of the family that is preferred via command-line. 

Second patch mostly adds format attributes to make the c-lang compiler
happy and not throw the warning messages.

Mahesh Bandewar (2):
  ipmaddr: use preferred_family when given
  iproute: make clang happy with iproute2 package

 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 ip/ipmaddr.c  | 13 -
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 9 files changed, 32 insertions(+), 25 deletions(-)

-- 
2.18.0.1017.ga543ac7ca45-goog



[PATCHv3 iproute2 1/2] ipmaddr: use preferred_family when given

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.1017.ga543ac7ca45-goog



[PATCHv3 iproute2 2/2] iproute: make clang happy

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 8 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..0c8831c1136d 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on);
 void jsonw_name(json_writer_t *self, const char *name);
 
 /* Add value  */
+__attribute__((format(printf, 2, 3)))
 void jsonw_printf(json_writer_t *self, const char *fmt, ...);
 void jsonw_string(json_writer_t *self, const char *value);
 void jsonw_bool(json_writer_t *self, bool value);
@@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,   

[PATCHv2 iproute2 3/3] iproute: make clang happy with iproute2 package

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 8 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..0c8831c1136d 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on);
 void jsonw_name(json_writer_t *self, const char *name);
 
 /* Add value  */
+__attribute__((format(printf, 2, 3)))
 void jsonw_printf(json_writer_t *self, const char *fmt, ...);
 void jsonw_string(json_writer_t *self, const char *value);
 void jsonw_bool(json_writer_t *self, bool value);
@@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,   

[PATCHv2 iproute2 0/3] clang + misc changes

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The primary theme is to make clang compile the iproute2 package without
warnings. Along with this there are two other misc patches in the series.

First patch uses the preferred_family when operating with maddr feature.
Prior to this patch, it would always open an AF_INET socket irrespective
of the family that is preferred via command-line. 

Second patch just removes extern from the prototype declarations from
the m_ematch.h header file.

Third patch mostly adds format attributes to make the c-lang compiler
happy and not throw the warning messages.

Mahesh Bandewar (3):
  ipmaddr: use preferred_family when given
  tc: remove extern from prototype declarations
  iproute: make clang happy with iproute2 package

 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 ip/ipmaddr.c  | 13 -
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h | 15 ---
 9 files changed, 39 insertions(+), 32 deletions(-)

-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCHv2 iproute2 2/3] tc: remove extern from prototype declarations

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Signed-off-by: Mahesh Bandewar 
---
 tc/m_ematch.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tc/m_ematch.h b/tc/m_ematch.h
index f634f19164fa..80b02cfad6cc 100644
--- a/tc/m_ematch.h
+++ b/tc/m_ematch.h
@@ -20,7 +20,7 @@ struct bstr
struct bstr *next;
 };
 
-extern struct bstr * bstr_alloc(const char *text);
+struct bstr * bstr_alloc(const char *text);
 
 static inline struct bstr * bstr_new(char *data, unsigned int len)
 {
@@ -51,8 +51,8 @@ static inline struct bstr *bstr_next(struct bstr *b)
return b->next;
 }
 
-extern unsigned long bstrtoul(const struct bstr *b);
-extern void bstr_print(FILE *fd, const struct bstr *b, int ascii);
+unsigned long bstrtoul(const struct bstr *b);
+void bstr_print(FILE *fd, const struct bstr *b, int ascii);
 
 
 struct ematch
@@ -79,7 +79,7 @@ static inline struct ematch * new_ematch(struct bstr *args, 
int inverted)
return e;
 }
 
-extern void print_ematch_tree(const struct ematch *tree);
+void print_ematch_tree(const struct ematch *tree);
 
 
 struct ematch_util
@@ -107,9 +107,9 @@ static inline int parse_layer(struct bstr *b)
return INT_MAX;
 }
 
-extern int em_parse_error(int err, struct bstr *args, struct bstr *carg,
+int em_parse_error(int err, struct bstr *args, struct bstr *carg,
   struct ematch_util *, char *fmt, ...);
-extern int print_ematch(FILE *, const struct rtattr *);
-extern int parse_ematch(int *, char ***, int, struct nlmsghdr *);
+int print_ematch(FILE *, const struct rtattr *);
+int parse_ematch(int *, char ***, int, struct nlmsghdr *);
 
 #endif
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCHv2 iproute2 1/3] ipmaddr: use preferred_family when given

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCH iproute2] iproute: make clang happy

2018-08-20 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  2 --
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 7 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..a111cc62e7b2 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -59,8 +59,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,\
 enum color_attr color, \
 const char *key,   \
diff --git a/lib/json_writer.c b/lib/json_writer.c
index aa9ce1c65e51..68890b34ee92 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -152,6 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
   

[PATCH iproute2] ipmaddr: use preferred_family when given

2018-08-15 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCH net v2] bonding: pass link-local packets to bonding master also.

2018-07-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
v2: Added Fixes tag.
v1: Initial patch.
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCH next v2] bonding: pass link-local packets to bonding master also.

2018-07-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
v2: Added Fixes tag.
v1: Initial patch.
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCH next] bonding: pass link-local packets to bonding master also.

2018-07-15 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCHv4 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. Capability
mask is stored in kernel as kernel_cap_t type (array of u32). This sysctl
takes input as comma separated hex u32 words. For simplicity one could
see this sysctl to operate on string inputs. However the value is not
expected to change that often during the life of a kernel-boot. It makes
more sense to use the widely available API instead of bringing another
string manipulation for the purpose of making this simpler.

The default value set (for kernel.controlled_userns_caps_whitelist) is
CAP_FULL_SET indicating that no capability is controlled by default to
maintain compatibility with the existing behavior of user-ns. Administrator
will have to modify this sysctl to control any capability as such. e.g. to
control CAP_NET_RAW the mask need to be changed like -

  # sysctl -q kernel.controlled_userns_caps_whitelist
  kernel.controlled_userns_caps_whitelist = 1f,
  # sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff
  kernel.controlled_userns_caps_whitelist = 1f,dfff

For bit-to-mask conversion please check include/uapi/linux/capability.h
file.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns. In above example
CAP_NET_RAW will not be available to controlled-user-namespaces.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v4:
  commit message changes.
v3:
  Added couple of comments as requested by Serge Hallyn
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..6aa1e087afee 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is available in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ct

[PATCHv4 0/2] capability controlled user-namespaces

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Processes that do not have CAP_SYS_ADMIN in init-ns can *only* create
controlled user-namespaces. In other words, user-namespaces created by
privileged processes (those which have CAP_SYS_ADMIN in init-ns) are
not controlled. A hierarchy underneath any controlled user-ns is always
controlled.

A global whitelist is list of capabilities governed by a sysctl
(kernel.controlled_userns_caps_whitelist) which is available to
(privileged) user in init-ns to modify while it's applicable to all
controlled user-namespaces on the host irrespective of when that user-ns
was created.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Example
---
Here is the example that demonstrates the behavior of a kernel that has
this patch-set applied. It uses the same c-code from this commit-log and
is called acquire_raw.c -

(a) The 'root' user has all the capabilities all the time (before and
after taking capability).

root@vm0:~# id
uid=0(root) gid=0(root) groups=0(root)

root@vm0:~# sysctl -q kernel.controlled_userns_caps_whitelist 
kernel.controlled_userns_caps_whitelist = 1f,

root@vm0:~# ./acquire_raw 
Attempting to open RAW socket before unshare()...
Successfully opened RAW-Sock before unshare().
Attempting to open RAW socket after unshare()...
Successfully opened RAW-Sock after unshare().

root@vm0:~# sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff
kernel.controlled_userns_caps_whitelist = 1f,dfff

root@vm0:~# ./acquire_raw 
Attempting to open RAW socket before unshare()...
Successfully opened RAW-Sock before unshare().
Attempting to open RAW socket after unshare()...
Successfully opened RAW-Sock after unshare().

(b) Unprivileged user cannot change the mask.

mahesh@vm0:~$ id
uid=1000(mahesh) gid=1000(mahesh)

groups=1000(mahesh),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),118(lpadmin),128(sambashare)

mahesh@vm

[PATCHv4 2/2] userns: control capabilities of some user namespaces

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v4:
  Rebase
v3:
  Rebase
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  4 
 include/linux/user_namespace.h | 25 +
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 46 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..383f31f066f0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, 
struct user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+/* Controlled capability is capability that is missing from the capability-mask
+ * controlled_userns_caps_whitelist controlled via sysctl.
+ */
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..a5c48684b317 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+/* Controlled user-ns is the one that is created by a process that does not
+ * have CAP_SYS_ADMIN (or descended from such an user-ns).
+ * For more details please see the sysctl description of
+ * controlled_userns_caps_whitelist.
+ */
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..ca0556d466b6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -141,6 +141,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index 4f8

[PATCH next 0/2] ipvlan: packet scrub

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar 

While crossing namespace boundary IPvlan aggressively scrubs packets.
This is creating problems. First thing is that scrubbing changes the 
packet type in skb meta-data to PACKET_HOST. This causes erroneous
packet delivery when dev_forward_skb() has already marked the packet
type as OTHER_HOST.

On the egress side scrubbing just before calling dev_queue_xmit()
creates another set of problems. Scrubbing remove skb->sk so the
prio update gets missed and more seriously, socket back-pressure
fails making TSQ not function correctly.

The first patch in the series just reverts the earlier change which
was adding a mac-check, but that is unnecessary if packet_type that
dev_forward_skb() has set is honored. The second path removes two of
the scrubs which are causing problems described above.


Mahesh Bandewar (2):
  Revert "ipvlan: add L2 check for packets arriving via virtual devices"
  ipvlan: remove excessive packet scrubbing

 drivers/net/ipvlan/ipvlan_core.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

-- 
2.15.1.424.g9478a66081-goog



[PATCH next 1/2] Revert "ipvlan: add L2 check for packets arriving via virtual devices"

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar 

This reverts commit 92ff42645028fa6f9b8aa767718457b9264316b4.

Even though the check added is not that taxing, it's not really needed.
First of all this will be per packet cost and second thing is that the
eth_type_trans() already does this correctly. The excessive scrubbing
in IPvlan was changing the pkt-type skb metadata of the packet which
made it necessary to re-check the mac. The subsequent patch in this
series removes the faulty packet-scrub.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_core.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 0bc7f721b717..9774c96ac7bb 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -322,10 +322,6 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct 
sk_buff **pskb,
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
-   if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest,
-ipvlan->phy_dev->dev_addr))
-   skb->pkt_type = PACKET_OTHERHOST;
-
ret = RX_HANDLER_ANOTHER;
success = true;
}
-- 
2.15.1.424.g9478a66081-goog



[PATCH next 2/2] ipvlan: remove excessive packet scrubbing

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan currently scrubs packets at every location where packets may be
crossing namespace boundary. Though this is desirable, currently IPvlan
does it more than necessary. e.g. packets that are going to take
dev_forward_skb() path will get scrubbed so no point in scrubbing them
before forwarding. Another side-effect of scrubbing is that pkt-type gets
set to PACKET_HOST which overrides what was already been set by the
earlier path making erroneous delivery of the packets.

Also scrubbing packets just before calling dev_queue_xmit() has detrimental
effects since packets lose skb->sk and because of that miss prio updates,
incorrect socket back-pressure and would even break TSQ.

Fixes: b93dd49c1a35 ('ipvlan: Scrub skb before crossing the namespace boundary')
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 9774c96ac7bb..c1f008fe4e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -315,13 +315,13 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, 
struct sk_buff **pskb,
 
*pskb = skb;
}
-   ipvlan_skb_crossing_ns(skb, dev);
 
if (local) {
skb->pkt_type = PACKET_HOST;
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
+   skb->dev = dev;
ret = RX_HANDLER_ANOTHER;
success = true;
}
@@ -586,7 +586,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct 
net_device *dev)
return NET_XMIT_SUCCESS;
}
 
-   ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
+   skb->dev = ipvlan->phy_dev;
return dev_queue_xmit(skb);
 }
 
-- 
2.15.1.424.g9478a66081-goog



[PATCH next] ipvlan: add L2 check for packets arriving via virtual devices

2017-12-07 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Packets that don't have dest mac as the mac of the master device should
not be entertained by the IPvlan rx-handler. This is mostly true as the
packet path mostly takes care of that, except when the master device is
a virtual device. As demonstrated in the following case -

  ip netns add ns1
  ip link add ve1 type veth peer name ve2
  ip link add link ve2 name iv1 type ipvlan mode l2
  ip link set dev iv1 netns ns1
  ip link set ve1 up
  ip link set ve2 up
  ip -n ns1 link set iv1 up
  ip addr add 192.168.10.1/24 dev ve1
  ip -n ns1 addr 192.168.10.2/24 dev iv1
  ping -c2 192.168.10.2
  
  ip neigh show dev ve1
  ip neigh show 192.168.10.2 lladdr  dev ve1
  ping -c2 192.168.10.2
  

This patch adds that missing check in the IPvlan rx-handler.

Reported-by: Amit Sikka 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_core.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 9774c96ac7bb..0bc7f721b717 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -322,6 +322,10 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct 
sk_buff **pskb,
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
+   if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest,
+ipvlan->phy_dev->dev_addr))
+   skb->pkt_type = PACKET_OTHERHOST;
+
ret = RX_HANDLER_ANOTHER;
success = true;
}
-- 
2.15.1.424.g9478a66081-goog



[PATCHv3 0/2] capability controlled user-namespaces

2017-12-05 Thread Mahesh Bandewar
From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  7 ++
 include/linux/user_namespace.h  | 25 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 122 insertions(+)

-- 
2.15.0.531.g2ccb3012c9-goog



[PATCHv3 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-12-05 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v3:
  Added couple of comments as requested by Serge Hallyn
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+  

[PATCHv3 2/2] userns: control capabilities of some user namespaces

2017-12-05 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Acked-by: Serge Hallyn 
Signed-off-by: Mahesh Bandewar 
---
v3:
  Rebase
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  4 
 include/linux/user_namespace.h | 25 +
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 46 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..383f31f066f0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, 
struct user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+/* Controlled capability is capability that is missing from the capability-mask
+ * controlled_userns_caps_whitelist controlled via sysctl.
+ */
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..a5c48684b317 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+/* Controlled user-ns is the one that is created by a process that does not
+ * have CAP_SYS_ADMIN (or descended from such an user-ns).
+ * For more details please see the sysctl description of
+ * controlled_userns_caps_whitelist.
+ */
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..ca0556d466b6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -141,6 +141,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index 4f8e09340956..5454

[PATCHv2 0/2] capability controlled user-namespaces

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 114 insertions(+)

-- 
2.15.0.448.gf294e3d99a-goog



[PATCHv2 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar 
---
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap_to

[PATCHv2 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar 
---
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 38 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..a1fd9e460379 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3fe714da7f5a..647f825c7b5f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -23,6 +23,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -103,6 +104,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -161,6 +172,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..600c7dcb9ff7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -139,6 +139,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index fc46f5b85251..89103f16ac37 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPERM and no need
+* to check the user-ns hierarchy.
+*/
+  

[PATCH resend 0/2] capability controlled user-namespaces

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.15.0.403.gc27cc4dac6-goog



[PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar 
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index fc46f5b85251..89103f16ac37 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPE

[PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar 
---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap_to_u32array(tmp.cap, _KERNEL_CAPABILITY_U32S,
+   

[PATCH] ip/ipvlan: enhance ability to add mode flags to existing modes

2017-10-30 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan supported bridge-only functionality prior to commits
a190d04db937 ('ipvlan: introduce 'private' attribute for all
existing modes.') and fe89aa6b250c ('ipvlan: implement VEPA mode').
These two commits allow to configure the VEPA and private modes now.
This patch adds those options in ip command.

e.g.
  bash:~# ip link add link eth0 name ipvl0 type ipvlan mode l2 private
  -or-
  bash:~# ip link add link eth0 type ipvl0 type ipvlan mode l2 vepa

Also the output will reflect the mode and the mode-flag accordingly.
e.g.
  bash:~# ip -details link show ipvl0
  4: ipvl0@eth0:  mtu 1500 qdisc ...
 link/ether 00:1a:11:44:a5:3e brd ff:ff:ff:ff:ff:ff promiscuity 0
 ipvlan  mode l2 private addrgenmode eui64 numtxqueues 1 ...

Signed-off-by: Mahesh Bandewar 
---
 include/uapi/linux/if_link.h |  4 
 ip/iplink_ipvlan.c   | 35 ++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index dafe0a6e0421..35bc598566e0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -463,6 +463,7 @@ enum macsec_validation_type {
 enum {
IFLA_IPVLAN_UNSPEC,
IFLA_IPVLAN_MODE,
+   IFLA_IPVLAN_FLAGS,
__IFLA_IPVLAN_MAX
 };
 
@@ -475,6 +476,9 @@ enum ipvlan_mode {
IPVLAN_MODE_MAX
 };
 
+#define IPVLAN_F_PRIVATE   0x01
+#define IPVLAN_F_VEPA  0x02
+
 /* VXLAN section */
 enum {
IFLA_VXLAN_UNSPEC,
diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c
index 9f48309ee030..8889808508fe 100644
--- a/ip/iplink_ipvlan.c
+++ b/ip/iplink_ipvlan.c
@@ -20,12 +20,21 @@
 
 static void ipvlan_explain(FILE *f)
 {
-   fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3  | l3s } ]\n");
+   fprintf(f,
+   "Usage: ... ipvlan [ mode MODE ] [ FLAGS ]\n"
+   "\n"
+   "MODE: l3 | l3s | l2\n"
+   "FLAGS: bridge | private | vepa\n"
+   "(first values are the defaults if nothing is specified).\n"
+   );
 }
 
 static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
struct nlmsghdr *n)
 {
+   __u16 flags = 0;
+   bool mflag_given = false;
+
while (argc > 0) {
if (matches(*argv, "mode") == 0) {
__u16 mode = 0;
@@ -43,6 +52,14 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
return -1;
}
addattr16(n, 1024, IFLA_IPVLAN_MODE, mode);
+   } else if (matches(*argv, "private") == 0 && !mflag_given) {
+   flags |= IPVLAN_F_PRIVATE;
+   mflag_given = true;
+   } else if (matches(*argv, "vepa") == 0 && !mflag_given) {
+   flags |= IPVLAN_F_VEPA;
+   mflag_given = true;
+   } else if (matches(*argv, "bridge") == 0 && !mflag_given) {
+   mflag_given = true;
} else if (matches(*argv, "help") == 0) {
ipvlan_explain(stderr);
return -1;
@@ -55,6 +72,7 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
argc--;
argv++;
}
+   addattr16(n, 1024, IFLA_IPVLAN_FLAGS, flags);
 
return 0;
 }
@@ -75,6 +93,21 @@ static void ipvlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
print_string(PRINT_ANY, "mode", " mode %s ", mode_str);
}
}
+   if (tb[IFLA_IPVLAN_FLAGS]) {
+   if (RTA_PAYLOAD(tb[IFLA_IPVLAN_FLAGS]) == sizeof(__u16)) {
+   __u16 flags = rta_getattr_u16(tb[IFLA_IPVLAN_FLAGS]);
+
+   if (flags & IPVLAN_F_PRIVATE)
+   print_bool(PRINT_ANY, "private", "private ",
+  true);
+   else if (flags & IPVLAN_F_VEPA)
+   print_bool(PRINT_ANY, "vepa", "vepa ",
+  true);
+   else
+   print_bool(PRINT_ANY, "bridge", "bridge ",
+  true);
+   }
+   }
 }
 
 static void ipvlan_print_help(struct link_util *lu, int argc, char **argv,
-- 
2.15.0.rc2.357.g7e34df9404-goog



[PATCH next 0/2] add 'private' and 'vepa' attributes to ipvlan modes

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan has always been operating in bridge-mode for its supported modes i.e.
if the packets are destined to the adjacent neighbor dev, then IPvlan driver
will switch the packet internally without needing the packets to hit the
wire or get routed. However, there are situations where this bridge-mode is
not needed. e.g. two private processes running inside two namespaces which
are having one IPvlan slave each for its namespace but sharing the master. These
processes should reach the outside world through the master device but at
the same time the bridge function should not work. Currently that's not
possible hence the private attribute for the selected mode comes in play.

VEPA or 802.1Qbg on the other hand has limited appeal with IPvlan since IPvlan
uses the mac-address of the lower device. So packets that are destined to 
the adjacent neighbor slave-dev will have same src and dest mac. When these
packets reach the external switch/router, they will send you the redirect
message which the host will have to deal with. Having said that this attribute 
will have appeal in debugging as IPvlan will not switch / short-circuit 
packets internally. e.g. using VEPA mode with lower-device in loopback mode
will avoid some complicated set-ups that use non-local-bind with some route
jugglery.

This patch-set implements these attributes for the existing modes that
IPvlan has. Please see individual patches for their detailed implementation.
A subsequent ip-utils patch is needed and will be sent soon.

Mahesh Bandewar (2):
  ipvlan: introduce 'private' attribute for all existing modes.
  ipvlan: implement VEPA mode

 Documentation/networking/ipvlan.txt | 42 +
 drivers/net/ipvlan/ipvlan.h | 31 
 drivers/net/ipvlan/ipvlan_core.c| 24 ++-
 drivers/net/ipvlan/ipvlan_main.c| 47 +++--
 include/uapi/linux/if_link.h|  4 
 5 files changed, 136 insertions(+), 12 deletions(-)

-- 
2.15.0.rc2.357.g7e34df9404-goog



[PATCH next 2/2] ipvlan: implement VEPA mode

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar 

This is very similar to the Macvlan VEPA mode, however, there is some
difference. IPvlan uses the mac-address of the lower device, so the VEPA
mode has implications of ICMP-redirects for packets destined for its
immediate neighbors sharing same master since the packets will have same
source and dest mac. The external switch/router will send redirect msg.

Having said that, this will be useful tool in terms of debugging
since IPvlan will not switch packets within its slaves and rely completely
on the external entity as intended in 802.1Qbg.

Signed-off-by: Mahesh Bandewar 
---
 Documentation/networking/ipvlan.txt | 12 +++-
 drivers/net/ipvlan/ipvlan.h | 15 +++
 drivers/net/ipvlan/ipvlan_core.c| 17 ++---
 drivers/net/ipvlan/ipvlan_main.c| 13 +++--
 include/uapi/linux/if_link.h|  1 +
 5 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index bfa91c77a4c9..812ef003e0a8 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -25,7 +25,7 @@ using IProute2/ip utility.
 ip link add link  name  type ipvlan [ mode MODE ] [ FLAGS ]
where
  MODE: l3 (default) | l3s | l2
- FLAGS: bridge (default) | private
+ FLAGS: bridge (default) | private | vepa
 
 e.g.
 (a) Following will create IPvlan link with eth0 as master in
@@ -35,6 +35,8 @@ using IProute2/ip utility.
   bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
 (c) This command will create an IPvlan device in L2 private mode.
   bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
+(d) This command will create an IPvlan device in L2 vepa mode.
+  bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa
 
 
 4. Operating modes:
@@ -77,6 +79,14 @@ themseleves apart from talking through the master device.
If this option is added to the command-line, the port is set in private
 mode. i.e. port wont allow cross communication between slaves.
 
+5.3 vepa:
+   If this is added to the command-line, the port is set in VEPA mode.
+i.e. port will offload switching functionality to the external entity as
+described in 802.1Qbg
+Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the
+master-device, so the packets which are emitted in this mode for the adjacent
+neighbor will have source and destination mac same. This will make the switch /
+router send the redirect message.
 
 6. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 9941851bcc13..5166575a164d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -139,6 +139,21 @@ static inline void ipvlan_clear_private(struct ipvl_port 
*port)
port->flags &= ~IPVLAN_F_PRIVATE;
 }
 
+static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
+{
+   return !!(port->flags & IPVLAN_F_VEPA);
+}
+
+static inline void ipvlan_mark_vepa(struct ipvl_port *port)
+{
+   port->flags |= IPVLAN_F_VEPA;
+}
+
+static inline void ipvlan_clear_vepa(struct ipvl_port *port)
+{
+   port->flags &= ~IPVLAN_F_VEPA;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 72fd56de9c00..034ae4c57196 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -514,13 +514,15 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, 
struct net_device *dev)
if (!lyr3h)
goto out;
 
-   addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-   if (addr) {
-   if (ipvlan_is_private(ipvlan->port)) {
-   consume_skb(skb);
-   return NET_XMIT_DROP;
+   if (!ipvlan_is_vepa(ipvlan->port)) {
+   addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+   if (addr) {
+   if (ipvlan_is_private(ipvlan->port)) {
+   consume_skb(skb);
+   return NET_XMIT_DROP;
+   }
+   return ipvlan_rcv_frame(addr, &skb, true);
}
-   return ipvlan_rcv_frame(addr, &skb, true);
}
 out:
ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
@@ -535,7 +537,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct 
net_device *dev)
void *lyr3h;
int addr_type;
 
-   if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+   if (!ipvlan_is_vepa(ipvlan->port)

[PATCH next 1/2] ipvlan: introduce 'private' attribute for all existing modes.

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan has always operated in bridge mode. However there are scenarios
where each slave should be able to talk through the master device but
not necessarily across each other. Think of an environment where each
of a namespace is a private and independant customer. In this scenario
the machine which is hosting these namespaces neither want to tell who
their neighbor is nor the individual namespaces care to talk to neighbor
on short-circuited network path.

This patch implements the mode that is very similar to the 'private' mode
in macvlan where individual slaves can send and receive traffic through
the master device, just that they can not talk among slave devices.

Signed-off-by: Mahesh Bandewar 
---
 Documentation/networking/ipvlan.txt | 30 ++---
 drivers/net/ipvlan/ipvlan.h | 16 
 drivers/net/ipvlan/ipvlan_core.c| 15 ---
 drivers/net/ipvlan/ipvlan_main.c| 38 +++--
 include/uapi/linux/if_link.h|  3 +++
 5 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index 1fe42a874aae..bfa91c77a4c9 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,9 +22,19 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or 
as a module
There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-   ip link add link  name  type ipvlan mode { l2 | 
l3 | l3s }
+ip link add link  name  type ipvlan [ mode MODE ] [ FLAGS ]
+   where
+ MODE: l3 (default) | l3s | l2
+ FLAGS: bridge (default) | private
 
-   e.g. ip link add link eth0 name ipvl0 type ipvlan mode l2
+e.g.
+(a) Following will create IPvlan link with eth0 as master in
+L3 bridge mode
+  bash# ip link add link eth0 name ipvl0 type ipvlan
+(b) This command will create IPvlan link in L2 bridge mode.
+  bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
+(c) This command will create an IPvlan device in L2 private mode.
+  bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
 
 
 4. Operating modes:
@@ -54,7 +64,21 @@ works in this mode and hence it is L3-symmetric (L3s). This 
will have slightly l
 performance but that shouldn't matter since you are choosing this mode over 
plain-L3
 mode to make conn-tracking work.
 
-5. What to choose (macvlan vs. ipvlan)?
+5. Mode flags:
+   At this time following mode flags are available
+
+5.1 bridge:
+   This is the default option. To configure the IPvlan port in this mode,
+user can choose to either add this option on the command-line or don't specify
+anything. This is the traditional mode where slaves can cross-talk among
+themseleves apart from talking through the master device.
+
+5.2 private:
+   If this option is added to the command-line, the port is set in private
+mode. i.e. port wont allow cross communication between slaves.
+
+
+6. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
 case could very well define which device to choose. if one of the following
 situations defines your use case then you can choose to use ipvlan -
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index ba8173a0b62e..9941851bcc13 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -96,6 +96,7 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 flags;
u16 dev_id_start;
struct work_struct  wq;
struct sk_buff_head backlog;
@@ -123,6 +124,21 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const 
struct net_device *d)
return rtnl_dereference(d->rx_handler_data);
 }
 
+static inline bool ipvlan_is_private(const struct ipvl_port *port)
+{
+   return !!(port->flags & IPVLAN_F_PRIVATE);
+}
+
+static inline void ipvlan_mark_private(struct ipvl_port *port)
+{
+   port->flags |= IPVLAN_F_PRIVATE;
+}
+
+static inline void ipvlan_clear_private(struct ipvl_port *port)
+{
+   port->flags &= ~IPVLAN_F_PRIVATE;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 1f3295e274d0..72fd56de9c00 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -515,9 +515,13 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct 
net_device *dev)
goto out;
 
addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type

[PATCH next] ipvlan: always use the current L2 addr of the master

2017-10-11 Thread Mahesh Bandewar
From: Mahesh Bandewar 

If the underlying master ever changes its L2 (e.g. bonding device),
then make sure that the IPvlan slaves always emit packets with the
current L2 of the master instead of the stale mac addr which was
copied during the device creation. The problem can be seen with
following script -

  #!/bin/bash
  # Create a vEth pair
  ip link add dev veth0 type veth peer name veth1
  ip link set veth0 up
  ip link set veth1 up
  ip link show veth0
  ip link show veth1
  # Create an IPvlan device on one end of this vEth pair.
  ip link add link veth0 dev ipvl0 type ipvlan mode l2
  ip link show ipvl0
  # Change the mac-address of the vEth master.
  ip link set veth0 address 02:11:22:33:44:55

Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.")
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_main.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c1e620..5832091680f4 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -407,7 +407,7 @@ static int ipvlan_hard_header(struct sk_buff *skb, struct 
net_device *dev,
 * while the packets use the mac-addr on the physical device.
 */
return dev_hard_header(skb, phy_dev, type, daddr,
-  saddr ? : dev->dev_addr, len);
+  saddr ? : phy_dev->dev_addr, len);
 }
 
 static const struct header_ops ipvlan_header_ops = {
@@ -730,6 +730,11 @@ static int ipvlan_device_event(struct notifier_block 
*unused,
ipvlan_adjust_mtu(ipvlan, dev);
break;
 
+   case NETDEV_CHANGEADDR:
+   list_for_each_entry(ipvlan, &port->ipvlans, pnode)
+   ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr);
+   break;
+
case NETDEV_PRE_TYPE_CHANGE:
/* Forbid underlying device to change its type. */
return NOTIFY_BAD;
-- 
2.15.0.rc0.271.g36b669edcc-goog



[PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar 
---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap_to_u32array(tmp.cap, _KERNEL_CAPABILITY_U32S,
+   

[PATCH 0/2] capability controlled user-namespaces

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar 

[Same as the previous RFC series sent on 9/21]

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.14.2.822.g60be5d43e6-goog



[PATCH 2/2] userns: control capabilities of some user namespaces

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar 
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6bf72b175b49..26f41602da10 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPE

[PATCH next] bonding: speed/duplex update at NETDEV_UP event

2017-09-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Some NIC drivers don't have correct speed/duplex settings at the
time they send NETDEV_UP notification and that messes up the
bonding state. Especially 802.3ad mode which is very sensitive
to these settings. In the current implementation we invoke
bond_update_speed_duplex() when we receive NETDEV_UP, however,
ignore the return value. If the values we get are invalid
(UNKNOWN), then slave gets removed from the aggregator with
speed and duplex set to UNKNOWN while link is still marked as UP.

This patch fixes this scenario. Also 802.3ad mode is sensitive to
these conditions while other modes are not, so making sure that it
doesn't change the behavior for other modes.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b7313c1d9dcd..177be373966b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3076,7 +3076,16 @@ static int bond_slave_netdev_event(unsigned long event,
break;
case NETDEV_UP:
case NETDEV_CHANGE:
-   bond_update_speed_duplex(slave);
+   /* For 802.3ad mode only:
+* Getting invalid Speed/Duplex values here will put slave
+* in weird state. So mark it as link-down for the time
+* being and let link-monitoring (miimon) set it right when
+* correct speeds/duplex are available.
+*/
+   if (bond_update_speed_duplex(slave) &&
+   BOND_MODE(bond) == BOND_MODE_8023AD)
+   slave->link = BOND_LINK_DOWN;
+
if (BOND_MODE(bond) == BOND_MODE_8023AD)
bond_3ad_adapter_speed_duplex_changed(slave);
/* Fallthrough */
-- 
2.14.2.822.g60be5d43e6-goog



[RFC PATCH 0/2] capability controlled user-namespaces

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.14.1.821.g8fa685d3b7-goog



[RFC PATCH 2/2] userns: control capabilities of some user namespaces

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar 
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6bf72b175b49..26f41602da10 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPE

[RFC PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar 
CC: Serge Hallyn 
CC: Kees Cook 
CC: "Eric W. Biederman" 

---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ce61d1fe08ca..ec0d74476f48 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -186,6 +187,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(&caps_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret =

[PATCH next] neigh: initialize neigh entry correctly during arp processing

2017-08-16 Thread Mahesh Bandewar
From: Mahesh Bandewar 

If the ARP processing creates a neigh entry, it's immediately marked
as STALE without timer and stays that way in that state as long as
host do not send traffic to that neighbour.

I observed this on hosts which are in IPv6 environment, where there is
very little to no IPv4 traffic and neigh-entries are stuck in STALE
mode. Ideally, the host should have PROBEd these neighbours before it
can send the first packet out.

It happens as a result of following call sequence in an environment
where host is mostly quiet as far as IPv4 traffic but few connected
hosts/gateways are sending ARPs.

   arp_process()
 neigh_event_ns()
   neigh_lookup()
 neigh_create()
   neigh_alloc()
 nud_state=NUD_NONE
 neigh_update(nud_state=NUD_STALE)

In the above scenario, the neighbour entry does not get a chance to get
PROBEd as subsequent call to neigh_update() marks  this entry STALE.
This patch initializes the neigh-entry correctly if it was created as a
result of neigh_lookup instead of just updating it in neigh_event_ns()
right after creating it.

Signed-off-by: Mahesh Bandewar 
---
 net/core/neighbour.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16a1a4c4eb57..d8a35db6c43b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1300,9 +1300,13 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 {
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
 lladdr || !dev->addr_len);
-   if (neigh)
-   neigh_update(neigh, lladdr, NUD_STALE,
-NEIGH_UPDATE_F_OVERRIDE, 0);
+   if (neigh) {
+   if (neigh->nud_state & NUD_VALID)
+   neigh_update(neigh, lladdr, NUD_STALE,
+NEIGH_UPDATE_F_OVERRIDE, 0);
+   else
+   neigh_event_send(neigh, NULL);
+   }
return neigh;
 }
 EXPORT_SYMBOL(neigh_event_ns);
-- 
2.14.1.480.gb18f417b89-goog



[PATCH 1/3] ipv4: initialize fib_trie prior to register_netdev_notifier call.

2017-07-19 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Net stack initialization currently initializes fib-trie after the
first call to netdevice_notifier() call. In fact fib_trie initialization
needs to happen before first rtnl_register(). It does not cause any problem
since there are no devices UP at this moment, but trying to bring 'lo'
UP at initialization would make this assumption wrong and exposes the issue.

Fixes following crash

 Call Trace:
  ? alternate_node_alloc+0x76/0xa0
  fib_table_insert+0x1b7/0x4b0
  fib_magic.isra.17+0xea/0x120
  fib_add_ifaddr+0x7b/0x190
  fib_netdev_event+0xc0/0x130
  register_netdevice_notifier+0x1c1/0x1d0
  ip_fib_init+0x72/0x85
  ip_rt_init+0x187/0x1e9
  ip_init+0xe/0x1a
  inet_init+0x171/0x26c
  ? ipv4_offload_init+0x66/0x66
  do_one_initcall+0x43/0x160
  kernel_init_freeable+0x191/0x219
  ? rest_init+0x80/0x80
  kernel_init+0xe/0x150
  ret_from_fork+0x22/0x30
 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff 
ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 
74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08
 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28
 CR2: 0014

Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle 
multiple namespaces.")
Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization")

Signed-off-by: Mahesh Bandewar 
---
 net/ipv4/fib_frontend.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e678fa892dd..044d2a159a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = {
 
 void __init ip_fib_init(void)
 {
-   rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
-   rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
-   rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+   fib_trie_init();
 
register_pernet_subsys(&fib_net_ops);
+
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
 
-   fib_trie_init();
+   rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+   rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+   rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 }
-- 
2.14.0.rc0.284.gd933b75aa4-goog



[PATCH 1/2] ipv4: initialize fib_trie prior to register_netdev_notifier call.

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Net stack initialization currently initializes fib-trie after the
first call to netdevice_notifier() call. It does not cause any problem
since there are no devices UP at this moment, but trying to bring 'lo'
UP at initialization would make this assumption wrong. However changing
order solves the issue.

Fixes following crash

[8.647095] BUG: unable to handle kernel NULL pointer dereference at 
0014
[8.654836] IP: kmem_cache_alloc+0xcf/0x1c0
[8.658966] PGD 0
[8.658967] P4D 0
[8.660953]
[8.664406] Oops:  [#1] SMP
[8.667505] Modules linked in:
[8.670519] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.12.0-smp-DEV #34
[8.677138] Hardware name: Intel Grantley,Wellsburg/Ixion_IT_15, BIOS 2.60.0 
06/03/2016
[8.685043] task: 8f26b2b00dc0 task.stack: 9b1500014000
[8.690891] RIP: 0010:kmem_cache_alloc+0xcf/0x1c0
[8.695535] RSP: :9b1500017c28 EFLAGS: 00010246
[8.700696] RAX: 96ed4f47 RBX:  RCX: 0200
[8.707743] RDX: 0400 RSI: 014000c0 RDI: 
[8.714791] RBP: 9b1500017c58 R08: 0001 R09: 
[8.721837] R10:  R11:  R12: 014000c0
[8.728886] R13: 014000c0 R14:  R15: 
[8.735932] FS:  () GS:8f26bf50() 
knlGS:
[8.743925] CS:  0010 DS:  ES:  CR0: 80050033
[8.749599] CR2: 0014 CR3: 001d05609000 CR4: 001406e0
[8.756648] Call Trace:
[8.759061]  ? alternate_node_alloc+0x76/0xa0
[8.763364]  fib_table_insert+0x1b7/0x4b0
[8.767323]  fib_magic.isra.17+0xea/0x120
[8.771283]  fib_add_ifaddr+0x7b/0x190
[8.774983]  fib_netdev_event+0xc0/0x130
[8.778857]  register_netdevice_notifier+0x1c1/0x1d0
[8.783761]  ip_fib_init+0x72/0x85
[8.787120]  ip_rt_init+0x187/0x1e9
[8.790564]  ip_init+0xe/0x1a
[8.793494]  inet_init+0x171/0x26c
[8.796851]  ? ipv4_offload_init+0x66/0x66
[8.800896]  do_one_initcall+0x43/0x160
[8.804685]  kernel_init_freeable+0x191/0x219
[8.808987]  ? rest_init+0x80/0x80
[8.812343]  kernel_init+0xe/0x150
[8.815702]  ret_from_fork+0x22/0x30
[8.819229] Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 
ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 
6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08
[8.837882] RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28
[8.843986] CR2: 0014
[8.847288] ---[ end trace 8fc60994c8a367cb ]---
[8.851847] Kernel panic - not syncing: Fatal exception
[8.857035] Rebooting in 10 seconds..

Signed-off-by: Mahesh Bandewar 
---
 net/ipv4/fib_frontend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e678fa892dd..ca22ca47019a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1338,9 +1338,9 @@ void __init ip_fib_init(void)
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 
+   fib_trie_init();
+
register_pernet_subsys(&fib_net_ops);
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
-
-   fib_trie_init();
 }
-- 
2.13.2.725.g09c95d1e9-goog



[PATCH 2/2] loopback: bringup 'lo' by default at initialization

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar 

loopback devices are always brought up right after its initialization
including the case of network namespace creation. e.g.

ip netns add foo
ip -netns foo link set lo up

This patch will eliminate the need to do that separately and would bring
it up as part of the loopback initialization.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/loopback.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 30612497643c..3216977935d1 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -217,6 +217,12 @@ static __net_init int loopback_net_init(struct net *net)
 
BUG_ON(dev->ifindex != LOOPBACK_IFINDEX);
net->loopback_dev = dev;
+
+   /* Set the loopback device UP */
+   rtnl_lock();
+   dev_open(net->loopback_dev);
+   rtnl_unlock();
+
return 0;
 
 out_free_netdev:
-- 
2.13.2.725.g09c95d1e9-goog



[PATCH 0/2] bring UP loopback device at initialziation

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar 

In almost every scenario the loopback device is brought UP after
initialization. So there is no point of bringing up the device in
DOWN state followed by device UP operation. This change exposed
another issue of fib-trie initialization which is corrected in the 
first path.

Mahesh Bandewar (2):
  ipv4: initialize fib_trie prior to register_netdev_notifier call.
  loopback: bringup 'lo' by default at initialization

 drivers/net/loopback.c  | 6 ++
 net/ipv4/fib_frontend.c | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

-- 
2.13.2.725.g09c95d1e9-goog



[PATCH net] ipv6: avoid dad-failures for addresses with NODAD

2017-05-12 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Every address gets added with TENTATIVE flag even for the addresses with
IFA_F_NODAD flag and dad-work is scheduled for them. During this DAD process
we realize it's an address with NODAD and complete the process without
sending any probe. However the TENTATIVE flags stays on the
address for sometime enough to cause misinterpretation when we receive a NS.
While processing NS, if the address has TENTATIVE flag, we mark it DADFAILED
and endup with an address that was originally configured as NODAD with
DADFAILED.

We can't avoid scheduling dad_work for addresses with NODAD but we can
avoid adding TENTATIVE flag to avoid this racy situation.

Signed-off-by: Mahesh Bandewar 
---
 net/ipv6/addrconf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b09ac38d8dc4..53f2dc092023 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
INIT_HLIST_NODE(&ifa->addr_lst);
ifa->scope = scope;
ifa->prefix_len = pfxlen;
-   ifa->flags = flags | IFA_F_TENTATIVE;
+   ifa->flags = flags;
+   /* No need to add the TENTATIVE flag for addresses with NODAD */
+   if (!(flags & IFA_F_NODAD))
+   ifa->flags |= IFA_F_TENTATIVE;
ifa->valid_lft = valid_lft;
ifa->prefered_lft = prefered_lft;
ifa->cstamp = ifa->tstamp = jiffies;
-- 
2.13.0.rc2.291.g57267f2277-goog



[PATCH] kmod: don't load module unless req process has CAP_SYS_MODULE

2017-05-12 Thread Mahesh Bandewar
From: Mahesh Bandewar 

A process inside random user-ns should not load a module, which is
currently possible. As demonstrated in following scenario -

  Create namespaces; especially a user-ns and become root inside.
  $ unshare -rfUp -- unshare -unm -- bash

  Try to load the bridge module. It should fail and this is expected!
  #  modprobe bridge
  WARNING: Error inserting stp 
(/lib/modules/4.11.0-smp-DEV/kernel/net/802/stp.ko): Operation not permitted
  FATAL: Error inserting bridge 
(/lib/modules/4.11.0-smp-DEV/kernel/net/bridge/bridge.ko): Operation not 
permitted

  Verify bridge module is not loaded.
  # lsmod | grep bridge
  #

  Now try to create a bridge inside this newly created net-ns which would
  mean bridge module need to be loaded.
  # ip link add br0 type bridge
  # echo $?
  0
  # lsmod | grep bridge
  bridge110592  0
  stp16384  1 bridge
  llc16384  2 bridge,stp
  #

  After this patch -
  # ip link add br0 type bridge
  RTNETLINK answers: Operation not supported
  # echo $?
  2
  # lsmod | grep bridge
  #

Signed-off-by: Mahesh Bandewar 
---
 kernel/kmod.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..ac30157169b7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -133,6 +133,9 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
 
+   if (!capable(CAP_SYS_MODULE))
+   return -EPERM;
+
/*
 * We don't allow synchronous module loading from async.  Module
 * init may invoke async_synchronize_full() which will end up
-- 
2.13.0.rc2.291.g57267f2277-goog



[PATCHv2 next] bonding: fix wq initialization for links created via netlink

2017-04-20 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Earlier patch 4493b81bea ("bonding: initialize work-queues during
creation of bond") moved the work-queue initialization from bond_open()
to bond_create(). However this caused the link those are created using
netlink 'create bond option' (ip link add bondX type bond); create the
new trunk without initializing work-queues. Prior to the above mentioned
change, ndo_open was in both paths and things worked correctly. The
consequence is visible in the report shared by Joe Stringer -

I've noticed that this patch breaks bonding within namespaces if
you're not careful to perform device cleanup correctly.

Here's my repro script, you can run on any net-next with this patch
and you'll start seeing some weird behaviour:

ip netns add foo
ip li add veth0 type veth peer name veth0+ netns foo
ip li add veth1 type veth peer name veth1+ netns foo
ip netns exec foo ip li add bond0 type bond
ip netns exec foo ip li set dev veth0+ master bond0
ip netns exec foo ip li set dev veth1+ master bond0
ip netns exec foo ip addr add dev bond0 192.168.0.1/24
ip netns exec foo ip li set dev bond0 up
ip li del dev veth0
ip li del dev veth1

The second to last command segfaults, last command hangs. rtnl is now
permanently locked. It's not a problem if you take bond0 down before
deleting veths, or delete bond0 before deleting veths. If you delete
either end of the veth pair as per above, either inside or outside the
namespace, it hits this problem.

Here's some kernel logs:
[ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link
[ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link
[ 1281.193863] bond0: Releasing backup interface veth0+
[ 1281.193866] bond0: the permanent HWaddr of veth0+ -
16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of
veth0+ to a different address to avoid conflicts
[ 1281.193867] [ cut here ]
[ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511
__queue_delayed_work+0x13f/0x150
[ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6
nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs
lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse
serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc
configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt
shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4
nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid
hid mptspi mptscsih e1000 mptbase ahci libahci
[ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW
4.10.0-bisect-bond-v0.14 #37
[ 1281.193906] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[ 1281.193906] Call Trace:
[ 1281.193912]  dump_stack+0x63/0x89
[ 1281.193915]  __warn+0xd1/0xf0
[ 1281.193917]  warn_slowpath_null+0x1d/0x20
[ 1281.193918]  __queue_delayed_work+0x13f/0x150
[ 1281.193920]  queue_delayed_work_on+0x27/0x40
[ 1281.193929]  bond_change_active_slave+0x25b/0x670 [bonding]
[ 1281.193932]  ? synchronize_rcu_expedited+0x27/0x30
[ 1281.193935]  __bond_release_one+0x489/0x510 [bonding]
[ 1281.193939]  ? addrconf_notify+0x1b7/0xab0
[ 1281.193942]  bond_netdev_event+0x2c5/0x2e0 [bonding]
[ 1281.193944]  ? netconsole_netdev_event+0x124/0x190 [netconsole]
[ 1281.193947]  notifier_call_chain+0x49/0x70
[ 1281.193948]  raw_notifier_call_chain+0x16/0x20
[ 1281.193950]  call_netdevice_notifiers_info+0x35/0x60
[ 1281.193951]  rollback_registered_many+0x23b/0x3e0
[ 1281.193953]  unregister_netdevice_many+0x24/0xd0
[ 1281.193955]  rtnl_delete_link+0x3c/0x50
[ 1281.193956]  rtnl_dellink+0x8d/0x1b0
[ 1281.193960]  rtnetlink_rcv_msg+0x95/0x220
[ 1281.193962]  ? __kmalloc_node_track_caller+0x35/0x280
[ 1281.193964]  ? __netlink_lookup+0xf1/0x110
[ 1281.193966]  ? rtnl_newlink+0x830/0x830
[ 1281.193967]  netlink_rcv_skb+0xa7/0xc0
[ 1281.193969]  rtnetlink_rcv+0x28/0x30
[ 1281.193970]  netlink_unicast+0x15b/0x210
[ 1281.193971]  netlink_sendmsg+0x319/0x390
[ 1281.193974]  sock_sendmsg+0x38/0x50
[ 1281.193975]  ___sys_sendmsg+0x25c/0x270
[ 1281.193978]  ? mem_cgroup_commit_charge+0x76/0xf0
[ 1281.193981]  ? page_add_new_anon_rmap+0x89/0xc0
[ 1281.193984]  ? lru_cache_add_active_or_unevictable+0x35/0xb0
[ 1281.193985]  ? __handle_mm_fault+0x4e9/0x1170
[ 1281.193987]  __sys_sendmsg+0x45/0x80
[ 1281.193989]  SyS_sendmsg+0x12/0x20
[ 1281.193991]  do_syscall_64+0x6e/0x180
[ 1281.193993]  entry_SYSCALL64_slow_path+0x25/0x25
[ 1281.193995] RIP: 0033:0x7f6ec122f5a0
[ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX:
002e
[ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0
[ 1281.193997] RDX:  RSI: 7ffe69e89c90 RDI: 0003
[ 1281.193998] RBP: 7ffe69e89c90 R08:  R09: 0003
[ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 000

[PATCH next] bonding: fix wq initialization for links created via netlink

2017-04-19 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Earlier patch 4493b81bea ("bonding: initialize work-queues during
creation of bond") moved the work-queue initialization from bond_open()
to bond_create(). However this caused the link those are created using
netlink 'create bond option' (ip link add bondX type bond); create the
new trunk without initializing work-queues. Prior to the above mentioned
change, ndo_open was in both paths and things worked correctly. The
consequence is visible in the report shared by Joe Stringer -

I've noticed that this patch breaks bonding within namespaces if
you're not careful to perform device cleanup correctly.

Here's my repro script, you can run on any net-next with this patch
and you'll start seeing some weird behaviour:

ip netns add foo
ip li add veth0 type veth peer name veth0+ netns foo
ip li add veth1 type veth peer name veth1+ netns foo
ip netns exec foo ip li add bond0 type bond
ip netns exec foo ip li set dev veth0+ master bond0
ip netns exec foo ip li set dev veth1+ master bond0
ip netns exec foo ip addr add dev bond0 192.168.0.1/24
ip netns exec foo ip li set dev bond0 up
ip li del dev veth0
ip li del dev veth1

The second to last command segfaults, last command hangs. rtnl is now
permanently locked. It's not a problem if you take bond0 down before
deleting veths, or delete bond0 before deleting veths. If you delete
either end of the veth pair as per above, either inside or outside the
namespace, it hits this problem.

Here's some kernel logs:
[ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link
[ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link
[ 1281.193863] bond0: Releasing backup interface veth0+
[ 1281.193866] bond0: the permanent HWaddr of veth0+ -
16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of
veth0+ to a different address to avoid conflicts
[ 1281.193867] [ cut here ]
[ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511
__queue_delayed_work+0x13f/0x150
[ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6
nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs
lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse
serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc
configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt
shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4
nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid
hid mptspi mptscsih e1000 mptbase ahci libahci
[ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW
4.10.0-bisect-bond-v0.14 #37
[ 1281.193906] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[ 1281.193906] Call Trace:
[ 1281.193912]  dump_stack+0x63/0x89
[ 1281.193915]  __warn+0xd1/0xf0
[ 1281.193917]  warn_slowpath_null+0x1d/0x20
[ 1281.193918]  __queue_delayed_work+0x13f/0x150
[ 1281.193920]  queue_delayed_work_on+0x27/0x40
[ 1281.193929]  bond_change_active_slave+0x25b/0x670 [bonding]
[ 1281.193932]  ? synchronize_rcu_expedited+0x27/0x30
[ 1281.193935]  __bond_release_one+0x489/0x510 [bonding]
[ 1281.193939]  ? addrconf_notify+0x1b7/0xab0
[ 1281.193942]  bond_netdev_event+0x2c5/0x2e0 [bonding]
[ 1281.193944]  ? netconsole_netdev_event+0x124/0x190 [netconsole]
[ 1281.193947]  notifier_call_chain+0x49/0x70
[ 1281.193948]  raw_notifier_call_chain+0x16/0x20
[ 1281.193950]  call_netdevice_notifiers_info+0x35/0x60
[ 1281.193951]  rollback_registered_many+0x23b/0x3e0
[ 1281.193953]  unregister_netdevice_many+0x24/0xd0
[ 1281.193955]  rtnl_delete_link+0x3c/0x50
[ 1281.193956]  rtnl_dellink+0x8d/0x1b0
[ 1281.193960]  rtnetlink_rcv_msg+0x95/0x220
[ 1281.193962]  ? __kmalloc_node_track_caller+0x35/0x280
[ 1281.193964]  ? __netlink_lookup+0xf1/0x110
[ 1281.193966]  ? rtnl_newlink+0x830/0x830
[ 1281.193967]  netlink_rcv_skb+0xa7/0xc0
[ 1281.193969]  rtnetlink_rcv+0x28/0x30
[ 1281.193970]  netlink_unicast+0x15b/0x210
[ 1281.193971]  netlink_sendmsg+0x319/0x390
[ 1281.193974]  sock_sendmsg+0x38/0x50
[ 1281.193975]  ___sys_sendmsg+0x25c/0x270
[ 1281.193978]  ? mem_cgroup_commit_charge+0x76/0xf0
[ 1281.193981]  ? page_add_new_anon_rmap+0x89/0xc0
[ 1281.193984]  ? lru_cache_add_active_or_unevictable+0x35/0xb0
[ 1281.193985]  ? __handle_mm_fault+0x4e9/0x1170
[ 1281.193987]  __sys_sendmsg+0x45/0x80
[ 1281.193989]  SyS_sendmsg+0x12/0x20
[ 1281.193991]  do_syscall_64+0x6e/0x180
[ 1281.193993]  entry_SYSCALL64_slow_path+0x25/0x25
[ 1281.193995] RIP: 0033:0x7f6ec122f5a0
[ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX:
002e
[ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0
[ 1281.193997] RDX:  RSI: 7ffe69e89c90 RDI: 0003
[ 1281.193998] RBP: 7ffe69e89c90 R08:  R09: 0003
[ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 000

[PATCH next] bonding: handle link transition from FAIL to UP correctly

2017-04-11 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When link transitions from LINK_FAIL to LINK_UP, the commit phase is
not called. This leads to an erroneous state causing slave-link state to
get stuck in "going down" state while its speed and duplex are perfectly
fine. This issue is a side-effect of splitting link-set into propose and
commit phases introduced by de77ecd4ef02 ("bonding: improve link-status
update in mii-monitoring")

This patch fixes these issues by calling commit phase whenever link
state change is proposed.

Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring")
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index aba7352906a5..01e4a69af421 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2064,6 +2064,7 @@ static int bond_miimon_inspect(struct bonding *bond)
(bond->params.downdelay - 
slave->delay) *
bond->params.miimon,
slave->dev->name);
+   commit++;
continue;
}
 
@@ -2098,7 +2099,7 @@ static int bond_miimon_inspect(struct bonding *bond)
(bond->params.updelay - 
slave->delay) *
bond->params.miimon,
slave->dev->name);
-
+   commit++;
continue;
}
 
-- 
2.12.2.715.g7642488e1d-goog



[PATCH next] bonding: fix active-backup transition

2017-04-03 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Earlier patch c4adfc822bf5 ("bonding: make speed, duplex setting
consistent with link state") made an attempt to keep slave state
consistent with speed and duplex settings. Unfortunately link-state
transition is used to change the active link especially when used
in conjunction with mii-mon. The above mentioned patch broke that
logic. Also when speed and duplex settings for a link are updated
during a link-event, the link-status should not be changed to
invoke correct transition logic.

This patch fixes this issue by moving the link-state update outside
of the bond_update_speed_duplex() fn and to the places where this fn
is called and update link-state selectively.

Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state")
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 27359dab78a1..535388b15cde 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -378,20 +378,15 @@ static int bond_update_speed_duplex(struct slave *slave)
slave->duplex = DUPLEX_UNKNOWN;
 
res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
-   if (res < 0) {
-   slave->link = BOND_LINK_DOWN;
+   if (res < 0)
return 1;
-   }
-   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) {
-   slave->link = BOND_LINK_DOWN;
+   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
return 1;
-   }
switch (ecmd.base.duplex) {
case DUPLEX_FULL:
case DUPLEX_HALF:
break;
default:
-   slave->link = BOND_LINK_DOWN;
return 1;
}
 
@@ -1563,7 +1558,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
new_slave->delay = 0;
new_slave->link_failure_count = 0;
 
-   bond_update_speed_duplex(new_slave);
+   if (bond_update_speed_duplex(new_slave))
+   new_slave->link = BOND_LINK_DOWN;
 
new_slave->last_rx = jiffies -
(msecs_to_jiffies(bond->params.arp_interval) + 1);
@@ -2126,6 +2122,7 @@ static void bond_miimon_commit(struct bonding *bond)
 
case BOND_LINK_UP:
if (bond_update_speed_duplex(slave)) {
+   slave->link = BOND_LINK_DOWN;
netdev_warn(bond->dev,
"failed to get link speed/duplex 
for %s\n",
slave->dev->name);
-- 
2.12.2.715.g7642488e1d-goog



[PATCH next 4/5] bonding: correctly update link status during mii-commit phase

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

bond_miimon_commit() marks the link UP after attempting to get the speed
and duplex settings for the link. There is a possibility that
bond_update_speed_duplex() could fail. This is another place where it
could result into an inconsistent bonding link state.

With this patch the link will be marked UP only if the speed and duplex
values retrieved have sane values and processed further.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ad317bb63193..6cea964ab70a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2125,7 +2125,12 @@ static void bond_miimon_commit(struct bonding *bond)
continue;
 
case BOND_LINK_UP:
-   bond_update_speed_duplex(slave);
+   if (bond_update_speed_duplex(slave)) {
+   netdev_warn(bond->dev,
+   "failed to get link speed/duplex 
for %s\n",
+   slave->dev->name);
+   continue;
+   }
bond_set_slave_link_state(slave, BOND_LINK_UP,
  BOND_SLAVE_NOTIFY_NOW);
slave->last_link_up = jiffies;
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 5/5] bonding: avoid printing while holding a spinlock

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_3ad.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 508713b4e533..c5fd4259da33 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2446,9 +2446,9 @@ void bond_3ad_adapter_speed_duplex_changed(struct slave 
*slave)
 
spin_lock_bh(&slave->bond->mode_lock);
ad_update_actor_keys(port, false);
+   spin_unlock_bh(&slave->bond->mode_lock);
netdev_dbg(slave->bond->dev, "Port %d slave %s changed speed/duplex\n",
   port->actor_port_number, slave->dev->name);
-   spin_unlock_bh(&slave->bond->mode_lock);
 }
 
 /**
@@ -2492,12 +2492,12 @@ void bond_3ad_handle_link_change(struct slave *slave, 
char link)
agg = __get_first_agg(port);
ad_agg_selection_logic(agg, &dummy);
 
+   spin_unlock_bh(&slave->bond->mode_lock);
+
netdev_dbg(slave->bond->dev, "Port %d changed link status to %s\n",
   port->actor_port_number,
   link == BOND_LINK_UP ? "UP" : "DOWN");
 
-   spin_unlock_bh(&slave->bond->mode_lock);
-
/* RTNL is held and mode_lock is released so it's safe
 * to update slave_array here.
 */
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 2/5] bonding: improve link-status update in mii-monitoring

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The primary issue is that mii-inspect phase updates link-state and
expects changes to be committed during the mii-commit phase. After
the inspect phase if it fails to acquire rtnl-mutex, the commit
phase (bond_mii_commit) doesn't get to run. This partially updated
state stays and makes the internal-state inconsistent.

e.g. setup bond0 => slaves: eth1, eth2
eth1 goes DOWN -> UP
   mii_monitor()
mii-inspect()
bond_set_slave_link_state(eth1, UP, DontNotify)
rtnl_trylock() <- fails!

Next mii-monitor round
eth1: No change
   mii_monitor()
mii-inspect()
eth1->link == current-status (ethtool_ops->get_link)
no-change-detected

End result:
eth1:
  Link = BOND_LINK_UP
  Speed = 0xf  [SpeedUnknown]
  Duplex = 0xff[DuplexUnknown]

This doesn't always happen but for some unlucky machines in a large set
of machines it creates problems.

The fix for this is to avoid making changes during inspect phase and
postpone them until acquiring the rtnl-mutex / invoking commit phase.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ba934020dfaa..85999e479916 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2033,8 +2033,7 @@ static int bond_miimon_inspect(struct bonding *bond)
if (link_state)
continue;
 
-   bond_set_slave_link_state(slave, BOND_LINK_FAIL,
- BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_FAIL);
slave->delay = bond->params.downdelay;
if (slave->delay) {
netdev_info(bond->dev, "link status down for 
%sinterface %s, disabling it in %d ms\n",
@@ -2049,8 +2048,7 @@ static int bond_miimon_inspect(struct bonding *bond)
case BOND_LINK_FAIL:
if (link_state) {
/* recovered before downdelay expired */
-   bond_set_slave_link_state(slave, BOND_LINK_UP,
- 
BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_UP);
slave->last_link_up = jiffies;
netdev_info(bond->dev, "link status up again 
after %d ms for interface %s\n",
(bond->params.downdelay - 
slave->delay) *
@@ -2072,8 +2070,7 @@ static int bond_miimon_inspect(struct bonding *bond)
if (!link_state)
continue;
 
-   bond_set_slave_link_state(slave, BOND_LINK_BACK,
- BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_BACK);
slave->delay = bond->params.updelay;
 
if (slave->delay) {
@@ -2086,9 +2083,7 @@ static int bond_miimon_inspect(struct bonding *bond)
/*FALLTHRU*/
case BOND_LINK_BACK:
if (!link_state) {
-   bond_set_slave_link_state(slave,
- BOND_LINK_DOWN,
- 
BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_DOWN);
netdev_info(bond->dev, "link status down again 
after %d ms for interface %s\n",
(bond->params.updelay - 
slave->delay) *
bond->params.miimon,
@@ -2225,6 +2220,8 @@ static void bond_mii_monitor(struct work_struct *work)
mii_work.work);
bool should_notify_peers = false;
unsigned long delay;
+   struct slave *slave;
+   struct list_head *iter;
 
delay = msecs_to_jiffies(bond->params.miimon);
 
@@ -2245,6 +2242,9 @@ static void bond_mii_monitor(struct work_struct *work)
goto re_arm;
}
 
+   bond_for_each_slave(bond, slave, iter) {
+   bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
+   }
bond_miimon_commit(bond);
 
rtnl_unlock();  /* might sleep, hold no other locks */
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 1/5] bonding: split bond_set_slave_link_state into two parts

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Split the function into two (a) propose (b) commit phase without
changing the semantics for the original API.

Signed-off-by: Mahesh Bandewar 
---
 include/net/bonding.h | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 3c857778a6ca..fb2dd97857c4 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -153,7 +153,8 @@ struct slave {
unsigned long last_link_up;
unsigned long last_rx;
unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
-   s8 link;/* one of BOND_LINK_ */
+   s8 link;/* one of BOND_LINK_ */
+   s8 link_new_state;  /* one of BOND_LINK_ */
s8 new_link;
u8 backup:1,   /* indicates backup slave. Value corresponds with
  BOND_STATE_ACTIVE and BOND_STATE_BACKUP */
@@ -504,13 +505,17 @@ static inline bool bond_is_slave_inactive(struct slave 
*slave)
return slave->inactive;
 }
 
-static inline void bond_set_slave_link_state(struct slave *slave, int state,
-bool notify)
+static inline void bond_propose_link_state(struct slave *slave, int state)
+{
+   slave->link_new_state = state;
+}
+
+static inline void bond_commit_link_state(struct slave *slave, bool notify)
 {
-   if (slave->link == state)
+   if (slave->link == slave->link_new_state)
return;
 
-   slave->link = state;
+   slave->link = slave->link_new_state;
if (notify) {
bond_queue_slave_event(slave);
bond_lower_state_changed(slave);
@@ -523,6 +528,13 @@ static inline void bond_set_slave_link_state(struct slave 
*slave, int state,
}
 }
 
+static inline void bond_set_slave_link_state(struct slave *slave, int state,
+bool notify)
+{
+   bond_propose_link_state(slave, state);
+   bond_commit_link_state(slave, notify);
+}
+
 static inline void bond_slave_link_notify(struct bonding *bond)
 {
struct list_head *iter;
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 0/5] link-status fixes for mii-monitoring

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The mii monitoring is divided into two phases - inspect and commit. The
inspect phase technically should not make any changes to the state and
defer it to the commit phase. However detected link state inconsistencies
on several machines and discovered that it's the result of some
inconsistent update to link states and assumption that you *always* get
rtnl-mutex. In reality when trylock() fails to acquire rtnl-mutex, the
commit phase is postponed until next mii-mon run. At the next round
because of the state change performed in the previous inspect-run, this
round does not detect any changes and would skip calling commit phase.
This would result in an inconsistent state until next link event happens
(if it ever happens).

During the the commit phase, it's always assumed that speed and duplex
fetch is always successful, but that's always not the case. However the
slave state is marked UP irrespective of speed / duplex fetch operation.
If the speed / duplex fetch operation results in insane values for either
of these two fields, then keeping internal link state UP is not going to
provide fruitful results either.

Please see into individual patches for more details.

Mahesh Bandewar (5):
  bonding: split bond_set_slave_link_state into two parts
  bonding: improve link-status update in mii-monitoring
  bonding: make speed, duplex setting consistent with link state
  bonding: correctly update link status during mii-commit phase
  bonding: avoid printing while holding a spinlock

 drivers/net/bonding/bond_3ad.c  |  6 ++---
 drivers/net/bonding/bond_main.c | 49 -
 include/net/bonding.h   | 22 +-
 3 files changed, 49 insertions(+), 28 deletions(-)

-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 3/5] bonding: make speed, duplex setting consistent with link state

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar 

bond_update_speed_duplex() retrieves speed and duplex settings. There
is a possibility of failure in retrieving these values but caller has
to assume it's always successful. This leads to having inconsistent
slave link settings. If these (speed, duplex) values cannot be
retrieved, then keeping the link UP causes problems.

The updated bond_update_speed_duplex() returns 0 on success if it
retrieves sane values for speed and duplex. On failure it returns 1
and marks the link down.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 85999e479916..ad317bb63193 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -365,9 +365,10 @@ int bond_set_carrier(struct bonding *bond)
 /* Get link speed and duplex from the slave's base driver
  * using ethtool. If for some reason the call fails or the
  * values are invalid, set speed and duplex to -1,
- * and return.
+ * and return. Return 1 if speed or duplex settings are
+ * UNKNOWN; 0 otherwise.
  */
-static void bond_update_speed_duplex(struct slave *slave)
+static int bond_update_speed_duplex(struct slave *slave)
 {
struct net_device *slave_dev = slave->dev;
struct ethtool_link_ksettings ecmd;
@@ -377,24 +378,27 @@ static void bond_update_speed_duplex(struct slave *slave)
slave->duplex = DUPLEX_UNKNOWN;
 
res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
-   if (res < 0)
-   return;
-
-   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
-   return;
-
+   if (res < 0) {
+   slave->link = BOND_LINK_DOWN;
+   return 1;
+   }
+   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) {
+   slave->link = BOND_LINK_DOWN;
+   return 1;
+   }
switch (ecmd.base.duplex) {
case DUPLEX_FULL:
case DUPLEX_HALF:
break;
default:
-   return;
+   slave->link = BOND_LINK_DOWN;
+   return 1;
}
 
slave->speed = ecmd.base.speed;
slave->duplex = ecmd.base.duplex;
 
-   return;
+   return 0;
 }
 
 const char *bond_slave_link_status(s8 link)
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 0/5] bonding: winter cleanup

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Few cleanup patches that I have accumulated over some time now.

(a) First two patches are basically to move the work-queue initialization
from every ndo_open / bond_open operation to once at the beginning while
port creation. Work-queue initialization is an unnecessary operation
for every 'ifup' operation. However we have some mode-specific work-queues
and mode can change anytime after port creation. So the second patch is
to ensure the correct work-handler is called based on the mode.

(b) Third patch is simple and straightforward that removes hard-coded value
that was added into the initial commit and replaces it with the default
value configured.

(c) The final patch in the series removes the unimplemented "port-moved" state
from the LACP state machine. This state is defined but never set so
removing from the state machine logic makes code little cleaner.

(d) Reduce scope of some global variables to local.

Note: None of these patches are making any functional changes.

Mahesh Bandewar (5):
  bonding: restructure arp-monitor
  bonding: initialize work-queues during creation of bond
  bonding: remove hardcoded value
  bonding: remove "port-moved" state that was never implemented
  bonding: reduce scope of some global variables

 drivers/net/bonding/bond_3ad.c  | 11 +++--
 drivers/net/bonding/bond_main.c | 53 ++---
 2 files changed, 37 insertions(+), 27 deletions(-)

-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 2/5] bonding: initialize work-queues during creation of bond

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Initializing work-queues every time ifup operation performed is unnecessary
and can be performed only once when the port is created.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 619f0c65f18a..1329110ed85f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev)
}
}
 
-   bond_work_init_all(bond);
-
if (bond_is_lb(bond)) {
/* bond_alb_initialize must be called before the timer
 * is started.
@@ -4691,6 +4689,8 @@ int bond_create(struct net *net, const char *name)
 
netif_carrier_off(bond_dev);
 
+   bond_work_init_all(bond);
+
rtnl_unlock();
if (res < 0)
bond_destructor(bond_dev);
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 1/5] bonding: restructure arp-monitor

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

In preparation to move the work-queue initialization to port creation
from current port_open phase. Work-queue initialization does not make
sense every time we do 'ifup/ifdown'. So moving to port creation phase.

Arp monitoring work depends on the bonding mode and that is not tied
to the port creation and can change anytime during the life after port
creation. So this restructuring allows us to move the initialization
at creation without losing the ability to arm the correct work for the
mode user has selected.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8a4ba8b88e52..619f0c65f18a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, 
unsigned long last_act,
  * arp is transmitted to generate traffic. see activebackup_arp_monitor for
  * arp monitoring in active backup mode.
  */
-static void bond_loadbalance_arp_mon(struct work_struct *work)
+static void bond_loadbalance_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
struct slave *slave, *oldcurrent;
struct list_head *iter;
int do_failover = 0, slave_state_changed = 0;
@@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond)
return should_notify_rtnl;
 }
 
-static void bond_activebackup_arp_mon(struct work_struct *work)
+static void bond_activebackup_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
bool should_notify_peers = false;
bool should_notify_rtnl = false;
int delta_in_ticks;
@@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct 
*work)
}
 }
 
+static void bond_arp_monitor(struct work_struct *work)
+{
+   struct bonding *bond = container_of(work, struct bonding,
+   arp_work.work);
+
+   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
+   bond_activebackup_arp_mon(bond);
+   else
+   bond_loadbalance_arp_mon(bond);
+}
+
 /*-- netdev event handling --*/
 
 /* Change device name */
@@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond)
  bond_resend_igmp_join_requests_delayed);
INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
-   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
-   INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon);
-   else
-   INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon);
+   INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
 }
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 4/5] bonding: remove "port-moved" state that was never implemented

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

LACP state-machine defines "port-moved" state when the same ActorSystemID
and Port are seen in a LACPDU received on different port. The state is
never set since it's not implemented. However the state-machine attempts
to clear that state occasionally. LACP state machine is already complicated
and since this state is not implemented, removing it's checks makes the
state-machine little simpler.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_3ad.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index edc70ffad660..431926bba9f4 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_rx_state = AD_RX_INITIALIZE;
port->sm_vars |= AD_PORT_CHURNED;
/* check if port is not enabled */
-   } else if (!(port->sm_vars & AD_PORT_BEGIN)
-&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED))
+   } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled)
port->sm_rx_state = AD_RX_PORT_DISABLED;
/* check if new lacpdu arrived */
else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) ||
@@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
/* if no lacpdu arrived and no timer is on */
switch (port->sm_rx_state) {
case AD_RX_PORT_DISABLED:
-   if (port->sm_vars & AD_PORT_MOVED)
-   port->sm_rx_state = AD_RX_INITIALIZE;
-   else if (port->is_enabled
-&& (port->sm_vars
-& AD_PORT_LACP_ENABLED))
+   if (port->is_enabled &&
+   (port->sm_vars & AD_PORT_LACP_ENABLED))
port->sm_rx_state = AD_RX_EXPIRED;
else if (port->is_enabled
 && ((port->sm_vars
@@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_vars &= ~AD_PORT_SELECTED;
__record_default(port);
port->actor_oper_port_state &= ~AD_STATE_EXPIRED;
-   port->sm_vars &= ~AD_PORT_MOVED;
port->sm_rx_state = AD_RX_PORT_DISABLED;
 
/* Fall Through */
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 5/5] bonding: reduce scope of some global variables

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Many of the bond param variables are declared global while it's not
really necessary for these variables to be global. So moving them to
the location these are used.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 0f9f5ceae80e..ba934020dfaa 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -201,12 +201,6 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0);
 
 unsigned int bond_net_id __read_mostly;
 
-static __be32 arp_target[BOND_MAX_ARP_TARGETS];
-static int arp_ip_count;
-static int bond_mode   = BOND_MODE_ROUNDROBIN;
-static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
-static int lacp_fast;
-
 /*-- Forward declarations ---*/
 
 static int bond_init(struct net_device *bond_dev);
@@ -4254,6 +4248,11 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   __be32 arp_target[BOND_MAX_ARP_TARGETS];
+   int arp_ip_count;
+   int bond_mode   = BOND_MODE_ROUNDROBIN;
+   int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
+   int lacp_fast = 0;
int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 3/5] bonding: remove hardcoded value

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Eliminate hard-coded value and use the default that is set.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1329110ed85f..0f9f5ceae80e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4254,6 +4254,7 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
if (mode) {
@@ -4566,6 +4567,17 @@ static int bond_check_params(struct bond_params *params)
}
ad_user_port_key = valptr->value;
 
+   if (bond_mode == BOND_MODE_TLB) {
+   bond_opt_initstr(&newval, "default");
+   valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
+   &newval);
+   if (!valptr) {
+   pr_err("Error: No tlb_dynamic_lb default value");
+   return -EINVAL;
+   }
+   tlb_dynamic_lb = valptr->value;
+   }
+
if (lp_interval == 0) {
pr_warn("Warning: ip_interval must be between 1 and %d, so it 
was reset to %d\n",
INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4593,7 +4605,7 @@ static int bond_check_params(struct bond_params *params)
params->min_links = min_links;
params->lp_interval = lp_interval;
params->packets_per_slave = packets_per_slave;
-   params->tlb_dynamic_lb = 1; /* Default value */
+   params->tlb_dynamic_lb = tlb_dynamic_lb;
params->ad_actor_sys_prio = ad_actor_sys_prio;
eth_zero_addr(params->ad_actor_system);
params->ad_user_port_key = ad_user_port_key;
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 3/4] bonding: remove hardcoded value

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Eliminate hard-coded value and use the default that is set.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b7a344ffc1de..06ac1783bfdd 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4253,6 +4253,7 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
if (mode) {
@@ -4565,6 +4566,17 @@ static int bond_check_params(struct bond_params *params)
}
ad_user_port_key = valptr->value;
 
+   if (bond_mode == BOND_MODE_TLB) {
+   bond_opt_initstr(&newval, "default");
+   valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
+   &newval);
+   if (!valptr) {
+   pr_err("Error: No tlb_dynamic_lb default value");
+   return -EINVAL;
+   }
+   tlb_dynamic_lb = valptr->value;
+   }
+
if (lp_interval == 0) {
pr_warn("Warning: ip_interval must be between 1 and %d, so it 
was reset to %d\n",
INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4592,7 +4604,7 @@ static int bond_check_params(struct bond_params *params)
params->min_links = min_links;
params->lp_interval = lp_interval;
params->packets_per_slave = packets_per_slave;
-   params->tlb_dynamic_lb = 1; /* Default value */
+   params->tlb_dynamic_lb = tlb_dynamic_lb;
params->ad_actor_sys_prio = ad_actor_sys_prio;
eth_zero_addr(params->ad_actor_system);
params->ad_user_port_key = ad_user_port_key;
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 1/4] bonding: restructure arp-monitor

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

In preparation to move the work-queue initialization to port creation
from current port_open phase. Work-queue initialization does not make
sense every time we do 'ifup/ifdown'. So moving to port creation phase.

Arp monitoring work depends on the bonding mode and that is not tied
to the port creation and can change anytime during the life after port
creation. So this restructuring allows us to move the initialization
at creation without losing the ability to arm the correct work for the
mode user has selected.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6321f12630c8..4e03807bb0d6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, 
unsigned long last_act,
  * arp is transmitted to generate traffic. see activebackup_arp_monitor for
  * arp monitoring in active backup mode.
  */
-static void bond_loadbalance_arp_mon(struct work_struct *work)
+static void bond_loadbalance_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
struct slave *slave, *oldcurrent;
struct list_head *iter;
int do_failover = 0, slave_state_changed = 0;
@@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond)
return should_notify_rtnl;
 }
 
-static void bond_activebackup_arp_mon(struct work_struct *work)
+static void bond_activebackup_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
bool should_notify_peers = false;
bool should_notify_rtnl = false;
int delta_in_ticks;
@@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct 
*work)
}
 }
 
+static void bond_arp_monitor(struct work_struct *work)
+{
+   struct bonding *bond = container_of(work, struct bonding,
+   arp_work.work);
+
+   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
+   bond_activebackup_arp_mon(bond);
+   else
+   bond_loadbalance_arp_mon(bond);
+}
+
 /*-- netdev event handling --*/
 
 /* Change device name */
@@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond)
  bond_resend_igmp_join_requests_delayed);
INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
-   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
-   INIT_DELAYED_WORK(&bond->arp_work, bond_activebackup_arp_mon);
-   else
-   INIT_DELAYED_WORK(&bond->arp_work, bond_loadbalance_arp_mon);
+   INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
 }
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 4/4] bonding: remove "port-moved" state that was never implemented

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

LACP state-machine defines "port-moved" state when the same ActorSystemID
and Port are seen in a LACPDU received on different port. The state is
never set since it's not implemented. However the state-machine attempts
to clear that state occasionally. LACP state machine is already complicated
and since this state is not implemented, removing it's checks makes the
state-machine little simpler.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_3ad.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index edc70ffad660..431926bba9f4 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_rx_state = AD_RX_INITIALIZE;
port->sm_vars |= AD_PORT_CHURNED;
/* check if port is not enabled */
-   } else if (!(port->sm_vars & AD_PORT_BEGIN)
-&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED))
+   } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled)
port->sm_rx_state = AD_RX_PORT_DISABLED;
/* check if new lacpdu arrived */
else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) ||
@@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
/* if no lacpdu arrived and no timer is on */
switch (port->sm_rx_state) {
case AD_RX_PORT_DISABLED:
-   if (port->sm_vars & AD_PORT_MOVED)
-   port->sm_rx_state = AD_RX_INITIALIZE;
-   else if (port->is_enabled
-&& (port->sm_vars
-& AD_PORT_LACP_ENABLED))
+   if (port->is_enabled &&
+   (port->sm_vars & AD_PORT_LACP_ENABLED))
port->sm_rx_state = AD_RX_EXPIRED;
else if (port->is_enabled
 && ((port->sm_vars
@@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_vars &= ~AD_PORT_SELECTED;
__record_default(port);
port->actor_oper_port_state &= ~AD_STATE_EXPIRED;
-   port->sm_vars &= ~AD_PORT_MOVED;
port->sm_rx_state = AD_RX_PORT_DISABLED;
 
/* Fall Through */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 2/4] bonding: initialize work-queues during creation of bond

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Initializing work-queues every time ifup operation performed is unnecessary
and can be performed only once when the port is created.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4e03807bb0d6..b7a344ffc1de 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev)
}
}
 
-   bond_work_init_all(bond);
-
if (bond_is_lb(bond)) {
/* bond_alb_initialize must be called before the timer
 * is started.
@@ -4690,6 +4688,8 @@ int bond_create(struct net *net, const char *name)
 
netif_carrier_off(bond_dev);
 
+   bond_work_init_all(bond);
+
rtnl_unlock();
if (res < 0)
bond_destructor(bond_dev);
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 0/4] bonding: winter cleanup

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Few cleanup patches that I have accumulated over some time now.

(a) First two patches are basically to move the work-queue initialization
from every ndo_open / bond_open operation to once at the beginning while
port creation. Work-queue initialization is an unnecessary operation
for every 'ifup' operation. However we have some mode-specific work-queues
and mode can change anytime after port creation. So the second patch is
to ensure the correct work-handler is called based on the mode.

(b) Third patch is simple and straightforward that removes hard-coded value
that was added into the initial commit and replaces it with the default
value configured.

(c) The final patch in the series removes the unimplemented "port-moved" state
from the LACP state machine. This state is defined but never set so
removing from the state machine logic makes code little cleaner.

Note: None of these patches are making any functional changes.

Mahesh Bandewar (4):
  bonding: restructure arp-monitor
  bonding: initialize work-queues during creation of bond
  bonding: remove hardcoded value
  bonding: remove "port-moved" state that was never implemented

 drivers/net/bonding/bond_3ad.c  | 11 +++
 drivers/net/bonding/bond_main.c | 42 -
 2 files changed, 32 insertions(+), 21 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 0/3] use netdev_is_rx_handler_busy() in few known cases

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

netdev_rx_handler_register() was recently split into two parts - (a) check
if the handler is used, (b) register the new handler, parts. This is 
helpful in scenarios like bonding where at the time of registration there
is too much state to unwind and it should check if the device is free
before building that state. IPvlan and macvlan drivers don't have this
issue however it can make use of the same check instead of using a device
specific check.

Mahesh Bandewar (3):
  net: remove duplicate code.
  ipvlan: use netdev_is_rx_handler_busy instead of checking specific
type
  macvlan: use netdev_is_rx_handler_busy instead of checking specific
type

 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 drivers/net/macvlan.c| 2 +-
 net/core/dev.c   | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 3/3] macvlan: use netdev_is_rx_handler_busy instead of checking specific type

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

netdev_is_rx_handler_busy() check is a superset of netif_is_ipvlan_port()
check and hence should be preferred.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/macvlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 440ab3d8adf7..cbfc1be23a0e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1110,7 +1110,7 @@ static int macvlan_port_create(struct net_device *dev)
if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
return -EINVAL;
 
-   if (netif_is_ipvlan_port(dev))
+   if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
 
port = kzalloc(sizeof(*port), GFP_KERNEL);
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 1/3] net: remove duplicate code.

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

netdev_rx_handler_register() checks to see if the handler is already
busy which was recently separated into netdev_is_rx_handler_busy(). So
use the same function inside register() to avoid code duplication.
Essentially this change should be a no-op

Signed-off-by: Mahesh Bandewar 
---
 net/core/dev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ad5959e56116..c8f1f67ff16c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3961,9 +3961,7 @@ int netdev_rx_handler_register(struct net_device *dev,
   rx_handler_func_t *rx_handler,
   void *rx_handler_data)
 {
-   ASSERT_RTNL();
-
-   if (dev->rx_handler)
+   if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
 
/* Note: rx_handler_data must be set before rx_handler */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 2/3] ipvlan: use netdev_is_rx_handler_busy instead of checking specific type

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan checks if the master device is already used by checking a
specific device (here it's macvlan device). This is technically not
sufficient and it should just ensure the rx_handler is busy or not.
This would be a super check that includes macvlan and any other that
has already registered rx-handler.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index b5c390f0f2b3..95b18f4602cf 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -102,8 +102,8 @@ static int ipvlan_port_create(struct net_device *dev)
return -EINVAL;
}
 
-   if (netif_is_macvlan_port(dev)) {
-   netdev_err(dev, "Master is a macvlan port.\n");
+   if (netdev_is_rx_handler_busy(dev)) {
+   netdev_err(dev, "Device is already in use.\n");
return -EBUSY;
}
 
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next] ipvlan: fix dev_id creation corner case.

2017-01-13 Thread Mahesh Bandewar
From: Mahesh Bandewar 

In the last patch da36e13cf65 ("ipvlan: improvise dev_id generation
logic in IPvlan") I missed some part of Dave's suggestion and because
of that the dev_id creation could fail in a corner case scenario. This
would happen when more or less 64k devices have been already created and
several have been deleted. If the devices that are still sticking around
are the last n bits from the bitmap. So in this scenario even if lower
bits are available, the dev_id search is so narrow that it always fails.

Fixes: da36e13cf65 ("ipvlan: improvise dev_id generation logic in IPvlan")
CC: David Miller 
CC: Eric Dumazet 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 92b221a03350..b5c390f0f2b3 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -550,6 +550,9 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
 GFP_KERNEL);
if (err < 0)
+   err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
+GFP_KERNEL);
+   if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
/* Increment id-base to the next slot for the future assignment */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next v2] ipvlan: improvise dev_id generation logic in IPvlan

2017-01-09 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The patch 009146d117b ("ipvlan: assign unique dev-id for each slave
device.") used ida_simple_get() to generate dev_ids assigned to the
slave devices. However (Eric has pointed out that) there is a shortcoming
with that approach as it always uses the first available ID. This
becomes a problem when a slave gets deleted and a new slave gets added.
The ID gets reassigned causing the new slave to get the same link-local
address. This side-effect is undesirable.

This patch adds a per-port variable that keeps track of the IDs
assigned and used as the stat-base for the IDR api. This base will be
wrapped around when it reaches the MAX (0xFFFE) value possibly on a
busy system where slaves are added and deleted routinely.

Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.")
Signed-off-by: Mahesh Bandewar 
CC: Eric Dumazet 
CC: David Miller 
---
v1 -> v2
  Implemented Dave's comments and reinstated IDR API.

 drivers/net/ipvlan/ipvlan.h  |  1 +
 drivers/net/ipvlan/ipvlan_main.c | 13 -
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 0a9068fdee0f..406ae4ff0ae8 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -94,6 +94,7 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 dev_id_start;
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..aca2885cf0b4 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -120,6 +120,7 @@ static int ipvlan_port_create(struct net_device *dev)
skb_queue_head_init(&port->backlog);
INIT_WORK(&port->wq, ipvlan_process_multicast);
ida_init(&port->ida);
+   port->dev_id_start = 1;
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -535,15 +536,25 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(&ipvlan->addrs);
 
+   /* If the port-id base is at the MAX value, then wrap it around and
+* begin from 0x1 again. This may be due to a busy system where lots
+* of slaves are getting created and deleted.
+*/
+   if (port->dev_id_start == 0xFFFE)
+   port->dev_id_start = 0x1;
+
/* Since L2 address is shared among all IPvlan slaves including
 * master, use unique 16 bit dev-ids to diffentiate among them.
 * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
 * slave link [see addrconf_ifid_eui48()].
 */
-   err = ida_simple_get(&port->ida, 1, 0xFFFE, GFP_KERNEL);
+   err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
+GFP_KERNEL);
if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
+   /* Increment id-base to the next slot for the future assignment */
+   port->dev_id_start = err + 1;
 
/* TODO Probably put random address here to be presented to the
 * world but keep using the physical-dev address for the outgoing
-- 
2.11.0.390.gc69c2f50cf-goog



[PATCH next v1] ipvlan: don't use IDR for generating dev_id

2017-01-06 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The patch 009146d117b ("ipvlan: assign unique dev-id for each slave
device.") used ida_simple_get() to generate dev_ids assigned to the
slave devices. However (Eric has pointed out that) there is a shortcoming
with that approach as it always uses the first available ID. This
becomes a problem when a slave gets deleted and a new slave gets added.
The ID gets reassigned causing the new slave to get the same link-local
address. This side-effect is undesirable.

This patch replaces IDR logic with a simple per-port variable that keeps
incrementing and wraps around when the MAX (0xFFFE) is reached. The
only downside is that this is an inefficient (n^2) search if there are
64k (or close to 64k) slaves in the system, the dev-id search takes time.
However having these many devices in the system has it's own challenges.

Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.")
Signed-off-by: Mahesh Bandewar 
CC: Eric Dumazet 
---
 drivers/net/ipvlan/ipvlan.h  |  2 +-
 drivers/net/ipvlan/ipvlan_main.c | 69 ++--
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 0a9068fdee0f..535f254324ba 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -94,10 +94,10 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 dev_id_base;
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
-   struct ida  ida;
 };
 
 struct ipvl_skb_cb {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..691662e02abb 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -119,7 +119,7 @@ static int ipvlan_port_create(struct net_device *dev)
 
skb_queue_head_init(&port->backlog);
INIT_WORK(&port->wq, ipvlan_process_multicast);
-   ida_init(&port->ida);
+   port->dev_id_base = 1;
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -151,7 +151,6 @@ static void ipvlan_port_destroy(struct net_device *dev)
dev_put(skb->dev);
kfree_skb(skb);
}
-   ida_destroy(&port->ida);
kfree(port);
 }
 
@@ -496,6 +495,60 @@ static int ipvlan_nl_fillinfo(struct sk_buff *skb,
return ret;
 }
 
+static u16 ipvlan_get_dev_id(struct ipvl_port *port)
+{
+   struct ipvl_dev *ipvlan;
+   u16 tid, dev_id = 0;
+   bool found, exhausted = false, second_round = false;
+
+   /* Idea here is to get the next available ID while avoiding
+* reuse of already used IDs. Per port variable (dev_id_base)
+* is used to get the next available ID.
+*
+* Possibilities are from 0x1 to 0xfffe so finding next available
+* ID is not be that difficult. However if the system is up for a
+* long time and these slave devices are getting added / deleted
+* routinely and it reaches the MAX, then only the assignment will
+* attempt to recycle previously used IDs starting from 0x1 (Provided
+* these are not currently in use).
+*
+* In case of a failure, dev-id search will return 0. All IDs in-use
+* case is very inefficient (n^2 search) but having close to 64k
+* slaves has it's own system limitations.
+*/
+   while (!exhausted) {
+   tid = port->dev_id_base++;
+
+   if (tid == 0xFFFE) {
+   /* Reached MAX; restart from 1 */
+   port->dev_id_base = 1;
+   if (second_round)
+   exhausted = true;
+   else
+   second_round = true;
+   continue;
+   }
+   /* Should not be same as the masters' (if present) */
+   if (tid == port->dev->dev_id)
+   continue;
+   /* Make sure that it's already not assigned to any of the
+* existing slaves.
+*/
+   found = false;
+   list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
+   if (ipvlan->dev->dev_id == tid) {
+   found = true;
+   break;
+   }
+   }
+   if (!found) {
+   dev_id = tid;
+   break;
+   }
+   }
+   return dev_id;
+}
+
 static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
   struct nlattr *tb[], str

[RFC PATCH next] ipv6: do not send RTM_DELADDR for tentative addresses

2017-01-04 Thread Mahesh Bandewar
From: Mahesh Bandewar 

RTM_NEWADDR notification is sent when IFA_F_TENTATIVE is cleared from
the address. So if the address is added and deleted before DAD probes
completes, the RTM_DELADDR will be sent for which there was no
RTM_NEWADDR causing asymmetry in notification. However if the same
logic is used while sending RTM_DELADDR notification, this asymmetry
can be avoided.

Signed-off-by: Mahesh Bandewar 
CC: Hideaki YOSHIFUJI 
CC: Patrick McHardy 
CC: Hannes Frederic Sowa 
---
 net/ipv6/addrconf.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c1e124bc8e1e..ac9bd5620f81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4888,6 +4888,13 @@ static void inet6_ifa_notify(int event, struct 
inet6_ifaddr *ifa)
struct net *net = dev_net(ifa->idev->dev);
int err = -ENOBUFS;
 
+   /* Don't send DELADDR notification for TENTATIVE address,
+* since NEWADDR notification is sent only after removing
+* TENTATIVE flag.
+*/
+   if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+   return;
+
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
-- 
2.11.0.390.gc69c2f50cf-goog



[PATCH next v1] ipvlan: assign unique dev-id for each slave device.

2017-01-03 Thread Mahesh Bandewar
From: Mahesh Bandewar 

IPvlan setup uses one mac-address (of master). The IPv6 link-local
addresses are derived using the mac-address on the link. Lack of
dev-ids makes these link-local addresses same for all slaves including
that of master device. dev-ids are necessary to add differentiation
when L2 address is shared.

Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan.h  |  1 +
 drivers/net/ipvlan/ipvlan_main.c | 17 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index dbfbb33ac66c..0a9068fdee0f 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -97,6 +97,7 @@ struct ipvl_port {
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
+   struct ida  ida;
 };
 
 struct ipvl_skb_cb {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 975f9ddb9908..ce7ca6a5aa8a 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -119,6 +119,7 @@ static int ipvlan_port_create(struct net_device *dev)
 
skb_queue_head_init(&port->backlog);
INIT_WORK(&port->wq, ipvlan_process_multicast);
+   ida_init(&port->ida);
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -150,6 +151,7 @@ static void ipvlan_port_destroy(struct net_device *dev)
dev_put(skb->dev);
kfree_skb(skb);
}
+   ida_destroy(&port->ida);
kfree(port);
 }
 
@@ -533,6 +535,16 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(&ipvlan->addrs);
 
+   /* Since L2 address is shared among all IPvlan slaves including
+* master, use unique 16 bit dev-ids to diffentiate among them.
+* Assign IDs between 0x1 and 0xFFFE (used by the master) to each
+* slave link [see addrconf_ifid_eui48()].
+*/
+   err = ida_simple_get(&port->ida, 1, 0xFFFE, GFP_KERNEL);
+   if (err < 0)
+   goto destroy_ipvlan_port;
+   dev->dev_id = err;
+
/* TODO Probably put random address here to be presented to the
 * world but keep using the physical-dev address for the outgoing
 * packets.
@@ -543,7 +555,7 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
 
err = register_netdevice(dev);
if (err < 0)
-   goto destroy_ipvlan_port;
+   goto remove_ida;
 
err = netdev_upper_dev_link(phy_dev, dev);
if (err) {
@@ -562,6 +574,8 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
unregister_netdevice(dev);
+remove_ida:
+   ida_simple_remove(&port->ida, dev->dev_id);
 destroy_ipvlan_port:
if (create)
ipvlan_port_destroy(phy_dev);
@@ -579,6 +593,7 @@ static void ipvlan_link_delete(struct net_device *dev, 
struct list_head *head)
kfree_rcu(addr, rcu);
}
 
+   ida_simple_remove(&ipvlan->port->ida, dev->dev_id);
list_del_rcu(&ipvlan->pnode);
unregister_netdevice_queue(dev, head);
netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
-- 
2.11.0.390.gc69c2f50cf-goog



  1   2   3   >