[PATCH net] bonding: fix warning message

2018-10-02 Thread Mahesh Bandewar
From: Mahesh Bandewar 

RX queue config for bonding master could be different from its slave
device(s). With the commit 6a9e461f6fe4 ("bonding: pass link-local
packets to bonding master also."), the packet is reinjected into stack
with skb->dev as bonding master. This potentially triggers the
message:

   "bondX received packet on queue Y, but number of RX queues is Z"

whenever the queue that packet is received on is higher than the
numrxqueues on bonding master (Y > Z).

Fixes: 6a9e461f6fe4 ("bonding: pass link-local packets to bonding master also.")
Reported-by: John Sperbeck 
Signed-off-by: Eric Dumazet 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index c05c01a00755..ee28ec9e0aba 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1187,6 +1187,7 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
 
if (nskb) {
nskb->dev = bond->dev;
+   nskb->queue_mapping = 0;
netif_rx(nskb);
}
return RX_HANDLER_PASS;
-- 
2.19.0.605.g01d371f741-goog



[PATCH net] bonding: avoid possible dead-lock

2018-09-24 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Syzkaller reported this on a slightly older kernel but it's still
applicable to the current kernel -

==
WARNING: possible circular locking dependency detected
4.18.0-next-20180823+ #46 Not tainted
--
syz-executor4/26841 is trying to acquire lock:
dd41ef48 ((wq_completion)bond_dev->name){+.+.}, at: 
flush_workqueue+0x2db/0x1e10 kernel/workqueue.c:2652

but task is already holding lock:
768ab431 (rtnl_mutex){+.+.}, at: rtnl_lock net/core/rtnetlink.c:77 
[inline]
768ab431 (rtnl_mutex){+.+.}, at: rtnetlink_rcv_msg+0x412/0xc30 
net/core/rtnetlink.c:4708

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (rtnl_mutex){+.+.}:
   __mutex_lock_common kernel/locking/mutex.c:925 [inline]
   __mutex_lock+0x171/0x1700 kernel/locking/mutex.c:1073
   mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1088
   rtnl_lock+0x17/0x20 net/core/rtnetlink.c:77
   bond_netdev_notify drivers/net/bonding/bond_main.c:1310 [inline]
   bond_netdev_notify_work+0x44/0xd0 drivers/net/bonding/bond_main.c:1320
   process_one_work+0xc73/0x1aa0 kernel/workqueue.c:2153
   worker_thread+0x189/0x13c0 kernel/workqueue.c:2296
   kthread+0x35a/0x420 kernel/kthread.c:246
   ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415

-> #1 ((work_completion)(&(>work)->work)){+.+.}:
   process_one_work+0xc0b/0x1aa0 kernel/workqueue.c:2129
   worker_thread+0x189/0x13c0 kernel/workqueue.c:2296
   kthread+0x35a/0x420 kernel/kthread.c:246
   ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:415

-> #0 ((wq_completion)bond_dev->name){+.+.}:
   lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901
   flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655
   drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820
   destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155
   __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138
   bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734
   register_netdevice+0x337/0x1100 net/core/dev.c:8410
   bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453
   rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099
   rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711
   netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454
   rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4729
   netlink_unicast_kernel net/netlink/af_netlink.c:1317 [inline]
   netlink_unicast+0x5a0/0x760 net/netlink/af_netlink.c:1343
   netlink_sendmsg+0xa18/0xfc0 net/netlink/af_netlink.c:1908
   sock_sendmsg_nosec net/socket.c:622 [inline]
   sock_sendmsg+0xd5/0x120 net/socket.c:632
   ___sys_sendmsg+0x7fd/0x930 net/socket.c:2115
   __sys_sendmsg+0x11d/0x290 net/socket.c:2153
   __do_sys_sendmsg net/socket.c:2162 [inline]
   __se_sys_sendmsg net/socket.c:2160 [inline]
   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2160
   do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
   entry_SYSCALL_64_after_hwframe+0x49/0xbe

other info that might help us debug this:

Chain exists of:
  (wq_completion)bond_dev->name --> (work_completion)(&(>work)->work) --> 
rtnl_mutex

 Possible unsafe locking scenario:

   CPU0CPU1
   
  lock(rtnl_mutex);
   lock((work_completion)(&(>work)->work));
   lock(rtnl_mutex);
  lock((wq_completion)bond_dev->name);

 *** DEADLOCK ***

1 lock held by syz-executor4/26841:

stack backtrace:
CPU: 1 PID: 26841 Comm: syz-executor4 Not tainted 4.18.0-next-20180823+ #46
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_circular_bug.isra.34.cold.55+0x1bd/0x27d kernel/locking/lockdep.c:1222
 check_prev_add kernel/locking/lockdep.c:1862 [inline]
 check_prevs_add kernel/locking/lockdep.c:1975 [inline]
 validate_chain kernel/locking/lockdep.c:2416 [inline]
 __lock_acquire+0x3449/0x5020 kernel/locking/lockdep.c:3412
 lock_acquire+0x1e4/0x4f0 kernel/locking/lockdep.c:3901
 flush_workqueue+0x30a/0x1e10 kernel/workqueue.c:2655
 drain_workqueue+0x2a9/0x640 kernel/workqueue.c:2820
 destroy_workqueue+0xc6/0x9d0 kernel/workqueue.c:4155
 __alloc_workqueue_key+0xef9/0x1190 kernel/workqueue.c:4138
 bond_init+0x269/0x940 drivers/net/bonding/bond_main.c:4734
 register_netdevice+0x337/0x1100 net/core/dev.c:8410
 bond_newlink+0x49/0xa0 drivers/net/bonding/bond_netlink.c:453
 rtnl_newlink+0xef4/0x1d50 net/core/rtnetlink.c:3099
 rtnetlink_rcv_msg+0x46e/0xc30 net/core/rtnetlink.c:4711
 netlink_rcv_skb+0x172/0x440 net/netlink/af_netlink.c:2454
 rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4

[PATCH net] bonding: pass link-local packets to bonding master also.

2018-09-24 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9866fa959a19..e3157aed656a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1178,9 +1178,26 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) they also must be available on master. Some of
+* the use cases include (but are not limited to): LLDP agents
+* that must be able to operate both on enslaved interfaces as
+* well as on bonds themselves; linux bridges that must be able
+* to process/pass BPDUs from attached bonds when any kind of
+* STP version is enabled on the network.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.19.0.444.g18242da7ef-goog



[PATCH iproute2] iproute2: fix use-after-free

2018-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar 

A local program using iproute2 lib pointed out the issue and looking
at the code it is pretty obvious -

a = (struct nlmsghdr *)b;
...
free(b);
if (a->nlmsg_seq == seq)
...

Fixes: 86bf43c7c2fd ("lib/libnetlink: update rtnl_talk to support malloc buff 
at run time")
Signed-off-by: Mahesh Bandewar 
---
 lib/libnetlink.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lib/libnetlink.c b/lib/libnetlink.c
index 928de1dd16d8..016a5f0bcfb6 100644
--- a/lib/libnetlink.c
+++ b/lib/libnetlink.c
@@ -661,17 +661,24 @@ next:
if (l < sizeof(struct nlmsgerr)) {
fprintf(stderr, "ERROR truncated\n");
} else if (!err->error) {
+   unsigned int tmp_seq;
+
/* check messages from kernel */
nl_dump_ext_ack(h, errfn);
 
-   if (answer)
+   tmp_seq = h->nlmsg_seq;
+   if (answer) {
*answer = (struct nlmsghdr 
*)buf;
-   else
+   } else {
free(buf);
-   if (h->nlmsg_seq == seq)
+   buf = NULL;
+   }
+   if (tmp_seq == seq) {
return 0;
-   else if (i < iovlen)
+   } else if (i < iovlen) {
+   free(buf);
goto next;
+   }
return 0;
}
 
-- 
2.19.0.rc2.392.g5ba43deb5a-goog



[PATCHv3 iproute2 0/2] clang + misc changes

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The primary theme is to make clang compile the iproute2 package without
warnings. Along with this there are two other misc patches in the series.

First patch uses the preferred_family when operating with maddr feature.
Prior to this patch, it would always open an AF_INET socket irrespective
of the family that is preferred via command-line. 

Second patch mostly adds format attributes to make the c-lang compiler
happy and not throw the warning messages.

Mahesh Bandewar (2):
  ipmaddr: use preferred_family when given
  iproute: make clang happy with iproute2 package

 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 ip/ipmaddr.c  | 13 -
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 9 files changed, 32 insertions(+), 25 deletions(-)

-- 
2.18.0.1017.ga543ac7ca45-goog



[PATCHv3 iproute2 1/2] ipmaddr: use preferred_family when given

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.1017.ga543ac7ca45-goog



[PATCHv3 iproute2 2/2] iproute: make clang happy

2018-08-22 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 8 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..0c8831c1136d 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on);
 void jsonw_name(json_writer_t *self, const char *name);
 
 /* Add value  */
+__attribute__((format(printf, 2, 3)))
 void jsonw_printf(json_writer_t *self, const char *fmt, ...);
 void jsonw_string(json_writer_t *self, const char *value);
 void jsonw_bool(json_writer_t *self, bool value);
@@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,   

[PATCHv2 iproute2 3/3] iproute: make clang happy with iproute2 package

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h |  1 +
 8 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..0c8831c1136d 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -29,6 +29,7 @@ void jsonw_pretty(json_writer_t *self, bool on);
 void jsonw_name(json_writer_t *self, const char *name);
 
 /* Add value  */
+__attribute__((format(printf, 2, 3)))
 void jsonw_printf(json_writer_t *self, const char *fmt, ...);
 void jsonw_string(json_writer_t *self, const char *value);
 void jsonw_bool(json_writer_t *self, bool value);
@@ -59,8 +60,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,   

[PATCHv2 iproute2 0/3] clang + misc changes

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

The primary theme is to make clang compile the iproute2 package without
warnings. Along with this there are two other misc patches in the series.

First patch uses the preferred_family when operating with maddr feature.
Prior to this patch, it would always open an AF_INET socket irrespective
of the family that is preferred via command-line. 

Second patch just removes extern from the prototype declarations from
the m_ematch.h header file.

Third patch mostly adds format attributes to make the c-lang compiler
happy and not throw the warning messages.

Mahesh Bandewar (3):
  ipmaddr: use preferred_family when given
  tc: remove extern from prototype declarations
  iproute: make clang happy with iproute2 package

 include/json_writer.h |  3 +--
 ip/iplink_can.c   | 19 ---
 ip/ipmaddr.c  | 13 -
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 tc/m_ematch.h | 15 ---
 9 files changed, 39 insertions(+), 32 deletions(-)

-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCHv2 iproute2 2/3] tc: remove extern from prototype declarations

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Signed-off-by: Mahesh Bandewar 
---
 tc/m_ematch.h | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tc/m_ematch.h b/tc/m_ematch.h
index f634f19164fa..80b02cfad6cc 100644
--- a/tc/m_ematch.h
+++ b/tc/m_ematch.h
@@ -20,7 +20,7 @@ struct bstr
struct bstr *next;
 };
 
-extern struct bstr * bstr_alloc(const char *text);
+struct bstr * bstr_alloc(const char *text);
 
 static inline struct bstr * bstr_new(char *data, unsigned int len)
 {
@@ -51,8 +51,8 @@ static inline struct bstr *bstr_next(struct bstr *b)
return b->next;
 }
 
-extern unsigned long bstrtoul(const struct bstr *b);
-extern void bstr_print(FILE *fd, const struct bstr *b, int ascii);
+unsigned long bstrtoul(const struct bstr *b);
+void bstr_print(FILE *fd, const struct bstr *b, int ascii);
 
 
 struct ematch
@@ -79,7 +79,7 @@ static inline struct ematch * new_ematch(struct bstr *args, 
int inverted)
return e;
 }
 
-extern void print_ematch_tree(const struct ematch *tree);
+void print_ematch_tree(const struct ematch *tree);
 
 
 struct ematch_util
@@ -107,9 +107,9 @@ static inline int parse_layer(struct bstr *b)
return INT_MAX;
 }
 
-extern int em_parse_error(int err, struct bstr *args, struct bstr *carg,
+int em_parse_error(int err, struct bstr *args, struct bstr *carg,
   struct ematch_util *, char *fmt, ...);
-extern int print_ematch(FILE *, const struct rtattr *);
-extern int parse_ematch(int *, char ***, int, struct nlmsghdr *);
+int print_ematch(FILE *, const struct rtattr *);
+int parse_ematch(int *, char ***, int, struct nlmsghdr *);
 
 #endif
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCHv2 iproute2 1/3] ipmaddr: use preferred_family when given

2018-08-21 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCH iproute2] iproute: make clang happy

2018-08-20 Thread Mahesh Bandewar
From: Mahesh Bandewar 

These are primarily fixes for "string is not string literal" warnings
/ errors (with -Werror -Wformat-nonliteral). This should be a no-op
change. I had to replace couple of print helper functions with the
code they call as it was becoming harder to eliminate these warnings,
however these helpers were used only at couple of places, so no
major change as such.

Signed-off-by: Mahesh Bandewar 
---
 include/json_writer.h |  2 --
 ip/iplink_can.c   | 19 ---
 lib/color.c   |  1 +
 lib/json_print.c  |  1 +
 lib/json_writer.c | 15 +--
 misc/ss.c |  3 ++-
 tc/m_ematch.c |  1 +
 7 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/include/json_writer.h b/include/json_writer.h
index 9ab88e1dbdd9..a111cc62e7b2 100644
--- a/include/json_writer.h
+++ b/include/json_writer.h
@@ -59,8 +59,6 @@ void jsonw_luint_field(json_writer_t *self, const char *prop,
unsigned long int num);
 void jsonw_lluint_field(json_writer_t *self, const char *prop,
unsigned long long int num);
-void jsonw_float_field_fmt(json_writer_t *self, const char *prop,
-  const char *fmt, double val);
 
 /* Collections */
 void jsonw_start_object(json_writer_t *self);
diff --git a/ip/iplink_can.c b/ip/iplink_can.c
index 587413da15c4..c0deeb1f1fcf 100644
--- a/ip/iplink_can.c
+++ b/ip/iplink_can.c
@@ -316,11 +316,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
struct can_bittiming *bt = RTA_DATA(tb[IFLA_CAN_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("bittiming");
print_int(PRINT_ANY, "bitrate", NULL, bt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point", "%.3f",
- (float) bt->sample_point / 1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) bt->sample_point / 1000);
print_int(PRINT_ANY, "tq", NULL, bt->tq);
print_int(PRINT_ANY, "prop_seg", NULL, bt->prop_seg);
print_int(PRINT_ANY, "phase_seg1",
@@ -415,12 +418,14 @@ static void can_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
RTA_DATA(tb[IFLA_CAN_DATA_BITTIMING]);
 
if (is_json_context()) {
+   json_writer_t *jw;
+
open_json_object("data_bittiming");
print_int(PRINT_JSON, "bitrate", NULL, dbt->bitrate);
-   jsonw_float_field_fmt(get_json_writer(),
- "sample_point",
- "%.3f",
- (float) dbt->sample_point / 
1000.);
+   jw = get_json_writer();
+   jsonw_name(jw, "sample_point");
+   jsonw_printf(jw, "%.3f",
+(float) dbt->sample_point / 1000.);
print_int(PRINT_JSON, "tq", NULL, dbt->tq);
print_int(PRINT_JSON, "prop_seg", NULL, dbt->prop_seg);
print_int(PRINT_JSON, "phase_seg1",
diff --git a/lib/color.c b/lib/color.c
index eaf69e74d673..e5406294dfc4 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -132,6 +132,7 @@ void set_color_palette(void)
is_dark_bg = 1;
 }
 
+__attribute__((format(printf, 3, 4)))
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...)
 {
int ret = 0;
diff --git a/lib/json_print.c b/lib/json_print.c
index 5dc41bfabfd4..77902824a738 100644
--- a/lib/json_print.c
+++ b/lib/json_print.c
@@ -100,6 +100,7 @@ void close_json_array(enum output_type type, const char 
*str)
  * functions handling different types
  */
 #define _PRINT_FUNC(type_name, type)   \
+   __attribute__((format(printf, 4, 0)))   \
void print_color_##type_name(enum output_type t,\
 enum color_attr color, \
 const char *key,   \
diff --git a/lib/json_writer.c b/lib/json_writer.c
index aa9ce1c65e51..68890b34ee92 100644
--- a/lib/json_writer.c
+++ b/lib/json_writer.c
@@ -152,6 +152,7 @@ void jsonw_name(json_writer_t *self, const char *name)
   

[PATCH iproute2] ipmaddr: use preferred_family when given

2018-08-15 Thread Mahesh Bandewar
From: Mahesh Bandewar 

When creating socket() AF_INET is used irrespective of the family
that is given at the command-line (with -4, -6, or -0). This change
will open the socket with the preferred family.

Signed-off-by: Mahesh Bandewar 
---
 ip/ipmaddr.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ip/ipmaddr.c b/ip/ipmaddr.c
index a48499029e17..abf83784d0df 100644
--- a/ip/ipmaddr.c
+++ b/ip/ipmaddr.c
@@ -289,6 +289,7 @@ static int multiaddr_list(int argc, char **argv)
 static int multiaddr_modify(int cmd, int argc, char **argv)
 {
struct ifreq ifr = {};
+   int family;
int fd;
 
if (cmd == RTM_NEWADDR)
@@ -324,7 +325,17 @@ static int multiaddr_modify(int cmd, int argc, char **argv)
exit(-1);
}
 
-   fd = socket(AF_INET, SOCK_DGRAM, 0);
+   switch (preferred_family) {
+   case AF_INET6:
+   case AF_PACKET:
+   case AF_INET:
+   family = preferred_family;
+   break;
+   default:
+   family = AF_INET;
+   }
+
+   fd = socket(family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("Cannot create socket");
exit(1);
-- 
2.18.0.865.gffc8e1a3cd6-goog



[PATCH net v2] bonding: pass link-local packets to bonding master also.

2018-07-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
v2: Added Fixes tag.
v1: Initial patch.
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCH next v2] bonding: pass link-local packets to bonding master also.

2018-07-18 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Fixes: b89f04c61efe ("bonding: deliver link-local packets with skb->dev set to 
link that packets arrived on")
Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
v2: Added Fixes tag.
v1: Initial patch.
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCH next] bonding: pass link-local packets to bonding master also.

2018-07-15 Thread Mahesh Bandewar
From: Mahesh Bandewar 

Commit b89f04c61efe ("bonding: deliver link-local packets with
skb->dev set to link that packets arrived on") changed the behavior
of how link-local-multicast packets are processed. The change in
the behavior broke some legacy use cases where these packets are
expected to arrive on bonding master device also.

This patch passes the packet to the stack with the link it arrived
on as well as passes to the bonding-master device to preserve the
legacy use case.

Reported-by: Michal Soltys 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/bonding/bond_main.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 9a2ea3c1f949..1d3b7d8448f2 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1177,9 +1177,22 @@ static rx_handler_result_t bond_handle_frame(struct 
sk_buff **pskb)
}
}
 
-   /* don't change skb->dev for link-local packets */
-   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
+   /* Link-local multicast packets should be passed to the
+* stack on the link they arrive as well as pass them to the
+* bond-master device. These packets are mostly usable when
+* stack receives it with the link on which they arrive
+* (e.g. LLDP) but there may be some legacy behavior that
+* expects these packets to appear on bonding master too.
+*/
+   if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) {
+   struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+   if (nskb) {
+   nskb->dev = bond->dev;
+   netif_rx(nskb);
+   }
return RX_HANDLER_PASS;
+   }
if (bond_should_deliver_exact_match(skb, slave, bond))
return RX_HANDLER_EXACT;
 
-- 
2.18.0.203.gfac676dfb9-goog



[PATCHv4 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. Capability
mask is stored in kernel as kernel_cap_t type (array of u32). This sysctl
takes input as comma separated hex u32 words. For simplicity one could
see this sysctl to operate on string inputs. However the value is not
expected to change that often during the life of a kernel-boot. It makes
more sense to use the widely available API instead of bringing another
string manipulation for the purpose of making this simpler.

The default value set (for kernel.controlled_userns_caps_whitelist) is
CAP_FULL_SET indicating that no capability is controlled by default to
maintain compatibility with the existing behavior of user-ns. Administrator
will have to modify this sysctl to control any capability as such. e.g. to
control CAP_NET_RAW the mask need to be changed like -

  # sysctl -q kernel.controlled_userns_caps_whitelist
  kernel.controlled_userns_caps_whitelist = 1f,
  # sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff
  kernel.controlled_userns_caps_whitelist = 1f,dfff

For bit-to-mask conversion please check include/uapi/linux/capability.h
file.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns. In above example
CAP_NET_RAW will not be available to controlled-user-namespaces.

Acked-by: Serge Hallyn <se...@hallyn.com>
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v4:
  commit message changes.
v3:
  Added couple of comments as requested by Serge Hallyn
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..6aa1e087afee 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is available in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+i

[PATCHv4 0/2] capability controlled user-namespaces

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Processes that do not have CAP_SYS_ADMIN in init-ns can *only* create
controlled user-namespaces. In other words, user-namespaces created by
privileged processes (those which have CAP_SYS_ADMIN in init-ns) are
not controlled. A hierarchy underneath any controlled user-ns is always
controlled.

A global whitelist is list of capabilities governed by a sysctl
(kernel.controlled_userns_caps_whitelist) which is available to
(privileged) user in init-ns to modify while it's applicable to all
controlled user-namespaces on the host irrespective of when that user-ns
was created.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Example
---
Here is the example that demonstrates the behavior of a kernel that has
this patch-set applied. It uses the same c-code from this commit-log and
is called acquire_raw.c -

(a) The 'root' user has all the capabilities all the time (before and
after taking capability).

root@vm0:~# id
uid=0(root) gid=0(root) groups=0(root)

root@vm0:~# sysctl -q kernel.controlled_userns_caps_whitelist 
kernel.controlled_userns_caps_whitelist = 1f,

root@vm0:~# ./acquire_raw 
Attempting to open RAW socket before unshare()...
Successfully opened RAW-Sock before unshare().
Attempting to open RAW socket after unshare()...
Successfully opened RAW-Sock after unshare().

root@vm0:~# sysctl -w kernel.controlled_userns_caps_whitelist=1f,dfff
kernel.controlled_userns_caps_whitelist = 1f,dfff

root@vm0:~# ./acquire_raw 
Attempting to open RAW socket before unshare()...
Successfully opened RAW-Sock before unshare().
Attempting to open RAW socket after unshare()...
Successfully opened RAW-Sock after unshare().

(b) Unprivileged user cannot change the mask.

mahesh@vm0:~$ id
uid=1000(mahesh) gid=1000(mahesh)

groups=1000(mahesh),4(adm),24(cdrom),27(sudo),30(dip),46(plugdev),118(lpadmin),128(sambashare)

mahesh@vm0:~$ sysctl -q kernel.contro

[PATCHv4 2/2] userns: control capabilities of some user namespaces

2018-01-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Acked-by: Serge Hallyn <se...@hallyn.com>
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v4:
  Rebase
v3:
  Rebase
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  4 
 include/linux/user_namespace.h | 25 +
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 46 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..383f31f066f0 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,10 @@ extern bool ptracer_capable(struct task_struct *tsk, 
struct user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+/* Controlled capability is capability that is missing from the capability-mask
+ * controlled_userns_caps_whitelist controlled via sysctl.
+ */
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..a5c48684b317 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -32,6 +32,7 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -112,6 +113,21 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+/* Controlled user-ns is the one that is created by a process that does not
+ * have CAP_SYS_ADMIN (or descended from such an user-ns).
+ * For more details please see the sysctl description of
+ * controlled_userns_caps_whitelist.
+ */
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -170,6 +186,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 246d4d4ce5c7..ca0556d466b6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -141,6 +141,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRIN

[PATCH next 1/2] Revert "ipvlan: add L2 check for packets arriving via virtual devices"

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

This reverts commit 92ff42645028fa6f9b8aa767718457b9264316b4.

Even though the check added is not that taxing, it's not really needed.
First of all this will be per packet cost and second thing is that the
eth_type_trans() already does this correctly. The excessive scrubbing
in IPvlan was changing the pkt-type skb metadata of the packet which
made it necessary to re-check the mac. The subsequent patch in this
series removes the faulty packet-scrub.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_core.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 0bc7f721b717..9774c96ac7bb 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -322,10 +322,6 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct 
sk_buff **pskb,
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
-   if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest,
-ipvlan->phy_dev->dev_addr))
-   skb->pkt_type = PACKET_OTHERHOST;
-
ret = RX_HANDLER_ANOTHER;
success = true;
}
-- 
2.15.1.424.g9478a66081-goog



[PATCH next 0/2] ipvlan: packet scrub

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

While crossing namespace boundary IPvlan aggressively scrubs packets.
This is creating problems. First thing is that scrubbing changes the 
packet type in skb meta-data to PACKET_HOST. This causes erroneous
packet delivery when dev_forward_skb() has already marked the packet
type as OTHER_HOST.

On the egress side scrubbing just before calling dev_queue_xmit()
creates another set of problems. Scrubbing remove skb->sk so the
prio update gets missed and more seriously, socket back-pressure
fails making TSQ not function correctly.

The first patch in the series just reverts the earlier change which
was adding a mac-check, but that is unnecessary if packet_type that
dev_forward_skb() has set is honored. The second path removes two of
the scrubs which are causing problems described above.


Mahesh Bandewar (2):
  Revert "ipvlan: add L2 check for packets arriving via virtual devices"
  ipvlan: remove excessive packet scrubbing

 drivers/net/ipvlan/ipvlan_core.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

-- 
2.15.1.424.g9478a66081-goog



[PATCH next 2/2] ipvlan: remove excessive packet scrubbing

2017-12-13 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan currently scrubs packets at every location where packets may be
crossing namespace boundary. Though this is desirable, currently IPvlan
does it more than necessary. e.g. packets that are going to take
dev_forward_skb() path will get scrubbed so no point in scrubbing them
before forwarding. Another side-effect of scrubbing is that pkt-type gets
set to PACKET_HOST which overrides what was already been set by the
earlier path making erroneous delivery of the packets.

Also scrubbing packets just before calling dev_queue_xmit() has detrimental
effects since packets lose skb->sk and because of that miss prio updates,
incorrect socket back-pressure and would even break TSQ.

Fixes: b93dd49c1a35 ('ipvlan: Scrub skb before crossing the namespace boundary')
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 9774c96ac7bb..c1f008fe4e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -315,13 +315,13 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, 
struct sk_buff **pskb,
 
*pskb = skb;
}
-   ipvlan_skb_crossing_ns(skb, dev);
 
if (local) {
skb->pkt_type = PACKET_HOST;
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
+   skb->dev = dev;
ret = RX_HANDLER_ANOTHER;
success = true;
}
@@ -586,7 +586,7 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct 
net_device *dev)
return NET_XMIT_SUCCESS;
}
 
-   ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
+   skb->dev = ipvlan->phy_dev;
return dev_queue_xmit(skb);
 }
 
-- 
2.15.1.424.g9478a66081-goog



[PATCH next] ipvlan: add L2 check for packets arriving via virtual devices

2017-12-07 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Packets that don't have dest mac as the mac of the master device should
not be entertained by the IPvlan rx-handler. This is mostly true as the
packet path mostly takes care of that, except when the master device is
a virtual device. As demonstrated in the following case -

  ip netns add ns1
  ip link add ve1 type veth peer name ve2
  ip link add link ve2 name iv1 type ipvlan mode l2
  ip link set dev iv1 netns ns1
  ip link set ve1 up
  ip link set ve2 up
  ip -n ns1 link set iv1 up
  ip addr add 192.168.10.1/24 dev ve1
  ip -n ns1 addr 192.168.10.2/24 dev iv1
  ping -c2 192.168.10.2
  <Works!>
  ip neigh show dev ve1
  ip neigh show 192.168.10.2 lladdr  dev ve1
  ping -c2 192.168.10.2
  

This patch adds that missing check in the IPvlan rx-handler.

Reported-by: Amit Sikka <amit.si...@ericsson.com>
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_core.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 9774c96ac7bb..0bc7f721b717 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -322,6 +322,10 @@ static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct 
sk_buff **pskb,
if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
success = true;
} else {
+   if (!ether_addr_equal_64bits(eth_hdr(skb)->h_dest,
+ipvlan->phy_dev->dev_addr))
+   skb->pkt_type = PACKET_OTHERHOST;
+
ret = RX_HANDLER_ANOTHER;
success = true;
}
-- 
2.15.1.424.g9478a66081-goog



[PATCHv3 0/2] capability controlled user-namespaces

2017-12-05 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  7 ++
 include/linux/user_namespace.h  | 25 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 122 insertions(+)

-- 
2.15.0.531.g2ccb3012c9-goog



[PATCHv3 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-12-05 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Acked-by: Serge Hallyn <se...@hallyn.com>
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v3:
  Added couple of comments as requested by Serge Hallyn
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, 

[PATCHv2 0/2] capability controlled user-namespaces

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  4 
 security/commoncap.c|  8 +++
 7 files changed, 114 insertions(+)

-- 
2.15.0.448.gf294e3d99a-goog



[PATCHv2 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v2:
  Rebase
v1:
  Initial submission

 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index f640dcbc880c..7d79a4689625 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -14,6 +14,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -248,6 +249,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index 1e1c0236f55b..4a859b7d4902 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -29,6 +29,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -507,3 +509,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap

[PATCHv2 2/2] userns: control capabilities of some user namespaces

2017-11-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v2:
  Don't recalculate user-ns flags for every setns() call.
v1:
  Initial submission.

 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  4 
 security/commoncap.c   |  8 
 5 files changed, 38 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 7d79a4689625..a1fd9e460379 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -251,6 +251,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 3fe714da7f5a..647f825c7b5f 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -23,6 +23,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -103,6 +104,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -161,6 +172,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 4a859b7d4902..bffe249922de 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -511,6 +511,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..600c7dcb9ff7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -139,6 +139,10 @@ int create_user_ns(struct cred *new)
goto fail_keyring;
 
set_cred_user_ns(new, ns);
+   if (!ns_capable(parent_ns, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(parent_ns))
+   mark_user_ns_controlled(ns);
+
return 0;
 fail_keyring:
 #ifdef CONFIG_PERSISTENT_KEYRINGS
diff --git a/security/commoncap.c b/security/commoncap.c
index fc46f5b85251..89103f16ac37 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controlled' then return EPERM and no need
+

[PATCH resend 0/2] capability controlled user-namespaces

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.15.0.403.gc27cc4dac6-goog



[PATCH resend 2/2] userns: control capabilities of some user namespaces

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index fc46f5b85251..89103f16ac37 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controll

[PATCH resend 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-11-02 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap

[PATCH] ip/ipvlan: enhance ability to add mode flags to existing modes

2017-10-30 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan supported bridge-only functionality prior to commits
a190d04db937 ('ipvlan: introduce 'private' attribute for all
existing modes.') and fe89aa6b250c ('ipvlan: implement VEPA mode').
These two commits allow to configure the VEPA and private modes now.
This patch adds those options in ip command.

e.g.
  bash:~# ip link add link eth0 name ipvl0 type ipvlan mode l2 private
  -or-
  bash:~# ip link add link eth0 type ipvl0 type ipvlan mode l2 vepa

Also the output will reflect the mode and the mode-flag accordingly.
e.g.
  bash:~# ip -details link show ipvl0
  4: ipvl0@eth0: <BROADCAST,MULTICAST> mtu 1500 qdisc ...
 link/ether 00:1a:11:44:a5:3e brd ff:ff:ff:ff:ff:ff promiscuity 0
 ipvlan  mode l2 private addrgenmode eui64 numtxqueues 1 ...

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/uapi/linux/if_link.h |  4 
 ip/iplink_ipvlan.c   | 35 ++-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index dafe0a6e0421..35bc598566e0 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -463,6 +463,7 @@ enum macsec_validation_type {
 enum {
IFLA_IPVLAN_UNSPEC,
IFLA_IPVLAN_MODE,
+   IFLA_IPVLAN_FLAGS,
__IFLA_IPVLAN_MAX
 };
 
@@ -475,6 +476,9 @@ enum ipvlan_mode {
IPVLAN_MODE_MAX
 };
 
+#define IPVLAN_F_PRIVATE   0x01
+#define IPVLAN_F_VEPA  0x02
+
 /* VXLAN section */
 enum {
IFLA_VXLAN_UNSPEC,
diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c
index 9f48309ee030..8889808508fe 100644
--- a/ip/iplink_ipvlan.c
+++ b/ip/iplink_ipvlan.c
@@ -20,12 +20,21 @@
 
 static void ipvlan_explain(FILE *f)
 {
-   fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3  | l3s } ]\n");
+   fprintf(f,
+   "Usage: ... ipvlan [ mode MODE ] [ FLAGS ]\n"
+   "\n"
+   "MODE: l3 | l3s | l2\n"
+   "FLAGS: bridge | private | vepa\n"
+   "(first values are the defaults if nothing is specified).\n"
+   );
 }
 
 static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
struct nlmsghdr *n)
 {
+   __u16 flags = 0;
+   bool mflag_given = false;
+
while (argc > 0) {
if (matches(*argv, "mode") == 0) {
__u16 mode = 0;
@@ -43,6 +52,14 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
return -1;
}
addattr16(n, 1024, IFLA_IPVLAN_MODE, mode);
+   } else if (matches(*argv, "private") == 0 && !mflag_given) {
+   flags |= IPVLAN_F_PRIVATE;
+   mflag_given = true;
+   } else if (matches(*argv, "vepa") == 0 && !mflag_given) {
+   flags |= IPVLAN_F_VEPA;
+   mflag_given = true;
+   } else if (matches(*argv, "bridge") == 0 && !mflag_given) {
+   mflag_given = true;
} else if (matches(*argv, "help") == 0) {
ipvlan_explain(stderr);
return -1;
@@ -55,6 +72,7 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
argc--;
argv++;
}
+   addattr16(n, 1024, IFLA_IPVLAN_FLAGS, flags);
 
return 0;
 }
@@ -75,6 +93,21 @@ static void ipvlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
print_string(PRINT_ANY, "mode", " mode %s ", mode_str);
}
}
+   if (tb[IFLA_IPVLAN_FLAGS]) {
+   if (RTA_PAYLOAD(tb[IFLA_IPVLAN_FLAGS]) == sizeof(__u16)) {
+   __u16 flags = rta_getattr_u16(tb[IFLA_IPVLAN_FLAGS]);
+
+   if (flags & IPVLAN_F_PRIVATE)
+   print_bool(PRINT_ANY, "private", "private ",
+  true);
+   else if (flags & IPVLAN_F_VEPA)
+   print_bool(PRINT_ANY, "vepa", "vepa ",
+  true);
+   else
+   print_bool(PRINT_ANY, "bridge", "bridge ",
+  true);
+   }
+   }
 }
 
 static void ipvlan_print_help(struct link_util *lu, int argc, char **argv,
-- 
2.15.0.rc2.357.g7e34df9404-goog



[PATCH next 0/2] add 'private' and 'vepa' attributes to ipvlan modes

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan has always been operating in bridge-mode for its supported modes i.e.
if the packets are destined to the adjacent neighbor dev, then IPvlan driver
will switch the packet internally without needing the packets to hit the
wire or get routed. However, there are situations where this bridge-mode is
not needed. e.g. two private processes running inside two namespaces which
are having one IPvlan slave each for its namespace but sharing the master. These
processes should reach the outside world through the master device but at
the same time the bridge function should not work. Currently that's not
possible hence the private attribute for the selected mode comes in play.

VEPA or 802.1Qbg on the other hand has limited appeal with IPvlan since IPvlan
uses the mac-address of the lower device. So packets that are destined to 
the adjacent neighbor slave-dev will have same src and dest mac. When these
packets reach the external switch/router, they will send you the redirect
message which the host will have to deal with. Having said that this attribute 
will have appeal in debugging as IPvlan will not switch / short-circuit 
packets internally. e.g. using VEPA mode with lower-device in loopback mode
will avoid some complicated set-ups that use non-local-bind with some route
jugglery.

This patch-set implements these attributes for the existing modes that
IPvlan has. Please see individual patches for their detailed implementation.
A subsequent ip-utils patch is needed and will be sent soon.

Mahesh Bandewar (2):
  ipvlan: introduce 'private' attribute for all existing modes.
  ipvlan: implement VEPA mode

 Documentation/networking/ipvlan.txt | 42 +
 drivers/net/ipvlan/ipvlan.h | 31 
 drivers/net/ipvlan/ipvlan_core.c| 24 ++-
 drivers/net/ipvlan/ipvlan_main.c| 47 +++--
 include/uapi/linux/if_link.h|  4 
 5 files changed, 136 insertions(+), 12 deletions(-)

-- 
2.15.0.rc2.357.g7e34df9404-goog



[PATCH next 2/2] ipvlan: implement VEPA mode

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

This is very similar to the Macvlan VEPA mode, however, there is some
difference. IPvlan uses the mac-address of the lower device, so the VEPA
mode has implications of ICMP-redirects for packets destined for its
immediate neighbors sharing same master since the packets will have same
source and dest mac. The external switch/router will send redirect msg.

Having said that, this will be useful tool in terms of debugging
since IPvlan will not switch packets within its slaves and rely completely
on the external entity as intended in 802.1Qbg.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 Documentation/networking/ipvlan.txt | 12 +++-
 drivers/net/ipvlan/ipvlan.h | 15 +++
 drivers/net/ipvlan/ipvlan_core.c| 17 ++---
 drivers/net/ipvlan/ipvlan_main.c| 13 +++--
 include/uapi/linux/if_link.h|  1 +
 5 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index bfa91c77a4c9..812ef003e0a8 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -25,7 +25,7 @@ using IProute2/ip utility.
 ip link add link  name  type ipvlan [ mode MODE ] [ FLAGS ]
where
  MODE: l3 (default) | l3s | l2
- FLAGS: bridge (default) | private
+ FLAGS: bridge (default) | private | vepa
 
 e.g.
 (a) Following will create IPvlan link with eth0 as master in
@@ -35,6 +35,8 @@ using IProute2/ip utility.
   bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
 (c) This command will create an IPvlan device in L2 private mode.
   bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
+(d) This command will create an IPvlan device in L2 vepa mode.
+  bash# ip link add link eth0 name ipvlan type ipvlan mode l2 vepa
 
 
 4. Operating modes:
@@ -77,6 +79,14 @@ themseleves apart from talking through the master device.
If this option is added to the command-line, the port is set in private
 mode. i.e. port wont allow cross communication between slaves.
 
+5.3 vepa:
+   If this is added to the command-line, the port is set in VEPA mode.
+i.e. port will offload switching functionality to the external entity as
+described in 802.1Qbg
+Note: VEPA mode in IPvlan has limitations. IPvlan uses the mac-address of the
+master-device, so the packets which are emitted in this mode for the adjacent
+neighbor will have source and destination mac same. This will make the switch /
+router send the redirect message.
 
 6. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 9941851bcc13..5166575a164d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -139,6 +139,21 @@ static inline void ipvlan_clear_private(struct ipvl_port 
*port)
port->flags &= ~IPVLAN_F_PRIVATE;
 }
 
+static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
+{
+   return !!(port->flags & IPVLAN_F_VEPA);
+}
+
+static inline void ipvlan_mark_vepa(struct ipvl_port *port)
+{
+   port->flags |= IPVLAN_F_VEPA;
+}
+
+static inline void ipvlan_clear_vepa(struct ipvl_port *port)
+{
+   port->flags &= ~IPVLAN_F_VEPA;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 72fd56de9c00..034ae4c57196 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -514,13 +514,15 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, 
struct net_device *dev)
if (!lyr3h)
goto out;
 
-   addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
-   if (addr) {
-   if (ipvlan_is_private(ipvlan->port)) {
-   consume_skb(skb);
-   return NET_XMIT_DROP;
+   if (!ipvlan_is_vepa(ipvlan->port)) {
+   addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
+   if (addr) {
+   if (ipvlan_is_private(ipvlan->port)) {
+   consume_skb(skb);
+   return NET_XMIT_DROP;
+   }
+   return ipvlan_rcv_frame(addr, , true);
}
-   return ipvlan_rcv_frame(addr, , true);
}
 out:
ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
@@ -535,7 +537,8 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct 
net_device *dev)
void *lyr3h;
int addr_type;
 
-   if (ether_addr_equal(eth->h_dest, eth->h_source)) {
+   if (!ipvlan_is_ve

[PATCH next 1/2] ipvlan: introduce 'private' attribute for all existing modes.

2017-10-26 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan has always operated in bridge mode. However there are scenarios
where each slave should be able to talk through the master device but
not necessarily across each other. Think of an environment where each
of a namespace is a private and independant customer. In this scenario
the machine which is hosting these namespaces neither want to tell who
their neighbor is nor the individual namespaces care to talk to neighbor
on short-circuited network path.

This patch implements the mode that is very similar to the 'private' mode
in macvlan where individual slaves can send and receive traffic through
the master device, just that they can not talk among slave devices.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 Documentation/networking/ipvlan.txt | 30 ++---
 drivers/net/ipvlan/ipvlan.h | 16 
 drivers/net/ipvlan/ipvlan_core.c| 15 ---
 drivers/net/ipvlan/ipvlan_main.c| 38 +++--
 include/uapi/linux/if_link.h|  3 +++
 5 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index 1fe42a874aae..bfa91c77a4c9 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,9 +22,19 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or 
as a module
There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-   ip link add link  name  type ipvlan mode { l2 | 
l3 | l3s }
+ip link add link  name  type ipvlan [ mode MODE ] [ FLAGS ]
+   where
+ MODE: l3 (default) | l3s | l2
+ FLAGS: bridge (default) | private
 
-   e.g. ip link add link eth0 name ipvl0 type ipvlan mode l2
+e.g.
+(a) Following will create IPvlan link with eth0 as master in
+L3 bridge mode
+  bash# ip link add link eth0 name ipvl0 type ipvlan
+(b) This command will create IPvlan link in L2 bridge mode.
+  bash# ip link add link eth0 name ipvl0 type ipvlan mode l2 bridge
+(c) This command will create an IPvlan device in L2 private mode.
+  bash# ip link add link eth0 name ipvlan type ipvlan mode l2 private
 
 
 4. Operating modes:
@@ -54,7 +64,21 @@ works in this mode and hence it is L3-symmetric (L3s). This 
will have slightly l
 performance but that shouldn't matter since you are choosing this mode over 
plain-L3
 mode to make conn-tracking work.
 
-5. What to choose (macvlan vs. ipvlan)?
+5. Mode flags:
+   At this time following mode flags are available
+
+5.1 bridge:
+   This is the default option. To configure the IPvlan port in this mode,
+user can choose to either add this option on the command-line or don't specify
+anything. This is the traditional mode where slaves can cross-talk among
+themseleves apart from talking through the master device.
+
+5.2 private:
+   If this option is added to the command-line, the port is set in private
+mode. i.e. port wont allow cross communication between slaves.
+
+
+6. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
 case could very well define which device to choose. if one of the following
 situations defines your use case then you can choose to use ipvlan -
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index ba8173a0b62e..9941851bcc13 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -96,6 +96,7 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 flags;
u16 dev_id_start;
struct work_struct  wq;
struct sk_buff_head backlog;
@@ -123,6 +124,21 @@ static inline struct ipvl_port *ipvlan_port_get_rtnl(const 
struct net_device *d)
return rtnl_dereference(d->rx_handler_data);
 }
 
+static inline bool ipvlan_is_private(const struct ipvl_port *port)
+{
+   return !!(port->flags & IPVLAN_F_PRIVATE);
+}
+
+static inline void ipvlan_mark_private(struct ipvl_port *port)
+{
+   port->flags |= IPVLAN_F_PRIVATE;
+}
+
+static inline void ipvlan_clear_private(struct ipvl_port *port)
+{
+   port->flags &= ~IPVLAN_F_PRIVATE;
+}
+
 void ipvlan_init_secret(void);
 unsigned int ipvlan_mac_hash(const unsigned char *addr);
 rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index 1f3295e274d0..72fd56de9c00 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -515,9 +515,13 @@ static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct 
net_device *dev)
goto out;
 
addr = ipvlan_addr_lookup(ipvl

[PATCH next] ipvlan: always use the current L2 addr of the master

2017-10-11 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

If the underlying master ever changes its L2 (e.g. bonding device),
then make sure that the IPvlan slaves always emit packets with the
current L2 of the master instead of the stale mac addr which was
copied during the device creation. The problem can be seen with
following script -

  #!/bin/bash
  # Create a vEth pair
  ip link add dev veth0 type veth peer name veth1
  ip link set veth0 up
  ip link set veth1 up
  ip link show veth0
  ip link show veth1
  # Create an IPvlan device on one end of this vEth pair.
  ip link add link veth0 dev ipvl0 type ipvlan mode l2
  ip link show ipvl0
  # Change the mac-address of the vEth master.
  ip link set veth0 address 02:11:22:33:44:55

Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index c74893c1e620..5832091680f4 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -407,7 +407,7 @@ static int ipvlan_hard_header(struct sk_buff *skb, struct 
net_device *dev,
 * while the packets use the mac-addr on the physical device.
 */
return dev_hard_header(skb, phy_dev, type, daddr,
-  saddr ? : dev->dev_addr, len);
+  saddr ? : phy_dev->dev_addr, len);
 }
 
 static const struct header_ops ipvlan_header_ops = {
@@ -730,6 +730,11 @@ static int ipvlan_device_event(struct notifier_block 
*unused,
ipvlan_adjust_mtu(ipvlan, dev);
break;
 
+   case NETDEV_CHANGEADDR:
+   list_for_each_entry(ipvlan, >ipvlans, pnode)
+   ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr);
+   break;
+
case NETDEV_PRE_TYPE_CHANGE:
/* Forbid underlying device to change its type. */
return NOTIFY_BAD;
-- 
2.15.0.rc0.271.g36b669edcc-goog



[PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 694968c7523c..a1d39dbae847 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -187,6 +188,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, CAP_LAST_CAP);
+   if (ret)
+   return ret;
+
+   ret = bitmap

[PATCH 0/2] capability controlled user-namespaces

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

[Same as the previous RFC series sent on 9/21]

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.14.2.822.g60be5d43e6-goog



[PATCH 2/2] userns: control capabilities of some user namespaces

2017-09-29 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6bf72b175b49..26f41602da10 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controll

[PATCH next] bonding: speed/duplex update at NETDEV_UP event

2017-09-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Some NIC drivers don't have correct speed/duplex settings at the
time they send NETDEV_UP notification and that messes up the
bonding state. Especially 802.3ad mode which is very sensitive
to these settings. In the current implementation we invoke
bond_update_speed_duplex() when we receive NETDEV_UP, however,
ignore the return value. If the values we get are invalid
(UNKNOWN), then slave gets removed from the aggregator with
speed and duplex set to UNKNOWN while link is still marked as UP.

This patch fixes this scenario. Also 802.3ad mode is sensitive to
these conditions while other modes are not, so making sure that it
doesn't change the behavior for other modes.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b7313c1d9dcd..177be373966b 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3076,7 +3076,16 @@ static int bond_slave_netdev_event(unsigned long event,
break;
case NETDEV_UP:
case NETDEV_CHANGE:
-   bond_update_speed_duplex(slave);
+   /* For 802.3ad mode only:
+* Getting invalid Speed/Duplex values here will put slave
+* in weird state. So mark it as link-down for the time
+* being and let link-monitoring (miimon) set it right when
+* correct speeds/duplex are available.
+*/
+   if (bond_update_speed_duplex(slave) &&
+   BOND_MODE(bond) == BOND_MODE_8023AD)
+   slave->link = BOND_LINK_DOWN;
+
if (BOND_MODE(bond) == BOND_MODE_8023AD)
bond_3ad_adapter_speed_duplex_changed(slave);
/* Fallthrough */
-- 
2.14.2.822.g60be5d43e6-goog



[RFC PATCH 0/2] capability controlled user-namespaces

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

TL;DR version
-
Creating a sandbox environment with namespaces is challenging
considering what these sandboxed processes can engage into. e.g.
CVE-2017-6074, CVE-2017-7184, CVE-2017-7308 etc. just to name few.
Current form of user-namespaces, however, if changed a bit can allow
us to create a sandbox environment without locking down user-
namespaces.

Detailed version


Problem
---
User-namespaces in the current form have increased the attack surface as
any process can acquire capabilities which are not available to them (by
default) by performing combination of clone()/unshare()/setns() syscalls.

#define _GNU_SOURCE
#include 
#include 
#include 

int main(int ac, char **av)
{
int sock = -1;

printf("Attempting to open RAW socket before unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock before unshare().\n");
close(sock);
sock = -1;
}

if (unshare(CLONE_NEWUSER | CLONE_NEWNET) < 0) {
perror("unshare() failed: ");
return 1;
}

printf("Attempting to open RAW socket after unshare()...\n");
sock = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
if (sock < 0) {
perror("socket() SOCK_RAW failed: ");
} else {
printf("Successfully opened RAW-Sock after unshare().\n");
close(sock);
sock = -1;
}

return 0;
}

The above example shows how easy it is to acquire NET_RAW capabilities
and once acquired, these processes could take benefit of above mentioned
or similar issues discovered/undiscovered with malicious intent. Note
that this is just an example and the problem/solution is not limited
to NET_RAW capability *only*. 

The easiest fix one can apply here is to lock-down user-namespaces which
many of the distros do (i.e. don't allow users to create user namespaces),
but unfortunately that prevents everyone from using them.

Approach

Introduce a notion of 'controlled' user-namespaces. Every process on
the host is allowed to create user-namespaces (governed by the limit
imposed by per-ns sysctl) however, mark user-namespaces created by
sandboxed processes as 'controlled'. Use this 'mark' at the time of
capability check in conjunction with a global capability whitelist.
If the capability is not whitelisted, processes that belong to 
controlled user-namespaces will not be allowed.

Once a user-ns is marked as 'controlled'; all its child user-
namespaces are marked as 'controlled' too.

A global whitelist is list of capabilities governed by the
sysctl which is available to (privileged) user in init-ns to modify
while it's applicable to all controlled user-namespaces on the host.

Marking user-namespaces controlled without modifying the whitelist is
equivalent of the current behavior. The default value of whitelist includes
all capabilities so that the compatibility is maintained. However it gives
admins fine-grained ability to control various capabilities system wide
without locking down user-namespaces.

Please see individual patches in this series.

Mahesh Bandewar (2):
  capability: introduce sysctl for controlled user-ns capability
whitelist
  userns: control capabilities of some user namespaces

 Documentation/sysctl/kernel.txt | 21 +
 include/linux/capability.h  |  4 
 include/linux/user_namespace.h  | 20 
 kernel/capability.c | 52 +
 kernel/sysctl.c |  5 
 kernel/user_namespace.c |  3 +++
 security/commoncap.c|  8 +++
 7 files changed, 113 insertions(+)

-- 
2.14.1.821.g8fa685d3b7-goog



[RFC PATCH 2/2] userns: control capabilities of some user namespaces

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

With this new notion of "controlled" user-namespaces, the controlled
user-namespaces are marked at the time of their creation while the
capabilities of processes that belong to them are controlled using the
global mask.

Init-user-ns is always uncontrolled and a process that has SYS_ADMIN
that belongs to uncontrolled user-ns can create another (child) user-
namespace that is uncontrolled. Any other process (that either does
not have SYS_ADMIN or belongs to a controlled user-ns) can only
create a user-ns that is controlled.

global-capability-whitelist (controlled_userns_caps_whitelist) is used
at the capability check-time and keeps the semantics for the processes
that belong to uncontrolled user-ns as it is. Processes that belong to
controlled user-ns however are subjected to different checks-

   (a) if the capability in question is controlled and process belongs
   to controlled user-ns, then it's always denied.
   (b) if the capability in question is NOT controlled then fall back
   to the traditional check.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/linux/capability.h |  1 +
 include/linux/user_namespace.h | 20 
 kernel/capability.c|  5 +
 kernel/user_namespace.c|  3 +++
 security/commoncap.c   |  8 
 5 files changed, 37 insertions(+)

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 6c0b9677c03f..b8c6cac18658 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -250,6 +250,7 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos);
+bool is_capability_controlled(int cap);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index c18e01252346..e890fe81b47e 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -22,6 +22,7 @@ struct uid_gid_map {  /* 64 bytes -- 1 cache line */
 };
 
 #define USERNS_SETGROUPS_ALLOWED 1UL
+#define USERNS_CONTROLLED   2UL
 
 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
 
@@ -102,6 +103,16 @@ static inline void put_user_ns(struct user_namespace *ns)
__put_user_ns(ns);
 }
 
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return ns->flags & USERNS_CONTROLLED;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+   ns->flags |= USERNS_CONTROLLED;
+}
+
 struct seq_operations;
 extern const struct seq_operations proc_uid_seq_operations;
 extern const struct seq_operations proc_gid_seq_operations;
@@ -160,6 +171,15 @@ static inline struct ns_common *ns_get_owner(struct 
ns_common *ns)
 {
return ERR_PTR(-EPERM);
 }
+
+static inline bool is_user_ns_controlled(const struct user_namespace *ns)
+{
+   return false;
+}
+
+static inline void mark_user_ns_controlled(struct user_namespace *ns)
+{
+}
 #endif
 
 #endif /* _LINUX_USER_H */
diff --git a/kernel/capability.c b/kernel/capability.c
index 62dbe3350c1b..40a38cc4ff43 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -510,6 +510,11 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
 }
 
 /* Controlled-userns capabilities routines */
+bool is_capability_controlled(int cap)
+{
+   return !cap_raised(controlled_userns_caps_whitelist, cap);
+}
+
 #ifdef CONFIG_SYSCTL
 int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
 void __user *buff, size_t *lenp, loff_t *ppos)
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..f393ea5108f0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -53,6 +53,9 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
cred->cap_effective = CAP_FULL_SET;
cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
+   if (!ns_capable(user_ns->parent, CAP_SYS_ADMIN) ||
+   is_user_ns_controlled(user_ns->parent))
+   mark_user_ns_controlled(user_ns);
 #ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
cred->request_key_auth = NULL;
diff --git a/security/commoncap.c b/security/commoncap.c
index 6bf72b175b49..26f41602da10 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -73,6 +73,14 @@ int cap_capable(const struct cred *cred, struct 
user_namespace *targ_ns,
 {
struct user_namespace *ns = targ_ns;
 
+   /* If the capability is controlled and user-ns that process
+* belongs-to is 'controll

[RFC PATCH 1/2] capability: introduce sysctl for controlled user-ns capability whitelist

2017-09-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add a sysctl variable kernel.controlled_userns_caps_whitelist. This
takes input as capability mask expressed as two comma separated hex
u32 words. The mask, however, is stored in kernel as kernel_cap_t type.

Any capabilities that are not part of this mask will be controlled and
will not be allowed to processes in controlled user-ns.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Serge Hallyn <se...@hallyn.com>
CC: Kees Cook <keesc...@chromium.org>
CC: "Eric W. Biederman" <ebied...@xmission.com>

---
 Documentation/sysctl/kernel.txt | 21 ++
 include/linux/capability.h  |  3 +++
 kernel/capability.c | 47 +
 kernel/sysctl.c |  5 +
 4 files changed, 76 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ce61d1fe08ca..ec0d74476f48 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - bootloader_version[ X86 only ]
 - callhome  [ S390 only ]
 - cap_last_cap
+- controlled_userns_caps_whitelist
 - core_pattern
 - core_pipe_limit
 - core_uses_pid
@@ -186,6 +187,26 @@ CAP_LAST_CAP from the kernel.
 
 ==
 
+controlled_userns_caps_whitelist
+
+Capability mask that is whitelisted for "controlled" user namespaces.
+Any capability that is missing from this mask will not be allowed to
+any process that is attached to a controlled-userns. e.g. if CAP_NET_RAW
+is not part of this mask, then processes running inside any controlled
+userns's will not be allowed to perform action that needs CAP_NET_RAW
+capability. However, processes that are attached to a parent user-ns
+hierarchy that is *not* controlled and has CAP_NET_RAW can continue
+performing those actions. User-namespaces are marked "controlled" at
+the time of their creation based on the capabilities of the creator.
+A process that does not have CAP_SYS_ADMIN will create user-namespaces
+that are controlled.
+
+The value is expressed as two comma separated hex words (u32). This
+sysctl is avaialble in init-ns and users with CAP_SYS_ADMIN in init-ns
+are allowed to make changes.
+
+==
+
 core_pattern:
 
 core_pattern is used to specify a core dumpfile pattern name.
diff --git a/include/linux/capability.h b/include/linux/capability.h
index b52e278e4744..6c0b9677c03f 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -13,6 +13,7 @@
 #define _LINUX_CAPABILITY_H
 
 #include 
+#include 
 
 
 #define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3
@@ -247,6 +248,8 @@ extern bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct 
cpu_vfs_cap_data *cpu_caps);
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos);
 
 extern int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t 
size);
 
diff --git a/kernel/capability.c b/kernel/capability.c
index f97fe77ceb88..62dbe3350c1b 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -28,6 +28,8 @@ EXPORT_SYMBOL(__cap_empty_set);
 
 int file_caps_enabled = 1;
 
+kernel_cap_t controlled_userns_caps_whitelist = CAP_FULL_SET;
+
 static int __init file_caps_disable(char *str)
 {
file_caps_enabled = 0;
@@ -506,3 +508,48 @@ bool ptracer_capable(struct task_struct *tsk, struct 
user_namespace *ns)
rcu_read_unlock();
return (ret == 0);
 }
+
+/* Controlled-userns capabilities routines */
+#ifdef CONFIG_SYSCTL
+int proc_douserns_caps_whitelist(struct ctl_table *table, int write,
+void __user *buff, size_t *lenp, loff_t *ppos)
+{
+   DECLARE_BITMAP(caps_bitmap, CAP_LAST_CAP);
+   struct ctl_table caps_table;
+   char tbuf[NAME_MAX];
+   int ret;
+
+   ret = bitmap_from_u32array(caps_bitmap, CAP_LAST_CAP,
+  controlled_userns_caps_whitelist.cap,
+  _KERNEL_CAPABILITY_U32S);
+   if (ret != CAP_LAST_CAP)
+   return -1;
+
+   scnprintf(tbuf, NAME_MAX, "%*pb", CAP_LAST_CAP, caps_bitmap);
+
+   caps_table.data = tbuf;
+   caps_table.maxlen = NAME_MAX;
+   caps_table.mode = table->mode;
+   ret = proc_dostring(_table, write, buff, lenp, ppos);
+   if (ret)
+   return ret;
+   if (write) {
+   kernel_cap_t tmp;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   ret = bitmap_parse_user(buff, *lenp, caps_bitmap, C

[PATCH next] neigh: initialize neigh entry correctly during arp processing

2017-08-16 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

If the ARP processing creates a neigh entry, it's immediately marked
as STALE without timer and stays that way in that state as long as
host do not send traffic to that neighbour.

I observed this on hosts which are in IPv6 environment, where there is
very little to no IPv4 traffic and neigh-entries are stuck in STALE
mode. Ideally, the host should have PROBEd these neighbours before it
can send the first packet out.

It happens as a result of following call sequence in an environment
where host is mostly quiet as far as IPv4 traffic but few connected
hosts/gateways are sending ARPs.

   arp_process()
 neigh_event_ns()
   neigh_lookup()
 neigh_create()
   neigh_alloc()
 nud_state=NUD_NONE
 neigh_update(nud_state=NUD_STALE)

In the above scenario, the neighbour entry does not get a chance to get
PROBEd as subsequent call to neigh_update() marks  this entry STALE.
This patch initializes the neigh-entry correctly if it was created as a
result of neigh_lookup instead of just updating it in neigh_event_ns()
right after creating it.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/core/neighbour.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 16a1a4c4eb57..d8a35db6c43b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1300,9 +1300,13 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl,
 {
struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev,
 lladdr || !dev->addr_len);
-   if (neigh)
-   neigh_update(neigh, lladdr, NUD_STALE,
-NEIGH_UPDATE_F_OVERRIDE, 0);
+   if (neigh) {
+   if (neigh->nud_state & NUD_VALID)
+   neigh_update(neigh, lladdr, NUD_STALE,
+NEIGH_UPDATE_F_OVERRIDE, 0);
+   else
+   neigh_event_send(neigh, NULL);
+   }
return neigh;
 }
 EXPORT_SYMBOL(neigh_event_ns);
-- 
2.14.1.480.gb18f417b89-goog



[PATCH 1/3] ipv4: initialize fib_trie prior to register_netdev_notifier call.

2017-07-19 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Net stack initialization currently initializes fib-trie after the
first call to netdevice_notifier() call. In fact fib_trie initialization
needs to happen before first rtnl_register(). It does not cause any problem
since there are no devices UP at this moment, but trying to bring 'lo'
UP at initialization would make this assumption wrong and exposes the issue.

Fixes following crash

 Call Trace:
  ? alternate_node_alloc+0x76/0xa0
  fib_table_insert+0x1b7/0x4b0
  fib_magic.isra.17+0xea/0x120
  fib_add_ifaddr+0x7b/0x190
  fib_netdev_event+0xc0/0x130
  register_netdevice_notifier+0x1c1/0x1d0
  ip_fib_init+0x72/0x85
  ip_rt_init+0x187/0x1e9
  ip_init+0xe/0x1a
  inet_init+0x171/0x26c
  ? ipv4_offload_init+0x66/0x66
  do_one_initcall+0x43/0x160
  kernel_init_freeable+0x191/0x219
  ? rest_init+0x80/0x80
  kernel_init+0xe/0x150
  ret_from_fork+0x22/0x30
 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff 
ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 
74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08
 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28
 CR2: 0014

Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle 
multiple namespaces.")
Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization")

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/ipv4/fib_frontend.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e678fa892dd..044d2a159a3c 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = {
 
 void __init ip_fib_init(void)
 {
-   rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
-   rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
-   rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
+   fib_trie_init();
 
register_pernet_subsys(_net_ops);
+
register_netdevice_notifier(_netdev_notifier);
register_inetaddr_notifier(_inetaddr_notifier);
 
-   fib_trie_init();
+   rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL);
+   rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
+   rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 }
-- 
2.14.0.rc0.284.gd933b75aa4-goog



[PATCH 1/2] ipv4: initialize fib_trie prior to register_netdev_notifier call.

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Net stack initialization currently initializes fib-trie after the
first call to netdevice_notifier() call. It does not cause any problem
since there are no devices UP at this moment, but trying to bring 'lo'
UP at initialization would make this assumption wrong. However changing
order solves the issue.

Fixes following crash

[8.647095] BUG: unable to handle kernel NULL pointer dereference at 
0014
[8.654836] IP: kmem_cache_alloc+0xcf/0x1c0
[8.658966] PGD 0
[8.658967] P4D 0
[8.660953]
[8.664406] Oops:  [#1] SMP
[8.667505] Modules linked in:
[8.670519] CPU: 4 PID: 1 Comm: swapper/0 Not tainted 4.12.0-smp-DEV #34
[8.677138] Hardware name: Intel Grantley,Wellsburg/Ixion_IT_15, BIOS 2.60.0 
06/03/2016
[8.685043] task: 8f26b2b00dc0 task.stack: 9b1500014000
[8.690891] RIP: 0010:kmem_cache_alloc+0xcf/0x1c0
[8.695535] RSP: :9b1500017c28 EFLAGS: 00010246
[8.700696] RAX: 96ed4f47 RBX:  RCX: 0200
[8.707743] RDX: 0400 RSI: 014000c0 RDI: 
[8.714791] RBP: 9b1500017c58 R08: 0001 R09: 
[8.721837] R10:  R11:  R12: 014000c0
[8.728886] R13: 014000c0 R14:  R15: 
[8.735932] FS:  () GS:8f26bf50() 
knlGS:
[8.743925] CS:  0010 DS:  ES:  CR0: 80050033
[8.749599] CR2: 0014 CR3: 001d05609000 CR4: 001406e0
[8.756648] Call Trace:
[8.759061]  ? alternate_node_alloc+0x76/0xa0
[8.763364]  fib_table_insert+0x1b7/0x4b0
[8.767323]  fib_magic.isra.17+0xea/0x120
[8.771283]  fib_add_ifaddr+0x7b/0x190
[8.774983]  fib_netdev_event+0xc0/0x130
[8.778857]  register_netdevice_notifier+0x1c1/0x1d0
[8.783761]  ip_fib_init+0x72/0x85
[8.787120]  ip_rt_init+0x187/0x1e9
[8.790564]  ip_init+0xe/0x1a
[8.793494]  inet_init+0x171/0x26c
[8.796851]  ? ipv4_offload_init+0x66/0x66
[8.800896]  do_one_initcall+0x43/0x160
[8.804685]  kernel_init_freeable+0x191/0x219
[8.808987]  ? rest_init+0x80/0x80
[8.812343]  kernel_init+0xe/0x150
[8.815702]  ret_from_fork+0x22/0x30
[8.819229] Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 
ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 
6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08
[8.837882] RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: 9b1500017c28
[8.843986] CR2: 0014
[8.847288] ---[ end trace 8fc60994c8a367cb ]---
[8.851847] Kernel panic - not syncing: Fatal exception
[8.857035] Rebooting in 10 seconds..

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/ipv4/fib_frontend.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e678fa892dd..ca22ca47019a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1338,9 +1338,9 @@ void __init ip_fib_init(void)
rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL);
rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL);
 
+   fib_trie_init();
+
register_pernet_subsys(_net_ops);
register_netdevice_notifier(_netdev_notifier);
register_inetaddr_notifier(_inetaddr_notifier);
-
-   fib_trie_init();
 }
-- 
2.13.2.725.g09c95d1e9-goog



[PATCH 2/2] loopback: bringup 'lo' by default at initialization

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

loopback devices are always brought up right after its initialization
including the case of network namespace creation. e.g.

ip netns add foo
ip -netns foo link set lo up

This patch will eliminate the need to do that separately and would bring
it up as part of the loopback initialization.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/loopback.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 30612497643c..3216977935d1 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -217,6 +217,12 @@ static __net_init int loopback_net_init(struct net *net)
 
BUG_ON(dev->ifindex != LOOPBACK_IFINDEX);
net->loopback_dev = dev;
+
+   /* Set the loopback device UP */
+   rtnl_lock();
+   dev_open(net->loopback_dev);
+   rtnl_unlock();
+
return 0;
 
 out_free_netdev:
-- 
2.13.2.725.g09c95d1e9-goog



[PATCH 0/2] bring UP loopback device at initialziation

2017-07-04 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In almost every scenario the loopback device is brought UP after
initialization. So there is no point of bringing up the device in
DOWN state followed by device UP operation. This change exposed
another issue of fib-trie initialization which is corrected in the 
first path.

Mahesh Bandewar (2):
  ipv4: initialize fib_trie prior to register_netdev_notifier call.
  loopback: bringup 'lo' by default at initialization

 drivers/net/loopback.c  | 6 ++
 net/ipv4/fib_frontend.c | 4 ++--
 2 files changed, 8 insertions(+), 2 deletions(-)

-- 
2.13.2.725.g09c95d1e9-goog



[PATCH net] ipv6: avoid dad-failures for addresses with NODAD

2017-05-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Every address gets added with TENTATIVE flag even for the addresses with
IFA_F_NODAD flag and dad-work is scheduled for them. During this DAD process
we realize it's an address with NODAD and complete the process without
sending any probe. However the TENTATIVE flags stays on the
address for sometime enough to cause misinterpretation when we receive a NS.
While processing NS, if the address has TENTATIVE flag, we mark it DADFAILED
and endup with an address that was originally configured as NODAD with
DADFAILED.

We can't avoid scheduling dad_work for addresses with NODAD but we can
avoid adding TENTATIVE flag to avoid this racy situation.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/ipv6/addrconf.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index b09ac38d8dc4..53f2dc092023 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1022,7 +1022,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct 
in6_addr *addr,
INIT_HLIST_NODE(>addr_lst);
ifa->scope = scope;
ifa->prefix_len = pfxlen;
-   ifa->flags = flags | IFA_F_TENTATIVE;
+   ifa->flags = flags;
+   /* No need to add the TENTATIVE flag for addresses with NODAD */
+   if (!(flags & IFA_F_NODAD))
+   ifa->flags |= IFA_F_TENTATIVE;
ifa->valid_lft = valid_lft;
ifa->prefered_lft = prefered_lft;
ifa->cstamp = ifa->tstamp = jiffies;
-- 
2.13.0.rc2.291.g57267f2277-goog



[PATCH] kmod: don't load module unless req process has CAP_SYS_MODULE

2017-05-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

A process inside random user-ns should not load a module, which is
currently possible. As demonstrated in following scenario -

  Create namespaces; especially a user-ns and become root inside.
  $ unshare -rfUp -- unshare -unm -- bash

  Try to load the bridge module. It should fail and this is expected!
  #  modprobe bridge
  WARNING: Error inserting stp 
(/lib/modules/4.11.0-smp-DEV/kernel/net/802/stp.ko): Operation not permitted
  FATAL: Error inserting bridge 
(/lib/modules/4.11.0-smp-DEV/kernel/net/bridge/bridge.ko): Operation not 
permitted

  Verify bridge module is not loaded.
  # lsmod | grep bridge
  #

  Now try to create a bridge inside this newly created net-ns which would
  mean bridge module need to be loaded.
  # ip link add br0 type bridge
  # echo $?
  0
  # lsmod | grep bridge
  bridge110592  0
  stp16384  1 bridge
  llc16384  2 bridge,stp
  #

  After this patch -
  # ip link add br0 type bridge
  RTNETLINK answers: Operation not supported
  # echo $?
  2
  # lsmod | grep bridge
  #

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 kernel/kmod.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..ac30157169b7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -133,6 +133,9 @@ int __request_module(bool wait, const char *fmt, ...)
 #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
 
+   if (!capable(CAP_SYS_MODULE))
+   return -EPERM;
+
/*
 * We don't allow synchronous module loading from async.  Module
 * init may invoke async_synchronize_full() which will end up
-- 
2.13.0.rc2.291.g57267f2277-goog



[PATCHv2 next] bonding: fix wq initialization for links created via netlink

2017-04-20 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Earlier patch 4493b81bea ("bonding: initialize work-queues during
creation of bond") moved the work-queue initialization from bond_open()
to bond_create(). However this caused the link those are created using
netlink 'create bond option' (ip link add bondX type bond); create the
new trunk without initializing work-queues. Prior to the above mentioned
change, ndo_open was in both paths and things worked correctly. The
consequence is visible in the report shared by Joe Stringer -

I've noticed that this patch breaks bonding within namespaces if
you're not careful to perform device cleanup correctly.

Here's my repro script, you can run on any net-next with this patch
and you'll start seeing some weird behaviour:

ip netns add foo
ip li add veth0 type veth peer name veth0+ netns foo
ip li add veth1 type veth peer name veth1+ netns foo
ip netns exec foo ip li add bond0 type bond
ip netns exec foo ip li set dev veth0+ master bond0
ip netns exec foo ip li set dev veth1+ master bond0
ip netns exec foo ip addr add dev bond0 192.168.0.1/24
ip netns exec foo ip li set dev bond0 up
ip li del dev veth0
ip li del dev veth1

The second to last command segfaults, last command hangs. rtnl is now
permanently locked. It's not a problem if you take bond0 down before
deleting veths, or delete bond0 before deleting veths. If you delete
either end of the veth pair as per above, either inside or outside the
namespace, it hits this problem.

Here's some kernel logs:
[ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link
[ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link
[ 1281.193863] bond0: Releasing backup interface veth0+
[ 1281.193866] bond0: the permanent HWaddr of veth0+ -
16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of
veth0+ to a different address to avoid conflicts
[ 1281.193867] [ cut here ]
[ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511
__queue_delayed_work+0x13f/0x150
[ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6
nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs
lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse
serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc
configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt
shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4
nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid
hid mptspi mptscsih e1000 mptbase ahci libahci
[ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW
4.10.0-bisect-bond-v0.14 #37
[ 1281.193906] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[ 1281.193906] Call Trace:
[ 1281.193912]  dump_stack+0x63/0x89
[ 1281.193915]  __warn+0xd1/0xf0
[ 1281.193917]  warn_slowpath_null+0x1d/0x20
[ 1281.193918]  __queue_delayed_work+0x13f/0x150
[ 1281.193920]  queue_delayed_work_on+0x27/0x40
[ 1281.193929]  bond_change_active_slave+0x25b/0x670 [bonding]
[ 1281.193932]  ? synchronize_rcu_expedited+0x27/0x30
[ 1281.193935]  __bond_release_one+0x489/0x510 [bonding]
[ 1281.193939]  ? addrconf_notify+0x1b7/0xab0
[ 1281.193942]  bond_netdev_event+0x2c5/0x2e0 [bonding]
[ 1281.193944]  ? netconsole_netdev_event+0x124/0x190 [netconsole]
[ 1281.193947]  notifier_call_chain+0x49/0x70
[ 1281.193948]  raw_notifier_call_chain+0x16/0x20
[ 1281.193950]  call_netdevice_notifiers_info+0x35/0x60
[ 1281.193951]  rollback_registered_many+0x23b/0x3e0
[ 1281.193953]  unregister_netdevice_many+0x24/0xd0
[ 1281.193955]  rtnl_delete_link+0x3c/0x50
[ 1281.193956]  rtnl_dellink+0x8d/0x1b0
[ 1281.193960]  rtnetlink_rcv_msg+0x95/0x220
[ 1281.193962]  ? __kmalloc_node_track_caller+0x35/0x280
[ 1281.193964]  ? __netlink_lookup+0xf1/0x110
[ 1281.193966]  ? rtnl_newlink+0x830/0x830
[ 1281.193967]  netlink_rcv_skb+0xa7/0xc0
[ 1281.193969]  rtnetlink_rcv+0x28/0x30
[ 1281.193970]  netlink_unicast+0x15b/0x210
[ 1281.193971]  netlink_sendmsg+0x319/0x390
[ 1281.193974]  sock_sendmsg+0x38/0x50
[ 1281.193975]  ___sys_sendmsg+0x25c/0x270
[ 1281.193978]  ? mem_cgroup_commit_charge+0x76/0xf0
[ 1281.193981]  ? page_add_new_anon_rmap+0x89/0xc0
[ 1281.193984]  ? lru_cache_add_active_or_unevictable+0x35/0xb0
[ 1281.193985]  ? __handle_mm_fault+0x4e9/0x1170
[ 1281.193987]  __sys_sendmsg+0x45/0x80
[ 1281.193989]  SyS_sendmsg+0x12/0x20
[ 1281.193991]  do_syscall_64+0x6e/0x180
[ 1281.193993]  entry_SYSCALL64_slow_path+0x25/0x25
[ 1281.193995] RIP: 0033:0x7f6ec122f5a0
[ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX:
002e
[ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0
[ 1281.193997] RDX:  RSI: 7ffe69e89c90 RDI: 0003
[ 1281.193998] RBP: 7ffe69e89c90 R08:  R09: 0003
[ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 58f14b9f
[ 

[PATCH next] bonding: fix wq initialization for links created via netlink

2017-04-19 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Earlier patch 4493b81bea ("bonding: initialize work-queues during
creation of bond") moved the work-queue initialization from bond_open()
to bond_create(). However this caused the link those are created using
netlink 'create bond option' (ip link add bondX type bond); create the
new trunk without initializing work-queues. Prior to the above mentioned
change, ndo_open was in both paths and things worked correctly. The
consequence is visible in the report shared by Joe Stringer -

I've noticed that this patch breaks bonding within namespaces if
you're not careful to perform device cleanup correctly.

Here's my repro script, you can run on any net-next with this patch
and you'll start seeing some weird behaviour:

ip netns add foo
ip li add veth0 type veth peer name veth0+ netns foo
ip li add veth1 type veth peer name veth1+ netns foo
ip netns exec foo ip li add bond0 type bond
ip netns exec foo ip li set dev veth0+ master bond0
ip netns exec foo ip li set dev veth1+ master bond0
ip netns exec foo ip addr add dev bond0 192.168.0.1/24
ip netns exec foo ip li set dev bond0 up
ip li del dev veth0
ip li del dev veth1

The second to last command segfaults, last command hangs. rtnl is now
permanently locked. It's not a problem if you take bond0 down before
deleting veths, or delete bond0 before deleting veths. If you delete
either end of the veth pair as per above, either inside or outside the
namespace, it hits this problem.

Here's some kernel logs:
[ 1221.801610] bond0: Enslaving veth0+ as an active interface with an up link
[ 1224.449581] bond0: Enslaving veth1+ as an active interface with an up link
[ 1281.193863] bond0: Releasing backup interface veth0+
[ 1281.193866] bond0: the permanent HWaddr of veth0+ -
16:bf:fb:e0:b8:43 - is still in use by bond0 - set the HWaddr of
veth0+ to a different address to avoid conflicts
[ 1281.193867] [ cut here ]
[ 1281.193873] WARNING: CPU: 0 PID: 2024 at kernel/workqueue.c:1511
__queue_delayed_work+0x13f/0x150
[ 1281.193873] Modules linked in: bonding veth openvswitch nf_nat_ipv6
nf_nat_ipv4 nf_nat autofs4 nfsd auth_rpcgss nfs_acl binfmt_misc nfs
lockd grace sunrpc fscache ppdev vmw_balloon coretemp psmouse
serio_raw vmwgfx ttm drm_kms_helper vmw_vmci netconsole parport_pc
configfs drm i2c_piix4 fb_sys_fops syscopyarea sysfillrect sysimgblt
shpchp mac_hid nf_conntrack_ipv6 nf_defrag_ipv6 nf_conntrack_ipv4
nf_defrag_ipv4 nf_conntrack libcrc32c lp parport hid_generic usbhid
hid mptspi mptscsih e1000 mptbase ahci libahci
[ 1281.193905] CPU: 0 PID: 2024 Comm: ip Tainted: GW
4.10.0-bisect-bond-v0.14 #37
[ 1281.193906] Hardware name: VMware, Inc. VMware Virtual
Platform/440BX Desktop Reference Platform, BIOS 6.00 09/30/2014
[ 1281.193906] Call Trace:
[ 1281.193912]  dump_stack+0x63/0x89
[ 1281.193915]  __warn+0xd1/0xf0
[ 1281.193917]  warn_slowpath_null+0x1d/0x20
[ 1281.193918]  __queue_delayed_work+0x13f/0x150
[ 1281.193920]  queue_delayed_work_on+0x27/0x40
[ 1281.193929]  bond_change_active_slave+0x25b/0x670 [bonding]
[ 1281.193932]  ? synchronize_rcu_expedited+0x27/0x30
[ 1281.193935]  __bond_release_one+0x489/0x510 [bonding]
[ 1281.193939]  ? addrconf_notify+0x1b7/0xab0
[ 1281.193942]  bond_netdev_event+0x2c5/0x2e0 [bonding]
[ 1281.193944]  ? netconsole_netdev_event+0x124/0x190 [netconsole]
[ 1281.193947]  notifier_call_chain+0x49/0x70
[ 1281.193948]  raw_notifier_call_chain+0x16/0x20
[ 1281.193950]  call_netdevice_notifiers_info+0x35/0x60
[ 1281.193951]  rollback_registered_many+0x23b/0x3e0
[ 1281.193953]  unregister_netdevice_many+0x24/0xd0
[ 1281.193955]  rtnl_delete_link+0x3c/0x50
[ 1281.193956]  rtnl_dellink+0x8d/0x1b0
[ 1281.193960]  rtnetlink_rcv_msg+0x95/0x220
[ 1281.193962]  ? __kmalloc_node_track_caller+0x35/0x280
[ 1281.193964]  ? __netlink_lookup+0xf1/0x110
[ 1281.193966]  ? rtnl_newlink+0x830/0x830
[ 1281.193967]  netlink_rcv_skb+0xa7/0xc0
[ 1281.193969]  rtnetlink_rcv+0x28/0x30
[ 1281.193970]  netlink_unicast+0x15b/0x210
[ 1281.193971]  netlink_sendmsg+0x319/0x390
[ 1281.193974]  sock_sendmsg+0x38/0x50
[ 1281.193975]  ___sys_sendmsg+0x25c/0x270
[ 1281.193978]  ? mem_cgroup_commit_charge+0x76/0xf0
[ 1281.193981]  ? page_add_new_anon_rmap+0x89/0xc0
[ 1281.193984]  ? lru_cache_add_active_or_unevictable+0x35/0xb0
[ 1281.193985]  ? __handle_mm_fault+0x4e9/0x1170
[ 1281.193987]  __sys_sendmsg+0x45/0x80
[ 1281.193989]  SyS_sendmsg+0x12/0x20
[ 1281.193991]  do_syscall_64+0x6e/0x180
[ 1281.193993]  entry_SYSCALL64_slow_path+0x25/0x25
[ 1281.193995] RIP: 0033:0x7f6ec122f5a0
[ 1281.193995] RSP: 002b:7ffe69e89c48 EFLAGS: 0246 ORIG_RAX:
002e
[ 1281.193997] RAX: ffda RBX: 7ffe69e8dd60 RCX: 7f6ec122f5a0
[ 1281.193997] RDX:  RSI: 7ffe69e89c90 RDI: 0003
[ 1281.193998] RBP: 7ffe69e89c90 R08:  R09: 0003
[ 1281.193999] R10: 7ffe69e89a10 R11: 0246 R12: 58f14b9f
[ 

[PATCH next] bonding: handle link transition from FAIL to UP correctly

2017-04-11 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

When link transitions from LINK_FAIL to LINK_UP, the commit phase is
not called. This leads to an erroneous state causing slave-link state to
get stuck in "going down" state while its speed and duplex are perfectly
fine. This issue is a side-effect of splitting link-set into propose and
commit phases introduced by de77ecd4ef02 ("bonding: improve link-status
update in mii-monitoring")

This patch fixes these issues by calling commit phase whenever link
state change is proposed.

Fixes: de77ecd4ef02 ("bonding: improve link-status update in mii-monitoring")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index aba7352906a5..01e4a69af421 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2064,6 +2064,7 @@ static int bond_miimon_inspect(struct bonding *bond)
(bond->params.downdelay - 
slave->delay) *
bond->params.miimon,
slave->dev->name);
+   commit++;
continue;
}
 
@@ -2098,7 +2099,7 @@ static int bond_miimon_inspect(struct bonding *bond)
(bond->params.updelay - 
slave->delay) *
bond->params.miimon,
slave->dev->name);
-
+   commit++;
continue;
}
 
-- 
2.12.2.715.g7642488e1d-goog



[PATCH next] bonding: fix active-backup transition

2017-04-03 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Earlier patch c4adfc822bf5 ("bonding: make speed, duplex setting
consistent with link state") made an attempt to keep slave state
consistent with speed and duplex settings. Unfortunately link-state
transition is used to change the active link especially when used
in conjunction with mii-mon. The above mentioned patch broke that
logic. Also when speed and duplex settings for a link are updated
during a link-event, the link-status should not be changed to
invoke correct transition logic.

This patch fixes this issue by moving the link-state update outside
of the bond_update_speed_duplex() fn and to the places where this fn
is called and update link-state selectively.

Fixes: c4adfc822bf5 ("bonding: make speed, duplex setting consistent
with link state")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 27359dab78a1..535388b15cde 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -378,20 +378,15 @@ static int bond_update_speed_duplex(struct slave *slave)
slave->duplex = DUPLEX_UNKNOWN;
 
res = __ethtool_get_link_ksettings(slave_dev, );
-   if (res < 0) {
-   slave->link = BOND_LINK_DOWN;
+   if (res < 0)
return 1;
-   }
-   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) {
-   slave->link = BOND_LINK_DOWN;
+   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
return 1;
-   }
switch (ecmd.base.duplex) {
case DUPLEX_FULL:
case DUPLEX_HALF:
break;
default:
-   slave->link = BOND_LINK_DOWN;
return 1;
}
 
@@ -1563,7 +1558,8 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
new_slave->delay = 0;
new_slave->link_failure_count = 0;
 
-   bond_update_speed_duplex(new_slave);
+   if (bond_update_speed_duplex(new_slave))
+   new_slave->link = BOND_LINK_DOWN;
 
new_slave->last_rx = jiffies -
(msecs_to_jiffies(bond->params.arp_interval) + 1);
@@ -2126,6 +2122,7 @@ static void bond_miimon_commit(struct bonding *bond)
 
case BOND_LINK_UP:
if (bond_update_speed_duplex(slave)) {
+   slave->link = BOND_LINK_DOWN;
netdev_warn(bond->dev,
"failed to get link speed/duplex 
for %s\n",
slave->dev->name);
-- 
2.12.2.715.g7642488e1d-goog



[PATCH next 4/5] bonding: correctly update link status during mii-commit phase

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

bond_miimon_commit() marks the link UP after attempting to get the speed
and duplex settings for the link. There is a possibility that
bond_update_speed_duplex() could fail. This is another place where it
could result into an inconsistent bonding link state.

With this patch the link will be marked UP only if the speed and duplex
values retrieved have sane values and processed further.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ad317bb63193..6cea964ab70a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2125,7 +2125,12 @@ static void bond_miimon_commit(struct bonding *bond)
continue;
 
case BOND_LINK_UP:
-   bond_update_speed_duplex(slave);
+   if (bond_update_speed_duplex(slave)) {
+   netdev_warn(bond->dev,
+   "failed to get link speed/duplex 
for %s\n",
+   slave->dev->name);
+   continue;
+   }
bond_set_slave_link_state(slave, BOND_LINK_UP,
  BOND_SLAVE_NOTIFY_NOW);
slave->last_link_up = jiffies;
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 5/5] bonding: avoid printing while holding a spinlock

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_3ad.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 508713b4e533..c5fd4259da33 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2446,9 +2446,9 @@ void bond_3ad_adapter_speed_duplex_changed(struct slave 
*slave)
 
spin_lock_bh(>bond->mode_lock);
ad_update_actor_keys(port, false);
+   spin_unlock_bh(>bond->mode_lock);
netdev_dbg(slave->bond->dev, "Port %d slave %s changed speed/duplex\n",
   port->actor_port_number, slave->dev->name);
-   spin_unlock_bh(>bond->mode_lock);
 }
 
 /**
@@ -2492,12 +2492,12 @@ void bond_3ad_handle_link_change(struct slave *slave, 
char link)
agg = __get_first_agg(port);
ad_agg_selection_logic(agg, );
 
+   spin_unlock_bh(>bond->mode_lock);
+
netdev_dbg(slave->bond->dev, "Port %d changed link status to %s\n",
   port->actor_port_number,
   link == BOND_LINK_UP ? "UP" : "DOWN");
 
-   spin_unlock_bh(>bond->mode_lock);
-
/* RTNL is held and mode_lock is released so it's safe
 * to update slave_array here.
 */
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 2/5] bonding: improve link-status update in mii-monitoring

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

The primary issue is that mii-inspect phase updates link-state and
expects changes to be committed during the mii-commit phase. After
the inspect phase if it fails to acquire rtnl-mutex, the commit
phase (bond_mii_commit) doesn't get to run. This partially updated
state stays and makes the internal-state inconsistent.

e.g. setup bond0 => slaves: eth1, eth2
eth1 goes DOWN -> UP
   mii_monitor()
mii-inspect()
bond_set_slave_link_state(eth1, UP, DontNotify)
rtnl_trylock() <- fails!

Next mii-monitor round
eth1: No change
   mii_monitor()
mii-inspect()
eth1->link == current-status (ethtool_ops->get_link)
no-change-detected

End result:
eth1:
  Link = BOND_LINK_UP
  Speed = 0xf  [SpeedUnknown]
  Duplex = 0xff[DuplexUnknown]

This doesn't always happen but for some unlucky machines in a large set
of machines it creates problems.

The fix for this is to avoid making changes during inspect phase and
postpone them until acquiring the rtnl-mutex / invoking commit phase.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index ba934020dfaa..85999e479916 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2033,8 +2033,7 @@ static int bond_miimon_inspect(struct bonding *bond)
if (link_state)
continue;
 
-   bond_set_slave_link_state(slave, BOND_LINK_FAIL,
- BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_FAIL);
slave->delay = bond->params.downdelay;
if (slave->delay) {
netdev_info(bond->dev, "link status down for 
%sinterface %s, disabling it in %d ms\n",
@@ -2049,8 +2048,7 @@ static int bond_miimon_inspect(struct bonding *bond)
case BOND_LINK_FAIL:
if (link_state) {
/* recovered before downdelay expired */
-   bond_set_slave_link_state(slave, BOND_LINK_UP,
- 
BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_UP);
slave->last_link_up = jiffies;
netdev_info(bond->dev, "link status up again 
after %d ms for interface %s\n",
(bond->params.downdelay - 
slave->delay) *
@@ -2072,8 +2070,7 @@ static int bond_miimon_inspect(struct bonding *bond)
if (!link_state)
continue;
 
-   bond_set_slave_link_state(slave, BOND_LINK_BACK,
- BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_BACK);
slave->delay = bond->params.updelay;
 
if (slave->delay) {
@@ -2086,9 +2083,7 @@ static int bond_miimon_inspect(struct bonding *bond)
/*FALLTHRU*/
case BOND_LINK_BACK:
if (!link_state) {
-   bond_set_slave_link_state(slave,
- BOND_LINK_DOWN,
- 
BOND_SLAVE_NOTIFY_LATER);
+   bond_propose_link_state(slave, BOND_LINK_DOWN);
netdev_info(bond->dev, "link status down again 
after %d ms for interface %s\n",
(bond->params.updelay - 
slave->delay) *
bond->params.miimon,
@@ -2225,6 +2220,8 @@ static void bond_mii_monitor(struct work_struct *work)
mii_work.work);
bool should_notify_peers = false;
unsigned long delay;
+   struct slave *slave;
+   struct list_head *iter;
 
delay = msecs_to_jiffies(bond->params.miimon);
 
@@ -2245,6 +2242,9 @@ static void bond_mii_monitor(struct work_struct *work)
goto re_arm;
}
 
+   bond_for_each_slave(bond, slave, iter) {
+   bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
+   }
bond_miimon_commit(bond);
 
rtnl_unlock();  /* might sleep, hold no other locks */
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 1/5] bonding: split bond_set_slave_link_state into two parts

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Split the function into two (a) propose (b) commit phase without
changing the semantics for the original API.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/net/bonding.h | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/net/bonding.h b/include/net/bonding.h
index 3c857778a6ca..fb2dd97857c4 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -153,7 +153,8 @@ struct slave {
unsigned long last_link_up;
unsigned long last_rx;
unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
-   s8 link;/* one of BOND_LINK_ */
+   s8 link;/* one of BOND_LINK_ */
+   s8 link_new_state;  /* one of BOND_LINK_ */
s8 new_link;
u8 backup:1,   /* indicates backup slave. Value corresponds with
  BOND_STATE_ACTIVE and BOND_STATE_BACKUP */
@@ -504,13 +505,17 @@ static inline bool bond_is_slave_inactive(struct slave 
*slave)
return slave->inactive;
 }
 
-static inline void bond_set_slave_link_state(struct slave *slave, int state,
-bool notify)
+static inline void bond_propose_link_state(struct slave *slave, int state)
+{
+   slave->link_new_state = state;
+}
+
+static inline void bond_commit_link_state(struct slave *slave, bool notify)
 {
-   if (slave->link == state)
+   if (slave->link == slave->link_new_state)
return;
 
-   slave->link = state;
+   slave->link = slave->link_new_state;
if (notify) {
bond_queue_slave_event(slave);
bond_lower_state_changed(slave);
@@ -523,6 +528,13 @@ static inline void bond_set_slave_link_state(struct slave 
*slave, int state,
}
 }
 
+static inline void bond_set_slave_link_state(struct slave *slave, int state,
+bool notify)
+{
+   bond_propose_link_state(slave, state);
+   bond_commit_link_state(slave, notify);
+}
+
 static inline void bond_slave_link_notify(struct bonding *bond)
 {
struct list_head *iter;
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 0/5] link-status fixes for mii-monitoring

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

The mii monitoring is divided into two phases - inspect and commit. The
inspect phase technically should not make any changes to the state and
defer it to the commit phase. However detected link state inconsistencies
on several machines and discovered that it's the result of some
inconsistent update to link states and assumption that you *always* get
rtnl-mutex. In reality when trylock() fails to acquire rtnl-mutex, the
commit phase is postponed until next mii-mon run. At the next round
because of the state change performed in the previous inspect-run, this
round does not detect any changes and would skip calling commit phase.
This would result in an inconsistent state until next link event happens
(if it ever happens).

During the the commit phase, it's always assumed that speed and duplex
fetch is always successful, but that's always not the case. However the
slave state is marked UP irrespective of speed / duplex fetch operation.
If the speed / duplex fetch operation results in insane values for either
of these two fields, then keeping internal link state UP is not going to
provide fruitful results either.

Please see into individual patches for more details.

Mahesh Bandewar (5):
  bonding: split bond_set_slave_link_state into two parts
  bonding: improve link-status update in mii-monitoring
  bonding: make speed, duplex setting consistent with link state
  bonding: correctly update link status during mii-commit phase
  bonding: avoid printing while holding a spinlock

 drivers/net/bonding/bond_3ad.c  |  6 ++---
 drivers/net/bonding/bond_main.c | 49 -
 include/net/bonding.h   | 22 +-
 3 files changed, 49 insertions(+), 28 deletions(-)

-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 3/5] bonding: make speed, duplex setting consistent with link state

2017-03-27 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

bond_update_speed_duplex() retrieves speed and duplex settings. There
is a possibility of failure in retrieving these values but caller has
to assume it's always successful. This leads to having inconsistent
slave link settings. If these (speed, duplex) values cannot be
retrieved, then keeping the link UP causes problems.

The updated bond_update_speed_duplex() returns 0 on success if it
retrieves sane values for speed and duplex. On failure it returns 1
and marks the link down.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 85999e479916..ad317bb63193 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -365,9 +365,10 @@ int bond_set_carrier(struct bonding *bond)
 /* Get link speed and duplex from the slave's base driver
  * using ethtool. If for some reason the call fails or the
  * values are invalid, set speed and duplex to -1,
- * and return.
+ * and return. Return 1 if speed or duplex settings are
+ * UNKNOWN; 0 otherwise.
  */
-static void bond_update_speed_duplex(struct slave *slave)
+static int bond_update_speed_duplex(struct slave *slave)
 {
struct net_device *slave_dev = slave->dev;
struct ethtool_link_ksettings ecmd;
@@ -377,24 +378,27 @@ static void bond_update_speed_duplex(struct slave *slave)
slave->duplex = DUPLEX_UNKNOWN;
 
res = __ethtool_get_link_ksettings(slave_dev, );
-   if (res < 0)
-   return;
-
-   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
-   return;
-
+   if (res < 0) {
+   slave->link = BOND_LINK_DOWN;
+   return 1;
+   }
+   if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) {
+   slave->link = BOND_LINK_DOWN;
+   return 1;
+   }
switch (ecmd.base.duplex) {
case DUPLEX_FULL:
case DUPLEX_HALF:
break;
default:
-   return;
+   slave->link = BOND_LINK_DOWN;
+   return 1;
}
 
slave->speed = ecmd.base.speed;
slave->duplex = ecmd.base.duplex;
 
-   return;
+   return 0;
 }
 
 const char *bond_slave_link_status(s8 link)
-- 
2.12.1.578.ge9c3154ca4-goog



[PATCH next 0/5] bonding: winter cleanup

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Few cleanup patches that I have accumulated over some time now.

(a) First two patches are basically to move the work-queue initialization
from every ndo_open / bond_open operation to once at the beginning while
port creation. Work-queue initialization is an unnecessary operation
for every 'ifup' operation. However we have some mode-specific work-queues
and mode can change anytime after port creation. So the second patch is
to ensure the correct work-handler is called based on the mode.

(b) Third patch is simple and straightforward that removes hard-coded value
that was added into the initial commit and replaces it with the default
value configured.

(c) The final patch in the series removes the unimplemented "port-moved" state
from the LACP state machine. This state is defined but never set so
removing from the state machine logic makes code little cleaner.

(d) Reduce scope of some global variables to local.

Note: None of these patches are making any functional changes.

Mahesh Bandewar (5):
  bonding: restructure arp-monitor
  bonding: initialize work-queues during creation of bond
  bonding: remove hardcoded value
  bonding: remove "port-moved" state that was never implemented
  bonding: reduce scope of some global variables

 drivers/net/bonding/bond_3ad.c  | 11 +++--
 drivers/net/bonding/bond_main.c | 53 ++---
 2 files changed, 37 insertions(+), 27 deletions(-)

-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 2/5] bonding: initialize work-queues during creation of bond

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Initializing work-queues every time ifup operation performed is unnecessary
and can be performed only once when the port is created.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 619f0c65f18a..1329110ed85f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev)
}
}
 
-   bond_work_init_all(bond);
-
if (bond_is_lb(bond)) {
/* bond_alb_initialize must be called before the timer
 * is started.
@@ -4691,6 +4689,8 @@ int bond_create(struct net *net, const char *name)
 
netif_carrier_off(bond_dev);
 
+   bond_work_init_all(bond);
+
rtnl_unlock();
if (res < 0)
bond_destructor(bond_dev);
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 1/5] bonding: restructure arp-monitor

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In preparation to move the work-queue initialization to port creation
from current port_open phase. Work-queue initialization does not make
sense every time we do 'ifup/ifdown'. So moving to port creation phase.

Arp monitoring work depends on the bonding mode and that is not tied
to the port creation and can change anytime during the life after port
creation. So this restructuring allows us to move the initialization
at creation without losing the ability to arm the correct work for the
mode user has selected.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8a4ba8b88e52..619f0c65f18a 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, 
unsigned long last_act,
  * arp is transmitted to generate traffic. see activebackup_arp_monitor for
  * arp monitoring in active backup mode.
  */
-static void bond_loadbalance_arp_mon(struct work_struct *work)
+static void bond_loadbalance_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
struct slave *slave, *oldcurrent;
struct list_head *iter;
int do_failover = 0, slave_state_changed = 0;
@@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond)
return should_notify_rtnl;
 }
 
-static void bond_activebackup_arp_mon(struct work_struct *work)
+static void bond_activebackup_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
bool should_notify_peers = false;
bool should_notify_rtnl = false;
int delta_in_ticks;
@@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct 
*work)
}
 }
 
+static void bond_arp_monitor(struct work_struct *work)
+{
+   struct bonding *bond = container_of(work, struct bonding,
+   arp_work.work);
+
+   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
+   bond_activebackup_arp_mon(bond);
+   else
+   bond_loadbalance_arp_mon(bond);
+}
+
 /*-- netdev event handling --*/
 
 /* Change device name */
@@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond)
  bond_resend_igmp_join_requests_delayed);
INIT_DELAYED_WORK(>alb_work, bond_alb_monitor);
INIT_DELAYED_WORK(>mii_work, bond_mii_monitor);
-   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
-   INIT_DELAYED_WORK(>arp_work, bond_activebackup_arp_mon);
-   else
-   INIT_DELAYED_WORK(>arp_work, bond_loadbalance_arp_mon);
+   INIT_DELAYED_WORK(>arp_work, bond_arp_monitor);
INIT_DELAYED_WORK(>ad_work, bond_3ad_state_machine_handler);
INIT_DELAYED_WORK(>slave_arr_work, bond_slave_arr_handler);
 }
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 4/5] bonding: remove "port-moved" state that was never implemented

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

LACP state-machine defines "port-moved" state when the same ActorSystemID
and Port are seen in a LACPDU received on different port. The state is
never set since it's not implemented. However the state-machine attempts
to clear that state occasionally. LACP state machine is already complicated
and since this state is not implemented, removing it's checks makes the
state-machine little simpler.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_3ad.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index edc70ffad660..431926bba9f4 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_rx_state = AD_RX_INITIALIZE;
port->sm_vars |= AD_PORT_CHURNED;
/* check if port is not enabled */
-   } else if (!(port->sm_vars & AD_PORT_BEGIN)
-&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED))
+   } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled)
port->sm_rx_state = AD_RX_PORT_DISABLED;
/* check if new lacpdu arrived */
else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) ||
@@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
/* if no lacpdu arrived and no timer is on */
switch (port->sm_rx_state) {
case AD_RX_PORT_DISABLED:
-   if (port->sm_vars & AD_PORT_MOVED)
-   port->sm_rx_state = AD_RX_INITIALIZE;
-   else if (port->is_enabled
-&& (port->sm_vars
-& AD_PORT_LACP_ENABLED))
+   if (port->is_enabled &&
+   (port->sm_vars & AD_PORT_LACP_ENABLED))
port->sm_rx_state = AD_RX_EXPIRED;
else if (port->is_enabled
 && ((port->sm_vars
@@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_vars &= ~AD_PORT_SELECTED;
__record_default(port);
port->actor_oper_port_state &= ~AD_STATE_EXPIRED;
-   port->sm_vars &= ~AD_PORT_MOVED;
port->sm_rx_state = AD_RX_PORT_DISABLED;
 
/* Fall Through */
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 5/5] bonding: reduce scope of some global variables

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Many of the bond param variables are declared global while it's not
really necessary for these variables to be global. So moving them to
the location these are used.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 0f9f5ceae80e..ba934020dfaa 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -201,12 +201,6 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0);
 
 unsigned int bond_net_id __read_mostly;
 
-static __be32 arp_target[BOND_MAX_ARP_TARGETS];
-static int arp_ip_count;
-static int bond_mode   = BOND_MODE_ROUNDROBIN;
-static int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
-static int lacp_fast;
-
 /*-- Forward declarations ---*/
 
 static int bond_init(struct net_device *bond_dev);
@@ -4254,6 +4248,11 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   __be32 arp_target[BOND_MAX_ARP_TARGETS];
+   int arp_ip_count;
+   int bond_mode   = BOND_MODE_ROUNDROBIN;
+   int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
+   int lacp_fast = 0;
int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 3/5] bonding: remove hardcoded value

2017-03-08 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Eliminate hard-coded value and use the default that is set.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 1329110ed85f..0f9f5ceae80e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4254,6 +4254,7 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
if (mode) {
@@ -4566,6 +4567,17 @@ static int bond_check_params(struct bond_params *params)
}
ad_user_port_key = valptr->value;
 
+   if (bond_mode == BOND_MODE_TLB) {
+   bond_opt_initstr(, "default");
+   valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
+   );
+   if (!valptr) {
+   pr_err("Error: No tlb_dynamic_lb default value");
+   return -EINVAL;
+   }
+   tlb_dynamic_lb = valptr->value;
+   }
+
if (lp_interval == 0) {
pr_warn("Warning: ip_interval must be between 1 and %d, so it 
was reset to %d\n",
INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4593,7 +4605,7 @@ static int bond_check_params(struct bond_params *params)
params->min_links = min_links;
params->lp_interval = lp_interval;
params->packets_per_slave = packets_per_slave;
-   params->tlb_dynamic_lb = 1; /* Default value */
+   params->tlb_dynamic_lb = tlb_dynamic_lb;
params->ad_actor_sys_prio = ad_actor_sys_prio;
eth_zero_addr(params->ad_actor_system);
params->ad_user_port_key = ad_user_port_key;
-- 
2.12.0.246.ga2ecc84866-goog



[PATCH next 3/4] bonding: remove hardcoded value

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Eliminate hard-coded value and use the default that is set.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index b7a344ffc1de..06ac1783bfdd 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4253,6 +4253,7 @@ static int bond_check_params(struct bond_params *params)
int arp_all_targets_value;
u16 ad_actor_sys_prio = 0;
u16 ad_user_port_key = 0;
+   int tlb_dynamic_lb = 0;
 
/* Convert string parameters. */
if (mode) {
@@ -4565,6 +4566,17 @@ static int bond_check_params(struct bond_params *params)
}
ad_user_port_key = valptr->value;
 
+   if (bond_mode == BOND_MODE_TLB) {
+   bond_opt_initstr(, "default");
+   valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB),
+   );
+   if (!valptr) {
+   pr_err("Error: No tlb_dynamic_lb default value");
+   return -EINVAL;
+   }
+   tlb_dynamic_lb = valptr->value;
+   }
+
if (lp_interval == 0) {
pr_warn("Warning: ip_interval must be between 1 and %d, so it 
was reset to %d\n",
INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
@@ -4592,7 +4604,7 @@ static int bond_check_params(struct bond_params *params)
params->min_links = min_links;
params->lp_interval = lp_interval;
params->packets_per_slave = packets_per_slave;
-   params->tlb_dynamic_lb = 1; /* Default value */
+   params->tlb_dynamic_lb = tlb_dynamic_lb;
params->ad_actor_sys_prio = ad_actor_sys_prio;
eth_zero_addr(params->ad_actor_system);
params->ad_user_port_key = ad_user_port_key;
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 1/4] bonding: restructure arp-monitor

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In preparation to move the work-queue initialization to port creation
from current port_open phase. Work-queue initialization does not make
sense every time we do 'ifup/ifdown'. So moving to port creation phase.

Arp monitoring work depends on the bonding mode and that is not tied
to the port creation and can change anytime during the life after port
creation. So this restructuring allows us to move the initialization
at creation without losing the ability to arm the correct work for the
mode user has selected.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 24 ++--
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6321f12630c8..4e03807bb0d6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2575,10 +2575,8 @@ static bool bond_time_in_interval(struct bonding *bond, 
unsigned long last_act,
  * arp is transmitted to generate traffic. see activebackup_arp_monitor for
  * arp monitoring in active backup mode.
  */
-static void bond_loadbalance_arp_mon(struct work_struct *work)
+static void bond_loadbalance_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
struct slave *slave, *oldcurrent;
struct list_head *iter;
int do_failover = 0, slave_state_changed = 0;
@@ -2916,10 +2914,8 @@ static bool bond_ab_arp_probe(struct bonding *bond)
return should_notify_rtnl;
 }
 
-static void bond_activebackup_arp_mon(struct work_struct *work)
+static void bond_activebackup_arp_mon(struct bonding *bond)
 {
-   struct bonding *bond = container_of(work, struct bonding,
-   arp_work.work);
bool should_notify_peers = false;
bool should_notify_rtnl = false;
int delta_in_ticks;
@@ -2972,6 +2968,17 @@ static void bond_activebackup_arp_mon(struct work_struct 
*work)
}
 }
 
+static void bond_arp_monitor(struct work_struct *work)
+{
+   struct bonding *bond = container_of(work, struct bonding,
+   arp_work.work);
+
+   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
+   bond_activebackup_arp_mon(bond);
+   else
+   bond_loadbalance_arp_mon(bond);
+}
+
 /*-- netdev event handling --*/
 
 /* Change device name */
@@ -3228,10 +3235,7 @@ static void bond_work_init_all(struct bonding *bond)
  bond_resend_igmp_join_requests_delayed);
INIT_DELAYED_WORK(>alb_work, bond_alb_monitor);
INIT_DELAYED_WORK(>mii_work, bond_mii_monitor);
-   if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
-   INIT_DELAYED_WORK(>arp_work, bond_activebackup_arp_mon);
-   else
-   INIT_DELAYED_WORK(>arp_work, bond_loadbalance_arp_mon);
+   INIT_DELAYED_WORK(>arp_work, bond_arp_monitor);
INIT_DELAYED_WORK(>ad_work, bond_3ad_state_machine_handler);
INIT_DELAYED_WORK(>slave_arr_work, bond_slave_arr_handler);
 }
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 4/4] bonding: remove "port-moved" state that was never implemented

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

LACP state-machine defines "port-moved" state when the same ActorSystemID
and Port are seen in a LACPDU received on different port. The state is
never set since it's not implemented. However the state-machine attempts
to clear that state occasionally. LACP state machine is already complicated
and since this state is not implemented, removing it's checks makes the
state-machine little simpler.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_3ad.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index edc70ffad660..431926bba9f4 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -1052,8 +1052,7 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_rx_state = AD_RX_INITIALIZE;
port->sm_vars |= AD_PORT_CHURNED;
/* check if port is not enabled */
-   } else if (!(port->sm_vars & AD_PORT_BEGIN)
-&& !port->is_enabled && !(port->sm_vars & AD_PORT_MOVED))
+   } else if (!(port->sm_vars & AD_PORT_BEGIN) && !port->is_enabled)
port->sm_rx_state = AD_RX_PORT_DISABLED;
/* check if new lacpdu arrived */
else if (lacpdu && ((port->sm_rx_state == AD_RX_EXPIRED) ||
@@ -1081,11 +1080,8 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
/* if no lacpdu arrived and no timer is on */
switch (port->sm_rx_state) {
case AD_RX_PORT_DISABLED:
-   if (port->sm_vars & AD_PORT_MOVED)
-   port->sm_rx_state = AD_RX_INITIALIZE;
-   else if (port->is_enabled
-&& (port->sm_vars
-& AD_PORT_LACP_ENABLED))
+   if (port->is_enabled &&
+   (port->sm_vars & AD_PORT_LACP_ENABLED))
port->sm_rx_state = AD_RX_EXPIRED;
else if (port->is_enabled
 && ((port->sm_vars
@@ -1115,7 +,6 @@ static void ad_rx_machine(struct lacpdu *lacpdu, struct 
port *port)
port->sm_vars &= ~AD_PORT_SELECTED;
__record_default(port);
port->actor_oper_port_state &= ~AD_STATE_EXPIRED;
-   port->sm_vars &= ~AD_PORT_MOVED;
port->sm_rx_state = AD_RX_PORT_DISABLED;
 
/* Fall Through */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 2/4] bonding: initialize work-queues during creation of bond

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Initializing work-queues every time ifup operation performed is unnecessary
and can be performed only once when the port is created.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/bonding/bond_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4e03807bb0d6..b7a344ffc1de 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3270,8 +3270,6 @@ static int bond_open(struct net_device *bond_dev)
}
}
 
-   bond_work_init_all(bond);
-
if (bond_is_lb(bond)) {
/* bond_alb_initialize must be called before the timer
 * is started.
@@ -4690,6 +4688,8 @@ int bond_create(struct net *net, const char *name)
 
netif_carrier_off(bond_dev);
 
+   bond_work_init_all(bond);
+
rtnl_unlock();
if (res < 0)
bond_destructor(bond_dev);
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 0/4] bonding: winter cleanup

2017-02-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Few cleanup patches that I have accumulated over some time now.

(a) First two patches are basically to move the work-queue initialization
from every ndo_open / bond_open operation to once at the beginning while
port creation. Work-queue initialization is an unnecessary operation
for every 'ifup' operation. However we have some mode-specific work-queues
and mode can change anytime after port creation. So the second patch is
to ensure the correct work-handler is called based on the mode.

(b) Third patch is simple and straightforward that removes hard-coded value
that was added into the initial commit and replaces it with the default
value configured.

(c) The final patch in the series removes the unimplemented "port-moved" state
from the LACP state machine. This state is defined but never set so
removing from the state machine logic makes code little cleaner.

Note: None of these patches are making any functional changes.

Mahesh Bandewar (4):
  bonding: restructure arp-monitor
  bonding: initialize work-queues during creation of bond
  bonding: remove hardcoded value
  bonding: remove "port-moved" state that was never implemented

 drivers/net/bonding/bond_3ad.c  | 11 +++
 drivers/net/bonding/bond_main.c | 42 -
 2 files changed, 32 insertions(+), 21 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 0/3] use netdev_is_rx_handler_busy() in few known cases

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

netdev_rx_handler_register() was recently split into two parts - (a) check
if the handler is used, (b) register the new handler, parts. This is 
helpful in scenarios like bonding where at the time of registration there
is too much state to unwind and it should check if the device is free
before building that state. IPvlan and macvlan drivers don't have this
issue however it can make use of the same check instead of using a device
specific check.

Mahesh Bandewar (3):
  net: remove duplicate code.
  ipvlan: use netdev_is_rx_handler_busy instead of checking specific
type
  macvlan: use netdev_is_rx_handler_busy instead of checking specific
type

 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 drivers/net/macvlan.c| 2 +-
 net/core/dev.c   | 4 +---
 3 files changed, 4 insertions(+), 6 deletions(-)

-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 3/3] macvlan: use netdev_is_rx_handler_busy instead of checking specific type

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

netdev_is_rx_handler_busy() check is a superset of netif_is_ipvlan_port()
check and hence should be preferred.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/macvlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 440ab3d8adf7..cbfc1be23a0e 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1110,7 +1110,7 @@ static int macvlan_port_create(struct net_device *dev)
if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
return -EINVAL;
 
-   if (netif_is_ipvlan_port(dev))
+   if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
 
port = kzalloc(sizeof(*port), GFP_KERNEL);
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 1/3] net: remove duplicate code.

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

netdev_rx_handler_register() checks to see if the handler is already
busy which was recently separated into netdev_is_rx_handler_busy(). So
use the same function inside register() to avoid code duplication.
Essentially this change should be a no-op

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/core/dev.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index ad5959e56116..c8f1f67ff16c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3961,9 +3961,7 @@ int netdev_rx_handler_register(struct net_device *dev,
   rx_handler_func_t *rx_handler,
   void *rx_handler_data)
 {
-   ASSERT_RTNL();
-
-   if (dev->rx_handler)
+   if (netdev_is_rx_handler_busy(dev))
return -EBUSY;
 
/* Note: rx_handler_data must be set before rx_handler */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next 2/3] ipvlan: use netdev_is_rx_handler_busy instead of checking specific type

2017-01-18 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan checks if the master device is already used by checking a
specific device (here it's macvlan device). This is technically not
sufficient and it should just ensure the rx_handler is busy or not.
This would be a super check that includes macvlan and any other that
has already registered rx-handler.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index b5c390f0f2b3..95b18f4602cf 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -102,8 +102,8 @@ static int ipvlan_port_create(struct net_device *dev)
return -EINVAL;
}
 
-   if (netif_is_macvlan_port(dev)) {
-   netdev_err(dev, "Master is a macvlan port.\n");
+   if (netdev_is_rx_handler_busy(dev)) {
+   netdev_err(dev, "Device is already in use.\n");
return -EBUSY;
}
 
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next] ipvlan: fix dev_id creation corner case.

2017-01-13 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In the last patch da36e13cf65 ("ipvlan: improvise dev_id generation
logic in IPvlan") I missed some part of Dave's suggestion and because
of that the dev_id creation could fail in a corner case scenario. This
would happen when more or less 64k devices have been already created and
several have been deleted. If the devices that are still sticking around
are the last n bits from the bitmap. So in this scenario even if lower
bits are available, the dev_id search is so narrow that it always fails.

Fixes: da36e13cf65 ("ipvlan: improvise dev_id generation logic in IPvlan")
CC: David Miller <da...@davemloft.org>
CC: Eric Dumazet <eduma...@google.com>
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 92b221a03350..b5c390f0f2b3 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -550,6 +550,9 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
err = ida_simple_get(>ida, port->dev_id_start, 0xFFFE,
 GFP_KERNEL);
if (err < 0)
+   err = ida_simple_get(>ida, 0x1, port->dev_id_start,
+GFP_KERNEL);
+   if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
/* Increment id-base to the next slot for the future assignment */
-- 
2.11.0.483.g087da7b7c-goog



[PATCH next v2] ipvlan: improvise dev_id generation logic in IPvlan

2017-01-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

The patch 009146d117b ("ipvlan: assign unique dev-id for each slave
device.") used ida_simple_get() to generate dev_ids assigned to the
slave devices. However (Eric has pointed out that) there is a shortcoming
with that approach as it always uses the first available ID. This
becomes a problem when a slave gets deleted and a new slave gets added.
The ID gets reassigned causing the new slave to get the same link-local
address. This side-effect is undesirable.

This patch adds a per-port variable that keeps track of the IDs
assigned and used as the stat-base for the IDR api. This base will be
wrapped around when it reaches the MAX (0xFFFE) value possibly on a
busy system where slaves are added and deleted routinely.

Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Eric Dumazet <eduma...@google.com>
CC: David Miller <da...@davemloft.org>
---
v1 -> v2
  Implemented Dave's comments and reinstated IDR API.

 drivers/net/ipvlan/ipvlan.h  |  1 +
 drivers/net/ipvlan/ipvlan_main.c | 13 -
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 0a9068fdee0f..406ae4ff0ae8 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -94,6 +94,7 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 dev_id_start;
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..aca2885cf0b4 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -120,6 +120,7 @@ static int ipvlan_port_create(struct net_device *dev)
skb_queue_head_init(>backlog);
INIT_WORK(>wq, ipvlan_process_multicast);
ida_init(>ida);
+   port->dev_id_start = 1;
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -535,15 +536,25 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(>addrs);
 
+   /* If the port-id base is at the MAX value, then wrap it around and
+* begin from 0x1 again. This may be due to a busy system where lots
+* of slaves are getting created and deleted.
+*/
+   if (port->dev_id_start == 0xFFFE)
+   port->dev_id_start = 0x1;
+
/* Since L2 address is shared among all IPvlan slaves including
 * master, use unique 16 bit dev-ids to diffentiate among them.
 * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
 * slave link [see addrconf_ifid_eui48()].
 */
-   err = ida_simple_get(>ida, 1, 0xFFFE, GFP_KERNEL);
+   err = ida_simple_get(>ida, port->dev_id_start, 0xFFFE,
+GFP_KERNEL);
if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
+   /* Increment id-base to the next slot for the future assignment */
+   port->dev_id_start = err + 1;
 
/* TODO Probably put random address here to be presented to the
 * world but keep using the physical-dev address for the outgoing
-- 
2.11.0.390.gc69c2f50cf-goog



[PATCH next v1] ipvlan: don't use IDR for generating dev_id

2017-01-06 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

The patch 009146d117b ("ipvlan: assign unique dev-id for each slave
device.") used ida_simple_get() to generate dev_ids assigned to the
slave devices. However (Eric has pointed out that) there is a shortcoming
with that approach as it always uses the first available ID. This
becomes a problem when a slave gets deleted and a new slave gets added.
The ID gets reassigned causing the new slave to get the same link-local
address. This side-effect is undesirable.

This patch replaces IDR logic with a simple per-port variable that keeps
incrementing and wraps around when the MAX (0xFFFE) is reached. The
only downside is that this is an inefficient (n^2) search if there are
64k (or close to 64k) slaves in the system, the dev-id search takes time.
However having these many devices in the system has it's own challenges.

Fixes: 009146d117b ("ipvlan: assign unique dev-id for each slave device.")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Eric Dumazet <eduma...@google.com>
---
 drivers/net/ipvlan/ipvlan.h  |  2 +-
 drivers/net/ipvlan/ipvlan_main.c | 69 ++--
 2 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 0a9068fdee0f..535f254324ba 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -94,10 +94,10 @@ struct ipvl_port {
struct hlist_head   hlhead[IPVLAN_HASH_SIZE];
struct list_headipvlans;
u16 mode;
+   u16 dev_id_base;
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
-   struct ida  ida;
 };
 
 struct ipvl_skb_cb {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index ce7ca6a5aa8a..691662e02abb 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -119,7 +119,7 @@ static int ipvlan_port_create(struct net_device *dev)
 
skb_queue_head_init(>backlog);
INIT_WORK(>wq, ipvlan_process_multicast);
-   ida_init(>ida);
+   port->dev_id_base = 1;
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -151,7 +151,6 @@ static void ipvlan_port_destroy(struct net_device *dev)
dev_put(skb->dev);
kfree_skb(skb);
}
-   ida_destroy(>ida);
kfree(port);
 }
 
@@ -496,6 +495,60 @@ static int ipvlan_nl_fillinfo(struct sk_buff *skb,
return ret;
 }
 
+static u16 ipvlan_get_dev_id(struct ipvl_port *port)
+{
+   struct ipvl_dev *ipvlan;
+   u16 tid, dev_id = 0;
+   bool found, exhausted = false, second_round = false;
+
+   /* Idea here is to get the next available ID while avoiding
+* reuse of already used IDs. Per port variable (dev_id_base)
+* is used to get the next available ID.
+*
+* Possibilities are from 0x1 to 0xfffe so finding next available
+* ID is not be that difficult. However if the system is up for a
+* long time and these slave devices are getting added / deleted
+* routinely and it reaches the MAX, then only the assignment will
+* attempt to recycle previously used IDs starting from 0x1 (Provided
+* these are not currently in use).
+*
+* In case of a failure, dev-id search will return 0. All IDs in-use
+* case is very inefficient (n^2 search) but having close to 64k
+* slaves has it's own system limitations.
+*/
+   while (!exhausted) {
+   tid = port->dev_id_base++;
+
+   if (tid == 0xFFFE) {
+   /* Reached MAX; restart from 1 */
+   port->dev_id_base = 1;
+   if (second_round)
+   exhausted = true;
+   else
+   second_round = true;
+   continue;
+   }
+   /* Should not be same as the masters' (if present) */
+   if (tid == port->dev->dev_id)
+   continue;
+   /* Make sure that it's already not assigned to any of the
+* existing slaves.
+*/
+   found = false;
+   list_for_each_entry(ipvlan, >ipvlans, pnode) {
+   if (ipvlan->dev->dev_id == tid) {
+   found = true;
+   break;
+   }
+   }
+   if (!found) {
+   dev_id = tid;
+   break;
+   }
+   }
+   return dev_id;
+}
+
 static int ipvlan_link_new(struct net *src_net, struct net_device *dev,
   struct nlattr *tb[], stru

[RFC PATCH next] ipv6: do not send RTM_DELADDR for tentative addresses

2017-01-04 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

RTM_NEWADDR notification is sent when IFA_F_TENTATIVE is cleared from
the address. So if the address is added and deleted before DAD probes
completes, the RTM_DELADDR will be sent for which there was no
RTM_NEWADDR causing asymmetry in notification. However if the same
logic is used while sending RTM_DELADDR notification, this asymmetry
can be avoided.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Hideaki YOSHIFUJI <yoshf...@linux-ipv6.org>
CC: Patrick McHardy <ka...@trash.net>
CC: Hannes Frederic Sowa <han...@stressinduktion.org>
---
 net/ipv6/addrconf.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c1e124bc8e1e..ac9bd5620f81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4888,6 +4888,13 @@ static void inet6_ifa_notify(int event, struct 
inet6_ifaddr *ifa)
struct net *net = dev_net(ifa->idev->dev);
int err = -ENOBUFS;
 
+   /* Don't send DELADDR notification for TENTATIVE address,
+* since NEWADDR notification is sent only after removing
+* TENTATIVE flag.
+*/
+   if (ifa->flags & IFA_F_TENTATIVE && event == RTM_DELADDR)
+   return;
+
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
-- 
2.11.0.390.gc69c2f50cf-goog



[PATCH next v1] ipvlan: assign unique dev-id for each slave device.

2017-01-03 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

IPvlan setup uses one mac-address (of master). The IPv6 link-local
addresses are derived using the mac-address on the link. Lack of
dev-ids makes these link-local addresses same for all slaves including
that of master device. dev-ids are necessary to add differentiation
when L2 address is shared.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan.h  |  1 +
 drivers/net/ipvlan/ipvlan_main.c | 17 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index dbfbb33ac66c..0a9068fdee0f 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -97,6 +97,7 @@ struct ipvl_port {
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
+   struct ida  ida;
 };
 
 struct ipvl_skb_cb {
diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 975f9ddb9908..ce7ca6a5aa8a 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -119,6 +119,7 @@ static int ipvlan_port_create(struct net_device *dev)
 
skb_queue_head_init(>backlog);
INIT_WORK(>wq, ipvlan_process_multicast);
+   ida_init(>ida);
 
err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
if (err)
@@ -150,6 +151,7 @@ static void ipvlan_port_destroy(struct net_device *dev)
dev_put(skb->dev);
kfree_skb(skb);
}
+   ida_destroy(>ida);
kfree(port);
 }
 
@@ -533,6 +535,16 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
ipvlan_adjust_mtu(ipvlan, phy_dev);
INIT_LIST_HEAD(>addrs);
 
+   /* Since L2 address is shared among all IPvlan slaves including
+* master, use unique 16 bit dev-ids to diffentiate among them.
+* Assign IDs between 0x1 and 0xFFFE (used by the master) to each
+* slave link [see addrconf_ifid_eui48()].
+*/
+   err = ida_simple_get(>ida, 1, 0xFFFE, GFP_KERNEL);
+   if (err < 0)
+   goto destroy_ipvlan_port;
+   dev->dev_id = err;
+
/* TODO Probably put random address here to be presented to the
 * world but keep using the physical-dev address for the outgoing
 * packets.
@@ -543,7 +555,7 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
 
err = register_netdevice(dev);
if (err < 0)
-   goto destroy_ipvlan_port;
+   goto remove_ida;
 
err = netdev_upper_dev_link(phy_dev, dev);
if (err) {
@@ -562,6 +574,8 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
netdev_upper_dev_unlink(phy_dev, dev);
 unregister_netdev:
unregister_netdevice(dev);
+remove_ida:
+   ida_simple_remove(>ida, dev->dev_id);
 destroy_ipvlan_port:
if (create)
ipvlan_port_destroy(phy_dev);
@@ -579,6 +593,7 @@ static void ipvlan_link_delete(struct net_device *dev, 
struct list_head *head)
kfree_rcu(addr, rcu);
}
 
+   ida_simple_remove(>port->ida, dev->dev_id);
list_del_rcu(>pnode);
unregister_netdevice_queue(dev, head);
netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
-- 
2.11.0.390.gc69c2f50cf-goog



[PATCH net] ipvlan: fix multicast processing

2016-12-21 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In an IPvlan setup when master is set in loopback mode e.g.

  ethtool -K eth0 set loopback on

  where eth0 is master device for IPvlan setup.

The failure is caused by the faulty logic that determines if the
packet is from TX-path vs. RX-path by just looking at the mac-
addresses on the packet while processing multicast packets.

In the loopback-mode where this crash was happening, the packets
that are sent out are reflected by the NIC and are processed on
the RX path, but mac-address check tricks into thinking this
packet is from TX path and falsely uses dev_forward_skb() to pass
packets to the slave (virtual) devices.

This patch records the path while queueing packets and eliminates
logic of looking at mac-addresses for the same decision.

[ cut here ]
kernel BUG at include/linux/skbuff.h:1737!
Call Trace:
 [] dev_forward_skb+0x92/0xd0
 [] ipvlan_process_multicast+0x395/0x4c0 [ipvlan]
 [] ? ipvlan_process_multicast+0xd7/0x4c0 [ipvlan]
 [] ? process_one_work+0x147/0x660
 [] process_one_work+0x1a9/0x660
 [] ? process_one_work+0x147/0x660
 [] worker_thread+0x11d/0x360
 [] ? rescuer_thread+0x350/0x350
 [] kthread+0xdb/0xe0
 [] ? _raw_spin_unlock_irq+0x30/0x50
 [] ? flush_kthread_worker+0xc0/0xc0
 [] ret_from_fork+0x9a/0xd0
 [] ? flush_kthread_worker+0xc0/0xc0

Fixes: ba35f8588f47 ("ipvlan: Defer multicast / broadcast processing to a 
work-queue")
Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Eric Dumazet <eduma...@google.com>
---
Note that this is on top of Eric's patch sent earlier.

 drivers/net/ipvlan/ipvlan.h  |  5 +
 drivers/net/ipvlan/ipvlan_core.c | 26 +++---
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 031093e1c25f..dbfbb33ac66c 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -99,6 +99,11 @@ struct ipvl_port {
int count;
 };
 
+struct ipvl_skb_cb {
+   bool tx_pkt;
+};
+#define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0]))
+
 static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
 {
return rcu_dereference(d->rx_handler_data);
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index ea6bc1e12cdf..83ce74acf82d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -198,7 +198,7 @@ void ipvlan_process_multicast(struct work_struct *work)
unsigned int mac_hash;
int ret;
u8 pkt_type;
-   bool hlocal, dlocal;
+   bool tx_pkt;
 
__skb_queue_head_init();
 
@@ -211,7 +211,7 @@ void ipvlan_process_multicast(struct work_struct *work)
bool consumed = false;
 
ethh = eth_hdr(skb);
-   hlocal = ether_addr_equal(ethh->h_source, port->dev->dev_addr);
+   tx_pkt = IPVL_SKB_CB(skb)->tx_pkt;
mac_hash = ipvlan_mac_hash(ethh->h_dest);
 
if (ether_addr_equal(ethh->h_dest, port->dev->broadcast))
@@ -219,13 +219,10 @@ void ipvlan_process_multicast(struct work_struct *work)
else
pkt_type = PACKET_MULTICAST;
 
-   dlocal = false;
rcu_read_lock();
list_for_each_entry_rcu(ipvlan, >ipvlans, pnode) {
-   if (hlocal && (ipvlan->dev == dev)) {
-   dlocal = true;
+   if (tx_pkt && (ipvlan->dev == skb->dev))
continue;
-   }
if (!test_bit(mac_hash, ipvlan->mac_filters))
continue;
if (!(ipvlan->dev->flags & IFF_UP))
@@ -238,7 +235,7 @@ void ipvlan_process_multicast(struct work_struct *work)
consumed = true;
nskb->pkt_type = pkt_type;
nskb->dev = ipvlan->dev;
-   if (hlocal)
+   if (tx_pkt)
ret = dev_forward_skb(ipvlan->dev, 
nskb);
else
ret = netif_rx(nskb);
@@ -248,7 +245,7 @@ void ipvlan_process_multicast(struct work_struct *work)
}
rcu_read_unlock();
 
-   if (dlocal) {
+   if (tx_pkt) {
/* If the packet originated here, send it out. */
skb->dev = port->dev;
skb->pkt_type = pkt_type;
@@ -480,13 +477,20 @@ static int ipvlan_process_outbound(struct sk_buff *skb)
 }
 
 static void ipvlan_multicast_enqueue(struct ipvl_port *port,
-

[PATCH net v2] ipvlan: fix crash when master is set in loopback mode

2016-12-18 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In an IPvlan setup when master is set in loopback mode e.g.

  ethtool -K eth0 set loopback on

  where eth0 is master device for IPvlan setup.

The failure actually happens while processing mulitcast packets
but that's a result of unconditionally queueing packets without
ensuring ether-header is part of the linear part of skb.

This patch forces this check at the reception and drops packets
which fail this check before queuing them.

[ cut here ]
kernel BUG at include/linux/skbuff.h:1737!
Call Trace:
 [] dev_forward_skb+0x92/0xd0
 [] ipvlan_process_multicast+0x395/0x4c0 [ipvlan]
 [] ? ipvlan_process_multicast+0xd7/0x4c0 [ipvlan]
 [] ? process_one_work+0x147/0x660
 [] process_one_work+0x1a9/0x660
 [] ? process_one_work+0x147/0x660
 [] worker_thread+0x11d/0x360
 [] ? rescuer_thread+0x350/0x350
 [] kthread+0xdb/0xe0
 [] ? _raw_spin_unlock_irq+0x30/0x50
 [] ? flush_kthread_worker+0xc0/0xc0
 [] ret_from_fork+0x9a/0xd0
 [] ? flush_kthread_worker+0xc0/0xc0

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
v1->v2: commit log update

 drivers/net/ipvlan/ipvlan_core.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b4e990743e1d..4294fc1f5564 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -660,6 +660,9 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
if (!port)
return RX_HANDLER_PASS;
 
+   if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr
+   goto out;
+
switch (port->mode) {
case IPVLAN_MODE_L2:
return ipvlan_handle_mode_l2(pskb, port);
@@ -672,6 +675,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
/* Should not reach here */
WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
  port->mode);
+
+out:
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
 }
-- 
2.8.0.rc3.226.g39d4020



[PATCH net] ipvlan: fix crash

2016-12-17 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

[ cut here ]
kernel BUG at include/linux/skbuff.h:1737!
Call Trace:
 [] dev_forward_skb+0x92/0xd0
 [] ipvlan_process_multicast+0x395/0x4c0 [ipvlan]
 [] ? ipvlan_process_multicast+0xd7/0x4c0 [ipvlan]
 [] ? process_one_work+0x147/0x660
 [] process_one_work+0x1a9/0x660
 [] ? process_one_work+0x147/0x660
 [] worker_thread+0x11d/0x360
 [] ? rescuer_thread+0x350/0x350
 [] kthread+0xdb/0xe0
 [] ? _raw_spin_unlock_irq+0x30/0x50
 [] ? flush_kthread_worker+0xc0/0xc0
 [] ret_from_fork+0x9a/0xd0
 [] ? flush_kthread_worker+0xc0/0xc0

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/ipvlan/ipvlan_core.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b4e990743e1d..4294fc1f5564 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -660,6 +660,9 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
if (!port)
return RX_HANDLER_PASS;
 
+   if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr
+   goto out;
+
switch (port->mode) {
case IPVLAN_MODE_L2:
return ipvlan_handle_mode_l2(pskb, port);
@@ -672,6 +675,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
/* Should not reach here */
WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
  port->mode);
+
+out:
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
 }
-- 
2.8.0.rc3.226.g39d4020



[PATCH next] arp: avoid sending ucast probes to 00:00:00:00:00:00

2016-12-01 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

If initial broadcast probe(s) is/are lost, the neigh entry wont have
valid address of the neighbour. In a situation like this, the fall
back should be to send a broadcast probe, however the code logic
continues sending ucast probes to 00:00:00:00:00:00. The default value
of ucast probes is 3 so system usually recovers after three such probes
but if the value configured is larger it takes those many probes
(a probe is sent every second in default config) / seconds to recover
making machine not-available on the network.

This patch just ensures that the unicast address is not NULL otherwise
falls back to sending broadcast probe.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 net/ipv4/arp.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 89a8cac4726a..56fb33d5ed31 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -330,6 +330,7 @@ static void arp_solicit(struct neighbour *neigh, struct 
sk_buff *skb)
 {
__be32 saddr = 0;
u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
+   u8 null_dev_hw_addr[MAX_ADDR_LEN];
struct net_device *dev = neigh->dev;
__be32 target = *(__be32 *)neigh->primary_key;
int probes = atomic_read(>probes);
@@ -371,10 +372,12 @@ static void arp_solicit(struct neighbour *neigh, struct 
sk_buff *skb)
 
probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
+   memset(_dev_hw_addr, 0, dev->addr_len);
if (!(neigh->nud_state & NUD_VALID))
pr_debug("trying to ucast probe in NUD_INVALID\n");
neigh_ha_snapshot(dst_ha, neigh, dev);
-   dst_hw = dst_ha;
+   if (memcmp(_ha, _dev_hw_addr, dev->addr_len) != 0)
+   dst_hw = dst_ha;
} else {
probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
if (probes < 0) {
-- 
2.8.0.rc3.226.g39d4020



[PATCH] ip: (ipvlan) introduce L3s mode

2016-09-19 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

The new mode 'l3s' can be set like -

  ip link add link  dev  type ipvlan mode l3s

  e.g. ip link add link eth0 dev ipvl0 type ipvlan mode l3s

Also did some trivial code restructuring.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/linux/if_link.h |  1 +
 ip/iplink_ipvlan.c  | 32 +---
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 1feb708902ac..78ef2c6ae04e 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -461,6 +461,7 @@ enum {
 enum ipvlan_mode {
IPVLAN_MODE_L2 = 0,
IPVLAN_MODE_L3,
+   IPVLAN_MODE_L3S,
IPVLAN_MODE_MAX
 };
 
diff --git a/ip/iplink_ipvlan.c b/ip/iplink_ipvlan.c
index a6273be88a2a..f7735f3a13ef 100644
--- a/ip/iplink_ipvlan.c
+++ b/ip/iplink_ipvlan.c
@@ -20,18 +20,7 @@
 
 static void ipvlan_explain(FILE *f)
 {
-   fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3 } ]\n");
-}
-
-static void explain(void)
-{
-   ipvlan_explain(stderr);
-}
-
-static int mode_arg(void)
-{
-   fprintf(stderr, "Error: argument of \"mode\" must be either \"l2\", or 
\"l3\"\n");
-   return -1;
+   fprintf(f, "Usage: ... ipvlan [ mode { l2 | l3  | l3s } ]\n");
 }
 
 static int ipvlan_parse_opt(struct link_util *lu, int argc, char **argv,
@@ -47,20 +36,24 @@ static int ipvlan_parse_opt(struct link_util *lu, int argc, 
char **argv,
mode = IPVLAN_MODE_L2;
else if (strcmp(*argv, "l3") == 0)
mode = IPVLAN_MODE_L3;
-   else
-   return mode_arg();
-
+   else if (strcmp(*argv, "l3s") == 0)
+   mode = IPVLAN_MODE_L3S;
+   else {
+   fprintf(stderr, "Error: argument of \"mode\" 
must be either \"l2\", \"l3\" or \"l3s\"\n");
+   return -1;
+   }
addattr16(n, 1024, IFLA_IPVLAN_MODE, mode);
} else if (matches(*argv, "help") == 0) {
-   explain();
+   ipvlan_explain(stderr);
return -1;
} else {
fprintf(stderr, "ipvlan: unknown option \"%s\"?\n",
*argv);
-   explain();
+   ipvlan_explain(stderr);
return -1;
}
-   argc--, argv++;
+   argc--;
+   argv++;
}
 
return 0;
@@ -78,7 +71,8 @@ static void ipvlan_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
fprintf(f, " mode %s ",
mode == IPVLAN_MODE_L2 ? "l2" :
-   mode == IPVLAN_MODE_L3 ? "l3" : "unknown");
+   mode == IPVLAN_MODE_L3 ? "l3" :
+   mode == IPVLAN_MODE_L3S ? "l3s" : "unknown");
}
}
 }
-- 
2.8.0.rc3.226.g39d4020



[PATCH next] ipvlan: Fix dependency issue

2016-09-19 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

kbuild-build-bot reported that if NETFILTER is not selected, the
build fails pointing to netfilter symbols.

Fixes: 4fbae7d83c98 ("ipvlan: Introduce l3s mode")

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 drivers/net/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 8768a625350d..95c32f2d7601 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
 tristate "IP-VLAN support"
 depends on INET
 depends on IPV6
+depends on NETFILTER
 depends on NET_L3_MASTER_DEV
 ---help---
   This allows one to create virtual devices off of a main interface
-- 
2.8.0.rc3.226.g39d4020



[PATCHv4 next 2/3] net: Add _nf_(un)register_hooks symbols

2016-09-16 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c  | 51 ++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
struct net *net, *last;
int ret;
 
-   rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(>list, _hook_list);
-   rtnl_unlock();
 
return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+   return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+   int ret;
+
+   rtnl_lock();
+   ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
struct net *net;
 
-   rtnl_lock();
list_del(>list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+   rtnl_lock();
+   _nf_unregister_hook(reg);
rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   unsigned int i;
+   int err = 0;
+
+   for (i = 0; i < n; i++) {
+   err = _nf_register_hook([i]);
+   if (err)
+   goto err;
+   }
+   return err;
+
+err:
+   if (i > 0)
+   _nf_unregister_hooks(reg, i);
+   return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned 
int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   while (n-- > 0)
+   _nf_unregister_hook([n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020



[PATCHv4 next 3/3] ipvlan: Introduce l3s mode

2016-09-16 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: David Ahern <d...@cumulusnetworks.com>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  6 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 87 +++---
 include/uapi/linux/if_link.h|  1 +
 6 files changed, 188 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or 
as a module
There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-   ip link add link   type ipvlan mode { l2 | L3 }
+   ip link add link   type ipvlan mode { l2 | l3 | 
l3s }
 
e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that 
instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+   This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly 
less
+performance but that shouldn't matter since you are choosing this mode over 
plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
 tristate "IP-VLAN support"
 depends on INET
 depends on IPV6
+depends on NET_L3_MASTER_DEV
 ---help---
   This allows one to create virtual devices off of a main interface
   and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..7e0732f5ea07 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #define IPVLAN_DRV "ipvlan"
 #define IPV_DRV_VER"0.1"
@@ -124,4 +126,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev 
*ipvlan,
   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+ u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
case IPVLAN_MODE_L2:
return ipvlan_xmit_mode_l2(skb, dev);
case IPVLAN_MODE_L3:
+   case IPVLAN_MODE_L3S:
return ipvlan_xmit_mode_l3(skb, dev);
}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
return ipvlan_handle_mode_l2(pskb, port);
case IPVLAN_MODE_L3:
return ipvlan_handle_mode_l3(pskb, port);
+   case IPVLAN_MODE_L3S:
+   return RX_HANDLER_PASS;
}
 
/* Should not reach here */
@@ -672,3 +675,94 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
kfree_skb(skb);
return RX_HANDLER_CONSUMED;
 }
+
+static st

[PATCHv4 next 0/3] IPvlan introduce l3s mode

2016-09-16 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Same old problem with new approach especially from suggestions from
earlier patch-series.

First thing is that this is introduced as a new mode rather than
modifying the old (L3) mode. So the behavior of the existing modes is
preserved as it is and the new L3s mode obeys iptables so that intended
conn-tracking can work. 

To do this, the code uses newly added l3mdev_rcv() handler and an
Iptables hook. l3mdev_rcv() to perform an inbound route lookup with the
correct (IPvlan slave) interface and then IPtable-hook at LOCAL_INPUT
to change the input device from master to the slave to complete the
formality.

Supporting stack changes are trivial changes to export symbol to get
IPv4 equivalent code exported for IPv6 and to allow netfilter hook
registration code to allow caller to hold RTNL. Please look into
individual patches for details.

Mahesh Bandewar (3):
  ipv6: Export p6_route_input_lookup symbol
  net: Add _nf_(un)register_hooks symbols
  ipvlan: Introduce l3s mode

 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  6 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 87 +++---
 include/linux/netfilter.h   |  2 +
 include/net/ip6_route.h |  3 ++
 include/uapi/linux/if_link.h|  1 +
 net/ipv6/route.c|  7 +--
 net/netfilter/core.c| 51 ++--
 10 files changed, 243 insertions(+), 16 deletions(-)

v1: Initial post
v2: Text correction and config changed from "select" to "depends on"
v3: separated nf_hook registration logic and made it independent of port
as nf_hook registration is independant of how many IPvlan ports are
present in the system.
v4: Eliminated need to have "hooks_attached" per port and rely just on
the mode. Also change BUG_ON to WARN_ON

-- 
2.8.0.rc3.226.g39d4020



[PATCHv4 next 1/3] ipv6: Export p6_route_input_lookup symbol

2016-09-16 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c| 7 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr 
*daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock 
*sk,
 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..4dab585f7642 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net 
*net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-   struct net_device *dev,
-   struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags)
 {
if (rt6_need_strict(>daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
 
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
2.8.0.rc3.226.g39d4020



[PATCHv3 next 0/3] IPvlan introduce l3s mode

2016-09-15 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Same old problem with new approach especially from suggestions from
earlier patch-series.

First thing is that this is introduced as a new mode rather than
modifying the old (L3) mode. So the behavior of the existing modes is
preserved as it is and the new L3s mode obeys iptables so that intended
conn-tracking can work. 

To do this, the code uses newly added l3mdev_rcv() handler and an
Iptables hook. l3mdev_rcv() to perform an inbound route lookup with the
correct (IPvlan slave) interface and then IPtable-hook at LOCAL_INPUT
to change the input device from master to the slave to complete the
formality.

Supporting stack changes are trivial changes to export symbol to get
IPv4 equivalent code exported for IPv6 and to allow netfilter hook
registration code to allow caller to hold RTNL. Please look into
individual patches for details.

Mahesh Bandewar (3):
  ipv6: Export p6_route_input_lookup symbol
  net: Add _nf_(un)register_hooks symbols
  ipvlan: Introduce l3s mode

 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  7 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 92 +---
 include/linux/netfilter.h   |  2 +
 include/net/ip6_route.h |  3 ++
 include/uapi/linux/if_link.h|  1 +
 net/ipv6/route.c|  7 +--
 net/netfilter/core.c| 51 ++--
 10 files changed, 249 insertions(+), 16 deletions(-)

v1: Initial post
v2: Text correction and config changed from "select" to "depends on"
v3: separated nf_hook registration logic and made it independent of port
as nf_hook registration is independant of how many IPvlan ports are
present in the system.

-- 
2.8.0.rc3.226.g39d4020



[PATCHv3 next 2/3] net: Add _nf_(un)register_hooks symbols

2016-09-15 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c  | 51 ++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
struct net *net, *last;
int ret;
 
-   rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(>list, _hook_list);
-   rtnl_unlock();
 
return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+   return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+   int ret;
+
+   rtnl_lock();
+   ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
struct net *net;
 
-   rtnl_lock();
list_del(>list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+   rtnl_lock();
+   _nf_unregister_hook(reg);
rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   unsigned int i;
+   int err = 0;
+
+   for (i = 0; i < n; i++) {
+   err = _nf_register_hook([i]);
+   if (err)
+   goto err;
+   }
+   return err;
+
+err:
+   if (i > 0)
+   _nf_unregister_hooks(reg, i);
+   return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned 
int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   while (n-- > 0)
+   _nf_unregister_hook([n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020



[PATCHv3 next 1/3] ipv6: Export p6_route_input_lookup symbol

2016-09-15 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c| 7 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr 
*daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock 
*sk,
 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ad4a7ff301fc..4dab585f7642 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net 
*net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-   struct net_device *dev,
-   struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags)
 {
if (rt6_need_strict(>daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
 
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
2.8.0.rc3.226.g39d4020



[PATCHv3 next 3/3] ipvlan: Introduce l3s mode

2016-09-15 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: David Ahern <d...@cumulusnetworks.com>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  7 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 92 +---
 include/uapi/linux/if_link.h|  1 +
 6 files changed, 194 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or 
as a module
There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-   ip link add link   type ipvlan mode { l2 | L3 }
+   ip link add link   type ipvlan mode { l2 | l3 | 
l3s }
 
e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that 
instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+   This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly 
less
+performance but that shouldn't matter since you are choosing this mode over 
plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
 tristate "IP-VLAN support"
 depends on INET
 depends on IPV6
+depends on NET_L3_MASTER_DEV
 ---help---
   This allows one to create virtual devices off of a main interface
   and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..371f4548c42d 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #define IPVLAN_DRV "ipvlan"
 #define IPV_DRV_VER"0.1"
@@ -96,6 +98,7 @@ struct ipvl_port {
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
+   boolhooks_attached;
struct rcu_head rcu;
 };
 
@@ -124,4 +127,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev 
*ipvlan,
   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+ u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
case IPVLAN_MODE_L2:
return ipvlan_xmit_mode_l2(skb, dev);
case IPVLAN_MODE_L3:
+   case IPVLAN_MODE_L3S:
return ipvlan_xmit_mode_l3(skb, dev);
}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
return ipvlan_handle_mode_l2(pskb, port);
case IPVLAN_MODE_L3:
return ipvlan_handle_mode_l3(pskb, port);
+   case I

[PATCHv2 next 3/3] ipvlan: Introduce l3s mode

2016-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

In a typical IPvlan L3 setup where master is in default-ns and
each slave is into different (slave) ns. In this setup egress
packet processing for traffic originating from slave-ns will
hit all NF_HOOKs in slave-ns as well as default-ns. However same
is not true for ingress processing. All these NF_HOOKs are
hit only in the slave-ns skipping them in the default-ns.
IPvlan in L3 mode is restrictive and if admins want to deploy
iptables rules in default-ns, this asymmetric data path makes it
impossible to do so.

This patch makes use of the l3_rcv() (added as part of l3mdev
enhancements) to perform input route lookup on RX packets without
changing the skb->dev and then uses nf_hook at NF_INET_LOCAL_IN
to change the skb->dev just before handing over skb to L4.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  7 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 60 ---
 include/uapi/linux/if_link.h|  1 +
 6 files changed, 162 insertions(+), 8 deletions(-)

diff --git a/Documentation/networking/ipvlan.txt 
b/Documentation/networking/ipvlan.txt
index 14422f8fcdc4..24196cef7c91 100644
--- a/Documentation/networking/ipvlan.txt
+++ b/Documentation/networking/ipvlan.txt
@@ -22,7 +22,7 @@ The driver can be built into the kernel (CONFIG_IPVLAN=y) or 
as a module
There are no module parameters for this driver and it can be configured
 using IProute2/ip utility.
 
-   ip link add link   type ipvlan mode { l2 | L3 }
+   ip link add link   type ipvlan mode { l2 | l3 | 
l3s }
 
e.g. ip link add link ipvl0 eth0 type ipvlan mode l2
 
@@ -48,6 +48,11 @@ master device for the L2 processing and routing from that 
instance will be
 used before packets are queued on the outbound device. In this mode the slaves
 will not receive nor can send multicast / broadcast traffic.
 
+4.3 L3S mode:
+   This is very similar to the L3 mode except that iptables (conn-tracking)
+works in this mode and hence it is L3-symmetric (L3s). This will have slightly 
less
+performance but that shouldn't matter since you are choosing this mode over 
plain-L3
+mode to make conn-tracking work.
 
 5. What to choose (macvlan vs. ipvlan)?
These two devices are very similar in many regards and the specific use
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0c5415b05ea9..8768a625350d 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -149,6 +149,7 @@ config IPVLAN
 tristate "IP-VLAN support"
 depends on INET
 depends on IPV6
+depends on NET_L3_MASTER_DEV
 ---help---
   This allows one to create virtual devices off of a main interface
   and packets will be delivered based on the dest L3 (IPv6/IPv4 addr)
diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h
index 695a5dc9ace3..68b270b59ba9 100644
--- a/drivers/net/ipvlan/ipvlan.h
+++ b/drivers/net/ipvlan/ipvlan.h
@@ -23,11 +23,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #define IPVLAN_DRV "ipvlan"
 #define IPV_DRV_VER"0.1"
@@ -96,6 +98,7 @@ struct ipvl_port {
struct work_struct  wq;
struct sk_buff_head backlog;
int count;
+   boolipt_hook_added;
struct rcu_head rcu;
 };
 
@@ -124,4 +127,8 @@ struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev 
*ipvlan,
   const void *iaddr, bool is_v6);
 bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
 void ipvlan_ht_addr_del(struct ipvl_addr *addr);
+struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
+ u16 proto);
+unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
+const struct nf_hook_state *state);
 #endif /* __IPVLAN_H */
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c
index b5f9511d819e..b4e990743e1d 100644
--- a/drivers/net/ipvlan/ipvlan_core.c
+++ b/drivers/net/ipvlan/ipvlan_core.c
@@ -560,6 +560,7 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct 
net_device *dev)
case IPVLAN_MODE_L2:
return ipvlan_xmit_mode_l2(skb, dev);
case IPVLAN_MODE_L3:
+   case IPVLAN_MODE_L3S:
return ipvlan_xmit_mode_l3(skb, dev);
}
 
@@ -664,6 +665,8 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff 
**pskb)
return ipvlan_handle_mode_l2(pskb, port);
case IPVLAN_MODE_L3:
return ipvlan_handle_mode_l3(pskb, port);
+   case IPVLAN_MODE_L3S:
+   return RX_HANDLER_PASS;
}
 
 

[PATCHv2 next 2/3] net: Add _nf_(un)register_hooks symbols

2016-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
CC: Pablo Neira Ayuso <pa...@netfilter.org>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c  | 51 ++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
struct net *net, *last;
int ret;
 
-   rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(>list, _hook_list);
-   rtnl_unlock();
 
return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+   return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+   int ret;
+
+   rtnl_lock();
+   ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
struct net *net;
 
-   rtnl_lock();
list_del(>list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+   rtnl_lock();
+   _nf_unregister_hook(reg);
rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   unsigned int i;
+   int err = 0;
+
+   for (i = 0; i < n; i++) {
+   err = _nf_register_hook([i]);
+   if (err)
+   goto err;
+   }
+   return err;
+
+err:
+   if (i > 0)
+   _nf_unregister_hooks(reg, i);
+   return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned 
int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   while (n-- > 0)
+   _nf_unregister_hook([n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020



[PATCHv2 next 1/3] ipv6: Export p6_route_input_lookup symbol

2016-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c| 7 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr 
*daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock 
*sk,
 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09d43ff11a8d..9563eedd4f97 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net 
*net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-   struct net_device *dev,
-   struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags)
 {
if (rt6_need_strict(>daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
 
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
2.8.0.rc3.226.g39d4020



[PATCHv2 next 0/3] ipvlan introduce l3s mode

2016-09-12 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Same old problem with new approach especially from suggestions from
earlier patch-series.

First thing is that this is introduced as a new mode rather than
modifying the old (L3) mode. So the behavior of the existing modes is
preserved as it is and the new L3s mode obeys iptables so that intended
conn-tracking can work. 

To do this, the code uses newly added l3mdev_rcv() handler and an
Iptables hook. l3mdev_rcv() to perform an inbound route lookup with the
correct (IPvlan slave) interface and then IPtable-hook at LOCAL_INPUT
to change the input device from master to the slave to complete the
formality.

Supporting stack changes are trivial changes to export symbol to get
IPv4 equivalent code exported for IPv6 and to allow netfilter hook
registration code to allow caller to hold RTNL. Please look into
individual patches for details.

Mahesh Bandewar (3):
  ipv6: Export p6_route_input_lookup symbol
  net: Add _nf_(un)register_hooks symbols
  ipvlan: Introduce l3s mode

 Documentation/networking/ipvlan.txt |  7 ++-
 drivers/net/Kconfig |  1 +
 drivers/net/ipvlan/ipvlan.h |  7 +++
 drivers/net/ipvlan/ipvlan_core.c| 94 +
 drivers/net/ipvlan/ipvlan_main.c| 60 ---
 include/linux/netfilter.h   |  2 +
 include/net/ip6_route.h |  3 ++
 include/uapi/linux/if_link.h|  1 +
 net/ipv6/route.c|  7 +--
 net/netfilter/core.c| 51 ++--
 10 files changed, 217 insertions(+), 16 deletions(-)


v1: Initial post
v2: Text correction and config changed from "select" to "depends on"
-- 
2.8.0.rc3.226.g39d4020



[PATCH next 2/3] net: Add _nf_(un)register_hooks symbols

2016-09-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Add _nf_register_hooks() and _nf_unregister_hooks() calls which allow
caller to hold RTNL mutex.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/linux/netfilter.h |  2 ++
 net/netfilter/core.c  | 51 ++-
 2 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 9230f9aee896..e82b76781bf6 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -133,6 +133,8 @@ int nf_register_hook(struct nf_hook_ops *reg);
 void nf_unregister_hook(struct nf_hook_ops *reg);
 int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n);
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n);
 
 /* Functions to register get/setsockopt ranges (non-inclusive).  You
need to check permissions yourself! */
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index f39276d1c2d7..2c5327e43a88 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -188,19 +188,17 @@ EXPORT_SYMBOL(nf_unregister_net_hooks);
 
 static LIST_HEAD(nf_hook_list);
 
-int nf_register_hook(struct nf_hook_ops *reg)
+static int _nf_register_hook(struct nf_hook_ops *reg)
 {
struct net *net, *last;
int ret;
 
-   rtnl_lock();
for_each_net(net) {
ret = nf_register_net_hook(net, reg);
if (ret && ret != -ENOENT)
goto rollback;
}
list_add_tail(>list, _hook_list);
-   rtnl_unlock();
 
return 0;
 rollback:
@@ -210,19 +208,34 @@ rollback:
break;
nf_unregister_net_hook(net, reg);
}
+   return ret;
+}
+
+int nf_register_hook(struct nf_hook_ops *reg)
+{
+   int ret;
+
+   rtnl_lock();
+   ret = _nf_register_hook(reg);
rtnl_unlock();
+
return ret;
 }
 EXPORT_SYMBOL(nf_register_hook);
 
-void nf_unregister_hook(struct nf_hook_ops *reg)
+static void _nf_unregister_hook(struct nf_hook_ops *reg)
 {
struct net *net;
 
-   rtnl_lock();
list_del(>list);
for_each_net(net)
nf_unregister_net_hook(net, reg);
+}
+
+void nf_unregister_hook(struct nf_hook_ops *reg)
+{
+   rtnl_lock();
+   _nf_unregister_hook(reg);
rtnl_unlock();
 }
 EXPORT_SYMBOL(nf_unregister_hook);
@@ -246,6 +259,26 @@ err:
 }
 EXPORT_SYMBOL(nf_register_hooks);
 
+/* Caller MUST take rtnl_lock() */
+int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   unsigned int i;
+   int err = 0;
+
+   for (i = 0; i < n; i++) {
+   err = _nf_register_hook([i]);
+   if (err)
+   goto err;
+   }
+   return err;
+
+err:
+   if (i > 0)
+   _nf_unregister_hooks(reg, i);
+   return err;
+}
+EXPORT_SYMBOL(_nf_register_hooks);
+
 void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
 {
while (n-- > 0)
@@ -253,6 +286,14 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned 
int n)
 }
 EXPORT_SYMBOL(nf_unregister_hooks);
 
+/* Caller MUST take rtnl_lock */
+void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n)
+{
+   while (n-- > 0)
+   _nf_unregister_hook([n]);
+}
+EXPORT_SYMBOL(_nf_unregister_hooks);
+
 unsigned int nf_iterate(struct list_head *head,
struct sk_buff *skb,
struct nf_hook_state *state,
-- 
2.8.0.rc3.226.g39d4020



[PATCH next 1/3] ipv6: Export p6_route_input_lookup symbol

2016-09-09 Thread Mahesh Bandewar
From: Mahesh Bandewar <mahe...@google.com>

Make ip6_route_input_lookup available outside of ipv6 the module
similar to ip_route_input_noref in the IPv4 world.

Signed-off-by: Mahesh Bandewar <mahe...@google.com>
---
 include/net/ip6_route.h | 3 +++
 net/ipv6/route.c| 7 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index d97305d0e71f..e0cd318d5103 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,9 @@ static inline bool rt6_need_strict(const struct in6_addr 
*daddr)
 }
 
 void ip6_route_input(struct sk_buff *skb);
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags);
 
 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock 
*sk,
 struct flowi6 *fl6, int flags);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 09d43ff11a8d..9563eedd4f97 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1147,15 +1147,16 @@ static struct rt6_info *ip6_pol_route_input(struct net 
*net, struct fib6_table *
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
 }
 
-static struct dst_entry *ip6_route_input_lookup(struct net *net,
-   struct net_device *dev,
-   struct flowi6 *fl6, int flags)
+struct dst_entry *ip6_route_input_lookup(struct net *net,
+struct net_device *dev,
+struct flowi6 *fl6, int flags)
 {
if (rt6_need_strict(>daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
 
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
+EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
 
 void ip6_route_input(struct sk_buff *skb)
 {
-- 
2.8.0.rc3.226.g39d4020



  1   2   >