Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf

2019-06-06 Thread Pablo Neira Ayuso
On Thu, Jun 06, 2019 at 05:19:39PM +0200, Christian Brauner wrote:
> On Thu, Jun 06, 2019 at 08:14:40AM -0700, Stephen Hemminger wrote:
> > On Thu,  6 Jun 2019 13:41:41 +0200
> > Christian Brauner  wrote:
> > 
> > > +struct netns_brnf {
> > > +#ifdef CONFIG_SYSCTL
> > > + struct ctl_table_header *ctl_hdr;
> > > +#endif
> > > +
> > > + /* default value is 1 */
> > > + int call_iptables;
> > > + int call_ip6tables;
> > > + int call_arptables;
> > > +
> > > + /* default value is 0 */
> > > + int filter_vlan_tagged;
> > > + int filter_pppoe_tagged;
> > > + int pass_vlan_indev;
> > > +};
> > 
> > Do you really need to waste four bytes for each
> > flag value. If you use a u8 that would work just as well.
> 
> I think we had discussed something like this but the problem why we
> can't do this stems from how the sysctl-table stuff is implemented.
> I distinctly remember that it couldn't be done with a flag due to that.

Could you define a pernet_operations object? I mean, define the id and size
fields, then pass it to register_pernet_subsys() for registration.
Similar to what we do in net/ipv4/netfilter/ipt_CLUSTER.c, see
clusterip_net_ops and clusterip_pernet() for instance.


Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf

2019-06-06 Thread Christian Brauner
On Thu, Jun 06, 2019 at 08:14:40AM -0700, Stephen Hemminger wrote:
> On Thu,  6 Jun 2019 13:41:41 +0200
> Christian Brauner  wrote:
> 
> > +struct netns_brnf {
> > +#ifdef CONFIG_SYSCTL
> > +   struct ctl_table_header *ctl_hdr;
> > +#endif
> > +
> > +   /* default value is 1 */
> > +   int call_iptables;
> > +   int call_ip6tables;
> > +   int call_arptables;
> > +
> > +   /* default value is 0 */
> > +   int filter_vlan_tagged;
> > +   int filter_pppoe_tagged;
> > +   int pass_vlan_indev;
> > +};
> 
> Do you really need to waste four bytes for each
> flag value. If you use a u8 that would work just as well.

I think we had discussed something like this but the problem why we
can't do this stems from how the sysctl-table stuff is implemented.
I distinctly remember that it couldn't be done with a flag due to that.

Christian


Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf

2019-06-06 Thread Stephen Hemminger
On Thu,  6 Jun 2019 13:41:41 +0200
Christian Brauner  wrote:

> +struct netns_brnf {
> +#ifdef CONFIG_SYSCTL
> + struct ctl_table_header *ctl_hdr;
> +#endif
> +
> + /* default value is 1 */
> + int call_iptables;
> + int call_ip6tables;
> + int call_arptables;
> +
> + /* default value is 0 */
> + int filter_vlan_tagged;
> + int filter_pppoe_tagged;
> + int pass_vlan_indev;
> +};

Do you really need to waste four bytes for each
flag value. If you use a u8 that would work just as well.

Bool would also work but the kernel developers frown on bool
in structures.


[Bridge] [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf

2019-06-06 Thread Christian Brauner
This adds struct netns_brnf in preparation for per-network-namespace
br_netfilter settings. The individual br_netfilter sysctl options are moved
into a central place in struct net. The struct is only included when
the CONFIG_BRIDGE_NETFILTER kconfig option is enabled in the kernel.

Signed-off-by: Christian Brauner 
Reviewed-by: Tyler Hicks 
---
 include/net/net_namespace.h |  3 ++
 include/net/netns/netfilter.h   | 16 
 net/bridge/br_netfilter_hooks.c | 68 -
 3 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 12689ddfc24c..a958d09dc14d 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -127,6 +127,9 @@ struct net {
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct;
 #endif
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+   struct netns_brnf   brnf;
+#endif
 #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
struct netns_nftables   nft;
 #endif
diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h
index ca043342c0eb..eedbd1ac940e 100644
--- a/include/net/netns/netfilter.h
+++ b/include/net/netns/netfilter.h
@@ -35,4 +35,20 @@ struct netns_nf {
booldefrag_ipv6;
 #endif
 };
+
+struct netns_brnf {
+#ifdef CONFIG_SYSCTL
+   struct ctl_table_header *ctl_hdr;
+#endif
+
+   /* default value is 1 */
+   int call_iptables;
+   int call_ip6tables;
+   int call_arptables;
+
+   /* default value is 0 */
+   int filter_vlan_tagged;
+   int filter_pppoe_tagged;
+   int pass_vlan_indev;
+};
 #endif
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 34fa72c72ad8..b51c6b49fc6f 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -49,23 +49,6 @@ struct brnf_net {
bool enabled;
 };
 
-#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
-#endif
-
 #define IS_IP(skb) \
(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
 
@@ -87,15 +70,15 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
 
 #define IS_VLAN_IP(skb) \
(vlan_proto(skb) == htons(ETH_P_IP) && \
-brnf_filter_vlan_tagged)
+init_net.brnf.filter_vlan_tagged)
 
 #define IS_VLAN_IPV6(skb) \
(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-brnf_filter_vlan_tagged)
+init_net.brnf.filter_vlan_tagged)
 
 #define IS_VLAN_ARP(skb) \
(vlan_proto(skb) == htons(ETH_P_ARP) && \
-brnf_filter_vlan_tagged)
+init_net.brnf.filter_vlan_tagged)
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -106,12 +89,12 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
 #define IS_PPPOE_IP(skb) \
(skb->protocol == htons(ETH_P_PPP_SES) && \
 pppoe_proto(skb) == htons(PPP_IP) && \
-brnf_filter_pppoe_tagged)
+init_net.brnf.filter_pppoe_tagged)
 
 #define IS_PPPOE_IPV6(skb) \
(skb->protocol == htons(ETH_P_PPP_SES) && \
 pppoe_proto(skb) == htons(PPP_IPV6) && \
-brnf_filter_pppoe_tagged)
+init_net.brnf.filter_pppoe_tagged)
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -413,7 +396,7 @@ static struct net_device *brnf_get_logical_dev(struct 
sk_buff *skb, const struct
struct net_device *vlan, *br;
 
br = bridge_parent(dev);
-   if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+   if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
return br;
 
vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -470,7 +453,7 @@ static unsigned int br_nf_pre_routing(void *priv,
br = p->br;
 
if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
-   if (!brnf_call_ip6tables &&
+   if (!init_net.brnf.call_ip6tables &&
!br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
 
@@ -478,7 +461,8 @@ static unsigned int br_nf_pre_routing(void *priv,
return br_nf_pre_routing_ipv6(priv, skb, state);
}
 
-   if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
+   if (!init_net.brnf.call_iptables &&
+   !br_opt_get(br, 

[PATCH RESEND net-next 0/2] br_netfilter: enable in non-initial netns

2019-06-06 Thread Christian Brauner
Hey everyone,

This is another resend of the same patch series. I have received so many
requests, pings, and questions that I would really like to push for this
again.

Over time I have seen multiple reports by users who want to run applications
(Kubernetes e.g. via [1]) that require the br_netfilter module in
non-initial network namespaces. There are *a lot* of issues for this. A
shortlist including ChromeOS and other big users is found below under
[2]! Even non-devs already tried to get more traction on this by
commenting on the patchset (cf. [3]).

Currently, the /proc/sys/net/bridge folder is only created in the
initial network namespace. This patch series ensures that the
/proc/sys/net/bridge folder is available in each network namespace if
the module is loaded and disappears from all network namespaces when the
module is unloaded.
The patch series also makes the sysctls:

bridge-nf-call-arptables
bridge-nf-call-ip6tables
bridge-nf-call-iptables
bridge-nf-filter-pppoe-tagged
bridge-nf-filter-vlan-tagged
bridge-nf-pass-vlan-input-dev

apply per network namespace. This unblocks some use-cases where users
would like to e.g. not do bridge filtering for bridges in a specific
network namespace while doing so for bridges located in another network
namespace.
The netfilter rules are afaict already per network namespace so it
should be safe for users to specify whether a bridge device inside their
network namespace is supposed to go through iptables et al. or not.
Also, this can already be done by setting an option for each individual
bridge via Netlink. It should also be possible to do this for all
bridges in a network namespace via sysctls.

Thanks!
Christian

[1]: https://github.com/zimmertr/Bootstrap-Kubernetes-with-Ansible
[2]: https://bugs.chromium.org/p/chromium/issues/detail?id=878034 
 https://github.com/lxc/lxd/issues/5193
 
https://discuss.linuxcontainers.org/t/bridge-nf-call-iptables-and-swap-error-on-lxd-with-kubeadm/2204
 https://github.com/lxc/lxd/issues/3306
 https://gitlab.com/gitlab-org/gitlab-runner/issues/3705
 https://ubuntuforums.org/showthread.php?t=2415032
 
https://medium.com/@thomaszimmerman93/hi-im-unable-to-get-kubeadm-init-to-run-due-to-br-netfilter-not-being-loaded-within-the-5642a4ccfece
[3]: https://lkml.org/lkml/2019/3/7/365

Christian Brauner (2):
  br_netfilter: add struct netns_brnf
  br_netfilter: namespace bridge netfilter sysctls

 include/net/net_namespace.h  |   3 +
 include/net/netfilter/br_netfilter.h |   3 +-
 include/net/netns/netfilter.h|  16 +++
 net/bridge/br_netfilter_hooks.c  | 166 ++-
 net/bridge/br_netfilter_ipv6.c   |   2 +-
 5 files changed, 134 insertions(+), 56 deletions(-)

-- 
2.21.0



[PATCH RESEND net-next 2/2] br_netfilter: namespace bridge netfilter sysctls

2019-06-06 Thread Christian Brauner
Currently, the /proc/sys/net/bridge folder is only created in the initial
network namespace. This patch ensures that the /proc/sys/net/bridge folder
is available in each network namespace if the module is loaded and
disappears from all network namespaces when the module is unloaded.

In doing so the patch makes the sysctls:

bridge-nf-call-arptables
bridge-nf-call-ip6tables
bridge-nf-call-iptables
bridge-nf-filter-pppoe-tagged
bridge-nf-filter-vlan-tagged
bridge-nf-pass-vlan-input-dev

apply per network namespace. This unblocks some use-cases where users would
like to e.g. not do bridge filtering for bridges in a specific network
namespace while doing so for bridges located in another network namespace.

The netfilter rules are afaict already per network namespace so it should
be safe for users to specify whether bridge devices inside a network
namespace are supposed to go through iptables et al. or not. Also, this can
already be done per-bridge by setting an option for each individual bridge
via Netlink. It should also be possible to do this for all bridges in a
network namespace via sysctls.

Signed-off-by: Christian Brauner 
Reviewed-by: Tyler Hicks 
---
 include/net/netfilter/br_netfilter.h |   3 +-
 net/bridge/br_netfilter_hooks.c  | 116 ---
 net/bridge/br_netfilter_ipv6.c   |   2 +-
 3 files changed, 91 insertions(+), 30 deletions(-)

diff --git a/include/net/netfilter/br_netfilter.h 
b/include/net/netfilter/br_netfilter.h
index 89808ce293c4..302fcd3aade2 100644
--- a/include/net/netfilter/br_netfilter.h
+++ b/include/net/netfilter/br_netfilter.h
@@ -42,7 +42,8 @@ static inline struct rtable *bridge_parent_rtable(const 
struct net_device *dev)
return port ? >br->fake_rtable : NULL;
 }
 
-struct net_device *setup_pre_routing(struct sk_buff *skb);
+struct net_device *setup_pre_routing(struct sk_buff *skb,
+const struct net *net);
 
 #if IS_ENABLED(CONFIG_IPV6)
 int br_validate_ipv6(struct net *net, struct sk_buff *skb);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index b51c6b49fc6f..02960259e51b 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -68,17 +68,17 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
return 0;
 }
 
-#define IS_VLAN_IP(skb) \
+#define IS_VLAN_IP(skb, net) \
(vlan_proto(skb) == htons(ETH_P_IP) && \
-init_net.brnf.filter_vlan_tagged)
+net->brnf.filter_vlan_tagged)
 
-#define IS_VLAN_IPV6(skb) \
+#define IS_VLAN_IPV6(skb, net) \
(vlan_proto(skb) == htons(ETH_P_IPV6) && \
-init_net.brnf.filter_vlan_tagged)
+net->brnf.filter_vlan_tagged)
 
-#define IS_VLAN_ARP(skb) \
+#define IS_VLAN_ARP(skb, net) \
(vlan_proto(skb) == htons(ETH_P_ARP) && \
-init_net.brnf.filter_vlan_tagged)
+net->brnf.filter_vlan_tagged)
 
 static inline __be16 pppoe_proto(const struct sk_buff *skb)
 {
@@ -86,15 +86,15 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
sizeof(struct pppoe_hdr)));
 }
 
-#define IS_PPPOE_IP(skb) \
+#define IS_PPPOE_IP(skb, net) \
(skb->protocol == htons(ETH_P_PPP_SES) && \
 pppoe_proto(skb) == htons(PPP_IP) && \
-init_net.brnf.filter_pppoe_tagged)
+net->brnf.filter_pppoe_tagged)
 
-#define IS_PPPOE_IPV6(skb) \
+#define IS_PPPOE_IPV6(skb, net) \
(skb->protocol == htons(ETH_P_PPP_SES) && \
 pppoe_proto(skb) == htons(PPP_IPV6) && \
-init_net.brnf.filter_pppoe_tagged)
+net->brnf.filter_pppoe_tagged)
 
 /* largest possible L2 header, see br_nf_dev_queue_xmit() */
 #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
@@ -391,12 +391,14 @@ static int br_nf_pre_routing_finish(struct net *net, 
struct sock *sk, struct sk_
return 0;
 }
 
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const 
struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+  const struct net_device *dev,
+  const struct net *net)
 {
struct net_device *vlan, *br;
 
br = bridge_parent(dev);
-   if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+   if (net->brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
return br;
 
vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -406,7 +408,7 @@ static struct net_device *brnf_get_logical_dev(struct 
sk_buff *skb, const struct
 }
 
 /* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net 
*net)
 {
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
 
@@ -417,7 +419,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
 
nf_bridge->in_prerouting = 1;