Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf
On Thu, Jun 06, 2019 at 05:19:39PM +0200, Christian Brauner wrote: > On Thu, Jun 06, 2019 at 08:14:40AM -0700, Stephen Hemminger wrote: > > On Thu, 6 Jun 2019 13:41:41 +0200 > > Christian Brauner wrote: > > > > > +struct netns_brnf { > > > +#ifdef CONFIG_SYSCTL > > > + struct ctl_table_header *ctl_hdr; > > > +#endif > > > + > > > + /* default value is 1 */ > > > + int call_iptables; > > > + int call_ip6tables; > > > + int call_arptables; > > > + > > > + /* default value is 0 */ > > > + int filter_vlan_tagged; > > > + int filter_pppoe_tagged; > > > + int pass_vlan_indev; > > > +}; > > > > Do you really need to waste four bytes for each > > flag value. If you use a u8 that would work just as well. > > I think we had discussed something like this but the problem why we > can't do this stems from how the sysctl-table stuff is implemented. > I distinctly remember that it couldn't be done with a flag due to that. Could you define a pernet_operations object? I mean, define the id and size fields, then pass it to register_pernet_subsys() for registration. Similar to what we do in net/ipv4/netfilter/ipt_CLUSTER.c, see clusterip_net_ops and clusterip_pernet() for instance.
Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf
On Thu, Jun 06, 2019 at 08:14:40AM -0700, Stephen Hemminger wrote: > On Thu, 6 Jun 2019 13:41:41 +0200 > Christian Brauner wrote: > > > +struct netns_brnf { > > +#ifdef CONFIG_SYSCTL > > + struct ctl_table_header *ctl_hdr; > > +#endif > > + > > + /* default value is 1 */ > > + int call_iptables; > > + int call_ip6tables; > > + int call_arptables; > > + > > + /* default value is 0 */ > > + int filter_vlan_tagged; > > + int filter_pppoe_tagged; > > + int pass_vlan_indev; > > +}; > > Do you really need to waste four bytes for each > flag value. If you use a u8 that would work just as well. I think we had discussed something like this but the problem why we can't do this stems from how the sysctl-table stuff is implemented. I distinctly remember that it couldn't be done with a flag due to that. Christian
Re: [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf
On Thu, 6 Jun 2019 13:41:41 +0200 Christian Brauner wrote: > +struct netns_brnf { > +#ifdef CONFIG_SYSCTL > + struct ctl_table_header *ctl_hdr; > +#endif > + > + /* default value is 1 */ > + int call_iptables; > + int call_ip6tables; > + int call_arptables; > + > + /* default value is 0 */ > + int filter_vlan_tagged; > + int filter_pppoe_tagged; > + int pass_vlan_indev; > +}; Do you really need to waste four bytes for each flag value. If you use a u8 that would work just as well. Bool would also work but the kernel developers frown on bool in structures.
[Bridge] [PATCH RESEND net-next 1/2] br_netfilter: add struct netns_brnf
This adds struct netns_brnf in preparation for per-network-namespace br_netfilter settings. The individual br_netfilter sysctl options are moved into a central place in struct net. The struct is only included when the CONFIG_BRIDGE_NETFILTER kconfig option is enabled in the kernel. Signed-off-by: Christian Brauner Reviewed-by: Tyler Hicks --- include/net/net_namespace.h | 3 ++ include/net/netns/netfilter.h | 16 net/bridge/br_netfilter_hooks.c | 68 - 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 12689ddfc24c..a958d09dc14d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -127,6 +127,9 @@ struct net { #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct netns_ct ct; #endif +#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + struct netns_brnf brnf; +#endif #if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE) struct netns_nftables nft; #endif diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index ca043342c0eb..eedbd1ac940e 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -35,4 +35,20 @@ struct netns_nf { booldefrag_ipv6; #endif }; + +struct netns_brnf { +#ifdef CONFIG_SYSCTL + struct ctl_table_header *ctl_hdr; +#endif + + /* default value is 1 */ + int call_iptables; + int call_ip6tables; + int call_arptables; + + /* default value is 0 */ + int filter_vlan_tagged; + int filter_pppoe_tagged; + int pass_vlan_indev; +}; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 34fa72c72ad8..b51c6b49fc6f 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -49,23 +49,6 @@ struct brnf_net { bool enabled; }; -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *brnf_sysctl_header; -static int brnf_call_iptables __read_mostly = 1; -static int brnf_call_ip6tables __read_mostly = 1; -static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly; -static int brnf_filter_pppoe_tagged __read_mostly; -static int brnf_pass_vlan_indev __read_mostly; -#else -#define brnf_call_iptables 1 -#define brnf_call_ip6tables 1 -#define brnf_call_arptables 1 -#define brnf_filter_vlan_tagged 0 -#define brnf_filter_pppoe_tagged 0 -#define brnf_pass_vlan_indev 0 -#endif - #define IS_IP(skb) \ (!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) @@ -87,15 +70,15 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) #define IS_VLAN_IP(skb) \ (vlan_proto(skb) == htons(ETH_P_IP) && \ -brnf_filter_vlan_tagged) +init_net.brnf.filter_vlan_tagged) #define IS_VLAN_IPV6(skb) \ (vlan_proto(skb) == htons(ETH_P_IPV6) && \ -brnf_filter_vlan_tagged) +init_net.brnf.filter_vlan_tagged) #define IS_VLAN_ARP(skb) \ (vlan_proto(skb) == htons(ETH_P_ARP) && \ -brnf_filter_vlan_tagged) +init_net.brnf.filter_vlan_tagged) static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -106,12 +89,12 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) #define IS_PPPOE_IP(skb) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IP) && \ -brnf_filter_pppoe_tagged) +init_net.brnf.filter_pppoe_tagged) #define IS_PPPOE_IPV6(skb) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IPV6) && \ -brnf_filter_pppoe_tagged) +init_net.brnf.filter_pppoe_tagged) /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -413,7 +396,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct struct net_device *vlan, *br; br = bridge_parent(dev); - if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -470,7 +453,7 @@ static unsigned int br_nf_pre_routing(void *priv, br = p->br; if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { - if (!brnf_call_ip6tables && + if (!init_net.brnf.call_ip6tables && !br_opt_get(br, BROPT_NF_CALL_IP6TABLES)) return NF_ACCEPT; @@ -478,7 +461,8 @@ static unsigned int br_nf_pre_routing(void *priv, return br_nf_pre_routing_ipv6(priv, skb, state); } - if (!brnf_call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES)) + if (!init_net.brnf.call_iptables && + !br_opt_get(br,
[PATCH RESEND net-next 0/2] br_netfilter: enable in non-initial netns
Hey everyone, This is another resend of the same patch series. I have received so many requests, pings, and questions that I would really like to push for this again. Over time I have seen multiple reports by users who want to run applications (Kubernetes e.g. via [1]) that require the br_netfilter module in non-initial network namespaces. There are *a lot* of issues for this. A shortlist including ChromeOS and other big users is found below under [2]! Even non-devs already tried to get more traction on this by commenting on the patchset (cf. [3]). Currently, the /proc/sys/net/bridge folder is only created in the initial network namespace. This patch series ensures that the /proc/sys/net/bridge folder is available in each network namespace if the module is loaded and disappears from all network namespaces when the module is unloaded. The patch series also makes the sysctls: bridge-nf-call-arptables bridge-nf-call-ip6tables bridge-nf-call-iptables bridge-nf-filter-pppoe-tagged bridge-nf-filter-vlan-tagged bridge-nf-pass-vlan-input-dev apply per network namespace. This unblocks some use-cases where users would like to e.g. not do bridge filtering for bridges in a specific network namespace while doing so for bridges located in another network namespace. The netfilter rules are afaict already per network namespace so it should be safe for users to specify whether a bridge device inside their network namespace is supposed to go through iptables et al. or not. Also, this can already be done by setting an option for each individual bridge via Netlink. It should also be possible to do this for all bridges in a network namespace via sysctls. Thanks! Christian [1]: https://github.com/zimmertr/Bootstrap-Kubernetes-with-Ansible [2]: https://bugs.chromium.org/p/chromium/issues/detail?id=878034 https://github.com/lxc/lxd/issues/5193 https://discuss.linuxcontainers.org/t/bridge-nf-call-iptables-and-swap-error-on-lxd-with-kubeadm/2204 https://github.com/lxc/lxd/issues/3306 https://gitlab.com/gitlab-org/gitlab-runner/issues/3705 https://ubuntuforums.org/showthread.php?t=2415032 https://medium.com/@thomaszimmerman93/hi-im-unable-to-get-kubeadm-init-to-run-due-to-br-netfilter-not-being-loaded-within-the-5642a4ccfece [3]: https://lkml.org/lkml/2019/3/7/365 Christian Brauner (2): br_netfilter: add struct netns_brnf br_netfilter: namespace bridge netfilter sysctls include/net/net_namespace.h | 3 + include/net/netfilter/br_netfilter.h | 3 +- include/net/netns/netfilter.h| 16 +++ net/bridge/br_netfilter_hooks.c | 166 ++- net/bridge/br_netfilter_ipv6.c | 2 +- 5 files changed, 134 insertions(+), 56 deletions(-) -- 2.21.0
[PATCH RESEND net-next 2/2] br_netfilter: namespace bridge netfilter sysctls
Currently, the /proc/sys/net/bridge folder is only created in the initial network namespace. This patch ensures that the /proc/sys/net/bridge folder is available in each network namespace if the module is loaded and disappears from all network namespaces when the module is unloaded. In doing so the patch makes the sysctls: bridge-nf-call-arptables bridge-nf-call-ip6tables bridge-nf-call-iptables bridge-nf-filter-pppoe-tagged bridge-nf-filter-vlan-tagged bridge-nf-pass-vlan-input-dev apply per network namespace. This unblocks some use-cases where users would like to e.g. not do bridge filtering for bridges in a specific network namespace while doing so for bridges located in another network namespace. The netfilter rules are afaict already per network namespace so it should be safe for users to specify whether bridge devices inside a network namespace are supposed to go through iptables et al. or not. Also, this can already be done per-bridge by setting an option for each individual bridge via Netlink. It should also be possible to do this for all bridges in a network namespace via sysctls. Signed-off-by: Christian Brauner Reviewed-by: Tyler Hicks --- include/net/netfilter/br_netfilter.h | 3 +- net/bridge/br_netfilter_hooks.c | 116 --- net/bridge/br_netfilter_ipv6.c | 2 +- 3 files changed, 91 insertions(+), 30 deletions(-) diff --git a/include/net/netfilter/br_netfilter.h b/include/net/netfilter/br_netfilter.h index 89808ce293c4..302fcd3aade2 100644 --- a/include/net/netfilter/br_netfilter.h +++ b/include/net/netfilter/br_netfilter.h @@ -42,7 +42,8 @@ static inline struct rtable *bridge_parent_rtable(const struct net_device *dev) return port ? >br->fake_rtable : NULL; } -struct net_device *setup_pre_routing(struct sk_buff *skb); +struct net_device *setup_pre_routing(struct sk_buff *skb, +const struct net *net); #if IS_ENABLED(CONFIG_IPV6) int br_validate_ipv6(struct net *net, struct sk_buff *skb); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index b51c6b49fc6f..02960259e51b 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -68,17 +68,17 @@ static inline __be16 vlan_proto(const struct sk_buff *skb) return 0; } -#define IS_VLAN_IP(skb) \ +#define IS_VLAN_IP(skb, net) \ (vlan_proto(skb) == htons(ETH_P_IP) && \ -init_net.brnf.filter_vlan_tagged) +net->brnf.filter_vlan_tagged) -#define IS_VLAN_IPV6(skb) \ +#define IS_VLAN_IPV6(skb, net) \ (vlan_proto(skb) == htons(ETH_P_IPV6) && \ -init_net.brnf.filter_vlan_tagged) +net->brnf.filter_vlan_tagged) -#define IS_VLAN_ARP(skb) \ +#define IS_VLAN_ARP(skb, net) \ (vlan_proto(skb) == htons(ETH_P_ARP) && \ -init_net.brnf.filter_vlan_tagged) +net->brnf.filter_vlan_tagged) static inline __be16 pppoe_proto(const struct sk_buff *skb) { @@ -86,15 +86,15 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb) sizeof(struct pppoe_hdr))); } -#define IS_PPPOE_IP(skb) \ +#define IS_PPPOE_IP(skb, net) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IP) && \ -init_net.brnf.filter_pppoe_tagged) +net->brnf.filter_pppoe_tagged) -#define IS_PPPOE_IPV6(skb) \ +#define IS_PPPOE_IPV6(skb, net) \ (skb->protocol == htons(ETH_P_PPP_SES) && \ pppoe_proto(skb) == htons(PPP_IPV6) && \ -init_net.brnf.filter_pppoe_tagged) +net->brnf.filter_pppoe_tagged) /* largest possible L2 header, see br_nf_dev_queue_xmit() */ #define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN) @@ -391,12 +391,14 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ return 0; } -static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev) +static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, + const struct net_device *dev, + const struct net *net) { struct net_device *vlan, *br; br = bridge_parent(dev); - if (init_net.brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) + if (net->brnf.pass_vlan_indev == 0 || !skb_vlan_tag_present(skb)) return br; vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto, @@ -406,7 +408,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct } /* Some common code for IPv4/IPv6 */ -struct net_device *setup_pre_routing(struct sk_buff *skb) +struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net) { struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); @@ -417,7 +419,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->in_prerouting = 1;