From: Taehee Yoo <ap420...@gmail.com>

[ Upstream commit 5a86d68bcf02f2d1e9a5897dd482079fd5f75e7f ]

When network namespace is destroyed, cleanup_net() is called.
cleanup_net() holds pernet_ops_rwsem then calls each ->exit callback.
So that clusterip_tg_destroy() is called by cleanup_net().
And clusterip_tg_destroy() calls unregister_netdevice_notifier().

But both cleanup_net() and clusterip_tg_destroy() hold same
lock(pernet_ops_rwsem). hence deadlock occurrs.

After this patch, only 1 notifier is registered when module is inserted.
And all of configs are added to per-net list.

test commands:
   %ip netns add vm1
   %ip netns exec vm1 bash
   %ip link set lo up
   %iptables -A INPUT -p tcp -i lo -d 192.168.0.5 --dport 80 \
        -j CLUSTERIP --new --hashmode sourceip \
        --clustermac 01:00:5e:00:00:20 --total-nodes 2 --local-node 1
   %exit
   %ip netns del vm1

splat looks like:
[  341.809674] ============================================
[  341.809674] WARNING: possible recursive locking detected
[  341.809674] 4.19.0-rc5+ #16 Tainted: G        W
[  341.809674] --------------------------------------------
[  341.809674] kworker/u4:2/87 is trying to acquire lock:
[  341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: 
unregister_netdevice_notifier+0x8c/0x460
[  341.809674]
[  341.809674] but task is already holding lock:
[  341.809674] 000000005da2d519 (pernet_ops_rwsem){++++}, at: 
cleanup_net+0x119/0x900
[  341.809674]
[  341.809674] other info that might help us debug this:
[  341.809674]  Possible unsafe locking scenario:
[  341.809674]
[  341.809674]        CPU0
[  341.809674]        ----
[  341.809674]   lock(pernet_ops_rwsem);
[  341.809674]   lock(pernet_ops_rwsem);
[  341.809674]
[  341.809674]  *** DEADLOCK ***
[  341.809674]
[  341.809674]  May be due to missing lock nesting notation
[  341.809674]
[  341.809674] 3 locks held by kworker/u4:2/87:
[  341.809674]  #0: 00000000d9df6c92 ((wq_completion)"%s""netns"){+.+.}, at: 
process_one_work+0xafe/0x1de0
[  341.809674]  #1: 00000000c2cbcee2 (net_cleanup_work){+.+.}, at: 
process_one_work+0xb60/0x1de0
[  341.809674]  #2: 000000005da2d519 (pernet_ops_rwsem){++++}, at: 
cleanup_net+0x119/0x900
[  341.809674]
[  341.809674] stack backtrace:
[  341.809674] CPU: 1 PID: 87 Comm: kworker/u4:2 Tainted: G        W         
4.19.0-rc5+ #16
[  341.809674] Workqueue: netns cleanup_net
[  341.809674] Call Trace:
[ ... ]
[  342.070196]  down_write+0x93/0x160
[  342.070196]  ? unregister_netdevice_notifier+0x8c/0x460
[  342.070196]  ? down_read+0x1e0/0x1e0
[  342.070196]  ? sched_clock_cpu+0x126/0x170
[  342.070196]  ? find_held_lock+0x39/0x1c0
[  342.070196]  unregister_netdevice_notifier+0x8c/0x460
[  342.070196]  ? register_netdevice_notifier+0x790/0x790
[  342.070196]  ? __local_bh_enable_ip+0xe9/0x1b0
[  342.070196]  ? __local_bh_enable_ip+0xe9/0x1b0
[  342.070196]  ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP]
[  342.070196]  ? trace_hardirqs_on+0x93/0x210
[  342.070196]  ? __bpf_trace_preemptirq_template+0x10/0x10
[  342.070196]  ? clusterip_tg_destroy+0x372/0x650 [ipt_CLUSTERIP]
[  342.123094]  clusterip_tg_destroy+0x3ad/0x650 [ipt_CLUSTERIP]
[  342.123094]  ? clusterip_net_init+0x3d0/0x3d0 [ipt_CLUSTERIP]
[  342.123094]  ? cleanup_match+0x17d/0x200 [ip_tables]
[  342.123094]  ? xt_unregister_table+0x215/0x300 [x_tables]
[  342.123094]  ? kfree+0xe2/0x2a0
[  342.123094]  cleanup_entry+0x1d5/0x2f0 [ip_tables]
[  342.123094]  ? cleanup_match+0x200/0x200 [ip_tables]
[  342.123094]  __ipt_unregister_table+0x9b/0x1a0 [ip_tables]
[  342.123094]  iptable_filter_net_exit+0x43/0x80 [iptable_filter]
[  342.123094]  ops_exit_list.isra.10+0x94/0x140
[  342.123094]  cleanup_net+0x45b/0x900
[ ... ]

Fixes: 202f59afd441 ("netfilter: ipt_CLUSTERIP: do not hold dev")
Signed-off-by: Taehee Yoo <ap420...@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pa...@netfilter.org>
Signed-off-by: Sasha Levin <sas...@kernel.org>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 155 ++++++++++++++++-------------
 1 file changed, 87 insertions(+), 68 deletions(-)

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c 
b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 33491cb6b9d1..fb1e7f237f53 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -57,17 +57,14 @@ struct clusterip_config {
        enum clusterip_hashmode hash_mode;      /* which hashing mode */
        u_int32_t hash_initval;                 /* hash initialization */
        struct rcu_head rcu;
-
+       struct net *net;                        /* netns for pernet list */
        char ifname[IFNAMSIZ];                  /* device ifname */
-       struct notifier_block notifier;         /* refresh c->ifindex in it */
 };
 
 #ifdef CONFIG_PROC_FS
 static const struct file_operations clusterip_proc_fops;
 #endif
 
-static unsigned int clusterip_net_id __read_mostly;
-
 struct clusterip_net {
        struct list_head configs;
        /* lock protects the configs list */
@@ -78,16 +75,30 @@ struct clusterip_net {
 #endif
 };
 
+static unsigned int clusterip_net_id __read_mostly;
+static inline struct clusterip_net *clusterip_pernet(struct net *net)
+{
+       return net_generic(net, clusterip_net_id);
+}
+
 static inline void
 clusterip_config_get(struct clusterip_config *c)
 {
        refcount_inc(&c->refcount);
 }
 
-
 static void clusterip_config_rcu_free(struct rcu_head *head)
 {
-       kfree(container_of(head, struct clusterip_config, rcu));
+       struct clusterip_config *config;
+       struct net_device *dev;
+
+       config = container_of(head, struct clusterip_config, rcu);
+       dev = dev_get_by_name(config->net, config->ifname);
+       if (dev) {
+               dev_mc_del(dev, config->clustermac);
+               dev_put(dev);
+       }
+       kfree(config);
 }
 
 static inline void
@@ -101,9 +112,9 @@ clusterip_config_put(struct clusterip_config *c)
  * entry(rule) is removed, remove the config from lists, but don't free it
  * yet, since proc-files could still be holding references */
 static inline void
-clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
+clusterip_config_entry_put(struct clusterip_config *c)
 {
-       struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+       struct clusterip_net *cn = clusterip_pernet(c->net);
 
        local_bh_disable();
        if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
@@ -118,8 +129,6 @@ clusterip_config_entry_put(struct net *net, struct 
clusterip_config *c)
                spin_unlock(&cn->lock);
                local_bh_enable();
 
-               unregister_netdevice_notifier(&c->notifier);
-
                return;
        }
        local_bh_enable();
@@ -129,7 +138,7 @@ static struct clusterip_config *
 __clusterip_config_find(struct net *net, __be32 clusterip)
 {
        struct clusterip_config *c;
-       struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+       struct clusterip_net *cn = clusterip_pernet(net);
 
        list_for_each_entry_rcu(c, &cn->configs, list) {
                if (c->clusterip == clusterip)
@@ -181,32 +190,37 @@ clusterip_netdev_event(struct notifier_block *this, 
unsigned long event,
                       void *ptr)
 {
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+       struct net *net = dev_net(dev);
+       struct clusterip_net *cn = clusterip_pernet(net);
        struct clusterip_config *c;
 
-       c = container_of(this, struct clusterip_config, notifier);
-       switch (event) {
-       case NETDEV_REGISTER:
-               if (!strcmp(dev->name, c->ifname)) {
-                       c->ifindex = dev->ifindex;
-                       dev_mc_add(dev, c->clustermac);
-               }
-               break;
-       case NETDEV_UNREGISTER:
-               if (dev->ifindex == c->ifindex) {
-                       dev_mc_del(dev, c->clustermac);
-                       c->ifindex = -1;
-               }
-               break;
-       case NETDEV_CHANGENAME:
-               if (!strcmp(dev->name, c->ifname)) {
-                       c->ifindex = dev->ifindex;
-                       dev_mc_add(dev, c->clustermac);
-               } else if (dev->ifindex == c->ifindex) {
-                       dev_mc_del(dev, c->clustermac);
-                       c->ifindex = -1;
+       spin_lock_bh(&cn->lock);
+       list_for_each_entry_rcu(c, &cn->configs, list) {
+               switch (event) {
+               case NETDEV_REGISTER:
+                       if (!strcmp(dev->name, c->ifname)) {
+                               c->ifindex = dev->ifindex;
+                               dev_mc_add(dev, c->clustermac);
+                       }
+                       break;
+               case NETDEV_UNREGISTER:
+                       if (dev->ifindex == c->ifindex) {
+                               dev_mc_del(dev, c->clustermac);
+                               c->ifindex = -1;
+                       }
+                       break;
+               case NETDEV_CHANGENAME:
+                       if (!strcmp(dev->name, c->ifname)) {
+                               c->ifindex = dev->ifindex;
+                               dev_mc_add(dev, c->clustermac);
+                       } else if (dev->ifindex == c->ifindex) {
+                               dev_mc_del(dev, c->clustermac);
+                               c->ifindex = -1;
+                       }
+                       break;
                }
-               break;
        }
+       spin_unlock_bh(&cn->lock);
 
        return NOTIFY_DONE;
 }
@@ -215,30 +229,44 @@ static struct clusterip_config *
 clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
                      __be32 ip, const char *iniface)
 {
-       struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+       struct clusterip_net *cn = clusterip_pernet(net);
        struct clusterip_config *c;
+       struct net_device *dev;
        int err;
 
+       if (iniface[0] == '\0') {
+               pr_info("Please specify an interface name\n");
+               return ERR_PTR(-EINVAL);
+       }
+
        c = kzalloc(sizeof(*c), GFP_ATOMIC);
        if (!c)
                return ERR_PTR(-ENOMEM);
 
-       strcpy(c->ifname, iniface);
-       c->ifindex = -1;
-       c->clusterip = ip;
+       dev = dev_get_by_name(net, iniface);
+       if (!dev) {
+               pr_info("no such interface %s\n", iniface);
+               kfree(c);
+               return ERR_PTR(-ENOENT);
+       }
+       c->ifindex = dev->ifindex;
+       strcpy(c->ifname, dev->name);
        memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+       dev_mc_add(dev, c->clustermac);
+       dev_put(dev);
+
+       c->clusterip = ip;
        c->num_total_nodes = i->num_total_nodes;
        clusterip_config_init_nodelist(c, i);
        c->hash_mode = i->hash_mode;
        c->hash_initval = i->hash_initval;
+       c->net = net;
        refcount_set(&c->refcount, 1);
 
        spin_lock_bh(&cn->lock);
        if (__clusterip_config_find(net, ip)) {
-               spin_unlock_bh(&cn->lock);
-               kfree(c);
-
-               return ERR_PTR(-EBUSY);
+               err = -EBUSY;
+               goto out_config_put;
        }
 
        list_add_rcu(&c->list, &cn->configs);
@@ -260,22 +288,17 @@ clusterip_config_init(struct net *net, const struct 
ipt_clusterip_tgt_info *i,
        }
 #endif
 
-       c->notifier.notifier_call = clusterip_netdev_event;
-       err = register_netdevice_notifier(&c->notifier);
-       if (!err) {
-               refcount_set(&c->entries, 1);
-               return c;
-       }
+       refcount_set(&c->entries, 1);
+       return c;
 
 #ifdef CONFIG_PROC_FS
-       proc_remove(c->pde);
 err:
 #endif
        spin_lock_bh(&cn->lock);
        list_del_rcu(&c->list);
+out_config_put:
        spin_unlock_bh(&cn->lock);
        clusterip_config_put(c);
-
        return ERR_PTR(err);
 }
 
@@ -475,21 +498,6 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
                                &e->ip.dst.s_addr);
                        return -EINVAL;
                } else {
-                       struct net_device *dev;
-
-                       if (e->ip.iniface[0] == '\0') {
-                               pr_info("Please specify an interface name\n");
-                               return -EINVAL;
-                       }
-
-                       dev = dev_get_by_name(par->net, e->ip.iniface);
-                       if (!dev) {
-                               pr_info("no such interface %s\n",
-                                       e->ip.iniface);
-                               return -ENOENT;
-                       }
-                       dev_put(dev);
-
                        config = clusterip_config_init(par->net, cipinfo,
                                                       e->ip.dst.s_addr,
                                                       e->ip.iniface);
@@ -503,7 +511,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param 
*par)
        if (ret < 0) {
                pr_info("cannot load conntrack support for proto=%u\n",
                        par->family);
-               clusterip_config_entry_put(par->net, config);
+               clusterip_config_entry_put(config);
                clusterip_config_put(config);
                return ret;
        }
@@ -525,7 +533,7 @@ static void clusterip_tg_destroy(const struct 
xt_tgdtor_param *par)
 
        /* if no more entries are referencing the config, remove it
         * from the list and destroy the proc entry */
-       clusterip_config_entry_put(par->net, cipinfo->config);
+       clusterip_config_entry_put(cipinfo->config);
 
        clusterip_config_put(cipinfo->config);
 
@@ -807,7 +815,7 @@ static const struct file_operations clusterip_proc_fops = {
 
 static int clusterip_net_init(struct net *net)
 {
-       struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+       struct clusterip_net *cn = clusterip_pernet(net);
        int ret;
 
        INIT_LIST_HEAD(&cn->configs);
@@ -832,7 +840,7 @@ static int clusterip_net_init(struct net *net)
 
 static void clusterip_net_exit(struct net *net)
 {
-       struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+       struct clusterip_net *cn = clusterip_pernet(net);
 #ifdef CONFIG_PROC_FS
        proc_remove(cn->procdir);
        cn->procdir = NULL;
@@ -847,6 +855,10 @@ static struct pernet_operations clusterip_net_ops = {
        .size = sizeof(struct clusterip_net),
 };
 
+struct notifier_block cip_netdev_notifier = {
+       .notifier_call = clusterip_netdev_event
+};
+
 static int __init clusterip_tg_init(void)
 {
        int ret;
@@ -859,11 +871,17 @@ static int __init clusterip_tg_init(void)
        if (ret < 0)
                goto cleanup_subsys;
 
+       ret = register_netdevice_notifier(&cip_netdev_notifier);
+       if (ret < 0)
+               goto unregister_target;
+
        pr_info("ClusterIP Version %s loaded successfully\n",
                CLUSTERIP_VERSION);
 
        return 0;
 
+unregister_target:
+       xt_unregister_target(&clusterip_tg_reg);
 cleanup_subsys:
        unregister_pernet_subsys(&clusterip_net_ops);
        return ret;
@@ -873,6 +891,7 @@ static void __exit clusterip_tg_exit(void)
 {
        pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
 
+       unregister_netdevice_notifier(&cip_netdev_notifier);
        xt_unregister_target(&clusterip_tg_reg);
        unregister_pernet_subsys(&clusterip_net_ops);
 
-- 
2.19.1

Reply via email to