Curently mutex is used to protect pernet operations list. It makes cleanup_net() to execute ->exit methods of the same operations set, which was used on the time of ->init, even after net namespace is unlinked from net_namespace_list.
But the problem is it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages The solution is to convert net_mutex to the rw_semaphore. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list. This gives signify performance increase, like you may see below. There is measured sequential net namespace creation in a cycle, in single thread, without other tasks (single user mode): 1)int main(int argc, char *argv[]) { unsigned nr; if (argc < 2) { fprintf(stderr, "Provide nr iterations arg\n"); return 1; } nr = atoi(argv[1]); while (nr-- > 0) { if (unshare(CLONE_NEWNET)) { perror("Can't unshare"); return 1; } } return 0; } Origin, 100000 unshare(): 0.03user 23.14system 1:39.85elapsed 23%CPU Patched, 100000 unshare(): 0.03user 67.49system 1:08.34elapsed 98%CPU 2)for i in {1..10000}; do unshare -n bash -c exit; done Origin: real 1m24,190s user 0m6,225s sys 0m15,132s Patched: real 0m18,235s (4.6 times faster) user 0m4,544s sys 0m13,796s This patch requires commit 76f8507f7a64 "locking/rwsem: Add down_read_killable()" from Linus tree (not in net-next yet). Signed-off-by: Kirill Tkhai <ktk...@virtuozzo.com> --- include/linux/rtnetlink.h | 2 +- include/net/net_namespace.h | 9 +++++++-- net/core/net_namespace.c | 40 ++++++++++++++++++++-------------------- net/core/rtnetlink.c | 4 ++-- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 54bcd970bfd3..36cc009f4580 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -32,7 +32,7 @@ extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern wait_queue_head_t netdev_unregistering_wq; -extern struct mutex net_mutex; +extern struct rw_semaphore net_sem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 10f99dafd5ac..aaed826ccbe7 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -60,7 +60,7 @@ struct net { struct list_head list; /* list of network namespaces */ struct list_head cleanup_list; /* namespaces on death row */ - struct list_head exit_list; /* Use only net_mutex */ + struct list_head exit_list; /* Use only net_sem */ struct user_namespace *user_ns; /* Owning user namespace */ struct ucounts *ucounts; @@ -89,7 +89,12 @@ struct net { /* core fib_rules */ struct list_head rules_ops; - struct list_head fib_notifier_ops; /* protected by net_mutex */ + /* + * RCU-protected list, modifiable by pernet-init and -exit methods. + * When net namespace is alive (net::count > 0), all the changes + * are made under rw_sem held on write. + */ + struct list_head fib_notifier_ops; struct net_device *loopback_dev; /* The loopback */ struct netns_core core; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6cfdc7c84c48..f502b11b507e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -29,7 +29,7 @@ static LIST_HEAD(pernet_list); static struct list_head *first_device = &pernet_list; -DEFINE_MUTEX(net_mutex); +DECLARE_RWSEM(net_sem); LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); @@ -65,11 +65,11 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) { struct net_generic *ng, *old_ng; - BUG_ON(!mutex_is_locked(&net_mutex)); + BUG_ON(!rwsem_is_locked(&net_sem)); BUG_ON(id < MIN_PERNET_OPS_ID); old_ng = rcu_dereference_protected(net->gen, - lockdep_is_held(&net_mutex)); + lockdep_is_held(&net_sem)); if (old_ng->s.len > id) { old_ng->ptr[id] = data; return 0; @@ -278,7 +278,7 @@ struct net *get_net_ns_by_id(struct net *net, int id) */ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { - /* Must be called with net_mutex held */ + /* Must be called with net_sem held */ const struct pernet_operations *ops, *saved_ops; int error = 0; LIST_HEAD(net_exit_list); @@ -406,7 +406,7 @@ struct net *copy_net_ns(unsigned long flags, get_user_ns(user_ns); - rv = mutex_lock_killable(&net_mutex); + rv = down_read_killable(&net_sem); if (rv < 0) { net_free(net); dec_net_namespaces(ucounts); @@ -421,7 +421,7 @@ struct net *copy_net_ns(unsigned long flags, list_add_tail_rcu(&net->list, &net_namespace_list); rtnl_unlock(); } - mutex_unlock(&net_mutex); + up_read(&net_sem); if (rv < 0) { dec_net_namespaces(ucounts); put_user_ns(user_ns); @@ -446,7 +446,7 @@ static void cleanup_net(struct work_struct *work) list_replace_init(&cleanup_list, &net_kill_list); spin_unlock_irq(&cleanup_list_lock); - mutex_lock(&net_mutex); + down_read(&net_sem); /* Don't let anyone else find us. */ rtnl_lock(); @@ -486,7 +486,7 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list); - mutex_unlock(&net_mutex); + up_read(&net_sem); /* Ensure there are no outstanding rcu callbacks using this * network namespace. @@ -513,8 +513,8 @@ static void cleanup_net(struct work_struct *work) */ void net_ns_barrier(void) { - mutex_lock(&net_mutex); - mutex_unlock(&net_mutex); + down_write(&net_sem); + up_write(&net_sem); } EXPORT_SYMBOL(net_ns_barrier); @@ -841,7 +841,7 @@ static int __init net_ns_init(void) rcu_assign_pointer(init_net.gen, ng); - mutex_lock(&net_mutex); + down_read(&net_sem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace"); @@ -851,7 +851,7 @@ static int __init net_ns_init(void) list_add_tail_rcu(&init_net.list, &net_namespace_list); rtnl_unlock(); - mutex_unlock(&net_mutex); + up_read(&net_sem); register_pernet_subsys(&net_ns_ops); @@ -991,9 +991,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops) int register_pernet_subsys(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&net_sem); error = register_pernet_operations(first_device, ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); return error; } EXPORT_SYMBOL_GPL(register_pernet_subsys); @@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys); */ void unregister_pernet_subsys(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&net_sem); unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(unregister_pernet_subsys); @@ -1037,11 +1037,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys); int register_pernet_device(struct pernet_operations *ops) { int error; - mutex_lock(&net_mutex); + down_write(&net_sem); error = register_pernet_operations(&pernet_list, ops); if (!error && (first_device == &pernet_list)) first_device = &ops->list; - mutex_unlock(&net_mutex); + up_write(&net_sem); return error; } EXPORT_SYMBOL_GPL(register_pernet_device); @@ -1057,11 +1057,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device); */ void unregister_pernet_device(struct pernet_operations *ops) { - mutex_lock(&net_mutex); + down_write(&net_sem); if (&ops->list == first_device) first_device = first_device->next; unregister_pernet_operations(ops); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(unregister_pernet_device); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 5ace48926b19..caa215fd170b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -390,11 +390,11 @@ static void rtnl_lock_unregistering_all(void) void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with cleanup_net() */ - mutex_lock(&net_mutex); + down_write(&net_sem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); - mutex_unlock(&net_mutex); + up_write(&net_sem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister);