[Devel] [PATCH vz8] userns: associate user_struct with the user_namespace

2020-10-27 Thread Andrey Ryabinin
user_struct contains per-user counters like processes, files,
sigpending etc which we wouldn't like to share across different
namespaces.
Make per-userns uid hastable instead of global.
This is partial revert of the 7b44ab978b77a
 ("userns: Disassociate user_struct from the user_namespace.")

Signed-off-by: Andrey Ryabinin 
---
 include/linux/sched/user.h |  1 +
 include/linux/user_namespace.h |  4 
 kernel/user.c  | 22 +-
 kernel/user_namespace.c| 13 +
 4 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
index 9a9536fd4fe3..4bf5a723f138 100644
--- a/include/linux/sched/user.h
+++ b/include/linux/sched/user.h
@@ -60,6 +60,7 @@ extern struct user_struct *find_user(kuid_t);
 extern struct user_struct root_user;
 #define INIT_USER (&root_user)
 
+extern struct user_struct * alloc_uid_ns(struct user_namespace *ns, kuid_t);
 
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(kuid_t);
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index b004d5aeba1f..30493179b756 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -15,6 +15,9 @@
 #define UID_GID_MAP_MAX_BASE_EXTENTS 5
 #define UID_GID_MAP_MAX_EXTENTS 340
 
+#define UIDHASH_BITS   (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ (1 << UIDHASH_BITS)
+
 struct uid_gid_extent {
u32 first;
u32 lower_first;
@@ -73,6 +76,7 @@ struct user_namespace {
struct uid_gid_map  gid_map;
struct uid_gid_map  projid_map;
atomic_tcount;
+   struct hlist_head   uidhash_table[UIDHASH_SZ];
struct user_namespace   *parent;
int level;
kuid_t  owner;
diff --git a/kernel/user.c b/kernel/user.c
index 0df9b1640b2a..f9f540484499 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -8,6 +8,7 @@
  * able to have per-user limits for system resources. 
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -74,14 +75,11 @@ EXPORT_SYMBOL_GPL(init_user_ns);
  * when changing user ID's (ie setuid() and friends).
  */
 
-#define UIDHASH_BITS   (CONFIG_BASE_SMALL ? 3 : 7)
-#define UIDHASH_SZ (1 << UIDHASH_BITS)
 #define UIDHASH_MASK   (UIDHASH_SZ - 1)
 #define __uidhashfn(uid)   (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid)  (uidhash_table + __uidhashfn((__kuid_val(uid
+#define uidhashentry(ns, uid)  ((ns)->uidhash_table + 
__uidhashfn((__kuid_val(uid
 
 static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
 
 /*
  * The uidhash_lock is mostly taken from process context, but it is
@@ -155,9 +153,10 @@ struct user_struct *find_user(kuid_t uid)
 {
struct user_struct *ret;
unsigned long flags;
+   struct user_namespace *ns = current_user_ns();
 
spin_lock_irqsave(&uidhash_lock, flags);
-   ret = uid_hash_find(uid, uidhashentry(uid));
+   ret = uid_hash_find(uid, uidhashentry(ns, uid));
spin_unlock_irqrestore(&uidhash_lock, flags);
return ret;
 }
@@ -173,9 +172,9 @@ void free_uid(struct user_struct *up)
free_user(up, flags);
 }
 
-struct user_struct *alloc_uid(kuid_t uid)
+struct user_struct *alloc_uid_ns(struct user_namespace *ns, kuid_t uid)
 {
-   struct hlist_head *hashent = uidhashentry(uid);
+   struct hlist_head *hashent = uidhashentry(ns, uid);
struct user_struct *up, *new;
 
spin_lock_irq(&uidhash_lock);
@@ -215,6 +214,11 @@ struct user_struct *alloc_uid(kuid_t uid)
return NULL;
 }
 
+struct user_struct *alloc_uid(kuid_t uid)
+{
+   return alloc_uid_ns(current_user_ns(), uid);
+}
+
 static int __init uid_cache_init(void)
 {
int n;
@@ -223,11 +227,11 @@ static int __init uid_cache_init(void)
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
for(n = 0; n < UIDHASH_SZ; ++n)
-   INIT_HLIST_HEAD(uidhash_table + n);
+   INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
/* Insert the root user immediately (init already runs as root) */
spin_lock_irq(&uidhash_lock);
-   uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
+   uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 
GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
 
return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 243fb390d744..459b88044c62 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -74,6 +74,7 @@ static void set_cred_user_ns(struct cred *cred, struct 
user_namespace *user_ns)
 int create_user_ns(struct cred *new)
 {
struct user_namespace *ns, *parent_ns = new->user_ns;
+   struct user_struct *new_user;
kuid_t owner = new->euid;
kgid_t group = new->egid;
struct ucounts *ucounts;
@@ -116,6 +117,17 @@ int create_user

[Devel] [PATCH RH7] cgroup: do not use cgroup_mutex in cgroup_show_options

2020-10-27 Thread Valeriy Vdovin
In patch 87cb5fdb5b5c77ac617b46a0fe118a7d50a77b1c function
cgroup_show_options started to lock cgroup_mutex, which introduced
new deadlock possibility, described below:

Thread A:
  m_start() --> down_read(&namespace_sem);
cgroup_show_options() --> mutex_lock(&cgroup_mutex);

Thread B:
attach_task_by_pid()
  cgroup_lock_live_group -->  mutex_lock(&cgroup_mutex);
  threadgroup_lock() -->  down_write(&tsk->signal->group_rwsem);

Thread C:
 copy_process
  threadgroup_change_begin() --> down_read(&tsk->signal->group_rwsem);
  copy_namespaces
   create_new_namespaces
 copy_mnt_ns
  namespace_lock() --> down_write(&namespace_sem)

Clearly cgroup_mutex can not be locked right after locking
namespace_sem, because opposite locking order is also present in
the code. To get rid of cgroup_mutex it's synchonization role should
transition to cgroup_root_mutex, that was created specifically to defeat
the described locking order problem.

In this location cgroup_mutex provided 2 guarantees:
  1. Safe call for 'task_cgroup_from_root', that asserts cgroup_mutex
 ownership.
  2. Safe read access to cgroup->ve_owner

These guarantees are now transferred to cgroup_root_mutex in the
following way:
  1. task_cgroup_from_root assertion is modified to take into account
 cgroup_root_mutex as one of two possible locks.
  2. cgroup->ve_owner field modifications are done with
 cgroup_root_mutex also locked.

https://jira.sw.ru/browse/PSBM-121438

Signed-Off-By: Valeriy Vdovin 
---
 kernel/cgroup.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27d7a5e..db6a5ba 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -669,7 +669,7 @@ static struct cgroup *task_cgroup_from_root(struct 
task_struct *task,
struct css_set *css;
struct cgroup *res = NULL;
 
-   BUG_ON(!mutex_is_locked(&cgroup_mutex));
+   BUG_ON(!mutex_is_locked(&cgroup_mutex) && 
!mutex_is_locked(&cgroup_root_mutex));
read_lock(&css_set_lock);
/*
 * No need to lock the task - since we hold cgroup_mutex the
@@ -1100,7 +1100,6 @@ static int cgroup_show_options(struct seq_file *seq, 
struct dentry *dentry)
struct cgroup_subsys *ss;
struct cgroup *root_cgrp = &root->top_cgroup;
 
-   mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
@@ -1142,7 +1141,6 @@ static int cgroup_show_options(struct seq_file *seq, 
struct dentry *dentry)
if (strlen(root->name))
seq_show_option(seq, "name", root->name);
mutex_unlock(&cgroup_root_mutex);
-   mutex_unlock(&cgroup_mutex);
return 0;
 }
 
@@ -2529,6 +2527,8 @@ static int cgroup_release_agent_write(struct cgroup 
*cgrp, struct cftype *cft,
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
 
+   mutex_lock(&cgroup_root_mutex);
+
/*
 * Call to cgroup_get_local_root is a way to make sure
 * that cgrp in the argument is valid "virtual root"
@@ -2551,6 +2551,7 @@ static int cgroup_release_agent_write(struct cgroup 
*cgrp, struct cftype *cft,
ret = -ENODEV;
 
 out:
+   mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
return ret;
 }
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel