Allow unprivileged processes to control subtrees of their associated
processes, a necessary feature if a rootless container wishes to take
advantage of cgroups for its own processes.

As cgroups are hierarchical, having the ability to set limits in a
subtree does not preclude the ability to modify the limits imposed by
parent cgroups. In addition, in the default hierarchy a process must
have write access to the common ancestor of the two (src and dest)
cgroups' cgroup.procs file. This makes this change safe against cgroup
escape.

There isn't a way to disable this at the moment.

Signed-off-by: Aleksa Sarai <asa...@suse.de>
---
 kernel/cgroup.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 120 insertions(+), 23 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f1c798b69561..f455488dc899 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -62,6 +62,7 @@
 #include <linux/proc_ns.h>
 #include <linux/nsproxy.h>
 #include <linux/proc_ns.h>
+#include <linux/time.h>
 #include <net/sock.h>
 
 /*
@@ -5269,34 +5270,40 @@ out_destroy:
        return ERR_PTR(ret);
 }
 
-static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
-                       umode_t mode)
+/**
+ * cgroup_create_subtree - creates a new subtree of a cgroup
+ * @parent: the parent cgroup to create the subtree under
+ * @name: the name of the cgroup in kernfs
+ * @mode: the mode of the cgroup in kernfs
+ *
+ * Creates a new cgroup under the given @parent, with the given @name and 
@mode.
+ * The caller must hold cgroup_mutex, and must not be under active protection 
of
+ * kernfs.
+ */
+static struct cgroup *cgroup_create_subtree(struct cgroup *parent,
+                                           const char *name, umode_t mode)
 {
-       struct cgroup *parent, *cgrp;
+       struct cgroup *child;
        struct kernfs_node *kn;
-       int ret;
+       int ret = 0;
+
+       lockdep_assert_held(&cgroup_mutex);
 
        /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
        if (strchr(name, '\n'))
-               return -EINVAL;
-
-       parent = cgroup_kn_lock_live(parent_kn, false);
-       if (!parent)
-               return -ENODEV;
+               return ERR_PTR(-EINVAL);
 
-       cgrp = cgroup_create(parent);
-       if (IS_ERR(cgrp)) {
-               ret = PTR_ERR(cgrp);
-               goto out_unlock;
-       }
+       child = cgroup_create(parent);
+       if (IS_ERR(child))
+               return child;
 
        /* create the directory */
-       kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
+       kn = kernfs_create_dir(parent->kn, name, mode, child);
        if (IS_ERR(kn)) {
                ret = PTR_ERR(kn);
                goto out_destroy;
        }
-       cgrp->kn = kn;
+       child->kn = kn;
 
        /*
         * This extra ref will be put in cgroup_free_fn() and guarantees
@@ -5308,22 +5315,51 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, 
const char *name,
        if (ret)
                goto out_destroy;
 
-       ret = css_populate_dir(&cgrp->self);
+       ret = css_populate_dir(&child->self);
        if (ret)
                goto out_destroy;
 
-       ret = cgroup_apply_control_enable(cgrp);
+       ret = cgroup_apply_control_enable(child);
        if (ret)
                goto out_destroy;
 
        /* let's create and online css's */
        kernfs_activate(kn);
 
-       ret = 0;
-       goto out_unlock;
+       return child;
 
 out_destroy:
-       cgroup_destroy_locked(cgrp);
+       cgroup_destroy_locked(child);
+       return ERR_PTR(ret);
+}
+
+/*
+ * cgroup directories starting with this prefix are forbidden from being 
created
+ * from userspace. This prefix is used internally to make sure that there's no
+ * conflicts with userspace when creating cgroups inside copy_cgroup_ns().
+ */
+#define CGROUPNS_INTERNAL_PREFIX ".__cgroupns_subtree:"
+
+static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
+                       umode_t mode)
+{
+       struct cgroup *parent, *cgrp;
+       int ret = 0;
+
+       if (strncmp(CGROUPNS_INTERNAL_PREFIX, name,
+                   strlen(CGROUPNS_INTERNAL_PREFIX)) == 0)
+               return -EINVAL;
+
+       parent = cgroup_kn_lock_live(parent_kn, false);
+       if (!parent)
+               return -ENODEV;
+
+       cgrp = cgroup_create_subtree(parent, name, mode);
+       if (IS_ERR(cgrp)) {
+               ret = PTR_ERR(cgrp);
+               goto out_unlock;
+       }
+
 out_unlock:
        cgroup_kn_unlock(parent_kn);
        return ret;
@@ -6298,7 +6334,9 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long 
flags,
                                        struct cgroup_namespace *old_ns)
 {
        struct cgroup_namespace *new_ns;
+       struct cgroup_root *root;
        struct css_set *cset;
+       char id[16], id_string[1+2*ARRAY_SIZE(id)] = {0};
 
        BUG_ON(!old_ns);
 
@@ -6311,12 +6349,71 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long 
flags,
        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
 
+       /*
+        * In order to make sure that the dirname we create is unique, we use a
+        * random id for all of the subtrees. The ID is the same to reduce
+        * confusion when reading /proc/<pid>/cgroup.
+        */
+       get_random_bytes(id, ARRAY_SIZE(id));
+       bin2hex(id_string, id, ARRAY_SIZE(id));
+
+       /*
+        * Create a new subtree in every cgroup the task is associated with.
+        * The cgroup is owned by the task uid and gid, to allow for management
+        * of subtrees in cgroup namespaces. This is safe because:
+        *
+        * 1. cgroups are hierarchical, so having the ability to set limits in
+        *    a subtree does not preclude the ability to modify the limits
+        *    imposed by parent cgroups.
+        *
+        * 2. cgroup_procs_write_permission() does checks to ensure that a
+        *    task cannot move other tasks into its cgroup unless they are both
+        *    running as the same user (or the task moving the process has
+        *    CAP_SYS_ADMIN in the user namespace of the process being moved).
+        *    This means that a misbehaving process can't start messing around
+        *    with other processes' cgroup associations.
+        *
+        * 3. On the default hierarchy, you cannot migrate a process to a
+        *    non-descendant cgroup unless you have write access to the
+        *    cgroup.procs file in the common ancestor of the two cgroups. This
+        *    means that two cooperative processes in the default hierarchy
+        *    can't move processes between their cgroups (if the admin
+        *    disallows it). Unfortunately, this functionality doesn't exist in
+        *    the other hierarchies (for backwards compatibility reasons).
+        *    However, this requirement isn't as important as the previous two.
+        */
        mutex_lock(&cgroup_mutex);
-       spin_lock_bh(&css_set_lock);
+       for_each_root(root) {
+               struct cgroup *parent, *child;
+               char namebuf[CGROUP_FILE_NAME_MAX];
+               bool is_dfl = cgroup_on_dfl(&root->cgrp);
+
+               spin_lock_bh(&css_set_lock);
+               parent = task_cgroup_from_root(current, root);
+               spin_unlock_bh(&css_set_lock);
+
+               snprintf(namebuf, CGROUP_FILE_NAME_MAX,
+                        CGROUPNS_INTERNAL_PREFIX "%s", id_string);
+
+               /* This should not fail, since we're under &cgroup_mutex. */
+               child = cgroup_create_subtree(parent, namebuf, 0755);
+               if (WARN_ON(IS_ERR(child)))
+                       continue;
 
+               /*
+                * Move the task to the new cgroup, which is owned by the user.
+                * Should never fail, since we're under &cgroup_mutex here.
+                */
+               rcu_read_lock();
+               if (WARN_ON(cgroup_attach_task(child, current, is_dfl)))
+                       cgroup_destroy_locked(child);
+               rcu_read_unlock();
+
+       }
+
+       spin_lock_bh(&css_set_lock);
        cset = task_css_set(current);
        get_css_set(cset);
-
        spin_unlock_bh(&css_set_lock);
        mutex_unlock(&cgroup_mutex);
 
-- 
2.8.2

Reply via email to