Hello, Linus.

This pull request contains fixes for two long standing subtle bugs.

* kthread_bind() on a new kthread binds it to specific CPUs and
  prevents userland from messing with the affinity or cgroup
  membership.  Unfortunately, for cgroup membership, there's a window
  between kthread creation and kthread_bind*() invocation where the
  kthread can be moved into a non-root cgroup by userland.

  Depending on what controllers are in effect, this can assign the
  kthread unexpected attributes.  For example, in the reported case,
  workqueue workers ended up in a non-root cpuset cgroups and had
  their CPU affinities overridden.  This broke workqueue invariants
  and led to workqueue stalls.

  Fixed by closing the window between kthread creation and
  kthread_bind() as suggested by Oleg.

* There was a bug in cgroup mount path which could allow two competing
  mount attempts to attach the same cgroup_root to two different
  superblocks.  This was caused by mishandling return value from
  kernfs_pin_sb().  Fixed.

Thanks.

The following changes since commit b6a6759daf55dade2b65089957832759d502acfb:

  cgroups: censor kernel pointer in debug files (2017-03-06 15:16:03 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git for-4.11-fixes

for you to fetch changes up to bfb0b80db5f9dca5ac0a5fd0edb765ee555e5a8e:

  cgroup: avoid attaching a cgroup root to two different superblocks 
(2017-04-11 09:00:57 +0900)

----------------------------------------------------------------
Tejun Heo (1):
      cgroup, kthread: close race window where new kthreads can be migrated to 
non-root cgroups

Zefan Li (1):
      cgroup: avoid attaching a cgroup root to two different superblocks

 include/linux/cgroup.h    | 21 +++++++++++++++++++++
 include/linux/sched.h     |  4 ++++
 kernel/cgroup/cgroup-v1.c |  2 +-
 kernel/cgroup/cgroup.c    |  9 +++++----
 kernel/kthread.c          |  3 +++
 5 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b43fb..af9c86e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -570,6 +570,25 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
        pr_cont_kernfs_path(cgrp->kn);
 }
 
+static inline void cgroup_init_kthreadd(void)
+{
+       /*
+        * kthreadd is inherited by all kthreads, keep it in the root so
+        * that the new kthreads are guaranteed to stay in the root until
+        * initialization is finished.
+        */
+       current->no_cgroup_migration = 1;
+}
+
+static inline void cgroup_kthread_ready(void)
+{
+       /*
+        * This kthread finished initialization.  The creator should have
+        * set PF_NO_SETAFFINITY if this kthread should stay in the root.
+        */
+       current->no_cgroup_migration = 0;
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
@@ -590,6 +609,8 @@ static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
+static inline void cgroup_init_kthreadd(void) {}
+static inline void cgroup_kthread_ready(void) {}
 
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..4cf9a59 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -604,6 +604,10 @@ struct task_struct {
 #ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
 #endif
+#ifdef CONFIG_CGROUPS
+       /* disallow userland-initiated cgroup migration */
+       unsigned                        no_cgroup_migration:1;
+#endif
 
        unsigned long                   atomic_flags; /* Flags requiring atomic 
access. */
 
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 1dc22f6..12e19f0 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -1146,7 +1146,7 @@ struct dentry *cgroup1_mount(struct file_system_type 
*fs_type, int flags,
                 * path is super cold.  Let's just sleep a bit and retry.
                 */
                pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-               if (IS_ERR(pinned_sb) ||
+               if (IS_ERR_OR_NULL(pinned_sb) ||
                    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
                        mutex_unlock(&cgroup_mutex);
                        if (!IS_ERR_OR_NULL(pinned_sb))
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0125589..638ef75 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2425,11 +2425,12 @@ ssize_t __cgroup_procs_write(struct kernfs_open_file 
*of, char *buf,
                tsk = tsk->group_leader;
 
        /*
-        * Workqueue threads may acquire PF_NO_SETAFFINITY and become
-        * trapped in a cpuset, or RT worker may be born in a cgroup
-        * with no rt_runtime allocated.  Just say no.
+        * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+        * If userland migrates such a kthread to a non-root cgroup, it can
+        * become trapped in a cpuset, or RT kthread may be born in a
+        * cgroup with no rt_runtime allocated.  Just say no.
         */
-       if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+       if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
                ret = -EINVAL;
                goto out_unlock_rcu;
        }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2f26ade..26db528 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,6 +20,7 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -225,6 +226,7 @@ static int kthread(void *_create)
 
        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
+               cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
@@ -538,6 +540,7 @@ int kthreadd(void *unused)
        set_mems_allowed(node_states[N_MEMORY]);
 
        current->flags |= PF_NOFREEZE;
+       cgroup_init_kthreadd();
 
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);

Reply via email to