Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()

Peter Zijlstra Fri, 07 Aug 2015 09:01:36 -0700

On Fri, Aug 07, 2015 at 11:38:28AM -0400, Tejun Heo wrote:
> Hello,
> 
> On Fri, Aug 07, 2015 at 05:29:56PM +0200, Peter Zijlstra wrote:
> > Even if we were to strictly order those stores you could have (note
> > there is no matching barrier, as there is only the one load, so ordering
> > cannot help):
> > 
> >     __kthread_bind()
> >                                             <SYSCALL>
> >                                               sched_setaffinity()
> >                                                 if (p->flags & 
> > PF_NO_SETAFFINITY) /* false-not-taken */
> >       p->flags |= PF_NO_SETAFFINITY;
> >       smp_wmb();
> >       do_set_cpus_allowed();
> >                                                 set_cpus_allowed_ptr()
> >
> > > I think the code was better before.  Can't we just revert workqueue.c
> > > part?
> > 
> > I agree that the new argument isn't pretty, but I cannot see how
> > workqueues would not be affected by this.
> 
> So, the problem there is that __kthread_bind() doesn't grab the same
> lock that the syscall side grabs but workqueue used
> set_cpus_allowed_ptr() which goes through the rq locking, so as long
> as the check on syscall side is movied inside rq lock, it should be
> fine.


Currently neither site uses any lock, and that is what the patch fixes
(it uses the per-task ->pi_lock instead of the rq->lock, but that is
immaterial).

What matters though is that you now must hold a scheduler lock while
setting PF_NO_SETAFFINITY. In order to avoid spreading that knowledge
around I've taught kthread_bind*() about this and made the workqueue
code use that API (rather than having the workqueue code take scheduler
locks).

Hmm.. a better solution. Have the worker thread creation call
kthread_bind_mask() before attach_to_pool() and have attach_to_pool()
keep using set_cpus_allowed_ptr(). Less ugly.

---
Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity()
From: Peter Zijlstra <[email protected]>
Date: Fri, 15 May 2015 17:43:34 +0200

Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY
without locks, a caller might observe an old value and race with the
set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo
it.

        __kthread_bind()
          do_set_cpus_allowed()
                                                <SYSCALL>
                                                  sched_setaffinity()
                                                    if (p->flags & 
PF_NO_SETAFFINITIY)
                                                    set_cpus_allowed_ptr()
          p->flags |= PF_NO_SETAFFINITY

Fix the issue by putting everything under the regular scheduler locks.

This also closes a hole in the serialization of
task_struct::{nr_,}cpus_allowed.

Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: [email protected]
Cc: Oleg Nesterov <[email protected]>
Cc: [email protected]
Acked-by: Tejun Heo <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
 include/linux/kthread.h |    1 +
 include/linux/sched.h   |    7 -------
 kernel/kthread.c        |   20 +++++++++++++++++---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++++++++++----
 kernel/workqueue.c      |    6 ++----
 5 files changed, 52 insertions(+), 18 deletions(-)

--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp
 })
 
 void kthread_bind(struct task_struct *k, unsigned int cpu);
+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
 bool kthread_should_stop(void);
 bool kthread_should_park(void);
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle(
 static inline void calc_load_exit_idle(void) { }
 #endif /* CONFIG_NO_HZ_COMMON */
 
-#ifndef CONFIG_CPUMASK_OFFSTACK
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
-{
-       return set_cpus_allowed_ptr(p, &new_mask);
-}
-#endif
-
 /*
  * Do not use outside of architecture code which knows its limitations.
  *
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask 
*mask, long state)
 {
-       /* Must have done schedule() in kthread() before we set_task_cpu */
+       unsigned long flags;
+
        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }
+
        /* It's safe because the task is inactive. */
-       do_set_cpus_allowed(p, cpumask_of(cpu));
+       raw_spin_lock_irqsave(&p->pi_lock, flags);
+       do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
+       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
+{
+       __kthread_bind_mask(p, cpumask_of(cpu), state);
+}
+
+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
+{
+       __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
 }
 
 /**
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,6 +1151,8 @@ static int migration_cpu_stop(void *data
 
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
+       lockdep_assert_held(&p->pi_lock);
+
        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
 
@@ -1167,7 +1169,8 @@ void do_set_cpus_allowed(struct task_str
  * task must not exit() & deallocate itself prematurely. The
  * call is not atomic; no spinlocks may be held.
  */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+                                 const struct cpumask *new_mask, bool check)
 {
        unsigned long flags;
        struct rq *rq;
@@ -1176,6 +1179,15 @@ int set_cpus_allowed_ptr(struct task_str
 
        rq = task_rq_lock(p, &flags);
 
+       /*
+        * Must re-check here, to close a race against __kthread_bind(),
+        * sched_setaffinity() is not guaranteed to observe the flag.
+        */
+       if (check && (p->flags & PF_NO_SETAFFINITY)) {
+               ret = -EINVAL;
+               goto out;
+       }
+
        if (cpumask_equal(&p->cpus_allowed, new_mask))
                goto out;
 
@@ -1212,6 +1224,11 @@ int set_cpus_allowed_ptr(struct task_str
 
        return ret;
 }
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+       return __set_cpus_allowed_ptr(p, new_mask, false);
+}
 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1593,6 +1610,15 @@ static void update_avg(u64 *avg, u64 sam
        s64 diff = sample - *avg;
        *avg += diff >> 3;
 }
+
+#else
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+                                        const struct cpumask *new_mask, bool 
check)
+{
+       return set_cpus_allowed_ptr(p, new_mask);
+}
+
 #endif /* CONFIG_SMP */
 
 static void
@@ -4338,7 +4364,7 @@ long sched_setaffinity(pid_t pid, const
        }
 #endif
 again:
-       retval = set_cpus_allowed_ptr(p, new_mask);
+       retval = __set_cpus_allowed_ptr(p, new_mask, true);
 
        if (!retval) {
                cpuset_cpus_allowed(p, cpus_allowed);
@@ -4863,7 +4889,8 @@ void init_idle(struct task_struct *idle,
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
-       raw_spin_lock_irqsave(&rq->lock, flags);
+       raw_spin_lock_irqsave(&idle->pi_lock, flags);
+       raw_spin_lock(&rq->lock);
 
        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
@@ -4889,7 +4916,8 @@ void init_idle(struct task_struct *idle,
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock(&rq->lock);
+       raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
        init_idle_preempt_count(idle, cpu);
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(stru
                goto fail;
 
        set_user_nice(worker->task, pool->attrs->nice);
-
-       /* prevent userland from meddling with cpumask of workqueue workers */
-       worker->task->flags |= PF_NO_SETAFFINITY;
+       kthread_bind_mask(worker->task, pool->attrs->cpumask);
 
        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workque
                }
 
                wq->rescuer = rescuer;
-               rescuer->task->flags |= PF_NO_SETAFFINITY;
+               kthread_bind_mask(rescuer->task, cpu_possible_mask);
                wake_up_process(rescuer->task);
        }
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 1/4] sched: Fix a race between __kthread_bind() and sched_setaffinity()

Reply via email to