On Fri, Aug 07, 2015 at 11:38:28AM -0400, Tejun Heo wrote: > Hello, > > On Fri, Aug 07, 2015 at 05:29:56PM +0200, Peter Zijlstra wrote: > > Even if we were to strictly order those stores you could have (note > > there is no matching barrier, as there is only the one load, so ordering > > cannot help): > > > > __kthread_bind() > > <SYSCALL> > > sched_setaffinity() > > if (p->flags & > > PF_NO_SETAFFINITY) /* false-not-taken */ > > p->flags |= PF_NO_SETAFFINITY; > > smp_wmb(); > > do_set_cpus_allowed(); > > set_cpus_allowed_ptr() > > > > > I think the code was better before. Can't we just revert workqueue.c > > > part? > > > > I agree that the new argument isn't pretty, but I cannot see how > > workqueues would not be affected by this. > > So, the problem there is that __kthread_bind() doesn't grab the same > lock that the syscall side grabs but workqueue used > set_cpus_allowed_ptr() which goes through the rq locking, so as long > as the check on syscall side is movied inside rq lock, it should be > fine.
Currently neither site uses any lock, and that is what the patch fixes (it uses the per-task ->pi_lock instead of the rq->lock, but that is immaterial). What matters though is that you now must hold a scheduler lock while setting PF_NO_SETAFFINITY. In order to avoid spreading that knowledge around I've taught kthread_bind*() about this and made the workqueue code use that API (rather than having the workqueue code take scheduler locks). Hmm.. a better solution. Have the worker thread creation call kthread_bind_mask() before attach_to_pool() and have attach_to_pool() keep using set_cpus_allowed_ptr(). Less ugly. --- Subject: sched: Fix a race between __kthread_bind() and sched_setaffinity() From: Peter Zijlstra <[email protected]> Date: Fri, 15 May 2015 17:43:34 +0200 Because sched_setscheduler() checks p->flags & PF_NO_SETAFFINITY without locks, a caller might observe an old value and race with the set_cpus_allowed_ptr() call from __kthread_bind() and effectively undo it. __kthread_bind() do_set_cpus_allowed() <SYSCALL> sched_setaffinity() if (p->flags & PF_NO_SETAFFINITIY) set_cpus_allowed_ptr() p->flags |= PF_NO_SETAFFINITY Fix the issue by putting everything under the regular scheduler locks. This also closes a hole in the serialization of task_struct::{nr_,}cpus_allowed. Cc: [email protected] Cc: [email protected] Cc: [email protected] Cc: [email protected] Cc: [email protected] Cc: Oleg Nesterov <[email protected]> Cc: [email protected] Acked-by: Tejun Heo <[email protected]> Signed-off-by: Peter Zijlstra (Intel) <[email protected]> Link: http://lkml.kernel.org/r/[email protected] --- include/linux/kthread.h | 1 + include/linux/sched.h | 7 ------- kernel/kthread.c | 20 +++++++++++++++++--- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++---- kernel/workqueue.c | 6 ++---- 5 files changed, 52 insertions(+), 18 deletions(-) --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cp }) void kthread_bind(struct task_struct *k, unsigned int cpu); +void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); int kthread_stop(struct task_struct *k); bool kthread_should_stop(void); bool kthread_should_park(void); --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2215,13 +2215,6 @@ static inline void calc_load_enter_idle( static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif - /* * Do not use outside of architecture code which knows its limitations. * --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -325,16 +325,30 @@ struct task_struct *kthread_create_on_no } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state) { - /* Must have done schedule() in kthread() before we set_task_cpu */ + unsigned long flags; + if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + /* It's safe because the task is inactive. */ - do_set_cpus_allowed(p, cpumask_of(cpu)); + raw_spin_lock_irqsave(&p->pi_lock, flags); + do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) +{ + __kthread_bind_mask(p, cpumask_of(cpu), state); +} + +void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) +{ + __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); } /** --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1151,6 +1151,8 @@ static int migration_cpu_stop(void *data void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + lockdep_assert_held(&p->pi_lock); + if (p->sched_class->set_cpus_allowed) p->sched_class->set_cpus_allowed(p, new_mask); @@ -1167,7 +1169,8 @@ void do_set_cpus_allowed(struct task_str * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ -int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) { unsigned long flags; struct rq *rq; @@ -1176,6 +1179,15 @@ int set_cpus_allowed_ptr(struct task_str rq = task_rq_lock(p, &flags); + /* + * Must re-check here, to close a race against __kthread_bind(), + * sched_setaffinity() is not guaranteed to observe the flag. + */ + if (check && (p->flags & PF_NO_SETAFFINITY)) { + ret = -EINVAL; + goto out; + } + if (cpumask_equal(&p->cpus_allowed, new_mask)) goto out; @@ -1212,6 +1224,11 @@ int set_cpus_allowed_ptr(struct task_str return ret; } + +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + return __set_cpus_allowed_ptr(p, new_mask, false); +} EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -1593,6 +1610,15 @@ static void update_avg(u64 *avg, u64 sam s64 diff = sample - *avg; *avg += diff >> 3; } + +#else + +static inline int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, bool check) +{ + return set_cpus_allowed_ptr(p, new_mask); +} + #endif /* CONFIG_SMP */ static void @@ -4338,7 +4364,7 @@ long sched_setaffinity(pid_t pid, const } #endif again: - retval = set_cpus_allowed_ptr(p, new_mask); + retval = __set_cpus_allowed_ptr(p, new_mask, true); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -4863,7 +4889,8 @@ void init_idle(struct task_struct *idle, struct rq *rq = cpu_rq(cpu); unsigned long flags; - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock_irqsave(&idle->pi_lock, flags); + raw_spin_lock(&rq->lock); __sched_fork(0, idle); idle->state = TASK_RUNNING; @@ -4889,7 +4916,8 @@ void init_idle(struct task_struct *idle, #if defined(CONFIG_SMP) idle->on_cpu = 1; #endif - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1714,9 +1714,7 @@ static struct worker *create_worker(stru goto fail; set_user_nice(worker->task, pool->attrs->nice); - - /* prevent userland from meddling with cpumask of workqueue workers */ - worker->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(worker->task, pool->attrs->cpumask); /* successful, attach the worker to the pool */ worker_attach_to_pool(worker, pool); @@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workque } wq->rescuer = rescuer; - rescuer->task->flags |= PF_NO_SETAFFINITY; + kthread_bind_mask(rescuer->task, cpu_possible_mask); wake_up_process(rescuer->task); } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [email protected] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/

