Concurrent migrate_disable() and set_cpus_allowed_ptr() has interesting features. We rely on set_cpus_allowed_ptr() to not return until the task runs inside the provided mask. This expectation is exported to userspace.
This means that any set_cpus_allowed_ptr() caller must wait until migrate_enable() allows migrations. At the same time, we don't want migrate_enable() to schedule, due to patterns like: preempt_disable(); migrate_disable(); ... migrate_enable(); preempt_enable(); And: raw_spin_lock(&B); spin_unlock(&A); this means that when migrate_enable() must restore the affinity mask, it cannot wait for completion thereof. Luck will have it that that is exactly the case where there is a pending set_cpus_allowed_ptr(), so let that provide storage for the async stop machine. Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org> --- include/linux/sched.h | 1 kernel/sched/core.c | 142 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 123 insertions(+), 20 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -713,6 +713,7 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; + void *migration_pending; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) int migration_disabled; #endif --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1734,15 +1734,28 @@ EXPORT_SYMBOL_GPL(migrate_disable); void migrate_enable(void) { - if (--current->migration_disabled) - return; - - barrier(); + struct task_struct *p = current; - if (p->cpus_ptr == &p->cpus_mask) + if (p->migration_disabled > 1) { + p->migration_disabled--; return; + } - __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -1818,6 +1831,7 @@ static struct rq *move_queued_task(struc struct migration_arg { struct task_struct *task; int dest_cpu; + struct completion *done; }; /* @@ -1852,6 +1866,7 @@ static int migration_cpu_stop(void *data struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* @@ -1874,15 +1889,27 @@ static int migration_cpu_stop(void *data * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + if (task_on_rq_queued(p)) rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; + + if (arg->done) { + p->migration_pending = NULL; + complete = true; + } } +out: rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); - local_irq_enable(); + + if (complete) + complete_all(arg->done); + return 0; } @@ -1957,6 +1984,92 @@ void do_set_cpus_allowed(struct task_str __do_set_cpus_allowed(p, new_mask, 0); } +struct set_affinity_pending { + refcount_t refs; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; +}; + +static int move_task(struct rq *rq, struct rq_flags *rf, struct task_struct *p, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending, *pending = NULL; + struct migration_arg arg = { + .task = p, + .dest_cpu = dest_cpu, + }; + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) + return -EINVAL; + + arg.done = &pending->done; + + if (flags & SCA_MIGRATE_ENABLE) { + + task_rq_unlock(rq, p, rf); + pending->arg = arg; + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + + return 0; + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { + + task_rq_unlock(rq, p, rf); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + + } else { + bool complete = false; + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + return 0; +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -2025,19 +2138,8 @@ static int __set_cpus_allowed_ptr(struct if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } + return move_task(rq, &rf, p, dest_cpu, flags); + out: task_rq_unlock(rq, p, &rf);