In order to enable symmetric hotplug, we must mirror the online &&
!active state of cpu-down on the cpu-up side.

However, to retain sanity, limit this state to per-cpu kthreads.

Aside from the change to set_cpus_allowed_ptr(), which allow moving
the per-cpu kthreads on, the other critical piece is the cpu selection
for pinned tasks in select_task_rq(). This avoids dropping into
select_fallback_rq().

select_fallback_rq() cannot be allowed to select !active cpus because
its used to migrate user tasks away. And we do not want to move user
tasks onto cpus that are in transition.

Cc: Lai Jiangshan <la...@cn.fujitsu.com>
Cc: Jan H. Schönherr <jscho...@amazon.de>
Cc: Oleg Nesterov <o...@redhat.com>
Requested-by: Thomas Gleixner <t...@linutronix.de>
Tested-by: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 arch/powerpc/kernel/smp.c |    2 -
 arch/s390/kernel/smp.c    |    2 -
 include/linux/cpumask.h   |    6 +----
 kernel/sched/core.c       |   49 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 46 insertions(+), 13 deletions(-)

--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -542,7 +542,7 @@ int __cpu_up(unsigned int cpu, struct ta
                smp_ops->give_timebase();
 
        /* Wait until cpu puts itself in the online & active maps */
-       while (!cpu_online(cpu) || !cpu_active(cpu))
+       while (!cpu_online(cpu))
                cpu_relax();
 
        return 0;
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct ta
        pcpu_attach_task(pcpu, tidle);
        pcpu_start_fn(pcpu, smp_start_secondary, NULL);
        /* Wait until cpu puts itself in the online & active maps */
-       while (!cpu_online(cpu) || !cpu_active(cpu))
+       while (!cpu_online(cpu))
                cpu_relax();
        return 0;
 }
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -745,12 +745,10 @@ set_cpu_present(unsigned int cpu, bool p
 static inline void
 set_cpu_online(unsigned int cpu, bool online)
 {
-       if (online) {
+       if (online)
                cpumask_set_cpu(cpu, &__cpu_online_mask);
-               cpumask_set_cpu(cpu, &__cpu_active_mask);
-       } else {
+       else
                cpumask_clear_cpu(cpu, &__cpu_online_mask);
-       }
 }
 
 static inline void
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1205,13 +1205,21 @@ void do_set_cpus_allowed(struct task_str
 static int __set_cpus_allowed_ptr(struct task_struct *p,
                                  const struct cpumask *new_mask, bool check)
 {
+       const struct cpumask *cpu_valid_mask = cpu_active_mask;
+       unsigned int dest_cpu;
        unsigned long flags;
        struct rq *rq;
-       unsigned int dest_cpu;
        int ret = 0;
 
        rq = task_rq_lock(p, &flags);
 
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * Kernel threads are allowed on online && !active CPUs
+                */
+               cpu_valid_mask = cpu_online_mask;
+       }
+
        /*
         * Must re-check here, to close a race against __kthread_bind(),
         * sched_setaffinity() is not guaranteed to observe the flag.
@@ -1224,18 +1232,28 @@ static int __set_cpus_allowed_ptr(struct
        if (cpumask_equal(&p->cpus_allowed, new_mask))
                goto out;
 
-       if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+       if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
                ret = -EINVAL;
                goto out;
        }
 
        do_set_cpus_allowed(p, new_mask);
 
+       if (p->flags & PF_KTHREAD) {
+               /*
+                * For kernel threads that do indeed end up on online &&
+                * !active we want to ensure they are strict per-cpu threads.
+                */
+               WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+                       !cpumask_intersects(new_mask, cpu_active_mask) &&
+                       p->nr_cpus_allowed != 1);
+       }
+
        /* Can the task run on the task's current CPU? If so, we're done */
        if (cpumask_test_cpu(task_cpu(p), new_mask))
                goto out;
 
-       dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+       dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
@@ -1554,6 +1572,25 @@ EXPORT_SYMBOL_GPL(kick_process);
 
 /*
  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ *
+ * A few notes on cpu_active vs cpu_online:
+ *
+ *  - cpu_active must be a subset of cpu_online
+ *
+ *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *    see __set_cpus_allowed_ptr(). At this point the newly online
+ *    cpu isn't yet part of the sched domains, and balancing will not
+ *    see it.
+ *
+ *  - on cpu-down we clear cpu_active() to mask the sched domains and
+ *    avoid the load balancer to place new tasks on the to be removed
+ *    cpu. Existing tasks will remain running there and will be taken
+ *    off.
+ *
+ * This means that fallback selection must not select !active CPUs.
+ * And can assume that any active CPU must be online. Conversely
+ * select_task_rq() below may allow selection of !active CPUs in order
+ * to satisfy the above rules.
  */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -1572,8 +1609,6 @@ static int select_fallback_rq(int cpu, s
 
                /* Look for allowed, online CPU in same node. */
                for_each_cpu(dest_cpu, nodemask) {
-                       if (!cpu_online(dest_cpu))
-                               continue;
                        if (!cpu_active(dest_cpu))
                                continue;
                        if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
@@ -1584,8 +1619,6 @@ static int select_fallback_rq(int cpu, s
        for (;;) {
                /* Any allowed, online CPU? */
                for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
-                       if (!cpu_online(dest_cpu))
-                               continue;
                        if (!cpu_active(dest_cpu))
                                continue;
                        goto out;
@@ -1637,6 +1670,8 @@ int select_task_rq(struct task_struct *p
 
        if (p->nr_cpus_allowed > 1)
                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, 
wake_flags);
+       else
+               cpu = cpumask_any(tsk_cpus_allowed(p));
 
        /*
         * In order not to call set_task_cpu() on a blocking task we need

Reply via email to