In preparation for migrate_disable(), make sure only per-cpu kthreads
are allowed to run on !active CPUs.

This is ran (as one of the very first steps) from the cpu-hotplug
task which is a per-cpu kthread and completion of the hotplug
operation only requires such tasks.

This contraint enables the migrate_disable() implementation to wait
for completion of all migrate_disable regions on this CPU at hotplug
time without fear of any new ones starting.

This replaces the unlikely(rq->balance_callbacks) test at the tail of
context_switch with an unlikely(rq->balance_work), the fast path is
not affected.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 kernel/sched/core.c  |  103 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h |    5 ++
 2 files changed, 106 insertions(+), 2 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3513,8 +3513,10 @@ static inline struct callback_head *spli
        struct callback_head *head = rq->balance_callback;
 
        lockdep_assert_held(&rq->lock);
-       if (head)
+       if (head) {
                rq->balance_callback = NULL;
+               rq->balance_flags &= ~BALANCE_WORK;
+       }
 
        return head;
 }
@@ -3569,6 +3571,8 @@ prepare_lock_switch(struct rq *rq, struc
 #endif
 }
 
+static bool balance_push(struct rq *rq);
+
 static inline void finish_lock_switch(struct rq *rq)
 {
        /*
@@ -3577,7 +3581,16 @@ static inline void finish_lock_switch(st
         * prev into current:
         */
        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-       __balance_callbacks(rq);
+       if (unlikely(rq->balance_flags)) {
+               /*
+                * Run the balance_callbacks, except on hotplug
+                * when we need to push the current task away.
+                */
+               if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
+                   !(rq->balance_flags & BALANCE_PUSH) ||
+                   !balance_push(rq))
+                       __balance_callbacks(rq);
+       }
        raw_spin_unlock_irq(&rq->lock);
 }
 
@@ -6836,6 +6849,87 @@ static void migrate_tasks(struct rq *dea
 
        rq->stop = stop;
 }
+
+static int __balance_push_stop(void *arg)
+{
+       struct task_struct *p = arg;
+       struct rq *rq = this_rq();
+       struct rq_flags rf;
+       int cpu;
+
+       raw_spin_lock_irq(&p->pi_lock);
+       rq_lock(rq, &rf);
+
+       if (task_rq(p) == rq && task_on_rq_queued(p)) {
+               cpu = select_fallback_rq(rq->cpu, p);
+               rq = __migrate_task(rq, &rf, p, cpu);
+       }
+
+       rq_unlock(rq, &rf);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       put_task_struct(p);
+
+       return 0;
+}
+
+static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
+
+/*
+ * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ */
+static bool balance_push(struct rq *rq)
+{
+       struct task_struct *push_task = rq->curr;
+
+       lockdep_assert_held(&rq->lock);
+       SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
+       /*
+        * Both the cpu-hotplug and stop task are in this class and are
+        * required to complete the hotplug process.
+        */
+       if (is_per_cpu_kthread(push_task))
+               return false;
+
+       get_task_struct(push_task);
+       /*
+        * Temporarily drop rq->lock such that we can wake-up the stop task.
+        * Both preemption and IRQs are still disabled.
+        */
+       raw_spin_unlock(&rq->lock);
+       stop_one_cpu_nowait(rq->cpu, __balance_push_stop, push_task,
+                           this_cpu_ptr(&push_work));
+       /*
+        * At this point need_resched() is true and we'll take the loop in
+        * schedule(). The next pick is obviously going to be the stop task
+        * which is_per_cpu_kthread() and will push this task away.
+        */
+       raw_spin_lock(&rq->lock);
+
+       return true;
+}
+
+static void balance_push_set(int cpu, bool on)
+{
+       struct rq *rq = cpu_rq(cpu);
+       struct rq_flags rf;
+
+       rq_lock_irqsave(rq, &rf);
+       if (on)
+               rq->balance_flags |= BALANCE_PUSH;
+       else
+               rq->balance_flags &= ~BALANCE_PUSH;
+       rq_unlock_irqrestore(rq, &rf);
+}
+
+#else
+
+static inline bool balance_push(struct rq *rq)
+{
+       return false;
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 void set_rq_online(struct rq *rq)
@@ -6921,6 +7015,8 @@ int sched_cpu_activate(unsigned int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct rq_flags rf;
 
+       balance_push_set(cpu, false);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going up, increment the number of cores with SMT present.
@@ -6968,6 +7064,8 @@ int sched_cpu_deactivate(unsigned int cp
         */
        synchronize_rcu();
 
+       balance_push_set(cpu, true);
+
 #ifdef CONFIG_SCHED_SMT
        /*
         * When going down, decrement the number of cores with SMT present.
@@ -6981,6 +7079,7 @@ int sched_cpu_deactivate(unsigned int cp
 
        ret = cpuset_cpu_inactive(cpu);
        if (ret) {
+               balance_push_set(cpu, false);
                set_cpu_active(cpu, true);
                return ret;
        }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -973,6 +973,7 @@ struct rq {
        unsigned long           cpu_capacity_orig;
 
        struct callback_head    *balance_callback;
+       unsigned char           balance_flags;
 
        unsigned char           nohz_idle_balance;
        unsigned char           idle_balance;
@@ -1384,6 +1385,9 @@ init_numa_balancing(unsigned long clone_
 
 #ifdef CONFIG_SMP
 
+#define BALANCE_WORK   0x01
+#define BALANCE_PUSH   0x02
+
 static inline void
 queue_balance_callback(struct rq *rq,
                       struct callback_head *head,
@@ -1397,6 +1401,7 @@ queue_balance_callback(struct rq *rq,
        head->func = (void (*)(struct callback_head *))func;
        head->next = rq->balance_callback;
        rq->balance_callback = head;
+       rq->balance_flags |= BALANCE_WORK;
 }
 
 #define rcu_dereference_check_sched_domain(p) \


Reply via email to