This lock is itself a source of hotplug deadlocks for RT:

1. kernfs_mutex is taken during hotplug, so any path other than hotplug
meeting hotplug_lock() thus deadlocks us.

2. notifier dependency upon RCU GP threads, same meeting hotplug_lock()
deadlocks us.

Removing it. Start migration immediately, do not stop migrating until the
cpu is down.  Have caller poll ->refcount before actually taking it down.
If someone manages to sneak in before we wake the migration thread, it
returns -EBUSY, and caller tries polling one more time.

Signed-off-by: Mike Galbraith <umgwanakikb...@gmail.com>
---
 kernel/cpu.c |  267 ++++++++++-------------------------------------------------
 1 file changed, 47 insertions(+), 220 deletions(-)

--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -23,6 +23,7 @@
 #include <linux/tick.h>
 #include <linux/irq.h>
 #include <linux/smpboot.h>
+#include <linux/delay.h>
 
 #include <trace/events/power.h>
 #define CREATE_TRACE_POINTS
@@ -166,49 +167,14 @@ static struct {
 
 /**
  * hotplug_pcp - per cpu hotplug descriptor
- * @unplug:    set when pin_current_cpu() needs to sync tasks
- * @sync_tsk:  the task that waits for tasks to finish pinned sections
  * @refcount:  counter of tasks in pinned sections
- * @grab_lock: set when the tasks entering pinned sections should wait
- * @synced:    notifier for @sync_tsk to tell cpu_down it's finished
- * @mutex:     the mutex to make tasks wait (used when @grab_lock is true)
- * @mutex_init:        zero if the mutex hasn't been initialized yet.
- *
- * Although @unplug and @sync_tsk may point to the same task, the @unplug
- * is used as a flag and still exists after @sync_tsk has exited and
- * @sync_tsk set to NULL.
+ * @migrate:   set when the tasks entering/leaving pinned sections should 
migrate
  */
 struct hotplug_pcp {
-       struct task_struct *unplug;
-       struct task_struct *sync_tsk;
        int refcount;
-       int grab_lock;
-       struct completion synced;
-       struct completion unplug_wait;
-#ifdef CONFIG_PREEMPT_RT_FULL
-       /*
-        * Note, on PREEMPT_RT, the hotplug lock must save the state of
-        * the task, otherwise the mutex will cause the task to fail
-        * to sleep when required. (Because it's called from migrate_disable())
-        *
-        * The spinlock_t on PREEMPT_RT is a mutex that saves the task's
-        * state.
-        */
-       spinlock_t lock;
-#else
-       struct mutex mutex;
-#endif
-       int mutex_init;
+       int migrate;
 };
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-# define hotplug_lock(hp) rt_spin_lock__no_mg(&(hp)->lock)
-# define hotplug_unlock(hp) rt_spin_unlock__no_mg(&(hp)->lock)
-#else
-# define hotplug_lock(hp) mutex_lock(&(hp)->mutex)
-# define hotplug_unlock(hp) mutex_unlock(&(hp)->mutex)
-#endif
-
 static DEFINE_PER_CPU(struct hotplug_pcp, hotplug_pcp);
 
 /**
@@ -221,42 +187,20 @@ static DEFINE_PER_CPU(struct hotplug_pcp
  */
 void pin_current_cpu(void)
 {
-       struct hotplug_pcp *hp;
-       int force = 0;
-
-retry:
-       hp = this_cpu_ptr(&hotplug_pcp);
+       struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
 
-       if (!hp->unplug || hp->refcount || force || preempt_count() > 1 ||
-           hp->unplug == current) {
+       if (!hp->migrate) {
                hp->refcount++;
                return;
        }
-       if (hp->grab_lock) {
-               preempt_enable();
-               hotplug_lock(hp);
-               hotplug_unlock(hp);
-       } else {
-               preempt_enable();
-               /*
-                * Try to push this task off of this CPU.
-                */
-               if (!migrate_me()) {
-                       preempt_disable();
-                       hp = this_cpu_ptr(&hotplug_pcp);
-                       if (!hp->grab_lock) {
-                               /*
-                                * Just let it continue it's already pinned
-                                * or about to sleep.
-                                */
-                               force = 1;
-                               goto retry;
-                       }
-                       preempt_enable();
-               }
-       }
+
+       /*
+        * Try to push this task off of this CPU.
+        */
+       preempt_enable_no_resched();
+       migrate_me();
        preempt_disable();
-       goto retry;
+       this_cpu_ptr(&hotplug_pcp)->refcount++;
 }
 
 /**
@@ -268,182 +212,54 @@ void unpin_current_cpu(void)
 {
        struct hotplug_pcp *hp = this_cpu_ptr(&hotplug_pcp);
 
-       WARN_ON(hp->refcount <= 0);
-
-       /* This is safe. sync_unplug_thread is pinned to this cpu */
-       if (!--hp->refcount && hp->unplug && hp->unplug != current)
-               wake_up_process(hp->unplug);
-}
-
-static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
-{
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       while (hp->refcount) {
-               schedule_preempt_disabled();
-               set_current_state(TASK_UNINTERRUPTIBLE);
-       }
-}
-
-static int sync_unplug_thread(void *data)
-{
-       struct hotplug_pcp *hp = data;
-
-       wait_for_completion(&hp->unplug_wait);
-       preempt_disable();
-       hp->unplug = current;
-       wait_for_pinned_cpus(hp);
-
-       /*
-        * This thread will synchronize the cpu_down() with threads
-        * that have pinned the CPU. When the pinned CPU count reaches
-        * zero, we inform the cpu_down code to continue to the next step.
-        */
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       preempt_enable();
-       complete(&hp->synced);
-
-       /*
-        * If all succeeds, the next step will need tasks to wait till
-        * the CPU is offline before continuing. To do this, the grab_lock
-        * is set and tasks going into pin_current_cpu() will block on the
-        * mutex. But we still need to wait for those that are already in
-        * pinned CPU sections. If the cpu_down() failed, the 
kthread_should_stop()
-        * will kick this thread out.
-        */
-       while (!hp->grab_lock && !kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_UNINTERRUPTIBLE);
-       }
-
-       /* Make sure grab_lock is seen before we see a stale completion */
-       smp_mb();
-
-       /*
-        * Now just before cpu_down() enters stop machine, we need to make
-        * sure all tasks that are in pinned CPU sections are out, and new
-        * tasks will now grab the lock, keeping them from entering pinned
-        * CPU sections.
-        */
-       if (!kthread_should_stop()) {
-               preempt_disable();
-               wait_for_pinned_cpus(hp);
-               preempt_enable();
-               complete(&hp->synced);
-       }
+       WARN_ON(hp->refcount-- <= 0);
 
-       set_current_state(TASK_UNINTERRUPTIBLE);
-       while (!kthread_should_stop()) {
-               schedule();
-               set_current_state(TASK_UNINTERRUPTIBLE);
-       }
-       set_current_state(TASK_RUNNING);
+       if (!hp->migrate)
+               return;
 
        /*
-        * Force this thread off this CPU as it's going down and
-        * we don't want any more work on this CPU.
+        * Try to push this task off of this CPU.
         */
-       current->flags &= ~PF_NO_SETAFFINITY;
-       set_cpus_allowed_ptr(current, cpu_present_mask);
+       preempt_enable_no_resched();
        migrate_me();
-       return 0;
-}
-
-static void __cpu_unplug_sync(struct hotplug_pcp *hp)
-{
-       wake_up_process(hp->sync_tsk);
-       wait_for_completion(&hp->synced);
+       preempt_disable();
 }
 
-static void __cpu_unplug_wait(unsigned int cpu)
+static void wait_for_pinned_cpus(struct hotplug_pcp *hp)
 {
-       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-
-       complete(&hp->unplug_wait);
-       wait_for_completion(&hp->synced);
+       while (hp->refcount) {
+               trace_printk("CHILL\n");
+               cpu_chill();
+       }
 }
 
 /*
- * Start the sync_unplug_thread on the target cpu and wait for it to
- * complete.
+ * Start migration of pinned tasks on the target cpu.
  */
 static int cpu_unplug_begin(unsigned int cpu)
 {
        struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-       int err;
-
-       /* Protected by cpu_hotplug.lock */
-       if (!hp->mutex_init) {
-#ifdef CONFIG_PREEMPT_RT_FULL
-               spin_lock_init(&hp->lock);
-#else
-               mutex_init(&hp->mutex);
-#endif
-               hp->mutex_init = 1;
-       }
 
        /* Inform the scheduler to migrate tasks off this CPU */
        tell_sched_cpu_down_begin(cpu);
+       hp->migrate = 1;
 
-       init_completion(&hp->synced);
-       init_completion(&hp->unplug_wait);
-
-       hp->sync_tsk = kthread_create(sync_unplug_thread, hp, "sync_unplug/%d", 
cpu);
-       if (IS_ERR(hp->sync_tsk)) {
-               err = PTR_ERR(hp->sync_tsk);
-               hp->sync_tsk = NULL;
-               return err;
-       }
-       kthread_bind(hp->sync_tsk, cpu);
+       /* Let all tasks know cpu unplug is starting */
+       smp_rmb();
 
-       /*
-        * Wait for tasks to get out of the pinned sections,
-        * it's still OK if new tasks enter. Some CPU notifiers will
-        * wait for tasks that are going to enter these sections and
-        * we must not have them block.
-        */
-       wake_up_process(hp->sync_tsk);
        return 0;
 }
 
-static void cpu_unplug_sync(unsigned int cpu)
-{
-       struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
-
-       init_completion(&hp->synced);
-       /* The completion needs to be initialzied before setting grab_lock */
-       smp_wmb();
-
-       /* Grab the mutex before setting grab_lock */
-       hotplug_lock(hp);
-       hp->grab_lock = 1;
-
-       /*
-        * The CPU notifiers have been completed.
-        * Wait for tasks to get out of pinned CPU sections and have new
-        * tasks block until the CPU is completely down.
-        */
-       __cpu_unplug_sync(hp);
-
-       /* All done with the sync thread */
-       kthread_stop(hp->sync_tsk);
-       hp->sync_tsk = NULL;
-}
-
 static void cpu_unplug_done(unsigned int cpu)
 {
        struct hotplug_pcp *hp = &per_cpu(hotplug_pcp, cpu);
 
-       hp->unplug = NULL;
-       /* Let all tasks know cpu unplug is finished before cleaning up */
+       /* Let all tasks know cpu unplug is finished */
        smp_wmb();
 
-       if (hp->sync_tsk)
-               kthread_stop(hp->sync_tsk);
-
-       if (hp->grab_lock) {
-               hotplug_unlock(hp);
+       if (hp->migrate) {
                /* protected by cpu_hotplug.lock */
-               hp->grab_lock = 0;
+               hp->migrate = 0;
        }
        tell_sched_cpu_down_done(cpu);
 }
@@ -951,6 +767,10 @@ static int take_cpu_down(void *_param)
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();
 
+       /* RT: too bad, some task snuck in on the way here */
+       if (this_cpu_ptr(&hotplug_pcp)->refcount)
+               return -EBUSY;
+
        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
@@ -972,7 +792,7 @@ static int take_cpu_down(void *_param)
 static int takedown_cpu(unsigned int cpu)
 {
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
-       int err;
+       int err, once = 0;
 
        /*
         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
@@ -989,25 +809,32 @@ static int takedown_cpu(unsigned int cpu
        else
                synchronize_rcu();
 
-       __cpu_unplug_wait(cpu);
-
        /* Park the smpboot threads */
        kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
        smpboot_park_threads(cpu);
 
-       /* Notifiers are done. Don't let any more tasks pin this CPU. */
-       cpu_unplug_sync(cpu);
-
        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();
 
+again:
+       /* Notifiers are done.  Check for late references */
+       wait_for_pinned_cpus(&per_cpu(hotplug_pcp, cpu));
+
        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
        err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu));
+       if (err == -EBUSY) {
+               if (!once) {
+                       trace_printk("BUSY, trying again\n");
+                       once++;
+                       goto again;
+               }
+               trace_printk("CRAP, still busy.  Deal with it caller\n");
+       }
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
                cpu_notify_nofail(CPU_DOWN_FAILED, cpu);

Reply via email to