On Tue, May 01, 2018 at 12:18:45PM +0200, Peter Zijlstra wrote:
> Aaaah... I think I've spotted a problem there. We clear SHOULD_PARK
> before we rebind, so if the thread lost the first PARKED store,
> does the completion, gets migrated, cycles through the loop and now
> observes !SHOULD_PARK and bails the wait-loop, then __kthread_bind()
> will forever wait.


Something like so perhaps...

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -451,6 +451,21 @@ void kthread_unpark(struct task_struct *
 {
        struct kthread *kthread = to_kthread(k);
 
+       if (test_bit(KTHREAD_IS_PARKED)) {
+               /*
+                * Newly created kthread was parked when the CPU was offline.
+                * The binding was lost and we need to set it again.
+                */
+               if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+                       __kthread_bind(k, kthread->cpu, TASK_PARKED);
+       }
+
+       /*
+        * Ensures the IS_PARKED load precedes the !SHOULD_PARK store.
+        * matched by the smp_mb() from test_and_set_bit() in 
__kthread_parkme().
+        */
+       smp_mb__before_atomic();
+
        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * We clear the IS_PARKED bit here as we don't wait
@@ -458,15 +473,8 @@ void kthread_unpark(struct task_struct *
         * park before that happens we'd see the IS_PARKED bit
         * which might be about to be cleared.
         */
-       if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-               /*
-                * Newly created kthread was parked when the CPU was offline.
-                * The binding was lost and we need to set it again.
-                */
-               if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-                       __kthread_bind(k, kthread->cpu, TASK_PARKED);
+       if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags))
                wake_up_state(k, TASK_PARKED);
-       }
 }
 EXPORT_SYMBOL_GPL(kthread_unpark);
 

Reply via email to