On Wed, Feb 12, 2014 at 05:40:12PM -0800, Andy Lutomirski wrote:
> This is a strawman proposal to simplify the idle implementation, eliminate
> a race
> 
> Benefits over current code:
>  - ttwu_queue_remote doesn't use an IPI unless needed
>  - The diffstat should speak for itself :)
>  - Less racy.  Spurious IPIs are possible, but only in narrow windows or
>    when two wakeups occur in rapid succession.
>  - Seems to work (?)
> 
> Issues:
>  - Am I doing the percpu stuff right?
>  - Needs work on non-x86 architectures
>  - The !CONFIG_SMP case needs to be checked
>  - Is "idlepoll" a good name for the new code?  It doesn't have *that*
>    much to do with the idle state.  Maybe cpukick?
> 
> If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.

No, we can't do away with that; its used in some fairly critical paths
(return to userspace) and adding a second cacheline load there would be
unfortunate.

I also don't really like how the polling state is an atomic; its a cpu
local property.

Now given we can't get rid of TIF_NEED_RESCHED, and we need an atomic op
on a remote cacheline anyhow; the simplest solution would be to convert
all TS_POLLING users to TIF_POLLING_NRFLAG and use an atomic_or_return()
like construct to do:

  atomic_or_return(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG

and avoid the IPI if that is false.

Something a little like this; it does require a lot of auditing; but it
boots on my x86_64.

---
 arch/x86/include/asm/mwait.h       | 19 ++++----
 arch/x86/include/asm/thread_info.h |  4 +-
 arch/x86/kernel/process.c          |  8 ++--
 include/linux/sched.h              | 90 +++-----------------------------------
 kernel/sched/core.c                | 70 ++++++++++++++++++-----------
 kernel/sched/idle.c                | 40 ++++++++---------
 kernel/sched/sched.h               |  3 ++
 7 files changed, 88 insertions(+), 146 deletions(-)

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5f96f9..d6a7b7cdc478 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -42,18 +42,15 @@ static inline void __mwait(unsigned long eax, unsigned long 
ecx)
  */
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-       if (!current_set_polling_and_test()) {
-               if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
-                       mb();
-                       clflush((void *)&current_thread_info()->flags);
-                       mb();
-               }
-
-               __monitor((void *)&current_thread_info()->flags, 0, 0);
-               if (!need_resched())
-                       __mwait(eax, ecx);
+       if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+               mb();
+               clflush((void *)&current_thread_info()->flags);
+               mb();
        }
-       current_clr_polling();
+
+       __monitor((void *)&current_thread_info()->flags, 0, 0);
+       if (!need_resched())
+               __mwait(eax, ecx);
 }
 
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index e1940c06ed02..1f2fe575cfca 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
 #define TIF_FORK               18      /* ret_from_fork */
 #define TIF_NOHZ               19      /* in adaptive nohz mode */
 #define TIF_MEMDIE             20      /* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG     21
 #define TIF_IO_BITMAP          22      /* uses I/O bitmap */
 #define TIF_FORCED_TF          24      /* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP          25      /* set when we want DEBUGCTLMSR_BTF */
@@ -111,6 +112,7 @@ struct thread_info {
 #define _TIF_IA32              (1 << TIF_IA32)
 #define _TIF_FORK              (1 << TIF_FORK)
 #define _TIF_NOHZ              (1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP         (1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF         (1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP         (1 << TIF_BLOCKSTEP)
@@ -234,8 +236,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT              0x0002  /* 32bit syscall active (64BIT)*/
-#define TS_POLLING             0x0004  /* idle task polling need_resched,
-                                          skip sending interrupt */
 #define TS_RESTORE_SIGMASK     0x0008  /* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4505e2a950d8..298c002629c6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -306,9 +306,11 @@ void arch_cpu_idle(void)
  */
 void default_idle(void)
 {
-       trace_cpu_idle_rcuidle(1, smp_processor_id());
-       safe_halt();
-       trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+       if (!current_clr_polling_and_test()) {
+               trace_cpu_idle_rcuidle(1, smp_processor_id());
+               safe_halt();
+               trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+       }
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c49a2585ff7d..b326abe95978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2692,85 +2692,26 @@ static inline int spin_needbreak(spinlock_t *lock)
  * thread_info.status and one based on TIF_POLLING_NRFLAG in
  * thread_info.flags
  */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-       return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-       current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       __current_set_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-       current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
-       smp_mb();
-
-       return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
+#ifdef CONFIG_SMP
 static inline int tsk_is_polling(struct task_struct *p)
 {
        return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
 }
 
-static inline void __current_set_polling(void)
+static inline void current_set_polling(void)
 {
        set_thread_flag(TIF_POLLING_NRFLAG);
 }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       __current_set_polling();
-
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        *
-        * XXX: assumes set/clear bit are identical barrier wise.
-        */
-       smp_mb__after_clear_bit();
-
-       return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
+static inline void current_clr_polling(void)
 {
        clear_thread_flag(TIF_POLLING_NRFLAG);
 }
 
 static inline bool __must_check current_clr_polling_and_test(void)
 {
-       __current_clr_polling();
+       current_clr_polling();
 
-       /*
-        * Polling state must be visible before we test NEED_RESCHED,
-        * paired by resched_task()
-        */
        smp_mb__after_clear_bit();
 
        return unlikely(tif_need_resched());
@@ -2778,34 +2719,15 @@ static inline bool __must_check 
current_clr_polling_and_test(void)
 
 #else
 static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
+static inline void current_set_polling(void) { }
+static inline void current_clr_polling(void) { }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-       return unlikely(tif_need_resched());
-}
 static inline bool __must_check current_clr_polling_and_test(void)
 {
        return unlikely(tif_need_resched());
 }
 #endif
 
-static inline void current_clr_polling(void)
-{
-       __current_clr_polling();
-
-       /*
-        * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-        * Once the bit is cleared, we'll get IPIs with every new
-        * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-        * fold.
-        */
-       smp_mb(); /* paired with resched_task() */
-
-       preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
        return unlikely(tif_need_resched());
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764fbc537..19de9a1cc96c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -504,45 +504,63 @@ static inline void init_hrtick(void)
 }
 #endif /* CONFIG_SCHED_HRTICK */
 
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
+static void __resched_cpu(int cpu)
 {
-       int cpu;
+       struct rq *rq = cpu_rq(cpu);
+       struct task_struct *p;
+       struct thread_info *ti;
+       unsigned long val, old, new;
+       bool ipi;
 
-       lockdep_assert_held(&task_rq(p)->lock);
+       rcu_read_lock();
+       do {
+               p = ACCESS_ONCE(rq->curr);
+               ti = task_thread_info(p);
+
+               val = ti->flags;
+               for (;;) {
+                       new = val | _TIF_NEED_RESCHED;
+                       old = cmpxchg(&ti->flags, val, new);
+                       if (old == val)
+                               break;
+                       val = old;
+               }
 
-       if (test_tsk_need_resched(p))
-               return;
+               ipi = !(old & _TIF_POLLING_NRFLAG);
 
-       set_tsk_need_resched(p);
+       } while (p != ACCESS_ONCE(rq->curr));
+       rcu_read_unlock();
 
-       cpu = task_cpu(p);
+       if (ipi)
+               smp_send_reschedule(cpu);
+}
+
+void resched_cpu(int cpu)
+{
        if (cpu == smp_processor_id()) {
+               set_tsk_need_resched(current);
                set_preempt_need_resched();
                return;
        }
 
-       /* NEED_RESCHED must be visible before we test polling */
-       smp_mb();
-       if (!tsk_is_polling(p))
-               smp_send_reschedule(cpu);
+       __resched_cpu(cpu);
 }
 
-void resched_cpu(int cpu)
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
 {
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long flags;
+       lockdep_assert_held(&task_rq(p)->lock);
 
-       if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+       if (test_tsk_need_resched(p))
                return;
-       resched_task(cpu_curr(cpu));
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+       resched_cpu(task_cpu(p));
 }
 
 #ifdef CONFIG_SMP
@@ -1476,7 +1494,7 @@ static int ttwu_remote(struct task_struct *p, int 
wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
        struct llist_node *llist = llist_del_all(&rq->wake_list);
@@ -2689,7 +2707,7 @@ static void __sched __schedule(void)
                update_rq_clock(rq);
 
        next = pick_next_task(rq, prev);
-       clear_tsk_need_resched(prev);
+       clear_tsk_need_resched(next);
        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 14ca43430aee..03748737473e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -11,6 +11,7 @@
 #include <asm/tlb.h>
 
 #include <trace/events/power.h>
+#include "sched.h"
 
 static int __read_mostly cpu_idle_force_poll;
 
@@ -71,6 +72,8 @@ static void cpu_idle_loop(void)
        while (1) {
                tick_nohz_idle_enter();
 
+               current_set_polling();
+
                while (!need_resched()) {
                        check_pgt_cache();
                        rmb();
@@ -93,29 +96,27 @@ static void cpu_idle_loop(void)
                        if (cpu_idle_force_poll || 
tick_check_broadcast_expired()) {
                                cpu_idle_poll();
                        } else {
-                               if (!current_clr_polling_and_test()) {
-                                       stop_critical_timings();
-                                       rcu_idle_enter();
-                                       if (cpuidle_idle_call())
-                                               arch_cpu_idle();
-                                       if (WARN_ON_ONCE(irqs_disabled()))
-                                               local_irq_enable();
-                                       rcu_idle_exit();
-                                       start_critical_timings();
-                               } else {
+                               stop_critical_timings();
+                               rcu_idle_enter();
+                               if (cpuidle_idle_call())
+                                       arch_cpu_idle();
+                               if (WARN_ON_ONCE(irqs_disabled()))
                                        local_irq_enable();
-                               }
-                               __current_set_polling();
+                               rcu_idle_exit();
+                               start_critical_timings();
                        }
                        arch_cpu_idle_exit();
-                       /*
-                        * We need to test and propagate the TIF_NEED_RESCHED
-                        * bit here because we might not have send the
-                        * reschedule IPI to idle tasks.
-                        */
-                       if (tif_need_resched())
-                               set_preempt_need_resched();
                }
+
+               current_clr_polling();
+
+               /*
+                * We need to test and propagate the TIF_NEED_RESCHED bit here
+                * because we might not have send the reschedule IPI to idle
+                * tasks.
+                */
+               set_preempt_need_resched();
+               sched_ttwu_pending();
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
        }
@@ -138,7 +139,6 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-       __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c257d3b..269b1a4e0bdf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1160,6 +1160,7 @@ extern const struct sched_class idle_sched_class;
 
 #ifdef CONFIG_SMP
 
+extern void sched_ttwu_pending(void);
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
@@ -1170,6 +1171,8 @@ extern void idle_exit_fair(struct rq *this_rq);
 
 #else  /* CONFIG_SMP */
 
+static inline void sched_ttwu_pending(void) { }
+
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to