On Fri, Feb 14, 2014 at 12:01 PM, Andy Lutomirski <l...@amacapital.net> wrote:
> On Thu, Feb 13, 2014 at 6:50 AM, Peter Zijlstra <pet...@infradead.org> wrote:
>> On Wed, Feb 12, 2014 at 05:40:12PM -0800, Andy Lutomirski wrote:
>>> This is a strawman proposal to simplify the idle implementation, eliminate
>>> a race
>>>
>>> Benefits over current code:
>>>  - ttwu_queue_remote doesn't use an IPI unless needed
>>>  - The diffstat should speak for itself :)
>>>  - Less racy.  Spurious IPIs are possible, but only in narrow windows or
>>>    when two wakeups occur in rapid succession.
>>>  - Seems to work (?)
>>>
>>> Issues:
>>>  - Am I doing the percpu stuff right?
>>>  - Needs work on non-x86 architectures
>>>  - The !CONFIG_SMP case needs to be checked
>>>  - Is "idlepoll" a good name for the new code?  It doesn't have *that*
>>>    much to do with the idle state.  Maybe cpukick?
>>>
>>> If this turns out okay, TIF_NEED_RESCHED could possibly be deleted as well.
>>
>> No, we can't do away with that; its used in some fairly critical paths
>> (return to userspace) and adding a second cacheline load there would be
>> unfortunate.
>>
>> I also don't really like how the polling state is an atomic; its a cpu
>> local property.
>>
>> Now given we can't get rid of TIF_NEED_RESCHED, and we need an atomic op
>> on a remote cacheline anyhow; the simplest solution would be to convert
>> all TS_POLLING users to TIF_POLLING_NRFLAG and use an atomic_or_return()
>> like construct to do:
>>
>>   atomic_or_return(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG
>>
>> and avoid the IPI if that is false.
>>
>> Something a little like this; it does require a lot of auditing; but it
>> boots on my x86_64.
>
> On further consideration, I think I like this approach.  It's a
> simpler change than mine and it appears to work (unlike mine, and I
> still haven't figured out what I'm doing wrong).  If anyone wants to
> get rid of the cmpxchg loop, a trick like would likely work well built
> on top of this.
>
> That being said, I can't really test this, because I can't seem to
> boot any recent 3.14-based kernel on real hardware, and they die
> before they produce any output.  They work fine in QEMU.

Either you have a bug or I rebased it wrong.  With the attached
rebased version, I hit WARN_ON_ONCE(irqs_disabled()) in
cpu_idle_loop() on a semi-regular basis when I boot with:

virtme-runkernel arch/x86/boot/bzImage -smp 2 -cpu host

It also sometimes hangs after starting a shell.

(shameless plug: this is virtme from
https://git.kernel.org/cgit/utils/kernel/virtme/virtme.git  )

--Andy
commit bdf333e0ebb4bd33ea20ee1825314381681208b0
Author: Peter Zijlstra <pet...@infradead.org>
Date:   Thu Feb 13 15:50:16 2014 +0100

    peterz's IPI patch, rebased by Andy to -linus

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 1da25a5..d6a7b7c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -42,18 +42,15 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
  */
 static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
 {
-	if (!current_set_polling_and_test()) {
-		if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
-			mb();
-			clflush((void *)&current_thread_info()->flags);
-			mb();
-		}
-
-		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		if (!need_resched())
-			__mwait(eax, ecx);
+	if (static_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) {
+		mb();
+		clflush((void *)&current_thread_info()->flags);
+		mb();
 	}
-	current_clr_polling();
+
+	__monitor((void *)&current_thread_info()->flags, 0, 0);
+	if (!need_resched())
+		__mwait(eax, ecx);
 }
 
 #endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e1940c0..1f2fe57 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -88,6 +88,7 @@ struct thread_info {
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_NOHZ		19	/* in adaptive nohz mode */
 #define TIF_MEMDIE		20	/* is terminating due to OOM killer */
+#define TIF_POLLING_NRFLAG	21
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_BLOCKSTEP		25	/* set when we want DEBUGCTLMSR_BTF */
@@ -111,6 +112,7 @@ struct thread_info {
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
 #define _TIF_NOHZ		(1 << TIF_NOHZ)
+#define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_BLOCKSTEP		(1 << TIF_BLOCKSTEP)
@@ -234,8 +236,6 @@ static inline struct thread_info *current_thread_info(void)
  * have to worry about atomic accesses.
  */
 #define TS_COMPAT		0x0002	/* 32bit syscall active (64BIT)*/
-#define TS_POLLING		0x0004	/* idle task polling need_resched,
-					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..4939eb7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -309,9 +309,11 @@ void arch_cpu_idle(void)
  */
 void default_idle(void)
 {
-	trace_cpu_idle_rcuidle(1, smp_processor_id());
-	safe_halt();
-	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	if (!current_clr_polling_and_test()) {
+		trace_cpu_idle_rcuidle(1, smp_processor_id());
+		safe_halt();
+		trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	}
 }
 #ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec..cae2249 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2667,85 +2667,26 @@ static inline int spin_needbreak(spinlock_t *lock)
  * thread_info.status and one based on TIF_POLLING_NRFLAG in
  * thread_info.flags
  */
-#ifdef TS_POLLING
-static inline int tsk_is_polling(struct task_struct *p)
-{
-	return task_thread_info(p)->status & TS_POLLING;
-}
-static inline void __current_set_polling(void)
-{
-	current_thread_info()->status |= TS_POLLING;
-}
-
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
-{
-	current_thread_info()->status &= ~TS_POLLING;
-}
-
-static inline bool __must_check current_clr_polling_and_test(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
-	smp_mb();
-
-	return unlikely(tif_need_resched());
-}
-#elif defined(TIF_POLLING_NRFLAG)
+#ifdef CONFIG_SMP
 static inline int tsk_is_polling(struct task_struct *p)
 {
 	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
 }
 
-static inline void __current_set_polling(void)
+static inline void current_set_polling(void)
 {
 	set_thread_flag(TIF_POLLING_NRFLAG);
 }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	__current_set_polling();
-
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 *
-	 * XXX: assumes set/clear bit are identical barrier wise.
-	 */
-	smp_mb__after_clear_bit();
-
-	return unlikely(tif_need_resched());
-}
-
-static inline void __current_clr_polling(void)
+static inline void current_clr_polling(void)
 {
 	clear_thread_flag(TIF_POLLING_NRFLAG);
 }
 
 static inline bool __must_check current_clr_polling_and_test(void)
 {
-	__current_clr_polling();
+	current_clr_polling();
 
-	/*
-	 * Polling state must be visible before we test NEED_RESCHED,
-	 * paired by resched_task()
-	 */
 	smp_mb__after_clear_bit();
 
 	return unlikely(tif_need_resched());
@@ -2753,34 +2694,15 @@ static inline bool __must_check current_clr_polling_and_test(void)
 
 #else
 static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void __current_set_polling(void) { }
-static inline void __current_clr_polling(void) { }
+static inline void current_set_polling(void) { }
+static inline void current_clr_polling(void) { }
 
-static inline bool __must_check current_set_polling_and_test(void)
-{
-	return unlikely(tif_need_resched());
-}
 static inline bool __must_check current_clr_polling_and_test(void)
 {
 	return unlikely(tif_need_resched());
 }
 #endif
 
-static inline void current_clr_polling(void)
-{
-	__current_clr_polling();
-
-	/*
-	 * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
-	 * Once the bit is cleared, we'll get IPIs with every new
-	 * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
-	 * fold.
-	 */
-	smp_mb(); /* paired with resched_task() */
-
-	preempt_fold_need_resched();
-}
-
 static __always_inline bool need_resched(void)
 {
 	return unlikely(tif_need_resched());
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494..88d2b40 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -10,6 +10,7 @@
 #include <asm/tlb.h>
 
 #include <trace/events/power.h>
+#include "../sched/sched.h"
 
 static int __read_mostly cpu_idle_force_poll;
 
@@ -70,6 +71,8 @@ static void cpu_idle_loop(void)
 	while (1) {
 		tick_nohz_idle_enter();
 
+		current_set_polling();
+
 		while (!need_resched()) {
 			check_pgt_cache();
 			rmb();
@@ -92,30 +95,25 @@ static void cpu_idle_loop(void)
 			if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
 				cpu_idle_poll();
 			} else {
-				if (!current_clr_polling_and_test()) {
-					stop_critical_timings();
-					rcu_idle_enter();
-					arch_cpu_idle();
-					WARN_ON_ONCE(irqs_disabled());
-					rcu_idle_exit();
-					start_critical_timings();
-				} else {
-					local_irq_enable();
-				}
-				__current_set_polling();
+				stop_critical_timings();
+				rcu_idle_enter();
+				arch_cpu_idle();
+				WARN_ON_ONCE(irqs_disabled());
+				rcu_idle_exit();
+				start_critical_timings();
 			}
 			arch_cpu_idle_exit();
 		}
 
+		current_clr_polling();
+
 		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
+		 * We need to test and propagate the TIF_NEED_RESCHED bit here
+		 * because we might not have send the reschedule IPI to idle
+		 * tasks.
 		 */
-		preempt_set_need_resched();
+		set_preempt_need_resched();
+		sched_ttwu_pending();
 		tick_nohz_idle_exit();
 		schedule_preempt_disabled();
 	}
@@ -138,7 +136,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 	 */
 	boot_init_stack_canary();
 #endif
-	__current_set_polling();
 	arch_cpu_idle_prepare();
 	cpu_idle_loop();
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131e..4b5f168 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -504,45 +504,63 @@ static inline void init_hrtick(void)
 }
 #endif	/* CONFIG_SCHED_HRTICK */
 
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
+static void __resched_cpu(int cpu)
 {
-	int cpu;
+	struct rq *rq = cpu_rq(cpu);
+	struct task_struct *p;
+	struct thread_info *ti;
+	unsigned long val, old, new;
+	bool ipi;
 
-	lockdep_assert_held(&task_rq(p)->lock);
+	rcu_read_lock();
+	do {
+		p = ACCESS_ONCE(rq->curr);
+		ti = task_thread_info(p);
+
+		val = ti->flags;
+		for (;;) {
+			new = val | _TIF_NEED_RESCHED;
+			old = cmpxchg(&ti->flags, val, new);
+			if (old == val)
+				break;
+			val = old;
+		}
 
-	if (test_tsk_need_resched(p))
-		return;
+		ipi = !(old & _TIF_POLLING_NRFLAG);
 
-	set_tsk_need_resched(p);
+	} while (p != ACCESS_ONCE(rq->curr));
+	rcu_read_unlock();
 
-	cpu = task_cpu(p);
+	if (ipi)
+		smp_send_reschedule(cpu);
+}
+
+void resched_cpu(int cpu)
+{
 	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(current);
 		set_preempt_need_resched();
 		return;
 	}
 
-	/* NEED_RESCHED must be visible before we test polling */
-	smp_mb();
-	if (!tsk_is_polling(p))
-		smp_send_reschedule(cpu);
+	__resched_cpu(cpu);
 }
 
-void resched_cpu(int cpu)
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
 {
-	struct rq *rq = cpu_rq(cpu);
-	unsigned long flags;
+	lockdep_assert_held(&task_rq(p)->lock);
 
-	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
+	if (test_tsk_need_resched(p))
 		return;
-	resched_task(cpu_curr(cpu));
-	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	resched_cpu(task_cpu(p));
 }
 
 #ifdef CONFIG_SMP
@@ -1476,7 +1494,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
 	struct llist_node *llist = llist_del_all(&rq->wake_list);
@@ -2707,7 +2725,7 @@ need_resched:
 
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
-	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched(next);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd..f4cb63a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1173,6 +1173,7 @@ extern const struct sched_class idle_sched_class;
 
 #ifdef CONFIG_SMP
 
+extern void sched_ttwu_pending(void);
 extern void update_group_power(struct sched_domain *sd, int cpu);
 
 extern void trigger_load_balance(struct rq *rq);
@@ -1183,6 +1184,8 @@ extern void idle_exit_fair(struct rq *this_rq);
 
 #else	/* CONFIG_SMP */
 
+static inline void sched_ttwu_pending(void) { }
+
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }

Reply via email to