On Tue, Aug 13, 2013 at 12:30:56PM +0200, Ingo Molnar wrote: > But we could perhaps do something else and push the overhead into > resched_task(): instead of using atomics we could use the resched IPI to > set the local preempt_count(). That way preempt_count() will only ever be > updated CPU-locally and we could merge need_resched into preempt_count() > just fine. > > [ Some care has to be taken with polling-idle threads: those could simply > use another signalling mechanism, another field in task struct, no need > to abuse need_resched for that. ] > > We could still _read_ the preempt count non-destructively from other CPUs, > to avoid having to send a resched IPI for already marked tasks. [ OTOH it > might be faster to never do that and assume that an IPI has to be sent in > 99.9% of the cases - that would have to be re-measured. ] > > Using this method we could have a really lightweight, minimal, percpu > based preempt count mechanism in all the default fastpath cases, both for > nested and for non-nested preempt_enable()s.
Indeed, however we need to keep TIF_NEED_RESCHED as is, and simply transfer it to preempt_count() on the IPI. Every NEED_RESCHED already causes the IPI, except indeed the 'polling' idle threads. Those we need to also teach to transfer. The reason is that the reschedule IPI is slightly over-used and we wouldn't want to unconditionally cause a reschedule. The below again boots to wanting to mount a root filesystem and is only tested with kvm -smp 4, CONFIG_PREEMPT=y. So we're now down to something like: decl fs:preempt_count cmpl PREEMPT_NEED_RESCHED,fs:preempt_count jnz 1f call preempt_schedule 1f: --- arch/x86/include/asm/thread_info.h | 9 ++++---- arch/x86/kernel/asm-offsets.c | 2 +- arch/x86/kernel/entry_64.S | 4 +--- include/linux/hardirq.h | 4 ++-- include/linux/preempt.h | 11 ++++++---- include/linux/sched.h | 2 +- include/linux/thread_info.h | 1 + include/trace/events/sched.h | 2 +- init/main.c | 7 ++++--- kernel/context_tracking.c | 3 +-- kernel/cpu/idle.c | 3 +++ kernel/sched/core.c | 43 ++++++++++++++++++++++++++------------ kernel/softirq.c | 8 +++---- kernel/timer.c | 9 ++++---- lib/smp_processor_id.c | 3 +-- 15 files changed, 67 insertions(+), 44 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2781119..232d512 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -28,8 +28,7 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, - <0 => BUG */ + int saved_preempt_count; mm_segment_t addr_limit; struct restart_block restart_block; void __user *sysenter_return; @@ -49,7 +48,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ + .saved_preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ @@ -100,8 +99,8 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -112,6 +111,7 @@ struct thread_info { #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) +#define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) @@ -155,6 +155,7 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define PREEMPT_ACTIVE 0x10000000 +#define PREEMPT_NEED_RESCHED 0x20000000 #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2861082..46d889f 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -32,7 +32,7 @@ void common(void) { OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); OFFSET(TI_addr_limit, thread_info, addr_limit); - OFFSET(TI_preempt_count, thread_info, preempt_count); +// OFFSET(TI_preempt_count, thread_info, preempt_count); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1b69951..011b6d3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1118,10 +1118,8 @@ ENTRY(native_iret) /* Returning to kernel space. Check if we need preemption */ /* rcx: threadinfo. interrupts off. */ ENTRY(retint_kernel) - cmpl $0,TI_preempt_count(%rcx) + cmpl $PREEMPT_NEED_RESCHED,PER_CPU_VAR(__preempt_count_var) jnz retint_restore_args - bt $TIF_NEED_RESCHED,TI_flags(%rcx) - jnc retint_restore_args bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 05bcc09..5d7f83c 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -107,14 +107,14 @@ * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +#define in_atomic() ((preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler, *after* releasing the kernel lock) */ #define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + ((preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) != PREEMPT_CHECK_OFFSET) #ifdef CONFIG_PREEMPT_COUNT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) diff --git a/include/linux/preempt.h b/include/linux/preempt.h index f5d4723..724ee32 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -6,7 +6,8 @@ * preempt_count (used for kernel preemption, interrupt count, etc.) */ -#include <linux/thread_info.h> +#include <linux/thread_info.h> /* for PREEMPT_NEED_RESCHED */ +#include <asm/percpu.h> #include <linux/linkage.h> #include <linux/list.h> @@ -21,7 +22,9 @@ #define inc_preempt_count() add_preempt_count(1) #define dec_preempt_count() sub_preempt_count(1) -#define preempt_count() (current_thread_info()->preempt_count) +DECLARE_PER_CPU(int, __preempt_count_var); + +#define preempt_count() __raw_get_cpu_var(__preempt_count_var) #ifdef CONFIG_PREEMPT @@ -29,7 +32,7 @@ asmlinkage void preempt_schedule(void); #define preempt_check_resched() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(preempt_count() == PREEMPT_NEED_RESCHED)) \ preempt_schedule(); \ } while (0) @@ -39,7 +42,7 @@ void preempt_schedule_context(void); #define preempt_check_resched_context() \ do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + if (unlikely(preempt_count() == PREEMPT_NEED_RESCHED)) \ preempt_schedule_context(); \ } while (0) #else diff --git a/include/linux/sched.h b/include/linux/sched.h index f79ced7..6d2ee58 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2405,7 +2405,7 @@ static inline int signal_pending_state(long state, struct task_struct *p) static inline int need_resched(void) { - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); + return preempt_count() & PREEMPT_NEED_RESCHED; } /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e7e0473..477c301 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -104,6 +104,7 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) +#define test_need_resched() test_thread_flag(TIF_NEED_RESCHED) #define set_need_resched() set_thread_flag(TIF_NEED_RESCHED) #define clear_need_resched() clear_thread_flag(TIF_NEED_RESCHED) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index e5586ca..816e2d6 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -103,7 +103,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p) /* * For all intents and purposes a preempted task is a running task. */ - if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE) + if (task_thread_info(p)->saved_preempt_count & PREEMPT_ACTIVE) state = TASK_RUNNING | TASK_STATE_MAX; #endif diff --git a/init/main.c b/init/main.c index d03d2ec..6948f23 100644 --- a/init/main.c +++ b/init/main.c @@ -677,7 +677,7 @@ static int __init_or_module do_one_initcall_debug(initcall_t fn) int __init_or_module do_one_initcall(initcall_t fn) { - int count = preempt_count(); + int count = preempt_count() & PREEMPT_MASK; int ret; char msgbuf[64]; @@ -688,9 +688,10 @@ int __init_or_module do_one_initcall(initcall_t fn) msgbuf[0] = 0; - if (preempt_count() != count) { + if ((preempt_count() & PREEMPT_MASK) != count) { sprintf(msgbuf, "preemption imbalance "); - preempt_count() = count; + preempt_count() &= ~PREEMPT_MASK; + preempt_count() |= count; } if (irqs_disabled()) { strlcat(msgbuf, "disabled interrupts ", sizeof(msgbuf)); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f823..6d113d8 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -87,10 +87,9 @@ void user_enter(void) */ void __sched notrace preempt_schedule_context(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_ctx; - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(preempt_count() || irqs_disabled())) return; /* diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index e695c0a..7a1afab 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -106,6 +106,9 @@ static void cpu_idle_loop(void) current_set_polling(); } arch_cpu_idle_exit(); + + if (test_need_resched()) + preempt_count() |= PREEMPT_NEED_RESCHED; } tick_nohz_idle_exit(); schedule_preempt_disabled(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 54957a6..e52e468 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -89,6 +89,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> +DEFINE_PER_CPU(int, __preempt_count_var) = INIT_PREEMPT_COUNT; + void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; @@ -526,8 +528,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + preempt_count() |= PREEMPT_NEED_RESCHED; return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -994,7 +998,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_thread_info(p)->saved_preempt_count & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1411,6 +1415,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + if (test_need_resched()) + preempt_count() |= PREEMPT_NEED_RESCHED; + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1728,7 +1735,7 @@ void sched_fork(struct task_struct *p) #endif #ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; + task_thread_info(p)->saved_preempt_count = 1; #endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); @@ -2013,6 +2020,16 @@ context_switch(struct rq *rq, struct task_struct *prev, spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif +#ifdef CONFIG_PREEMPT_COUNT + /* + * If it weren't for PREEMPT_ACTIVE we could guarantee that the + * preempt_count() of all tasks was equal here and this wouldn't be + * needed at all -- try and move PREEMPT_ACTIVE into TI_flags? + */ + task_thread_info(prev)->saved_preempt_count = preempt_count(); + preempt_count() = task_thread_info(next)->saved_preempt_count; +#endif + context_tracking_task_switch(prev, next); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2241,7 +2258,7 @@ void __kprobes add_preempt_count(int val) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 10); #endif - if (preempt_count() == val) + if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } EXPORT_SYMBOL(add_preempt_count); @@ -2252,7 +2269,7 @@ void __kprobes sub_preempt_count(int val) /* * Underflow? */ - if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + if (DEBUG_LOCKS_WARN_ON(val > (preempt_count() & ~PREEMPT_NEED_RESCHED))) return; /* * Is the spinlock portion underflowing? @@ -2262,7 +2279,7 @@ void __kprobes sub_preempt_count(int val) return; #endif - if (preempt_count() == val) + if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); preempt_count() -= val; } @@ -2433,6 +2450,7 @@ static void __sched __schedule(void) put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + preempt_count() &= ~PREEMPT_NEED_RESCHED; rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2515,13 +2533,11 @@ void __sched schedule_preempt_disabled(void) */ asmlinkage void __sched notrace preempt_schedule(void) { - struct thread_info *ti = current_thread_info(); - /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(ti->preempt_count || irqs_disabled())) + if (likely((preempt_count() & ~PREEMPT_NEED_RESCHED) || irqs_disabled())) return; do { @@ -2546,11 +2562,10 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON((preempt_count() & ~PREEMPT_NEED_RESCHED) || !irqs_disabled()); prev_state = exception_enter(); @@ -4217,7 +4232,8 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + task_thread_info(idle)->saved_preempt_count = 0; + per_cpu(__preempt_count_var, cpu) = 0; /* * The idle tasks have their own, simple scheduling class: @@ -6562,7 +6578,8 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + int nested = (preempt_count() & ~(PREEMPT_ACTIVE|PREEMPT_NEED_RESCHED)) + + rcu_preempt_depth(); return (nested == preempt_offset); } diff --git a/kernel/softirq.c b/kernel/softirq.c index be3d351..87ede78 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -114,7 +114,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == cnt) + if ((preempt_count() & ~PREEMPT_NEED_RESCHED) == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ @@ -243,20 +243,20 @@ asmlinkage void __do_softirq(void) do { if (pending & 1) { unsigned int vec_nr = h - softirq_vec; - int prev_count = preempt_count(); + int prev_count = preempt_count() & ~PREEMPT_NEED_RESCHED; kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); h->action(h); trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { + if (unlikely(prev_count != (preempt_count() & ~PREEMPT_NEED_RESCHED))) { printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," " exited with %08x?\n", vec_nr, softirq_to_name[vec_nr], h->action, prev_count, preempt_count()); - preempt_count() = prev_count; + preempt_count() = prev_count | (preempt_count() & PREEMPT_NEED_RESCHED); } rcu_bh_qs(cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 4296d13..4c6be29 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index) static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), unsigned long data) { - int preempt_count = preempt_count(); + int count = preempt_count() & PREEMPT_MASK; #ifdef CONFIG_LOCKDEP /* @@ -1119,16 +1119,17 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), lock_map_release(&lockdep_map); - if (preempt_count != preempt_count()) { + if (count != (preempt_count() & PREEMPT_MASK)) { WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", - fn, preempt_count, preempt_count()); + fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ - preempt_count() = preempt_count; + preempt_count() &= ~PREEMPT_MASK; + preempt_count() |= count; } } diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 4c0d0e5..04abe53 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -9,10 +9,9 @@ notrace unsigned int debug_smp_processor_id(void) { - unsigned long preempt_count = preempt_count(); int this_cpu = raw_smp_processor_id(); - if (likely(preempt_count)) + if (likely(preempt_count())) goto out; if (irqs_disabled()) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/