On Wed, Apr 23, 2014 at 09:00:36PM +0200, Denys Vlasenko wrote: > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 9cae286..08dd220 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4255,6 +4255,9 @@ EXPORT_SYMBOL_GPL(yield_to); > */ > void __sched io_schedule(void) > { > +#ifdef CONFIG_NO_HZ_COMMON > + int cpu_on_entry = smp_processor_id(); > +#endif > struct rq *rq = raw_rq(); > > delayacct_blkio_start(); > @@ -4263,13 +4266,23 @@ void __sched io_schedule(void) > current->in_iowait = 1; > schedule(); > current->in_iowait = 0; > +#ifdef CONFIG_NO_HZ_COMMON > + if (atomic_dec_and_test(&rq->nr_iowait)) { > + if (smp_processor_id() != cpu_on_entry) > + tick_nohz_iowait_to_idle(cpu_on_entry); > + } > +#else > atomic_dec(&rq->nr_iowait); > +#endif > delayacct_blkio_end(); > } > EXPORT_SYMBOL(io_schedule); > > long __sched io_schedule_timeout(long timeout) > { > +#ifdef CONFIG_NO_HZ_COMMON > + int cpu_on_entry = smp_processor_id(); > +#endif > struct rq *rq = raw_rq(); > long ret; > > @@ -4279,7 +4292,14 @@ long __sched io_schedule_timeout(long timeout) > current->in_iowait = 1; > ret = schedule_timeout(timeout); > current->in_iowait = 0; > +#ifdef CONFIG_NO_HZ_COMMON > + if (atomic_dec_and_test(&rq->nr_iowait)) { > + if (smp_processor_id() != cpu_on_entry) > + tick_nohz_iowait_to_idle(cpu_on_entry); > + } > +#else > atomic_dec(&rq->nr_iowait); > +#endif > delayacct_blkio_end(); > return ret; > }
Why do you insist on writing the same (buggy, see later) code twice? Really, get lazy already and write it once but use it twice! Its buggy because the smp_processor_id() is used in preemptible context, its further buggy because the raw_rq() does it again and could get a rq on a different cpu. What you want is something like: static inline void io_wait_start(struct rq *rq) { atomic_inc(&rq->nr_iowait); current->in_iowait = 1; } static inline void io_wait_end(struct rq *rq) { current->in_iowait = 0; #ifdef CONFIG_NO_HZ_COMMON if (atomic_dec_and_test(&rq->nr_iowait) && cpu_of(rq) != raw_smp_processor_id()) { tick_nohz_iowait_end(cpu_of(rq)); } #else atomic_dec(&rq->nr_iowait); #endif } Anyway, I suspect that's still broken and you really need that lock around the state like I did earlier, because the above isn't serialized between remote wakeup and the cpu waking from nohz. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/