On Wed, 4 Jun 2014 10:54:18 +0200 Peter Zijlstra <pet...@infradead.org> wrote:
> > I'm still sitting on this patch. Jacub you were going to make it play > nice with QoS? > I had a patchset to work through system PM QOS and still maintain the idle injection efficiency. When I saw you did not merge the patch below, I thought you have abandoned it :) The only issue as per our last discussion is the lack of notification when PM QOS cannot be met. But that is intrinsic to PM QOS itself. I also consulted with Arjan and looked at directly intercept with intel_idle since both intel_powerclamp and intel_idle are arch specific drivers. But I think that is hard to do at per idle period basis, since we should still allow "natural" idle during the forced idle time. So, I think we can take a two stepped approach, 1. integrate your patch with a updated version of https://lkml.org/lkml/2013/11/26/534 such that there is no performance/efficiency regression. 2. add notification mechanism to system qos when constraints cannot be met. Thanks, Jacob > --- > Subject: idle, thermal, acpi: Remove home grown idle implementations > From: Peter Zijlstra <pet...@infradead.org> > Date: Wed Nov 20 14:32:37 CET 2013 > > People are starting to grow their own idle implementations in various > disgusting ways. Collapse the lot and use the generic idle code to > provide a proper idle cycle implementation. > > This does not fully preseve existing behaviour in that the generic > idle cycle function calls into the normal cpuidle governed idle > routines and should thus respect things like QoS parameters and the > like. > > If people want to over-ride the idle state they should talk to the > cpuidle folks about extending the interface and attempt to preserve > QoS guarantees, instead of jumping straight to the deepest possible C > state -- Jacub Pan said he was going to do this. > > This is reported to work for intel_powerclamp by Jacub Pan, the > acpi_pad driver is untested. > > Cc: rui.zh...@intel.com > Cc: jacob.jun....@linux.intel.com > Cc: l...@kernel.org > Cc: Rafael J. Wysocki <rafael.j.wyso...@intel.com> > Cc: Ingo Molnar <mi...@kernel.org> > Cc: Thomas Gleixner <t...@linutronix.de> > Cc: h...@zytor.com > Cc: ar...@linux.intel.com > Signed-off-by: Peter Zijlstra <pet...@infradead.org> > --- > drivers/acpi/acpi_pad.c | 41 ----------- > drivers/thermal/intel_powerclamp.c | 38 ---------- > include/linux/cpu.h | 2 > include/linux/sched.h | 3 > kernel/sched/core.c | 9 ++ > kernel/sched/idle.c | 129 > ++++++++++++++++++++++--------------- > kernel/time/tick-sched.c | 2 7 files changed, 95 > insertions(+), 129 deletions(-) > > --- a/drivers/acpi/acpi_pad.c > +++ b/drivers/acpi/acpi_pad.c > @@ -40,9 +40,7 @@ static DEFINE_MUTEX(round_robin_lock); > static unsigned long power_saving_mwait_eax; > > static unsigned char tsc_detected_unstable; > -static unsigned char tsc_marked_unstable; > static unsigned char lapic_detected_unstable; > -static unsigned char lapic_marked_unstable; > > static void power_saving_mwait_init(void) > { > @@ -152,10 +150,9 @@ static int power_saving_thread(void *dat > unsigned int tsk_index = (unsigned long)data; > u64 last_jiffies = 0; > > - sched_setscheduler(current, SCHED_RR, ¶m); > + sched_setscheduler(current, SCHED_FIFO, ¶m); > > while (!kthread_should_stop()) { > - int cpu; > u64 expire_time; > > try_to_freeze(); > @@ -170,41 +167,7 @@ static int power_saving_thread(void *dat > > expire_time = jiffies + HZ * (100 - idle_pct) / 100; > > - while (!need_resched()) { > - if (tsc_detected_unstable > && !tsc_marked_unstable) { > - /* TSC could halt in idle, so notify > users */ > - mark_tsc_unstable("TSC halts in > idle"); > - tsc_marked_unstable = 1; > - } > - if (lapic_detected_unstable > && !lapic_marked_unstable) { > - int i; > - /* LAPIC could halt in idle, so > notify users */ > - for_each_online_cpu(i) > - clockevents_notify( > - > CLOCK_EVT_NOTIFY_BROADCAST_ON, > - &i); > - lapic_marked_unstable = 1; > - } > - local_irq_disable(); > - cpu = smp_processor_id(); > - if (lapic_marked_unstable) > - clockevents_notify( > - > CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); > - stop_critical_timings(); > - > - > mwait_idle_with_hints(power_saving_mwait_eax, 1); - > - start_critical_timings(); > - if (lapic_marked_unstable) > - clockevents_notify( > - > CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); > - local_irq_enable(); > - > - if (jiffies > expire_time) { > - do_sleep = 1; > - break; > - } > - } > + play_idle(expire_time); > > /* > * current sched_rt has threshold for rt task > running time. --- a/drivers/thermal/intel_powerclamp.c > +++ b/drivers/thermal/intel_powerclamp.c > @@ -256,11 +256,6 @@ static u64 pkg_state_counter(void) > return count; > } > > -static void noop_timer(unsigned long foo) > -{ > - /* empty... just the fact that we get the interrupt wakes us > up */ -} > - > static unsigned int get_compensation(int ratio) > { > unsigned int comp = 0; > @@ -365,7 +360,6 @@ static bool powerclamp_adjust_controls(u > static int clamp_thread(void *arg) > { > int cpunr = (unsigned long)arg; > - DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); > static const struct sched_param param = { > .sched_priority = MAX_USER_RT_PRIO/2, > }; > @@ -374,11 +368,9 @@ static int clamp_thread(void *arg) > > set_bit(cpunr, cpu_clamping_mask); > set_freezable(); > - init_timer_on_stack(&wakeup_timer); > sched_setscheduler(current, SCHED_FIFO, ¶m); > > - while (true == clamping && !kthread_should_stop() && > - cpu_online(cpunr)) { > + while (clamping && !kthread_should_stop() && > cpu_online(cpunr)) { int sleeptime; > unsigned long target_jiffies; > unsigned int guard; > @@ -426,35 +418,11 @@ static int clamp_thread(void *arg) > if (should_skip) > continue; > > - target_jiffies = jiffies + duration_jiffies; > - mod_timer(&wakeup_timer, target_jiffies); > if (unlikely(local_softirq_pending())) > continue; > - /* > - * stop tick sched during idle time, interrupts are > still > - * allowed. thus jiffies are updated properly. > - */ > - preempt_disable(); > - tick_nohz_idle_enter(); > - /* mwait until target jiffies is reached */ > - while (time_before(jiffies, target_jiffies)) { > - unsigned long ecx = 1; > - unsigned long eax = target_mwait; > - > - /* > - * REVISIT: may call enter_idle() to notify > drivers who > - * can save power during cpu idle. same for > exit_idle() > - */ > - local_touch_nmi(); > - stop_critical_timings(); > - mwait_idle_with_hints(eax, ecx); > - start_critical_timings(); > - atomic_inc(&idle_wakeup_counter); > - } > - tick_nohz_idle_exit(); > - preempt_enable(); > + > + play_idle(duration_jiffies); > } > - del_timer_sync(&wakeup_timer); > clear_bit(cpunr, cpu_clamping_mask); > > return 0; > --- a/include/linux/cpu.h > +++ b/include/linux/cpu.h > @@ -255,6 +255,8 @@ enum cpuhp_state { > CPUHP_ONLINE, > }; > > +void play_idle(unsigned long jiffies); > + > void cpu_startup_entry(enum cpuhp_state state); > void cpu_idle(void); > > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1892,6 +1892,7 @@ extern void thread_group_cputime_adjuste > /* > * Per process flags > */ > +#define PF_IDLE 0x00000002 /* I am an IDLE > thread */ #define PF_EXITING 0x00000004 /* getting shut > down */ #define PF_EXITPIDONE 0x00000008 /* pi exit > done on shut down */ #define PF_VCPU > 0x00000010 /* I'm a virtual CPU */ @@ -2204,7 +2205,7 @@ > extern struct task_struct *idle_task(int */ > static inline bool is_idle_task(const struct task_struct *p) > { > - return p->pid == 0; > + return !!(p->flags & PF_IDLE); > } > extern struct task_struct *curr_task(int cpu); > extern void set_curr_task(int cpu, struct task_struct *p); > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -676,10 +676,12 @@ static void wake_up_idle_cpu(int cpu) > if (cpu == smp_processor_id()) > return; > > - if (set_nr_and_not_polling(rq->idle)) > + rcu_read_lock(); > + if (set_nr_and_not_polling(rq->curr)) > smp_send_reschedule(cpu); > else > trace_sched_wake_polling_cpu(cpu); > + rcu_read_unlock(); > } > > static bool wake_up_full_nohz_cpu(int cpu) > @@ -1605,10 +1607,12 @@ static void ttwu_queue_remote(struct tas > struct rq *rq = cpu_rq(cpu); > > if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { > - if (!set_nr_if_polling(rq->idle)) > + rcu_read_lock(); > + if (!set_nr_if_polling(rq->curr)) > smp_send_reschedule(cpu); > else > trace_sched_wake_polling_cpu(cpu); > + rcu_read_unlock(); > } > } > > @@ -4537,6 +4541,7 @@ void init_idle(struct task_struct *idle, > __sched_fork(0, idle); > idle->state = TASK_RUNNING; > idle->se.exec_start = sched_clock(); > + idle->flags |= PF_IDLE; > > do_set_cpus_allowed(idle, cpumask_of(cpu)); > /* > --- a/kernel/sched/idle.c > +++ b/kernel/sched/idle.c > @@ -184,66 +184,94 @@ static void cpuidle_idle_call(void) > * > * Called with polling cleared. > */ > -static void cpu_idle_loop(void) > +static void do_idle(void) > { > - while (1) { > - /* > - * If the arch has a polling bit, we maintain an > invariant: > - * > - * The polling bit is clear if we're not scheduled > (i.e. if > - * rq->curr != rq->idle). This means that, if > rq->idle has > - * the polling bit set, then setting need_resched is > - * guaranteed to cause the cpu to reschedule. > - */ > + /* > + * If the arch has a polling bit, we maintain an invariant: > + * > + * The polling bit is clear if we're not scheduled (i.e. if > rq->curr != > + * rq->idle). This means that, if rq->idle has the polling > bit set, > + * then setting need_resched is guaranteed to cause the cpu > to > + * reschedule. > + */ > > - __current_set_polling(); > - tick_nohz_idle_enter(); > + __current_set_polling(); > + tick_nohz_idle_enter(); > > - while (!need_resched()) { > - check_pgt_cache(); > - rmb(); > - > - if (cpu_is_offline(smp_processor_id())) > - arch_cpu_idle_dead(); > - > - local_irq_disable(); > - arch_cpu_idle_enter(); > - > - /* > - * In poll mode we reenable interrupts and > spin. > - * > - * Also if we detected in the wakeup from > idle > - * path that the tick broadcast device > expired > - * for us, we don't want to go deep idle as > we > - * know that the IPI is going to arrive right > - * away > - */ > - if (cpu_idle_force_poll || > tick_check_broadcast_expired()) > - cpu_idle_poll(); > - else > - cpuidle_idle_call(); > + while (!need_resched()) { > + check_pgt_cache(); > + rmb(); > > - arch_cpu_idle_exit(); > - } > + if (cpu_is_offline(smp_processor_id())) > + arch_cpu_idle_dead(); > + > + local_irq_disable(); > + arch_cpu_idle_enter(); > > /* > - * Since we fell out of the loop above, we know > - * TIF_NEED_RESCHED must be set, propagate it into > - * PREEMPT_NEED_RESCHED. > + * In poll mode we reenable interrupts and spin. > * > - * This is required because for polling idle loops > we will > - * not have had an IPI to fold the state for us. > + * Also if we detected in the wakeup from idle path > that the > + * tick broadcast device expired for us, we don't > want to go > + * deep idle as we know that the IPI is going to > arrive right > + * away > */ > - preempt_set_need_resched(); > - tick_nohz_idle_exit(); > - __current_clr_polling(); > - smp_mb__after_clear_bit(); > - sched_ttwu_pending(); > - schedule_preempt_disabled(); > + if (cpu_idle_force_poll || > tick_check_broadcast_expired()) > + cpu_idle_poll(); > + else > + cpuidle_idle_call(); > + > + arch_cpu_idle_exit(); > } > + > + /* > + * Since we fell out of the loop above, we know > TIF_NEED_RESCHED must > + * be set, propagate it into PREEMPT_NEED_RESCHED. > + * > + * This is required because for polling idle loops we will > not have had > + * an IPI to fold the state for us. > + */ > + preempt_set_need_resched(); > + tick_nohz_idle_exit(); > + __current_clr_polling(); > + smp_mb__after_clear_bit(); > + sched_ttwu_pending(); > + schedule_preempt_disabled(); > +} > + > +static void play_idle_timer(unsigned long foo) > +{ > + set_tsk_need_resched(current); > +} > + > +void play_idle(unsigned long duration) > +{ > + DEFINE_TIMER(wakeup_timer, play_idle_timer, 0, 0); > + > + /* > + * Only FIFO tasks can disable the tick since they don't > need the forced > + * preemption. > + */ > + WARN_ON_ONCE(current->policy != SCHED_FIFO); > + WARN_ON_ONCE(current->nr_cpus_allowed != 1); > + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); > + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); > + rcu_sleep_check(); > + > + init_timer_on_stack(&wakeup_timer); > + mod_timer_pinned(&wakeup_timer, jiffies + duration); > + > + preempt_disable(); > + current->flags |= PF_IDLE; > + do_idle(); > + current->flags &= ~PF_IDLE; > + del_timer_sync(&wakeup_timer); > + preempt_fold_need_resched(); > + preempt_enable(); > } > +EXPORT_SYMBOL_GPL(play_idle); > > -void cpu_startup_entry(enum cpuhp_state state) > +__noreturn void cpu_startup_entry(enum cpuhp_state state) > { > /* > * This #ifdef needs to die, but it's too late in the cycle > to @@ -261,5 +289,6 @@ void cpu_startup_entry(enum cpuhp_state > boot_init_stack_canary(); > #endif > arch_cpu_idle_prepare(); > - cpu_idle_loop(); > + while (1) > + do_idle(); > } > --- a/kernel/time/tick-sched.c > +++ b/kernel/time/tick-sched.c > @@ -807,7 +807,6 @@ void tick_nohz_idle_enter(void) > > local_irq_enable(); > } > -EXPORT_SYMBOL_GPL(tick_nohz_idle_enter); > > /** > * tick_nohz_irq_exit - update next tick event from interrupt exit > @@ -934,7 +933,6 @@ void tick_nohz_idle_exit(void) > > local_irq_enable(); > } > -EXPORT_SYMBOL_GPL(tick_nohz_idle_exit); > > static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) > { [Jacob Pan] -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/