(Please ignore the previous patch as it never really prevented
the last idler from going longer than timekeeping_max_deferement
sleeps.)

An adaptive nohz (AHZ) CPU may not do do_timer() for a while
despite being non-idle. When all other CPUs are idle, AHZ
CPUs might be using stale jiffies values. To prevent this
always keep a CPU with ticks if there is one or more AHZ
CPUs.

A new function, check_drop_timer_duty, handles the updates
to tick_do_timer_cpu value and makes sure that the jiffies
update is done when there are non-idle adaptive-nohz CPUs.
Also added is a new field in struct tick_sched to indicate
if CPU is ready to run something other than the idle task
without ticks once it drops the do_timer() duty. This also
facilitates the system-wide tick shut down when all CPUs,
including AHZ CPUs, are idle.

Signed-off-by: Hakan Akkan <hakanak...@gmail.com>
Cc: Frederic Weisbecker <fweis...@gmail.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Ingo Molnar <mi...@kernel.org>
---
 include/linux/tick.h     |    2 +
 kernel/time/tick-sched.c |  219 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 174 insertions(+), 47 deletions(-)

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 93add37..0a65dfb 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -54,6 +54,7 @@ enum tick_saved_jiffies {
  * @iowait_sleeptime:          Sum of the time slept in idle with sched tick 
stopped, with IO outstanding
  * @sleep_length:              Duration of the current idle sleep
  * @do_timer_lst:              CPU was the last one doing do_timer before 
going idle
+ * @user_nohz:                 CPU wants to switch to adaptive nohz mode
  */
 struct tick_sched {
        struct hrtimer                  sched_timer;
@@ -77,6 +78,7 @@ struct tick_sched {
        unsigned long                   next_jiffies;
        ktime_t                         idle_expires;
        int                             do_timer_last;
+       int                             user_nohz;
 };
 
 extern void __init tick_init(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index bdc8aeb..f9a85e0 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -172,6 +172,115 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, 
ktime_t now, u64 *last_upda
 
 }
 
+#ifdef CONFIG_CPUSETS_NO_HZ
+/*
+ * This defines the number of CPUs currently in (or wanting to
+ * be in) adaptive nohz mode. Greater than 0 means at least
+ * one CPU is ready to shut down its tick for non-idle purposes.
+ */
+static atomic_t __read_mostly nr_cpus_user_nohz = ATOMIC_INIT(0);
+
+static inline int update_do_timer_cpu(int current_handler,
+                       int new_handler)
+{
+       return cmpxchg(&tick_do_timer_cpu, current_handler, new_handler);
+}
+#else
+static inline int update_do_timer_cpu(int current_handler,
+                       int new_handler)
+{
+       int tmp = ACCESS_ONCE(tick_do_timer_cpu);
+       tick_do_timer_cpu = new_handler;
+       return tmp;
+}
+#endif
+
+/*
+ * check_drop_timer_duty: Check if this cpu can shut down
+ * ticks without worrying about who is going to handle
+ * timekeeping. The duty is dropped here as well if possible.
+ * When there are adaptive nohz cpus in the system ready to
+ * run user tasks without ticks, this function makes sure
+ * that timekeeping is handled by a cpu. A non-adaptive-nohz
+ * cpu, if any, will claim the duty as soon as it discovers
+ * that some adaptive-nohz cpu is stuck with it.
+ *
+ * Returns the new value of tick_do_timer_cpu.
+ */
+static int check_drop_timer_duty(int cpu)
+{
+       int curr_handler, prev_handler, new_handler;
+       int nrepeat = -1;
+       bool drop_recheck;
+
+repeat:
+       WARN_ON_ONCE(++nrepeat > 1);
+       drop_recheck = false;
+       curr_handler = cpu;
+       new_handler = TICK_DO_TIMER_NONE;
+
+#ifdef CONFIG_CPUSETS_NO_HZ
+       if (atomic_read(&nr_cpus_user_nohz) > 0) {
+               curr_handler = ACCESS_ONCE(tick_do_timer_cpu);
+               /*
+                * Keep the duty until someone takes it away.
+                * FIXME: Make nr_cpus_user_nohz an atomic cpumask
+                * to find an idle CPU to dump the duty at.
+                */
+               if (curr_handler == cpu)
+                       return cpu;
+               /*
+                * This cpu will try to take the duty if 1) there is
+                * no handler or 2) current handler seems to be an
+                * adaptive-nohz cpu. We take the duty from others
+                * only if the we are idle or not part of an
+                * adaptive-nohz cpuset.
+                * Once we take the duty, the check above ensures that
+                * we stick with it.
+                */
+               if (unlikely(curr_handler == TICK_DO_TIMER_NONE)
+                       || (per_cpu(tick_cpu_sched, curr_handler).user_nohz
+                               && (is_idle_task(current)
+                                       || !cpuset_cpu_adaptive_nohz(cpu))))
+                       new_handler = cpu;
+               else
+                       /*
+                        * A regular CPU is updating the jiffies and we don't
+                        * have to take it away from her.
+                        */
+                       new_handler = curr_handler;
+       } else {
+               /*
+                * We might miss nr_cpus_user_nohz update and drop the duty
+                * whereas other CPUs think that we keep handling the
+                * timekeeping. To prevent this, we recheck its value after
+                * we update the timer_do_timer_cpu and start over if
+                * necessary.
+                */
+               drop_recheck = true;
+       }
+#endif
+
+       prev_handler = update_do_timer_cpu(curr_handler, new_handler);
+
+       if (drop_recheck && atomic_read(&nr_cpus_user_nohz) > 0)
+               goto repeat;
+
+       if (likely(new_handler != TICK_DO_TIMER_NONE)) {
+               if (prev_handler == curr_handler)
+                       return new_handler;
+               /*
+                * Handler was probably changed under us. Whoever has
+                * the duty might just drop it and we wouldn't know.
+                * So, let's try again...
+                */
+               goto repeat;
+       } else {
+               /* We either just dropped the duty or didn't have it. */
+               return prev_handler == cpu ? TICK_DO_TIMER_NONE : prev_handler;
+       }
+}
+
 static void tick_nohz_stop_idle(int cpu, ktime_t now)
 {
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -187,6 +296,14 @@ static ktime_t tick_nohz_start_idle(int cpu, struct 
tick_sched *ts)
        ktime_t now = ktime_get();
 
        ts->idle_entrytime = now;
+
+#ifdef CONFIG_CPUSETS_NO_HZ
+       if (ts->user_nohz) {
+               ts->user_nohz = 0;
+               WARN_ON_ONCE(atomic_add_negative(-1, &nr_cpus_user_nohz));
+       }
+#endif
+
        ts->idle_active = 1;
        sched_clock_idle_sleep_event();
        return now;
@@ -280,6 +397,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched 
*ts,
        ktime_t last_update, expires, ret = { .tv64 = 0 };
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
+       int new_handler, prev_handler;
 
 
        /* Read jiffies and the time when jiffies were updated last */
@@ -308,24 +426,25 @@ static ktime_t tick_nohz_stop_sched_tick(struct 
tick_sched *ts,
 
        /* Schedule the tick, if we are at least one jiffie off */
        if ((long)delta_jiffies >= 1) {
+               /*
+                * Check if adaptive nohz needs this CPU to take care
+                * of the jiffies update. We also drop the duty in this
+                * function if we can.
+                */
+               prev_handler = ACCESS_ONCE(tick_do_timer_cpu);
+               new_handler = check_drop_timer_duty(cpu);
+               if (new_handler == cpu)
+                       goto out;
 
                /*
-                * If this cpu is the one which updates jiffies, then
-                * give up the assignment and let it be taken by the
-                * cpu which runs the tick timer next, which might be
-                * this cpu as well. If we don't drop this here the
-                * jiffies might be stale and do_timer() never
-                * invoked. Keep track of the fact that it was the one
-                * which had the do_timer() duty last. If this cpu is
-                * the one which had the do_timer() duty last, we
-                * limit the sleep time to the timekeeping
-                * max_deferement value which we retrieved
+                * If this cpu is the one which had the do_timer()
+                * duty last, we limit the sleep time to the
+                * timekeeping max_deferement value which we retrieved
                 * above. Otherwise we can sleep as long as we want.
                 */
-               if (cpu == tick_do_timer_cpu) {
-                       tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+               if (prev_handler == cpu) {
                        ts->do_timer_last = 1;
-               } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+               } else if (new_handler != TICK_DO_TIMER_NONE) {
                        time_delta = KTIME_MAX;
                        ts->do_timer_last = 0;
                } else if (!ts->do_timer_last) {
@@ -419,6 +538,10 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched 
*ts)
         * invoked.
         */
        if (unlikely(!cpu_online(cpu))) {
+               /*
+                * FIXME: Might need some sort of protection
+                * against CPU hotunplug for adaptive nohz.
+                */
                if (cpu == tick_do_timer_cpu)
                        tick_do_timer_cpu = TICK_DO_TIMER_NONE;
        }
@@ -510,19 +633,24 @@ void tick_nohz_idle_enter(void)
 }
 
 #ifdef CONFIG_CPUSETS_NO_HZ
-static bool can_stop_adaptive_tick(void)
+static bool can_stop_adaptive_tick(struct tick_sched *ts)
 {
-       if (!sched_can_stop_tick())
-               return false;
-
-       if (posix_cpu_timers_running(current))
-               return false;
-
-       /* Is there a grace period to complete ? */
-       if (rcu_pending(smp_processor_id()))
-               return false;
+       int ret = true;
+
+       if (!sched_can_stop_tick()
+               || posix_cpu_timers_running(current)
+               || rcu_pending(smp_processor_id()))
+               ret = false;
+
+       if (ret && !ts->user_nohz) {
+               ts->user_nohz = 1;
+               atomic_inc(&nr_cpus_user_nohz);
+       } else if (!ret && ts->user_nohz) {
+               ts->user_nohz = 0;
+               WARN_ON_ONCE(atomic_add_negative(-1, &nr_cpus_user_nohz));
+       }
 
-       return true;
+       return ret;
 }
 
 static void tick_nohz_cpuset_stop_tick(struct tick_sched *ts)
@@ -541,7 +669,7 @@ static void tick_nohz_cpuset_stop_tick(struct tick_sched 
*ts)
        if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
                return;
 
-       if (!can_stop_adaptive_tick())
+       if (!can_stop_adaptive_tick(ts))
                return;
 
        /*
@@ -990,27 +1118,14 @@ void tick_nohz_exit_exception(struct pt_regs *regs)
                tick_nohz_exit_kernel();
 }
 
-/*
- * Take the timer duty if nobody is taking care of it.
- * If a CPU already does and and it's in a nohz cpuset,
- * then take the charge so that it can switch to nohz mode.
- */
-static void tick_do_timer_check_handler(int cpu)
+static void tick_nohz_restart_adaptive(struct tick_sched *ts)
 {
-       int handler = tick_do_timer_cpu;
+       tick_nohz_flush_current_times(true);
 
-       if (unlikely(handler == TICK_DO_TIMER_NONE)) {
-               tick_do_timer_cpu = cpu;
-       } else {
-               if (!cpuset_adaptive_nohz() &&
-                   cpuset_cpu_adaptive_nohz(handler))
-                       tick_do_timer_cpu = cpu;
+       if (ts->user_nohz) {
+               ts->user_nohz = 0;
+               WARN_ON_ONCE(atomic_add_negative(-1, &nr_cpus_user_nohz));
        }
-}
-
-static void tick_nohz_restart_adaptive(void)
-{
-       tick_nohz_flush_current_times(true);
        tick_nohz_restart_sched_tick();
        clear_thread_flag(TIF_NOHZ);
        trace_printk("clear TIF_NOHZ\n");
@@ -1022,8 +1137,8 @@ void tick_nohz_check_adaptive(void)
        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 
        if (ts->tick_stopped && !is_idle_task(current)) {
-               if (!can_stop_adaptive_tick())
-                       tick_nohz_restart_adaptive();
+               if (!can_stop_adaptive_tick(ts))
+                       tick_nohz_restart_adaptive(ts);
        }
 }
 
@@ -1033,7 +1148,7 @@ void cpuset_exit_nohz_interrupt(void *unused)
 
        trace_printk("IPI: Nohz exit\n");
        if (ts->tick_stopped && !is_idle_task(current))
-               tick_nohz_restart_adaptive();
+               tick_nohz_restart_adaptive(ts);
 }
 
 /*
@@ -1122,7 +1237,17 @@ static enum hrtimer_restart tick_sched_timer(struct 
hrtimer *timer)
        ktime_t now = ktime_get();
        int cpu = smp_processor_id();
 
-       tick_do_timer_check_handler(cpu);
+#ifdef CONFIG_NO_HZ
+       /*
+        * Check if the do_timer duty was dropped. We don't care about
+        * concurrency: This happens only when the cpu in charge went
+        * into a long sleep. If two cpus happen to assign themself to
+        * this duty, then the jiffies update is still serialized by
+        * xtime_lock.
+        */
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+               tick_do_timer_cpu = cpu;
+#endif
 
        /* Check, if the jiffies need an update */
        if (tick_do_timer_cpu == cpu)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to