Linus,

Please pull the latest timers-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 
timers-urgent-for-linus

   # HEAD: 2fe59f507a65dbd734b990a11ebc7488f6f87a24 timers: Fix excessive 
granularity of new timers after a nohz idle

Fix a timer granularity handling race+bug, which would manifest itself by 
spuriously increasing timeouts of some timers (from 1 jiffy to ~500 jiffies
in the worst case measured) in certain nohz states.

 Thanks,

        Ingo

------------------>
Nicholas Piggin (1):
      timers: Fix excessive granularity of new timers after a nohz idle


 kernel/time/timer.c | 50 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 8f5d1bf18854..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
        bool                    migration_enabled;
        bool                    nohz_active;
        bool                    is_idle;
+       bool                    must_forward_clk;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head       vectors[WHEEL_SIZE];
 } ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
 
 static inline void forward_timer_base(struct timer_base *base)
 {
-       unsigned long jnow = READ_ONCE(jiffies);
+       unsigned long jnow;
 
        /*
-        * We only forward the base when it's idle and we have a delta between
-        * base clock and jiffies.
+        * We only forward the base when we are idle or have just come out of
+        * idle (must_forward_clk logic), and have a delta between base clock
+        * and jiffies. In the common case, run_timers will take care of it.
         */
-       if (!base->is_idle || (long) (jnow - base->clk) < 2)
+       if (likely(!base->must_forward_clk))
+               return;
+
+       jnow = READ_ONCE(jiffies);
+       base->must_forward_clk = base->is_idle;
+       if ((long)(jnow - base->clk) < 2)
                return;
 
        /*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, bool pending_only)
         * same array bucket then just return:
         */
        if (timer_pending(timer)) {
+               /*
+                * The downside of this optimization is that it can result in
+                * larger granularity than you would get from adding a new
+                * timer with this expiry.
+                */
                if (timer->expires == expires)
                        return 1;
 
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, bool pending_only)
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
+               forward_timer_base(base);
 
                clk = base->clk;
                idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, bool pending_only)
                }
        } else {
                base = lock_timer_base(timer, &flags);
+               forward_timer_base(base);
        }
 
        ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long 
expires, bool pending_only)
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | 
base->cpu);
+                       forward_timer_base(base);
                }
        }
 
-       /* Try to forward a stale timer base clock */
-       forward_timer_base(base);
-
        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
+       forward_timer_base(base);
 
        debug_activate(timer, timer->expires);
        internal_add_timer(base, timer);
@@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 
basem)
                if (!is_max_delta)
                        expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
                /*
-                * If we expect to sleep more than a tick, mark the base idle:
+                * If we expect to sleep more than a tick, mark the base idle.
+                * Also the tick is stopped so any added timer must forward
+                * the base clk itself to keep granularity small. This idle
+                * logic is only maintained for the BASE_STD base, deferrable
+                * timers may still see large granularity skew (by design).
                 */
-               if ((expires - basem) > TICK_NSEC)
+               if ((expires - basem) > TICK_NSEC) {
+                       base->must_forward_clk = true;
                        base->is_idle = true;
+               }
        }
        raw_spin_unlock(&base->lock);
 
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct 
softirq_action *h)
 {
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
+       /*
+        * must_forward_clk must be cleared before running timers so that any
+        * timer functions that call mod_timer will not try to forward the
+        * base. idle trcking / clock forwarding logic is only used with
+        * BASE_STD timers.
+        *
+        * The deferrable base does not do idle tracking at all, so we do
+        * not forward it. This can result in very large variations in
+        * granularity for deferrable timers, but they can be deferred for
+        * long periods due to idle.
+        */
+       base->must_forward_clk = false;
+
        __run_timers(base);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
                __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));

Reply via email to