Hi!,

On Fri, Sep 11 2020 at 23:48, Steven Rostedt wrote:
> The VMware PhotonOS team is evaluating 4.19-rt compared to CentOS
> 3.10-rt (franken kernel from Red Hat). They found a regression between
> the two kernels that was found to be introduced by:
>
>  d25408756accb ("clockevents: Stop unused clockevent devices")
>
> The issue is running this on a guest, and it causes a noticeable wake
> up latency in cyclictest. The 4.19-rt kernel has two extra apic
> instructions causing for two extra VMEXITs to occur over the 3.10-rt
> kernel. I found out the reason why, and this is true for vanilla 5.9-rc
> as well.
>
> When running isocpus with NOHZ_FULL, I see the following.
>
>   tick_nohz_idle_stop_tick() {
>       hrtimer_start_range_ns() {
>               remove_hrtimer(timer)
>                       /* no more timers on the base */
>                       expires = KTIME_MAX;
>                       tick_program_event() {
>                               clock_switch_state(ONESHOT_STOPPED);
>                               /* call to apic to shutdown timer */
>                       }
>               }
>               [..]
>               hrtimer_reprogram(timer) {
>                       tick_program_event() {
>                               clock_switch_state(ONESHOT);
>                               /* call to apic to enable timer again! */
>               }
>       }
>  }
>
>
> Thus, we are needlessly shutting down and restarting the apic every
> time we call tick_nohz_stop_tick() if there is a timer still on the
> queue.
>
> I'm not exactly sure how to fix this. Is there a way we can hold off
> disabling the clock here until we know that it isn't going to be
> immediately enabled again?

For the hrtimer_start_range_ns() case we can do that. Something like the
completely untested below.

Thanks,

        tglx
---
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 95b6a708b040..9931a7f66e47 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -209,6 +209,9 @@ struct hrtimer_cpu_base *get_target_base(struct 
hrtimer_cpu_base *base,
        return base;
 }
 
+static void
+hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal);
+
 /*
  * We switch the timer base to a power-optimized selected CPU target,
  * if:
@@ -223,7 +226,7 @@ struct hrtimer_cpu_base *get_target_base(struct 
hrtimer_cpu_base *base,
  */
 static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
-                   int pinned)
+                   int pinned, bool *reprogram_old_base)
 {
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
@@ -247,6 +250,23 @@ switch_hrtimer_base(struct hrtimer *timer, struct 
hrtimer_clock_base *base,
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;
 
+               /*
+                * The caller has removed the first expiring timer from
+                * @base, but avoided reprogramming the clocksource as it
+                * immediately enqueues a timer again. If the base stays
+                * the same and the removed timer was the only timer on
+                * that CPU base then reprogramming in hrtimer_remove()
+                * would shut down the clock event device just to restart
+                * it when the timer is enqueued.
+                *
+                * timer->base->lock is about to be dropped. Check whether
+                * the current base needs an update.
+                */
+               if (*reprogram_old_base) {
+                       *reprogram_old_base = false;
+                       hrtimer_force_reprogram(base->cpu_base, 1);
+               }
+
                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
@@ -288,7 +308,12 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned 
long *flags)
        return base;
 }
 
-# define switch_hrtimer_base(t, b, p)  (b)
+static inline struct hrtimer_clock_base *
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+                   int pinned, bool *reprogram_old_base)
+{
+       return base;
+}
 
 #endif /* !CONFIG_SMP */
 
@@ -1090,9 +1115,20 @@ static int __hrtimer_start_range_ns(struct hrtimer 
*timer, ktime_t tim,
                                    struct hrtimer_clock_base *base)
 {
        struct hrtimer_clock_base *new_base;
+       bool reprogram_old_base;
+       int ret;
+
+       /*
+        * If this is the first expiring timer then after removing the
+        * timer the clock event needs to be reprogrammed. But if the timer
+        * stays on the same base then this might be a pointless exercise
+        * because it's immediately enqueued again. Store the state and
+        * delay reprogramming. See below.
+        */
+       reprogram_old_base = timer == base->cpu_base->next_timer;
 
        /* Remove an active timer from the queue: */
-       remove_hrtimer(timer, base, true);
+       remove_hrtimer(timer, base, false);
 
        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());
@@ -1101,10 +1137,21 @@ static int __hrtimer_start_range_ns(struct hrtimer 
*timer, ktime_t tim,
 
        hrtimer_set_expires_range_ns(timer, tim, delta_ns);
 
-       /* Switch the timer base, if necessary: */
-       new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+       /*
+        * Switch the timer base, if necessary. It the timer was the first
+        * expiring timer and the timer base is switched then the old base
+        * is reprogrammed before dropping the cpu base lock. In that case
+        * reprogram_old_base is then set to false.
+        */
+       new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED,
+                                      &reprogram_old_base);
 
-       return enqueue_hrtimer(timer, new_base, mode);
+       ret = enqueue_hrtimer(timer, new_base, mode);
+       if (reprogram_old_base) {
+               hrtimer_force_reprogram(base->cpu_base, 1);
+               ret = 0;
+       }
+       return ret;
 }
 
 /**

Reply via email to