Allow the boot CPU to be full nohz, and have it take the do_timer duty
temporarily during boot.

nohz_full has been successful at significantly reducing jitter for a
large supercomputer customer, but their job control system requires
CPU0 to be for housekeeping.

This will cause suspend / kexec freeze to occur on a non-boot CPU,
so the option may need to be made conditional by arch?

Signed-off-by: Nicholas Piggin <npig...@gmail.com>
---
 kernel/time/tick-common.c | 50 +++++++++++++++++++++++++++++++++++----
 kernel/time/tick-sched.c  | 27 +++++++++++----------
 2 files changed, 60 insertions(+), 17 deletions(-)

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 529143b4c8d2..31146c13226e 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -46,6 +46,14 @@ ktime_t tick_period;
  *    procedure also covers cpu hotplug.
  */
 int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * tick_do_timer_boot_cpu indicates the boot CPU temporarily owns
+ * tick_do_timer_cpu and it should be taken over by an eligible secondary
+ * when one comes online.
+ */
+static int tick_do_timer_boot_cpu __read_mostly = -1;
+#endif
 
 /*
  * Debugging: see timer_list.c
@@ -167,6 +175,26 @@ void tick_setup_periodic(struct clock_event_device *dev, 
int broadcast)
        }
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+static void giveup_do_timer(void *info)
+{
+       int cpu = *(unsigned int *)info;
+
+       WARN_ON(tick_do_timer_cpu != smp_processor_id());
+
+       tick_do_timer_cpu = cpu;
+}
+
+static void tick_take_do_timer_from_boot(void)
+{
+       int cpu = smp_processor_id();
+       int from = tick_do_timer_boot_cpu;
+
+       if (from >= 0 && from != cpu)
+               smp_call_function_single(from, giveup_do_timer, &cpu, 1);
+}
+#endif
+
 /*
  * Setup the tick device
  */
@@ -186,12 +214,26 @@ static void tick_setup_device(struct tick_device *td,
                 * this cpu:
                 */
                if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-                       if (!tick_nohz_full_cpu(cpu))
-                               tick_do_timer_cpu = cpu;
-                       else
-                               tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+                       tick_do_timer_cpu = cpu;
+
                        tick_next_period = ktime_get();
                        tick_period = NSEC_PER_SEC / HZ;
+#ifdef CONFIG_NO_HZ_FULL
+                       /*
+                        * The boot CPU may be nohz_full, in which case set
+                        * tick_do_timer_boot_cpu so the first housekeeping
+                        * secondary that comes up will take do_timer from
+                        * us.
+                        */
+                       if (tick_nohz_full_cpu(cpu))
+                               tick_do_timer_boot_cpu = cpu;
+
+               } else if (tick_do_timer_boot_cpu != -1 &&
+                                               !tick_nohz_full_cpu(cpu)) {
+                       tick_take_do_timer_from_boot();
+                       tick_do_timer_boot_cpu = -1;
+                       WARN_ON(tick_do_timer_cpu != cpu);
+#endif
                }
 
                /*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6fa52cd6df0b..c0105bf4ecd9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -121,10 +121,14 @@ static void tick_sched_do_timer(struct tick_sched *ts, 
ktime_t now)
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * jiffies_lock.
+        *
+        * If nohz_full is enabled, this should not happen because the
+        * tick_do_timer_cpu never relinquishes.
         */
-       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
-           && !tick_nohz_full_cpu(cpu))
+       if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
+               WARN_ON(tick_nohz_full_running);
                tick_do_timer_cpu = cpu;
+       }
 #endif
 
        /* Check, if the jiffies need an update */
@@ -395,8 +399,8 @@ void __init tick_nohz_full_setup(cpumask_var_t cpumask)
 static int tick_nohz_cpu_down(unsigned int cpu)
 {
        /*
-        * The boot CPU handles housekeeping duty (unbound timers,
-        * workqueues, timekeeping, ...) on behalf of full dynticks
+        * The tick_do_timer_cpu CPU handles housekeeping duty (unbound
+        * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && tick_do_timer_cpu == cpu)
@@ -423,14 +427,6 @@ void __init tick_nohz_init(void)
                return;
        }
 
-       cpu = smp_processor_id();
-
-       if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
-               pr_warn("NO_HZ: Clearing %d from nohz_full range for 
timekeeping\n",
-                       cpu);
-               cpumask_clear_cpu(cpu, tick_nohz_full_mask);
-       }
-
        for_each_cpu(cpu, tick_nohz_full_mask)
                context_tracking_cpu_set(cpu);
 
@@ -904,8 +900,13 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched 
*ts)
                /*
                 * Boot safety: make sure the timekeeping duty has been
                 * assigned before entering dyntick-idle mode,
+                * tick_do_timer_cpu is TICK_DO_TIMER_BOOT
                 */
-               if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+               if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT))
+                       return false;
+
+               /* Should not happen for nohz-full */
+               if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }
 
-- 
2.20.1

Reply via email to