The non-vtime dynticks-idle cputime accounting is a big mess that
accumulates within two concurrent statistics, each having their own
shortcomings:
* The accounting for online CPUs which is based on the delta between
tick_nohz_start_idle() and tick_nohz_stop_idle().
Pros:
- Works when the tick is off
- Has nsecs granularity
Cons:
- Account idle steal time but doesn't substract it from idle
cputime.
- Assumes CONFIG_IRQ_TIME_ACCOUNTING by not accounting IRQs but
the IRQ time is simply ignored when
CONFIG_IRQ_TIME_ACCOUNTING=n
- The windows between 1) idle task scheduling and the first call
to tick_nohz_start_idle() and 2) idle task between the last
tick_nohz_stop_idle() and the rest of the idle time are
blindspots wrt. cputime accounting (though mostly insignificant
amount)
- Relies on private fields outside of kernel stats, with specific
accessors.
* The accounting for offline CPUs which is based on ticks and the
jiffies delta during which the tick was stopped.
Pros:
- Handles steal time correctly
- Handle CONFIG_IRQ_TIME_ACCOUNTING=y and
CONFIG_IRQ_TIME_ACCOUNTING=n correctly.
- Handles the whole idle task
- Accounts directly to kernel stats, without midlayer accumulator.
Cons:
- Doesn't elapse when the tick is off, which doesn't make it
suitable for online CPUs.
- Has TICK_NSEC granularity (jiffies)
- Needs to track the dyntick-idle ticks that were accounted and
substract them from the total jiffies time spent while the tick
was stopped. This is an ugly workaround.
Having two different accounting for a single context is not the only
problem: since those accountings are of different natures, it is
possible to observe the global idle time going backward after a CPU goes
offline.
Clean up the situation with introducing a hybrid approach that stays
coherent and works for both online and offline CPUs:
* Tick based or native vtime accounting operate before the idle loop
is entered and resume once the idle loop prepares to exit.
* When the idle loop starts, switch to dynticks-idle accounting as is
done currently, except that the statistics accumulate directly to the
relevant kernel stat fields.
* Private dyntick cputime accounting fields are removed.
* Works on both online and offline case.
Further improvement will include:
* Only switch to dynticks-idle cputime accounting when the tick actually
goes in dynticks mode.
* Handle CONFIG_IRQ_TIME_ACCOUNTING=n correctly such that the
dynticks-idle accounting still elapses while on IRQs.
* Correctly substract idle steal cputime from idle time
Reported-by: Xin Zhao <[email protected]>
Signed-off-by: Frederic Weisbecker <[email protected]>
---
include/linux/kernel_stat.h | 24 ++++++++++---
include/linux/vtime.h | 7 +++-
kernel/sched/cputime.c | 62 ++++++++++++++++----------------
kernel/time/tick-sched.c | 72 +++++++++++--------------------------
4 files changed, 77 insertions(+), 88 deletions(-)
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index dd020ecaf67b..ba65aad308a1 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -34,6 +34,9 @@ enum cpu_usage_stat {
};
struct kernel_cpustat {
+#ifdef CONFIG_NO_HZ_COMMON
+ int idle_dyntick;
+#endif
u64 cpustat[NR_STATS];
};
@@ -99,6 +102,20 @@ static inline unsigned long kstat_cpu_irqs_sum(unsigned int
cpu)
return kstat_cpu(cpu).irqs_sum;
}
+#ifdef CONFIG_NO_HZ_COMMON
+extern void kcpustat_dyntick_start(void);
+extern void kcpustat_dyntick_stop(void);
+static inline bool kcpustat_idle_dyntick(void)
+{
+ return __this_cpu_read(kernel_cpustat.idle_dyntick);
+}
+#else
+static inline bool kcpustat_idle_dyntick(void)
+{
+ return false;
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern u64 kcpustat_field(enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
@@ -113,7 +130,7 @@ static inline void kcpustat_cpu_fetch(struct kernel_cpustat
*dst, int cpu)
*dst = kcpustat_cpu(cpu);
}
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */
extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
@@ -127,14 +144,13 @@ extern u64 get_idle_time(struct kernel_cpustat *kcs, int
cpu);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
static inline void account_process_tick(struct task_struct *tsk, int user)
{
- vtime_flush(tsk);
+ if (!kcpustat_idle_dyntick())
+ vtime_flush(tsk);
}
#else
extern void account_process_tick(struct task_struct *, int user);
#endif
-extern void account_idle_ticks(unsigned long ticks);
-
#ifdef CONFIG_SCHED_CORE
extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
#endif
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index 10cdb08f960b..43934ff20c4a 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -32,6 +32,11 @@ static inline bool vtime_generic_enabled_cpu(int cpu)
return context_tracking_enabled_cpu(cpu);
}
+static inline bool vtime_generic_enabled_this_cpu(void)
+{
+ return context_tracking_enabled_this_cpu();
+}
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
extern void vtime_account_softirq(struct task_struct *tsk);
@@ -90,7 +95,7 @@ static inline bool vtime_accounting_enabled_cpu(int cpu)
static inline bool vtime_accounting_enabled_this_cpu(void)
{
- return context_tracking_enabled_this_cpu();
+ return vtime_generic_enabled_this_cpu();
}
extern void vtime_task_switch_generic(struct task_struct *prev);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f32c169da11a..c10fcc3d65b3 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -400,16 +400,30 @@ static void irqtime_account_process_tick(struct
task_struct *p, int user_tick,
}
}
-static void irqtime_account_idle_ticks(int ticks)
-{
- irqtime_account_process_tick(current, 0, ticks);
-}
#else /* !CONFIG_IRQ_TIME_ACCOUNTING: */
-static inline void irqtime_account_idle_ticks(int ticks) { }
static inline void irqtime_account_process_tick(struct task_struct *p, int
user_tick,
int nr_ticks) { }
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_NO_HZ_COMMON
+void kcpustat_dyntick_start(void)
+{
+ if (!vtime_generic_enabled_this_cpu()) {
+ vtime_dyntick_start();
+ __this_cpu_write(kernel_cpustat.idle_dyntick, 1);
+ }
+}
+
+void kcpustat_dyntick_stop(void)
+{
+ if (!vtime_generic_enabled_this_cpu()) {
+ __this_cpu_write(kernel_cpustat.idle_dyntick, 0);
+ vtime_dyntick_stop();
+ steal_account_process_time(ULONG_MAX);
+ }
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
/*
* Use precise platform statistics if available:
*/
@@ -423,11 +437,15 @@ void vtime_account_irq(struct task_struct *tsk, unsigned
int offset)
vtime_account_hardirq(tsk);
} else if (pc & SOFTIRQ_OFFSET) {
vtime_account_softirq(tsk);
- } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
- is_idle_task(tsk)) {
- vtime_account_idle(tsk);
+ } else if (!kcpustat_idle_dyntick()) {
+ if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) &&
+ is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ } else {
+ vtime_account_kernel(tsk);
+ }
} else {
- vtime_account_kernel(tsk);
+ vtime_reset();
}
}
@@ -469,6 +487,9 @@ void account_process_tick(struct task_struct *p, int
user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
+ if (kcpustat_idle_dyntick())
+ return;
+
if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
@@ -490,29 +511,6 @@ void account_process_tick(struct task_struct *p, int
user_tick)
account_idle_time(cputime);
}
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
- u64 cputime, steal;
-
- if (irqtime_enabled()) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- cputime = ticks * TICK_NSEC;
- steal = steal_account_process_time(ULONG_MAX);
-
- if (steal >= cputime)
- return;
-
- cputime -= steal;
- account_idle_time(cputime);
-}
-
/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f1d07a0276a5..74c97ad75856 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -285,8 +285,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct
pt_regs *regs)
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
touch_softlockup_watchdog_sched();
- if (is_idle_task(current))
- ts->idle_jiffies++;
/*
* In case the current tick fired too early past its expected
* expiration, make sure we don't bypass the next clock
reprogramming
@@ -744,8 +742,12 @@ static void tick_nohz_update_jiffies(ktime_t now)
static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
ktime_t delta;
+ if (vtime_generic_enabled_this_cpu())
+ return;
+
if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
return;
@@ -753,9 +755,9 @@ static void tick_nohz_stop_idle(struct tick_sched *ts,
ktime_t now)
write_seqcount_begin(&ts->idle_sleeptime_seq);
if (nr_iowait_cpu(smp_processor_id()) > 0)
- ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
+ cpustat[CPUTIME_IOWAIT] = ktime_add(cpustat[CPUTIME_IOWAIT],
delta);
else
- ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
+ cpustat[CPUTIME_IDLE] = ktime_add(cpustat[CPUTIME_IDLE], delta);
ts->idle_entrytime = now;
tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
@@ -766,17 +768,21 @@ static void tick_nohz_stop_idle(struct tick_sched *ts,
ktime_t now)
static void tick_nohz_start_idle(struct tick_sched *ts)
{
+ if (vtime_generic_enabled_this_cpu())
+ return;
+
write_seqcount_begin(&ts->idle_sleeptime_seq);
ts->idle_entrytime = ktime_get();
tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
write_seqcount_end(&ts->idle_sleeptime_seq);
-
sched_clock_idle_sleep_event();
}
-static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
+static u64 get_cpu_sleep_time_us(int cpu, enum cpu_usage_stat idx,
bool compute_delta, u64 *last_update_time)
{
+ struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+ u64 *cpustat = kcpustat_cpu(cpu).cpustat;
ktime_t now, idle;
unsigned int seq;
@@ -793,9 +799,9 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts,
ktime_t *sleeptime,
if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) &&
compute_delta) {
ktime_t delta = ktime_sub(now, ts->idle_entrytime);
- idle = ktime_add(*sleeptime, delta);
+ idle = ktime_add(cpustat[idx], delta);
} else {
- idle = *sleeptime;
+ idle = cpustat[idx];
}
} while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));
@@ -822,9 +828,7 @@ static u64 get_cpu_sleep_time_us(struct tick_sched *ts,
ktime_t *sleeptime,
*/
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IDLE,
!nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
@@ -848,9 +852,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
*/
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
- struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
- return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
+ return get_cpu_sleep_time_us(cpu, CPUTIME_IOWAIT,
nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
@@ -1250,10 +1252,8 @@ void tick_nohz_idle_stop_tick(void)
ts->idle_sleeps++;
ts->idle_expires = expires;
- if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ts->idle_jiffies = ts->last_jiffies;
+ if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED))
nohz_balance_enter_idle(cpu);
- }
} else {
tick_nohz_retain_tick(ts);
}
@@ -1282,6 +1282,7 @@ void tick_nohz_idle_enter(void)
WARN_ON_ONCE(ts->timer_expires_base);
tick_sched_flag_set(ts, TS_FLAG_INIDLE);
+ kcpustat_dyntick_start();
tick_nohz_start_idle(ts);
local_irq_enable();
@@ -1407,37 +1408,12 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}
-static void tick_nohz_account_idle_time(struct tick_sched *ts,
- ktime_t now)
-{
- unsigned long ticks;
-
- ts->idle_exittime = now;
-
- if (vtime_accounting_enabled_this_cpu())
- return;
- /*
- * We stopped the tick in idle. update_process_times() would miss the
- * time we slept, as it does only a 1 tick accounting.
- * Enforce that this is accounted to idle !
- */
- ticks = jiffies - ts->idle_jiffies;
- /*
- * We might be one off. Do not randomly account a huge number of ticks!
- */
- if (ticks && ticks < LONG_MAX)
- account_idle_ticks(ticks);
-}
-
void tick_nohz_idle_restart_tick(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
- if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
- ktime_t now = ktime_get();
- tick_nohz_restart_sched_tick(ts, now);
- tick_nohz_account_idle_time(ts, now);
- }
+ if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
+ tick_nohz_restart_sched_tick(ts, ktime_get());
}
static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
@@ -1446,8 +1422,6 @@ static void tick_nohz_idle_update_tick(struct tick_sched
*ts, ktime_t now)
__tick_nohz_full_update_tick(ts, now);
else
tick_nohz_restart_sched_tick(ts, now);
-
- tick_nohz_account_idle_time(ts, now);
}
/**
@@ -1489,6 +1463,7 @@ void tick_nohz_idle_exit(void)
if (tick_stopped)
tick_nohz_idle_update_tick(ts, now);
+ kcpustat_dyntick_stop();
local_irq_enable();
}
@@ -1625,20 +1600,15 @@ void tick_setup_sched_timer(bool hrtimer)
void tick_sched_timer_dying(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
- ktime_t idle_sleeptime, iowait_sleeptime;
unsigned long idle_calls, idle_sleeps;
/* This must happen before hrtimers are migrated! */
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
hrtimer_cancel(&ts->sched_timer);
- idle_sleeptime = ts->idle_sleeptime;
- iowait_sleeptime = ts->iowait_sleeptime;
idle_calls = ts->idle_calls;
idle_sleeps = ts->idle_sleeps;
memset(ts, 0, sizeof(*ts));
- ts->idle_sleeptime = idle_sleeptime;
- ts->iowait_sleeptime = iowait_sleeptime;
ts->idle_calls = idle_calls;
ts->idle_sleeps = idle_sleeps;
}
--
2.51.1