On 2/6/26 7:52 PM, Frederic Weisbecker wrote:
Currently the tick subsystem stores the idle cputime accounting in
private fields, allowing cohabitation with architecture idle vtime
accounting. The former is fetched on online CPUs, the latter on offline
CPUs.
For consolidation purpose, architecture vtime accounting will continue
to account the cputime but will make a break when the idle tick is
stopped. The dyntick cputime accounting will then be relayed by the tick
subsystem so that the idle cputime is still seen advancing coherently
even when the tick isn't there to flush the idle vtime.
Prepare for that and introduce three new APIs which will be used in
subsequent patches:
_ vtime_dynticks_start() is deemed to be called when idle enters in
dyntick mode. The idle cputime that elapsed so far is accumulated.
- vtime_dynticks_stop() is deemed to be called when idle exits from
dyntick mode. The vtime entry clocks are fast-forward to current time
so that idle accounting restarts elapsing from now.
- vtime_reset() is deemed to be called from dynticks idle IRQ entry to
fast-forward the clock to current time so that the IRQ time is still
accounted by vtime while nohz cputime is paused.
Also accumulated vtime won't be flushed from dyntick-idle ticks to avoid
accounting twice the idle cputime, along with nohz accounting.
Signed-off-by: Frederic Weisbecker <[email protected]>
Reviewed-by: Shrikanth Hegde <[email protected]>
---
arch/powerpc/kernel/time.c | 41 ++++++++++++++++++++++++++++++++++++++
include/linux/vtime.h | 6 ++++++
2 files changed, 47 insertions(+)
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 4bbeb8644d3d..18506740f4a4 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -376,6 +376,47 @@ void vtime_task_switch(struct task_struct *prev)
acct->starttime = acct0->starttime;
}
}
+
+#ifdef CONFIG_NO_HZ_COMMON
+/**
+ * vtime_reset - Fast forward vtime entry clocks
+ *
+ * Called from dynticks idle IRQ entry to fast-forward the clocks to current
time
+ * so that the IRQ time is still accounted by vtime while nohz cputime is
paused.
+ */
+void vtime_reset(void)
+{
+ struct cpu_accounting_data *acct = get_accounting(current);
+
+ acct->starttime = mftb();
I figured out why those huge values happen.
This happens because mftb is from when the system is booted.
I was doing kexec to start the new kernel and mftb wasn't getting
reset.
I thought about this. This is concern for pseries too, where LPAR's
restart but system won't restart and mftb will continue to run instead of
reset.
I think we should be using sched_clock instead of mftb here.
Though we need it a few more places and some cosmetic changes around it.
Note: Some values being huge exists without series for few CPUs, with series it
shows up in most of the CPUs.
So I am planning send out fix below fix separately keeping your
series as dependency.
---
arch/powerpc/include/asm/accounting.h | 4 ++--
arch/powerpc/include/asm/cputime.h | 14 +++++++-------
arch/powerpc/kernel/time.c | 22 +++++++++++-----------
3 files changed, 20 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/include/asm/accounting.h
b/arch/powerpc/include/asm/accounting.h
index 6d79c31700e2..50f120646e6d 100644
--- a/arch/powerpc/include/asm/accounting.h
+++ b/arch/powerpc/include/asm/accounting.h
@@ -21,8 +21,8 @@ struct cpu_accounting_data {
unsigned long steal_time;
unsigned long idle_time;
/* Internal counters */
- unsigned long starttime; /* TB value snapshot */
- unsigned long starttime_user; /* TB value on exit to usermode */
+ unsigned long starttime; /* Time value snapshot */
+ unsigned long starttime_user; /* Time value on exit to usermode */
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
unsigned long startspurr; /* SPURR value snapshot */
unsigned long utime_sspurr; /* ->user_time when ->startspurr set */
diff --git a/arch/powerpc/include/asm/cputime.h
b/arch/powerpc/include/asm/cputime.h
index aff858ca99c0..eb6b629b113f 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -20,9 +20,9 @@
#include <asm/time.h>
#include <asm/param.h>
#include <asm/firmware.h>
+#include <linux/sched/clock.h>
#ifdef __KERNEL__
-#define cputime_to_nsecs(cputime) tb_to_ns(cputime)
/*
* PPC64 uses PACA which is task independent for storing accounting data while
@@ -44,20 +44,20 @@
*/
static notrace inline void account_cpu_user_entry(void)
{
- unsigned long tb = mftb();
+ unsigned long now = sched_clock();
struct cpu_accounting_data *acct = raw_get_accounting(current);
- acct->utime += (tb - acct->starttime_user);
- acct->starttime = tb;
+ acct->utime += (now - acct->starttime_user);
+ acct->starttime = now;
}
static notrace inline void account_cpu_user_exit(void)
{
- unsigned long tb = mftb();
+ unsigned long now = sched_clock();
struct cpu_accounting_data *acct = raw_get_accounting(current);
- acct->stime += (tb - acct->starttime);
- acct->starttime_user = tb;
+ acct->stime += (now - acct->starttime);
+ acct->starttime_user = now;
}
static notrace inline void account_stolen_time(void)
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 18506740f4a4..fb67cdae3bcb 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -215,7 +215,7 @@ static unsigned long vtime_delta(struct cpu_accounting_data
*acct,
WARN_ON_ONCE(!irqs_disabled());
- now = mftb();
+ now = sched_clock();
stime = now - acct->starttime;
acct->starttime = now;
@@ -299,9 +299,9 @@ static void vtime_flush_scaled(struct task_struct *tsk,
{
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
if (acct->utime_scaled)
- tsk->utimescaled += cputime_to_nsecs(acct->utime_scaled);
+ tsk->utimescaled += acct->utime_scaled;
if (acct->stime_scaled)
- tsk->stimescaled += cputime_to_nsecs(acct->stime_scaled);
+ tsk->stimescaled += acct->stime_scaled;
acct->utime_scaled = 0;
acct->utime_sspurr = 0;
@@ -321,28 +321,28 @@ void vtime_flush(struct task_struct *tsk)
struct cpu_accounting_data *acct = get_accounting(tsk);
if (acct->utime)
- account_user_time(tsk, cputime_to_nsecs(acct->utime));
+ account_user_time(tsk, acct->utime);
if (acct->gtime)
- account_guest_time(tsk, cputime_to_nsecs(acct->gtime));
+ account_guest_time(tsk, acct->gtime);
if (IS_ENABLED(CONFIG_PPC_SPLPAR) && acct->steal_time) {
- account_steal_time(cputime_to_nsecs(acct->steal_time));
+ account_steal_time(acct->steal_time);
acct->steal_time = 0;
}
if (acct->idle_time)
- account_idle_time(cputime_to_nsecs(acct->idle_time));
+ account_idle_time(acct->idle_time);
if (acct->stime)
- account_system_index_time(tsk, cputime_to_nsecs(acct->stime),
+ account_system_index_time(tsk, acct->stime,
CPUTIME_SYSTEM);
if (acct->hardirq_time)
- account_system_index_time(tsk,
cputime_to_nsecs(acct->hardirq_time),
+ account_system_index_time(tsk, acct->hardirq_time,
CPUTIME_IRQ);
if (acct->softirq_time)
- account_system_index_time(tsk,
cputime_to_nsecs(acct->softirq_time),
+ account_system_index_time(tsk, acct->softirq_time,
CPUTIME_SOFTIRQ);
vtime_flush_scaled(tsk, acct);
@@ -388,7 +388,7 @@ void vtime_reset(void)
{
struct cpu_accounting_data *acct = get_accounting(current);
- acct->starttime = mftb();
+ acct->starttime = sched_clock();
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
acct->startspurr = read_spurr(acct->starttime);
#endif
--
2.43.0