In order to correctly implement kcpustat under nohz_full, we need to
track the task running on a given CPU and read its vtime state safely,
reliably and locklessly.

This leaves us with tracking and fetching that task under RCU. This will
be done in a further patch. Until then we need to prepare vtime for
handling that properly and close the accounting before we meet the earliest
opportunity for the RCU delayed put_task_struct() to be queued. That
point happens to be in exit_notify() in case of auto-reaping.

Therefore we need to finish the accounting right before exit_notify().
After that we shouldn't track the exiting task any further.

Signed-off-by: Frederic Weisbecker <frede...@kernel.org>
Cc: Yauheni Kaliuta <yauheni.kali...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Rik van Riel <r...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Wanpeng Li <wanpen...@tencent.com>
Cc: Ingo Molnar <mi...@kernel.org>
---
 include/linux/sched.h  |  2 ++
 include/linux/vtime.h  |  2 ++
 kernel/exit.c          |  1 +
 kernel/sched/cputime.c | 56 ++++++++++++++++++++++++++++++++++++++++++--------
 4 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d458d65..27e0544 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -265,6 +265,8 @@ struct task_cputime {
 enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
+       /* Task has passed exit_notify() */
+       VTIME_DEAD,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index d9160ab..8350a0b 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -73,12 +73,14 @@ extern void vtime_user_exit(struct task_struct *tsk);
 extern void vtime_guest_enter(struct task_struct *tsk);
 extern void vtime_guest_exit(struct task_struct *tsk);
 extern void vtime_init_idle(struct task_struct *tsk, int cpu);
+extern void vtime_exit_task(struct task_struct *tsk);
 #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
 static inline void vtime_user_enter(struct task_struct *tsk) { }
 static inline void vtime_user_exit(struct task_struct *tsk) { }
 static inline void vtime_guest_enter(struct task_struct *tsk) { }
 static inline void vtime_guest_exit(struct task_struct *tsk) { }
 static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
+static inline void vtime_exit_task(struct task_struct *tsk) { }
 #endif
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/exit.c b/kernel/exit.c
index 0e21e6d..cae3fe9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -883,6 +883,7 @@ void __noreturn do_exit(long code)
         */
        flush_ptrace_hw_breakpoint(tsk);
 
+       vtime_exit_task(tsk);
        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f64afd7..a0c3a82 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -702,7 +702,7 @@ static u64 get_vtime_delta(struct vtime *vtime)
         * errors from causing elapsed vtime to go negative.
         */
        other = account_other_time(delta);
-       WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+       WARN_ON_ONCE(vtime->state < VTIME_IDLE);
        vtime->starttime += delta;
 
        return delta - other;
@@ -813,17 +813,31 @@ void vtime_task_switch_generic(struct task_struct *prev)
 {
        struct vtime *vtime = &prev->vtime;
 
-       write_seqcount_begin(&vtime->seqcount);
-       if (vtime->state == VTIME_IDLE)
-               vtime_account_idle(prev);
-       else
-               __vtime_account_kernel(prev, vtime);
-       vtime->state = VTIME_INACTIVE;
-       vtime->cpu = -1;
-       write_seqcount_end(&vtime->seqcount);
+       /*
+        * Flush the prev task vtime, unless it has passed
+        * vtime_exit_task(), in which case there is nothing
+        * left to account.
+        */
+       if (vtime->state != VTIME_DEAD) {
+               write_seqcount_begin(&vtime->seqcount);
+               if (vtime->state == VTIME_IDLE)
+                       vtime_account_idle(prev);
+               else
+                       __vtime_account_kernel(prev, vtime);
+               vtime->state = VTIME_INACTIVE;
+               vtime->cpu = -1;
+               write_seqcount_end(&vtime->seqcount);
+       }
 
        vtime = &current->vtime;
 
+       /*
+        * Ignore the next task if it has been preempted after
+        * vtime_exit_task().
+        */
+       if (vtime->state == VTIME_DEAD)
+               return;
+
        write_seqcount_begin(&vtime->seqcount);
        if (is_idle_task(current))
                vtime->state = VTIME_IDLE;
@@ -850,6 +864,30 @@ void vtime_init_idle(struct task_struct *t, int cpu)
        local_irq_restore(flags);
 }
 
+/*
+ * This is the final settlement point after which we don't account
+ * anymore vtime for this task.
+ */
+void vtime_exit_task(struct task_struct *t)
+{
+       struct vtime *vtime = &t->vtime;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       write_seqcount_begin(&vtime->seqcount);
+       /*
+        * A task that has never run on a nohz_full CPU hasn't
+        * been tracked by vtime. Thus it's in VTIME_INACTIVE
+        * state. Nothing to account for it.
+        */
+       if (vtime->state != VTIME_INACTIVE)
+               vtime_account_system(t, vtime);
+       vtime->state = VTIME_DEAD;
+       vtime->cpu = -1;
+       write_seqcount_end(&vtime->seqcount);
+       local_irq_restore(flags);
+}
+
 u64 task_gtime(struct task_struct *t)
 {
        struct vtime *vtime = &t->vtime;
-- 
2.7.4

Reply via email to