Commit-ID:  8d4c00dc38a8aa30dae8402955e55e7b34e74bc8
Gitweb:     https://git.kernel.org/tip/8d4c00dc38a8aa30dae8402955e55e7b34e74bc8
Author:     Xunlei Pang <xlp...@linux.alibaba.com>
AuthorDate: Mon, 9 Jul 2018 22:58:43 +0800
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Mon, 16 Jul 2018 00:28:31 +0200

sched/cputime: Ensure accurate utime and stime ratio in cputime_adjust()

If users access "/proc/pid/stat", the utime and stime ratio in the
current SAMPLE period are excepted, but currently cputime_adjust()
always calculates with the ratio of the WHOLE lifetime of the process.

This results in inaccurate utime and stime in "/proc/pid/stat". For
example,  a process runs for a while with "50% usr, 0% sys", then
followed by "100% sys". For later while, the following is excepted:

  0.0 usr,  100.0 sys

but we get:

  10.0 usr,  90.0 sys

This patch uses the accurate ratio in cputime_adjust() to address the
issue. A new 'task_cputime' type field is added in prev_cputime to record
previous 'task_cputime' so that we can get the elapsed times as the accurate
ratio.

Signed-off-by: Xunlei Pang <xlp...@linux.alibaba.com>
Cc: Frederic Weisbecker <frede...@kernel.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Tejun Heo <t...@kernel.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: baoyou....@gmail.com
Link: http://lkml.kernel.org/r/20180709145843.126583-1-xlp...@linux.alibaba.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 include/linux/sched.h         | 34 ++++++++++++------------
 include/linux/sched/cputime.h | 12 ++++++++-
 kernel/sched/cputime.c        | 61 ++++++++++++++++---------------------------
 3 files changed, 52 insertions(+), 55 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..fedc69d4a425 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -223,10 +223,27 @@ extern void io_schedule_finish(int token);
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+/**
+ * struct task_cputime - collected CPU time counts
+ * @utime:             time spent in user mode, in nanoseconds
+ * @stime:             time spent in kernel mode, in nanoseconds
+ * @sum_exec_runtime:  total time spent on the CPU, in nanoseconds
+ *
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups.  Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
+ */
+struct task_cputime {
+       u64                             utime;
+       u64                             stime;
+       unsigned long long              sum_exec_runtime;
+};
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
  * @stime: time spent in system mode
+ * @cputime: previous task_cputime to calculate utime/stime
  * @lock: protects the above two fields
  *
  * Stores previous user/system time values such that we can guarantee
@@ -236,26 +253,11 @@ struct prev_cputime {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                             utime;
        u64                             stime;
+       struct task_cputime             cputime;
        raw_spinlock_t                  lock;
 #endif
 };
 
-/**
- * struct task_cputime - collected CPU time counts
- * @utime:             time spent in user mode, in nanoseconds
- * @stime:             time spent in kernel mode, in nanoseconds
- * @sum_exec_runtime:  total time spent on the CPU, in nanoseconds
- *
- * This structure groups together three kinds of CPU time that are tracked for
- * threads and thread groups.  Most things considering CPU time want to group
- * these counts together and treat all three of them in parallel.
- */
-struct task_cputime {
-       u64                             utime;
-       u64                             stime;
-       unsigned long long              sum_exec_runtime;
-};
-
 /* Alternate field names when used on cache expirations: */
 #define virt_exp                       utime
 #define prof_exp                       stime
diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h
index 53f883f5a2fd..49f8fd2564ed 100644
--- a/include/linux/sched/cputime.h
+++ b/include/linux/sched/cputime.h
@@ -175,10 +175,20 @@ static inline void account_group_exec_runtime(struct 
task_struct *tsk,
        atomic64_add(ns, &cputimer->cputime_atomic.sum_exec_runtime);
 }
 
-static inline void prev_cputime_init(struct prev_cputime *prev)
+static inline void prev_cputime_clear(struct prev_cputime *prev)
 {
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        prev->utime = prev->stime = 0;
+       prev->cputime.utime = 0;
+       prev->cputime.stime = 0;
+       prev->cputime.sum_exec_runtime = 0;
+#endif
+}
+
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       prev_cputime_clear(prev);
        raw_spin_lock_init(&prev->lock);
 #endif
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0796f938c4f0..a68483ee3ad7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -590,69 +590,54 @@ drop_precision:
 void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
                    u64 *ut, u64 *st)
 {
-       u64 rtime, stime, utime;
+       u64 rtime_delta, stime_delta, utime_delta;
        unsigned long flags;
 
        /* Serialize concurrent callers such that we can honour our guarantees 
*/
        raw_spin_lock_irqsave(&prev->lock, flags);
-       rtime = curr->sum_exec_runtime;
 
        /*
         * This is possible under two circumstances:
-        *  - rtime isn't monotonic after all (a bug);
+        *  - task_cputime isn't monotonic after all (a bug);
         *  - we got reordered by the lock.
         *
         * In both cases this acts as a filter such that the rest of the code
         * can assume it is monotonic regardless of anything else.
         */
-       if (prev->stime + prev->utime >= rtime)
+       if (prev->cputime.utime > curr->utime ||
+           prev->cputime.stime > curr->stime ||
+           prev->cputime.sum_exec_runtime >= curr->sum_exec_runtime)
                goto out;
 
-       stime = curr->stime;
-       utime = curr->utime;
+       stime_delta = curr->stime - prev->cputime.stime;
+       utime_delta = curr->utime - prev->cputime.utime;
+       rtime_delta = curr->sum_exec_runtime - prev->cputime.sum_exec_runtime;
 
        /*
-        * If either stime or utime are 0, assume all runtime is userspace.
-        * Once a task gets some ticks, the monotonicy code at 'update:'
-        * will ensure things converge to the observed ratio.
+        * If either stime or utime increase are 0, assume all runtime
+        * is userspace. Once a task gets some ticks, the monotonicy code
+        * at 'update:' will ensure things converge to the observed ratio.
         */
-       if (stime == 0) {
-               utime = rtime;
+       if (stime_delta == 0) {
+               utime_delta = rtime_delta;
                goto update;
        }
 
-       if (utime == 0) {
-               stime = rtime;
+       if (utime_delta == 0) {
+               stime_delta = rtime_delta;
                goto update;
        }
 
-       stime = scale_stime(stime, rtime, stime + utime);
+       stime_delta = scale_stime(stime_delta, rtime_delta,
+                               stime_delta + utime_delta);
+       if (stime_delta > rtime_delta)
+               stime_delta = rtime_delta;
+       utime_delta = rtime_delta - stime_delta;
 
 update:
-       /*
-        * Make sure stime doesn't go backwards; this preserves monotonicity
-        * for utime because rtime is monotonic.
-        *
-        *  utime_i+1 = rtime_i+1 - stime_i
-        *            = rtime_i+1 - (rtime_i - utime_i)
-        *            = (rtime_i+1 - rtime_i) + utime_i
-        *            >= utime_i
-        */
-       if (stime < prev->stime)
-               stime = prev->stime;
-       utime = rtime - stime;
-
-       /*
-        * Make sure utime doesn't go backwards; this still preserves
-        * monotonicity for stime, analogous argument to above.
-        */
-       if (utime < prev->utime) {
-               utime = prev->utime;
-               stime = rtime - utime;
-       }
-
-       prev->stime = stime;
-       prev->utime = utime;
+       prev->cputime = *curr;
+       prev->utime += utime_delta;
+       prev->stime += stime_delta;
 out:
        *ut = prev->utime;
        *st = prev->stime;

Reply via email to