Re: [PATCH 18/25] vtime: Track nice-ness on top of context switch

2018-11-20 Thread Peter Zijlstra
On Wed, Nov 14, 2018 at 03:46:02AM +0100, Frederic Weisbecker wrote:
> We need to read the nice value of the task running on any CPU, possibly
> remotely, in order to correctly support kcpustat on nohz_full.
> Unfortunately we can't just read task_nice(tsk) when tsk runs on another
> CPU because its nice value may be concurrently changed. There could be a
> risk that a recently modified nice value is thought to apply for a longer
> while than is supposed to.
> 
> For example if a task runs at T0 with nice = -10, then its nice value
> is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second
> could think that the task had this "nice == 10" value since the beginning
> (T0) and spuriously account 1 second nice time on kcpustat instead of 1
> second user time.
> 
> So we need to track the nice value changes under vtime seqcount. Start
> with context switches and account the vtime nice-ness on top of it.

Huh, what!? That doesn't make any sense..


Re: [PATCH 18/25] vtime: Track nice-ness on top of context switch

2018-11-20 Thread Peter Zijlstra
On Wed, Nov 14, 2018 at 03:46:02AM +0100, Frederic Weisbecker wrote:
> We need to read the nice value of the task running on any CPU, possibly
> remotely, in order to correctly support kcpustat on nohz_full.
> Unfortunately we can't just read task_nice(tsk) when tsk runs on another
> CPU because its nice value may be concurrently changed. There could be a
> risk that a recently modified nice value is thought to apply for a longer
> while than is supposed to.
> 
> For example if a task runs at T0 with nice = -10, then its nice value
> is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second
> could think that the task had this "nice == 10" value since the beginning
> (T0) and spuriously account 1 second nice time on kcpustat instead of 1
> second user time.
> 
> So we need to track the nice value changes under vtime seqcount. Start
> with context switches and account the vtime nice-ness on top of it.

Huh, what!? That doesn't make any sense..


[PATCH 18/25] vtime: Track nice-ness on top of context switch

2018-11-13 Thread Frederic Weisbecker
We need to read the nice value of the task running on any CPU, possibly
remotely, in order to correctly support kcpustat on nohz_full.
Unfortunately we can't just read task_nice(tsk) when tsk runs on another
CPU because its nice value may be concurrently changed. There could be a
risk that a recently modified nice value is thought to apply for a longer
while than is supposed to.

For example if a task runs at T0 with nice = -10, then its nice value
is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second
could think that the task had this "nice == 10" value since the beginning
(T0) and spuriously account 1 second nice time on kcpustat instead of 1
second user time.

So we need to track the nice value changes under vtime seqcount. Start
with context switches and account the vtime nice-ness on top of it.

Signed-off-by: Frederic Weisbecker 
Cc: Yauheni Kaliuta 
Cc: Thomas Gleixner 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Wanpeng Li 
Cc: Ingo Molnar 
---
 include/linux/sched.h  |  1 +
 kernel/sched/cputime.c | 44 +++-
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27e0544..356326f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -280,6 +280,7 @@ enum vtime_state {
 struct vtime {
seqcount_t  seqcount;
unsigned long long  starttime;
+   int nice;
enum vtime_statestate;
unsigned intcpu;
u64 utime;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8f5dee2..07c2e7f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -735,13 +735,42 @@ static void vtime_account_system(struct task_struct *tsk,
 static void vtime_account_guest(struct task_struct *tsk,
struct vtime *vtime)
 {
+   enum cpu_usage_stat index;
+
vtime->gtime += get_vtime_delta(vtime);
-   if (vtime->gtime >= TICK_NSEC) {
-   account_guest_time(tsk, vtime->gtime);
-   vtime->gtime = 0;
-   }
+
+   if (vtime->gtime < TICK_NSEC)
+   return;
+
+   if (vtime->nice)
+   index = CPUTIME_GUEST_NICE;
+   else
+   index = CPUTIME_GUEST;
+
+   account_guest_time_index(tsk, vtime->gtime, index);
+   vtime->gtime = 0;
 }
 
+static void vtime_account_user(struct task_struct *tsk,
+  struct vtime *vtime)
+{
+   enum cpu_usage_stat index;
+
+   vtime->utime += get_vtime_delta(vtime);
+
+   if (vtime->utime < TICK_NSEC)
+   return;
+
+   if (vtime->nice)
+   index = CPUTIME_NICE;
+   else
+   index = CPUTIME_USER;
+
+   account_user_time_index(tsk, vtime->utime, index);
+   vtime->utime = 0;
+}
+
+
 static void __vtime_account_kernel(struct task_struct *tsk,
   struct vtime *vtime)
 {
@@ -779,11 +808,7 @@ void vtime_user_exit(struct task_struct *tsk)
struct vtime *vtime = >vtime;
 
write_seqcount_begin(>seqcount);
-   vtime->utime += get_vtime_delta(vtime);
-   if (vtime->utime >= TICK_NSEC) {
-   account_user_time(tsk, vtime->utime);
-   vtime->utime = 0;
-   }
+   vtime_account_user(tsk, vtime);
vtime->state = VTIME_SYS;
write_seqcount_end(>seqcount);
 }
@@ -864,6 +889,7 @@ void vtime_task_switch_generic(struct task_struct *prev)
vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
vtime->cpu = smp_processor_id();
+   vtime->nice = (task_nice(current) > 0) ? 1 : 0;
write_seqcount_end(>seqcount);
 
rcu_assign_pointer(kcpustat->curr, current);
-- 
2.7.4



[PATCH 18/25] vtime: Track nice-ness on top of context switch

2018-11-13 Thread Frederic Weisbecker
We need to read the nice value of the task running on any CPU, possibly
remotely, in order to correctly support kcpustat on nohz_full.
Unfortunately we can't just read task_nice(tsk) when tsk runs on another
CPU because its nice value may be concurrently changed. There could be a
risk that a recently modified nice value is thought to apply for a longer
while than is supposed to.

For example if a task runs at T0 with nice = -10, then its nice value
is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second
could think that the task had this "nice == 10" value since the beginning
(T0) and spuriously account 1 second nice time on kcpustat instead of 1
second user time.

So we need to track the nice value changes under vtime seqcount. Start
with context switches and account the vtime nice-ness on top of it.

Signed-off-by: Frederic Weisbecker 
Cc: Yauheni Kaliuta 
Cc: Thomas Gleixner 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
Cc: Wanpeng Li 
Cc: Ingo Molnar 
---
 include/linux/sched.h  |  1 +
 kernel/sched/cputime.c | 44 +++-
 2 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27e0544..356326f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -280,6 +280,7 @@ enum vtime_state {
 struct vtime {
seqcount_t  seqcount;
unsigned long long  starttime;
+   int nice;
enum vtime_statestate;
unsigned intcpu;
u64 utime;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8f5dee2..07c2e7f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -735,13 +735,42 @@ static void vtime_account_system(struct task_struct *tsk,
 static void vtime_account_guest(struct task_struct *tsk,
struct vtime *vtime)
 {
+   enum cpu_usage_stat index;
+
vtime->gtime += get_vtime_delta(vtime);
-   if (vtime->gtime >= TICK_NSEC) {
-   account_guest_time(tsk, vtime->gtime);
-   vtime->gtime = 0;
-   }
+
+   if (vtime->gtime < TICK_NSEC)
+   return;
+
+   if (vtime->nice)
+   index = CPUTIME_GUEST_NICE;
+   else
+   index = CPUTIME_GUEST;
+
+   account_guest_time_index(tsk, vtime->gtime, index);
+   vtime->gtime = 0;
 }
 
+static void vtime_account_user(struct task_struct *tsk,
+  struct vtime *vtime)
+{
+   enum cpu_usage_stat index;
+
+   vtime->utime += get_vtime_delta(vtime);
+
+   if (vtime->utime < TICK_NSEC)
+   return;
+
+   if (vtime->nice)
+   index = CPUTIME_NICE;
+   else
+   index = CPUTIME_USER;
+
+   account_user_time_index(tsk, vtime->utime, index);
+   vtime->utime = 0;
+}
+
+
 static void __vtime_account_kernel(struct task_struct *tsk,
   struct vtime *vtime)
 {
@@ -779,11 +808,7 @@ void vtime_user_exit(struct task_struct *tsk)
struct vtime *vtime = >vtime;
 
write_seqcount_begin(>seqcount);
-   vtime->utime += get_vtime_delta(vtime);
-   if (vtime->utime >= TICK_NSEC) {
-   account_user_time(tsk, vtime->utime);
-   vtime->utime = 0;
-   }
+   vtime_account_user(tsk, vtime);
vtime->state = VTIME_SYS;
write_seqcount_end(>seqcount);
 }
@@ -864,6 +889,7 @@ void vtime_task_switch_generic(struct task_struct *prev)
vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
vtime->cpu = smp_processor_id();
+   vtime->nice = (task_nice(current) > 0) ? 1 : 0;
write_seqcount_end(>seqcount);
 
rcu_assign_pointer(kcpustat->curr, current);
-- 
2.7.4