Re: [PATCH 18/25] vtime: Track nice-ness on top of context switch
On Wed, Nov 14, 2018 at 03:46:02AM +0100, Frederic Weisbecker wrote: > We need to read the nice value of the task running on any CPU, possibly > remotely, in order to correctly support kcpustat on nohz_full. > Unfortunately we can't just read task_nice(tsk) when tsk runs on another > CPU because its nice value may be concurrently changed. There could be a > risk that a recently modified nice value is thought to apply for a longer > while than is supposed to. > > For example if a task runs at T0 with nice = -10, then its nice value > is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second > could think that the task had this "nice == 10" value since the beginning > (T0) and spuriously account 1 second nice time on kcpustat instead of 1 > second user time. > > So we need to track the nice value changes under vtime seqcount. Start > with context switches and account the vtime nice-ness on top of it. Huh, what!? That doesn't make any sense..
Re: [PATCH 18/25] vtime: Track nice-ness on top of context switch
On Wed, Nov 14, 2018 at 03:46:02AM +0100, Frederic Weisbecker wrote: > We need to read the nice value of the task running on any CPU, possibly > remotely, in order to correctly support kcpustat on nohz_full. > Unfortunately we can't just read task_nice(tsk) when tsk runs on another > CPU because its nice value may be concurrently changed. There could be a > risk that a recently modified nice value is thought to apply for a longer > while than is supposed to. > > For example if a task runs at T0 with nice = -10, then its nice value > is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second > could think that the task had this "nice == 10" value since the beginning > (T0) and spuriously account 1 second nice time on kcpustat instead of 1 > second user time. > > So we need to track the nice value changes under vtime seqcount. Start > with context switches and account the vtime nice-ness on top of it. Huh, what!? That doesn't make any sense..
[PATCH 18/25] vtime: Track nice-ness on top of context switch
We need to read the nice value of the task running on any CPU, possibly remotely, in order to correctly support kcpustat on nohz_full. Unfortunately we can't just read task_nice(tsk) when tsk runs on another CPU because its nice value may be concurrently changed. There could be a risk that a recently modified nice value is thought to apply for a longer while than is supposed to. For example if a task runs at T0 with nice = -10, then its nice value is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second could think that the task had this "nice == 10" value since the beginning (T0) and spuriously account 1 second nice time on kcpustat instead of 1 second user time. So we need to track the nice value changes under vtime seqcount. Start with context switches and account the vtime nice-ness on top of it. Signed-off-by: Frederic Weisbecker Cc: Yauheni Kaliuta Cc: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/cputime.c | 44 +++- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 27e0544..356326f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -280,6 +280,7 @@ enum vtime_state { struct vtime { seqcount_t seqcount; unsigned long long starttime; + int nice; enum vtime_statestate; unsigned intcpu; u64 utime; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8f5dee2..07c2e7f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -735,13 +735,42 @@ static void vtime_account_system(struct task_struct *tsk, static void vtime_account_guest(struct task_struct *tsk, struct vtime *vtime) { + enum cpu_usage_stat index; + vtime->gtime += get_vtime_delta(vtime); - if (vtime->gtime >= TICK_NSEC) { - account_guest_time(tsk, vtime->gtime); - vtime->gtime = 0; - } + + if (vtime->gtime < TICK_NSEC) + return; + + if (vtime->nice) + index = CPUTIME_GUEST_NICE; + else + index = CPUTIME_GUEST; + + account_guest_time_index(tsk, vtime->gtime, index); + vtime->gtime = 0; } +static void vtime_account_user(struct task_struct *tsk, + struct vtime *vtime) +{ + enum cpu_usage_stat index; + + vtime->utime += get_vtime_delta(vtime); + + if (vtime->utime < TICK_NSEC) + return; + + if (vtime->nice) + index = CPUTIME_NICE; + else + index = CPUTIME_USER; + + account_user_time_index(tsk, vtime->utime, index); + vtime->utime = 0; +} + + static void __vtime_account_kernel(struct task_struct *tsk, struct vtime *vtime) { @@ -779,11 +808,7 @@ void vtime_user_exit(struct task_struct *tsk) struct vtime *vtime = >vtime; write_seqcount_begin(>seqcount); - vtime->utime += get_vtime_delta(vtime); - if (vtime->utime >= TICK_NSEC) { - account_user_time(tsk, vtime->utime); - vtime->utime = 0; - } + vtime_account_user(tsk, vtime); vtime->state = VTIME_SYS; write_seqcount_end(>seqcount); } @@ -864,6 +889,7 @@ void vtime_task_switch_generic(struct task_struct *prev) vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); vtime->cpu = smp_processor_id(); + vtime->nice = (task_nice(current) > 0) ? 1 : 0; write_seqcount_end(>seqcount); rcu_assign_pointer(kcpustat->curr, current); -- 2.7.4
[PATCH 18/25] vtime: Track nice-ness on top of context switch
We need to read the nice value of the task running on any CPU, possibly remotely, in order to correctly support kcpustat on nohz_full. Unfortunately we can't just read task_nice(tsk) when tsk runs on another CPU because its nice value may be concurrently changed. There could be a risk that a recently modified nice value is thought to apply for a longer while than is supposed to. For example if a task runs at T0 with nice = -10, then its nice value is changed at T0 + 1 second with nice = 10, a reader at T0 + 1 second could think that the task had this "nice == 10" value since the beginning (T0) and spuriously account 1 second nice time on kcpustat instead of 1 second user time. So we need to track the nice value changes under vtime seqcount. Start with context switches and account the vtime nice-ness on top of it. Signed-off-by: Frederic Weisbecker Cc: Yauheni Kaliuta Cc: Thomas Gleixner Cc: Rik van Riel Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/cputime.c | 44 +++- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 27e0544..356326f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -280,6 +280,7 @@ enum vtime_state { struct vtime { seqcount_t seqcount; unsigned long long starttime; + int nice; enum vtime_statestate; unsigned intcpu; u64 utime; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 8f5dee2..07c2e7f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -735,13 +735,42 @@ static void vtime_account_system(struct task_struct *tsk, static void vtime_account_guest(struct task_struct *tsk, struct vtime *vtime) { + enum cpu_usage_stat index; + vtime->gtime += get_vtime_delta(vtime); - if (vtime->gtime >= TICK_NSEC) { - account_guest_time(tsk, vtime->gtime); - vtime->gtime = 0; - } + + if (vtime->gtime < TICK_NSEC) + return; + + if (vtime->nice) + index = CPUTIME_GUEST_NICE; + else + index = CPUTIME_GUEST; + + account_guest_time_index(tsk, vtime->gtime, index); + vtime->gtime = 0; } +static void vtime_account_user(struct task_struct *tsk, + struct vtime *vtime) +{ + enum cpu_usage_stat index; + + vtime->utime += get_vtime_delta(vtime); + + if (vtime->utime < TICK_NSEC) + return; + + if (vtime->nice) + index = CPUTIME_NICE; + else + index = CPUTIME_USER; + + account_user_time_index(tsk, vtime->utime, index); + vtime->utime = 0; +} + + static void __vtime_account_kernel(struct task_struct *tsk, struct vtime *vtime) { @@ -779,11 +808,7 @@ void vtime_user_exit(struct task_struct *tsk) struct vtime *vtime = >vtime; write_seqcount_begin(>seqcount); - vtime->utime += get_vtime_delta(vtime); - if (vtime->utime >= TICK_NSEC) { - account_user_time(tsk, vtime->utime); - vtime->utime = 0; - } + vtime_account_user(tsk, vtime); vtime->state = VTIME_SYS; write_seqcount_end(>seqcount); } @@ -864,6 +889,7 @@ void vtime_task_switch_generic(struct task_struct *prev) vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); vtime->cpu = smp_processor_id(); + vtime->nice = (task_nice(current) > 0) ? 1 : 0; write_seqcount_end(>seqcount); rcu_assign_pointer(kcpustat->curr, current); -- 2.7.4