[tip: sched/core] psi: Optimize task switch inside shared cgroups
The following commit has been merged into the sched/core branch of tip: Commit-ID: 4117cebf1a9fcbf35b9aabf0e37b6c5eea296798 Gitweb: https://git.kernel.org/tip/4117cebf1a9fcbf35b9aabf0e37b6c5eea296798 Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00 Committer: Ingo Molnar CommitterDate: Sat, 06 Mar 2021 12:40:23 +01:00 psi: Optimize task switch inside shared cgroups The commit 36b238d57172 ("psi: Optimize switching tasks inside shared cgroups") only update cgroups whose state actually changes during a task switch only in task preempt case, not in task sleep case. We actually don't need to clear and set TSK_ONCPU state for common cgroups of next and prev task in sleep case, that can save many psi_group_change especially when most activity comes from one leaf cgroup. sleep before: psi_dequeue() while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU) psi_task_switch() while ((group = iterate_groups(next))) # all ancestors psi_group_change(next, .set=TSK_ONCPU) sleep after: psi_dequeue() nop psi_task_switch() while ((group = iterate_groups(next))) # until (prev & next) psi_group_change(next, .set=TSK_ONCPU) while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU) When a voluntary sleep switches to another task, we remove one call of psi_group_change() for every common cgroup ancestor of the two tasks. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com --- kernel/sched/psi.c | 35 +-- kernel/sched/stats.h | 28 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3907a6b..ee3c5b4 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } } - /* -* If this is a voluntary sleep, dequeue will have taken care -* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We -* only need to deal with it during preemption. -*/ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; + + /* +* When we're going to sleep, psi_dequeue() lets us handle +* TSK_RUNNING and TSK_IOWAIT here, where we can combine it +* with TSK_ONCPU and save walking common ancestors twice. +*/ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + } + + psi_flags_change(prev, clear, set); iter = NULL; while ((group = iterate_groups(prev, )) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); + psi_group_change(group, cpu, clear, set, true); + + /* +* TSK_ONCPU is handled up to the common ancestor. If we're tasked +* with dequeuing too, finish that for the rest of the hierarchy. +*/ + if (sleep) { + clear &= ~TSK_ONCPU; + for (; group; group = iterate_groups(prev, )) + psi_group_change(group, cpu, clear, set, true); + } } } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 9e4e67a..dc218e9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING, set = 0; + int clear = TSK_RUNNING; if (static_branch_likely(_disabled)) return; - if (!sleep) { - if (p->in_memstall) - clear |= TSK_MEMSTALL; - } else { - /* -* When a task sleeps, schedule() dequeues it before -* switching to the next one. Merge the clearing of -* TSK_RUNNING and TSK_ONCPU to save an unnecessary -* psi_task_change() call in psi_sched_switch(). -*/ - clear |= TSK_ONCPU; + /* +* A voluntary sleep is a dequeue followed by a task switch. To +* avoid walking all ancestors twice, psi_task_switch() handles +* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. +*
[tip: sched/core] psi: Add PSI_CPU_FULL state
The following commit has been merged into the sched/core branch of tip: Commit-ID: e7fcd762282332f765af2035a9568fb126fa3c01 Gitweb: https://git.kernel.org/tip/e7fcd762282332f765af2035a9568fb126fa3c01 Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:56 +08:00 Committer: Ingo Molnar CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00 psi: Add PSI_CPU_FULL state The FULL state doesn't exist for the CPU resource at the system level, but exist at the cgroup level, means all non-idle tasks in a cgroup are delayed on the CPU resource which used by others outside of the cgroup or throttled by the cgroup cpu.max configuration. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengm...@bytedance.com --- include/linux/psi_types.h | 3 ++- kernel/sched/psi.c| 14 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index b95f321..0a23300 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -50,9 +50,10 @@ enum psi_states { PSI_MEM_SOME, PSI_MEM_FULL, PSI_CPU_SOME, + PSI_CPU_FULL, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 6, + NR_PSI_STATES = 7, }; enum psi_aggregators { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c..2293c45 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration. * * SOME = nr_delayed_tasks != 0 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -225,6 +228,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + case PSI_CPU_FULL: + return tasks[NR_RUNNING] && !tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -678,8 +683,11 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; @@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(>avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { + for (full = 0; full < 2; full++) { unsigned long avg[3]; u64 total; int w;
[tip: sched/core] psi: Use ONCPU state tracking machinery to detect reclaim
The following commit has been merged into the sched/core branch of tip: Commit-ID: 7fae6c8171d20ac55402930ee8ae760cf85dff7b Gitweb: https://git.kernel.org/tip/7fae6c8171d20ac55402930ee8ae760cf85dff7b Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:57 +08:00 Committer: Ingo Molnar CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00 psi: Use ONCPU state tracking machinery to detect reclaim Move the reclaim detection from the timer tick to the task state tracking machinery using the recently added ONCPU state. And we also add task psi_flags changes checking in the psi_task_switch() optimization to update the parents properly. In terms of performance and cost, this ONCPU task state tracking is not cheaper than previous timer tick in aggregate. But the code is simpler and shorter this way, so it's a maintainability win. And Johannes did some testing with perf bench, the performace and cost changes would be acceptable for real workloads. Thanks to Johannes Weiner for pointing out the psi_task_switch() optimization things and the clearer changelog. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengm...@bytedance.com --- include/linux/psi.h | 1 +- kernel/sched/core.c | 1 +- kernel/sched/psi.c | 65 +++ kernel/sched/stats.h | 9 +-- 4 files changed, 24 insertions(+), 52 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 7361023..65eb147 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 361974e..d2629fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4551,7 +4551,6 @@ void scheduler_tick(void) update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); - psi_task_tick(rq); rq_unlock(rq, ); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2293c45..0fe6ff6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(>poll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu, -bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, int cpu) { u32 delta; u64 now; @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* -* Since we care about lost potential, a -* memstall is FULL when there are no other -* working tasks, but also when the CPU is -* actively reclaiming and nothing productive -* could run even if it were runnable. -* -* When the timer tick sees a reclaiming CPU, -* regardless of runnable tasks, sample a FULL -* tick (or less if it hasn't been a full tick -* since the last state change). -*/ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } if (groupc->state_mask & (1 << PSI_CPU_SOME)) { @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(>seq); - record_times(groupc, cpu, false); + record_times(groupc, cpu); for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu, if (test_state(groupc->tasks, s)) state_mask |= (1 << s); } + + /* +* Since we care about lost potential, a memstall is FULL +* when there are no other working tasks, but also when +* the CPU is actively reclaiming and nothing productive +* could run even if it were runnable. So when the current +* task in a cgroup is
[tip: sched/core] psi: Optimize task switch inside shared cgroups
The following commit has been merged into the sched/core branch of tip: Commit-ID: e6560d58334ca463061ade733674abc8dd0df9bd Gitweb: https://git.kernel.org/tip/e6560d58334ca463061ade733674abc8dd0df9bd Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00 Committer: Peter Zijlstra CommitterDate: Thu, 04 Mar 2021 09:56:02 +01:00 psi: Optimize task switch inside shared cgroups The commit 36b238d57172 ("psi: Optimize switching tasks inside shared cgroups") only update cgroups whose state actually changes during a task switch only in task preempt case, not in task sleep case. We actually don't need to clear and set TSK_ONCPU state for common cgroups of next and prev task in sleep case, that can save many psi_group_change especially when most activity comes from one leaf cgroup. sleep before: psi_dequeue() while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU) psi_task_switch() while ((group = iterate_groups(next))) # all ancestors psi_group_change(next, .set=TSK_ONCPU) sleep after: psi_dequeue() nop psi_task_switch() while ((group = iterate_groups(next))) # until (prev & next) psi_group_change(next, .set=TSK_ONCPU) while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU) When a voluntary sleep switches to another task, we remove one call of psi_group_change() for every common cgroup ancestor of the two tasks. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com --- kernel/sched/psi.c | 35 +-- kernel/sched/stats.h | 28 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 3907a6b..ee3c5b4 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } } - /* -* If this is a voluntary sleep, dequeue will have taken care -* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We -* only need to deal with it during preemption. -*/ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; + + /* +* When we're going to sleep, psi_dequeue() lets us handle +* TSK_RUNNING and TSK_IOWAIT here, where we can combine it +* with TSK_ONCPU and save walking common ancestors twice. +*/ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + } + + psi_flags_change(prev, clear, set); iter = NULL; while ((group = iterate_groups(prev, )) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); + psi_group_change(group, cpu, clear, set, true); + + /* +* TSK_ONCPU is handled up to the common ancestor. If we're tasked +* with dequeuing too, finish that for the rest of the hierarchy. +*/ + if (sleep) { + clear &= ~TSK_ONCPU; + for (; group; group = iterate_groups(prev, )) + psi_group_change(group, cpu, clear, set, true); + } } } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 9e4e67a..dc218e9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING, set = 0; + int clear = TSK_RUNNING; if (static_branch_likely(_disabled)) return; - if (!sleep) { - if (p->in_memstall) - clear |= TSK_MEMSTALL; - } else { - /* -* When a task sleeps, schedule() dequeues it before -* switching to the next one. Merge the clearing of -* TSK_RUNNING and TSK_ONCPU to save an unnecessary -* psi_task_change() call in psi_sched_switch(). -*/ - clear |= TSK_ONCPU; + /* +* A voluntary sleep is a dequeue followed by a task switch. To +* avoid walking all ancestors twice, psi_task_switch() handles +* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. +* Do nothing here. +
[tip: sched/core] psi: Use ONCPU state tracking machinery to detect reclaim
The following commit has been merged into the sched/core branch of tip: Commit-ID: f3f7feec57b9141dfed9825874d0191b1ac18ad2 Gitweb: https://git.kernel.org/tip/f3f7feec57b9141dfed9825874d0191b1ac18ad2 Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:57 +08:00 Committer: Peter Zijlstra CommitterDate: Thu, 04 Mar 2021 09:56:01 +01:00 psi: Use ONCPU state tracking machinery to detect reclaim Move the reclaim detection from the timer tick to the task state tracking machinery using the recently added ONCPU state. And we also add task psi_flags changes checking in the psi_task_switch() optimization to update the parents properly. In terms of performance and cost, this ONCPU task state tracking is not cheaper than previous timer tick in aggregate. But the code is simpler and shorter this way, so it's a maintainability win. And Johannes did some testing with perf bench, the performace and cost changes would be acceptable for real workloads. Thanks to Johannes Weiner for pointing out the psi_task_switch() optimization things and the clearer changelog. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengm...@bytedance.com --- include/linux/psi.h | 1 +- kernel/sched/core.c | 1 +- kernel/sched/psi.c | 65 +++ kernel/sched/stats.h | 9 +-- 4 files changed, 24 insertions(+), 52 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index 7361023..65eb147 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); -void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 361974e..d2629fd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4551,7 +4551,6 @@ void scheduler_tick(void) update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); - psi_task_tick(rq); rq_unlock(rq, ); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2293c45..0fe6ff6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(>poll_wait); } -static void record_times(struct psi_group_cpu *groupc, int cpu, -bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, int cpu) { u32 delta; u64 now; @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* -* Since we care about lost potential, a -* memstall is FULL when there are no other -* working tasks, but also when the CPU is -* actively reclaiming and nothing productive -* could run even if it were runnable. -* -* When the timer tick sees a reclaiming CPU, -* regardless of runnable tasks, sample a FULL -* tick (or less if it hasn't been a full tick -* since the last state change). -*/ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } } if (groupc->state_mask & (1 << PSI_CPU_SOME)) { @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(>seq); - record_times(groupc, cpu, false); + record_times(groupc, cpu); for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu, if (test_state(groupc->tasks, s)) state_mask |= (1 << s); } + + /* +* Since we care about lost potential, a memstall is FULL +* when there are no other working tasks, but also when +* the CPU is actively reclaiming and nothing productive +* could run even if it were runnable. So when the current +* task in a cgroup is in_memstall, the
[tip: sched/core] psi: Add PSI_CPU_FULL state
The following commit has been merged into the sched/core branch of tip: Commit-ID: 311b293811a31929c72c790eff48cf767561589f Gitweb: https://git.kernel.org/tip/311b293811a31929c72c790eff48cf767561589f Author:Chengming Zhou AuthorDate:Wed, 03 Mar 2021 11:46:56 +08:00 Committer: Peter Zijlstra CommitterDate: Thu, 04 Mar 2021 09:56:01 +01:00 psi: Add PSI_CPU_FULL state The FULL state doesn't exist for the CPU resource at the system level, but exist at the cgroup level, means all non-idle tasks in a cgroup are delayed on the CPU resource which used by others outside of the cgroup or throttled by the cgroup cpu.max configuration. Co-developed-by: Muchun Song Signed-off-by: Muchun Song Signed-off-by: Chengming Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengm...@bytedance.com --- include/linux/psi_types.h | 3 ++- kernel/sched/psi.c| 14 +++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index b95f321..0a23300 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -50,9 +50,10 @@ enum psi_states { PSI_MEM_SOME, PSI_MEM_FULL, PSI_CPU_SOME, + PSI_CPU_FULL, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 6, + NR_PSI_STATES = 7, }; enum psi_aggregators { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 967732c..2293c45 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration. * * SOME = nr_delayed_tasks != 0 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -225,6 +228,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + case PSI_CPU_FULL: + return tasks[NR_RUNNING] && !tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -678,8 +683,11 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } } - if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + } if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; @@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(>avgs_lock); - for (full = 0; full < 2 - (res == PSI_CPU); full++) { + for (full = 0; full < 2; full++) { unsigned long avg[3]; u64 total; int w;