[tip: sched/core] psi: Optimize task switch inside shared cgroups

2021-03-06 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 4117cebf1a9fcbf35b9aabf0e37b6c5eea296798
Gitweb:
https://git.kernel.org/tip/4117cebf1a9fcbf35b9aabf0e37b6c5eea296798
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:23 +01:00

psi: Optimize task switch inside shared cgroups

The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.

We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))  # all ancestors
psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))  # until (prev & next)
psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com
---
 kernel/sched/psi.c   | 35 +--
 kernel/sched/stats.h | 28 
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 3907a6b..ee3c5b4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
}
}
 
-   /*
-* If this is a voluntary sleep, dequeue will have taken care
-* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-* only need to deal with it during preemption.
-*/
-   if (sleep)
-   return;
-
if (prev->pid) {
-   psi_flags_change(prev, TSK_ONCPU, 0);
+   int clear = TSK_ONCPU, set = 0;
+
+   /*
+* When we're going to sleep, psi_dequeue() lets us handle
+* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+* with TSK_ONCPU and save walking common ancestors twice.
+*/
+   if (sleep) {
+   clear |= TSK_RUNNING;
+   if (prev->in_iowait)
+   set |= TSK_IOWAIT;
+   }
+
+   psi_flags_change(prev, clear, set);
 
iter = NULL;
while ((group = iterate_groups(prev, )) && group != common)
-   psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+   psi_group_change(group, cpu, clear, set, true);
+
+   /*
+* TSK_ONCPU is handled up to the common ancestor. If we're 
tasked
+* with dequeuing too, finish that for the rest of the 
hierarchy.
+*/
+   if (sleep) {
+   clear &= ~TSK_ONCPU;
+   for (; group; group = iterate_groups(prev, ))
+   psi_group_change(group, cpu, clear, set, true);
+   }
}
 }
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a..dc218e9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool 
wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
-   int clear = TSK_RUNNING, set = 0;
+   int clear = TSK_RUNNING;
 
if (static_branch_likely(_disabled))
return;
 
-   if (!sleep) {
-   if (p->in_memstall)
-   clear |= TSK_MEMSTALL;
-   } else {
-   /*
-* When a task sleeps, schedule() dequeues it before
-* switching to the next one. Merge the clearing of
-* TSK_RUNNING and TSK_ONCPU to save an unnecessary
-* psi_task_change() call in psi_sched_switch().
-*/
-   clear |= TSK_ONCPU;
+   /*
+* A voluntary sleep is a dequeue followed by a task switch. To
+* avoid walking all ancestors twice, psi_task_switch() handles
+* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+* 

[tip: sched/core] psi: Add PSI_CPU_FULL state

2021-03-06 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: e7fcd762282332f765af2035a9568fb126fa3c01
Gitweb:
https://git.kernel.org/tip/e7fcd762282332f765af2035a9568fb126fa3c01
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:56 +08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00

psi: Add PSI_CPU_FULL state

The FULL state doesn't exist for the CPU resource at the system level,
but exist at the cgroup level, means all non-idle tasks in a cgroup are
delayed on the CPU resource which used by others outside of the cgroup
or throttled by the cgroup cpu.max configuration.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengm...@bytedance.com
---
 include/linux/psi_types.h |  3 ++-
 kernel/sched/psi.c| 14 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index b95f321..0a23300 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -50,9 +50,10 @@ enum psi_states {
PSI_MEM_SOME,
PSI_MEM_FULL,
PSI_CPU_SOME,
+   PSI_CPU_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
-   NR_PSI_STATES = 6,
+   NR_PSI_STATES = 7,
 };
 
 enum psi_aggregators {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 967732c..2293c45 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -34,7 +34,10 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level, means all non-idle tasks
+ * in a cgroup are delayed on the CPU resource which used by others outside
+ * of the cgroup or throttled by the cgroup cpu.max configuration.
  *
  * SOME = nr_delayed_tasks != 0
  * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
@@ -225,6 +228,8 @@ static bool test_state(unsigned int *tasks, enum psi_states 
state)
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+   case PSI_CPU_FULL:
+   return tasks[NR_RUNNING] && !tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -678,8 +683,11 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu,
}
}
 
-   if (groupc->state_mask & (1 << PSI_CPU_SOME))
+   if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
groupc->times[PSI_CPU_SOME] += delta;
+   if (groupc->state_mask & (1 << PSI_CPU_FULL))
+   groupc->times[PSI_CPU_FULL] += delta;
+   }
 
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
@@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, 
enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(>avgs_lock);
 
-   for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+   for (full = 0; full < 2; full++) {
unsigned long avg[3];
u64 total;
int w;


[tip: sched/core] psi: Use ONCPU state tracking machinery to detect reclaim

2021-03-06 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 7fae6c8171d20ac55402930ee8ae760cf85dff7b
Gitweb:
https://git.kernel.org/tip/7fae6c8171d20ac55402930ee8ae760cf85dff7b
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:57 +08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00

psi: Use ONCPU state tracking machinery to detect reclaim

Move the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state. And we
also add task psi_flags changes checking in the psi_task_switch()
optimization to update the parents properly.

In terms of performance and cost, this ONCPU task state tracking
is not cheaper than previous timer tick in aggregate. But the code is
simpler and shorter this way, so it's a maintainability win. And
Johannes did some testing with perf bench, the performace and cost
changes would be acceptable for real workloads.

Thanks to Johannes Weiner for pointing out the psi_task_switch()
optimization things and the clearer changelog.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengm...@bytedance.com
---
 include/linux/psi.h  |  1 +-
 kernel/sched/core.c  |  1 +-
 kernel/sched/psi.c   | 65 +++
 kernel/sched/stats.h |  9 +--
 4 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023..65eb147 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int 
set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 361974e..d2629fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4551,7 +4551,6 @@ void scheduler_tick(void)
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
-   psi_task_tick(rq);
 
rq_unlock(rq, );
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2293c45..0fe6ff6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(>poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
u32 delta;
u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
-   else if (memstall_tick) {
-   u32 sample;
-   /*
-* Since we care about lost potential, a
-* memstall is FULL when there are no other
-* working tasks, but also when the CPU is
-* actively reclaiming and nothing productive
-* could run even if it were runnable.
-*
-* When the timer tick sees a reclaiming CPU,
-* regardless of runnable tasks, sample a FULL
-* tick (or less if it hasn't been a full tick
-* since the last state change).
-*/
-   sample = min(delta, (u32)jiffies_to_nsecs(1));
-   groupc->times[PSI_MEM_FULL] += sample;
-   }
}
 
if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
 */
write_seqcount_begin(>seq);
 
-   record_times(groupc, cpu, false);
+   record_times(groupc, cpu);
 
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
if (test_state(groupc->tasks, s))
state_mask |= (1 << s);
}
+
+   /*
+* Since we care about lost potential, a memstall is FULL
+* when there are no other working tasks, but also when
+* the CPU is actively reclaiming and nothing productive
+* could run even if it were runnable. So when the current
+* task in a cgroup is 

[tip: sched/core] psi: Optimize task switch inside shared cgroups

2021-03-04 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: e6560d58334ca463061ade733674abc8dd0df9bd
Gitweb:
https://git.kernel.org/tip/e6560d58334ca463061ade733674abc8dd0df9bd
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 04 Mar 2021 09:56:02 +01:00

psi: Optimize task switch inside shared cgroups

The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.

We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))  # all ancestors
psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))  # until (prev & next)
psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com
---
 kernel/sched/psi.c   | 35 +--
 kernel/sched/stats.h | 28 
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 3907a6b..ee3c5b4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
}
}
 
-   /*
-* If this is a voluntary sleep, dequeue will have taken care
-* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-* only need to deal with it during preemption.
-*/
-   if (sleep)
-   return;
-
if (prev->pid) {
-   psi_flags_change(prev, TSK_ONCPU, 0);
+   int clear = TSK_ONCPU, set = 0;
+
+   /*
+* When we're going to sleep, psi_dequeue() lets us handle
+* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+* with TSK_ONCPU and save walking common ancestors twice.
+*/
+   if (sleep) {
+   clear |= TSK_RUNNING;
+   if (prev->in_iowait)
+   set |= TSK_IOWAIT;
+   }
+
+   psi_flags_change(prev, clear, set);
 
iter = NULL;
while ((group = iterate_groups(prev, )) && group != common)
-   psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+   psi_group_change(group, cpu, clear, set, true);
+
+   /*
+* TSK_ONCPU is handled up to the common ancestor. If we're 
tasked
+* with dequeuing too, finish that for the rest of the 
hierarchy.
+*/
+   if (sleep) {
+   clear &= ~TSK_ONCPU;
+   for (; group; group = iterate_groups(prev, ))
+   psi_group_change(group, cpu, clear, set, true);
+   }
}
 }
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a..dc218e9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool 
wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
-   int clear = TSK_RUNNING, set = 0;
+   int clear = TSK_RUNNING;
 
if (static_branch_likely(_disabled))
return;
 
-   if (!sleep) {
-   if (p->in_memstall)
-   clear |= TSK_MEMSTALL;
-   } else {
-   /*
-* When a task sleeps, schedule() dequeues it before
-* switching to the next one. Merge the clearing of
-* TSK_RUNNING and TSK_ONCPU to save an unnecessary
-* psi_task_change() call in psi_sched_switch().
-*/
-   clear |= TSK_ONCPU;
+   /*
+* A voluntary sleep is a dequeue followed by a task switch. To
+* avoid walking all ancestors twice, psi_task_switch() handles
+* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+* Do nothing here.
+   

[tip: sched/core] psi: Use ONCPU state tracking machinery to detect reclaim

2021-03-04 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: f3f7feec57b9141dfed9825874d0191b1ac18ad2
Gitweb:
https://git.kernel.org/tip/f3f7feec57b9141dfed9825874d0191b1ac18ad2
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:57 +08:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 04 Mar 2021 09:56:01 +01:00

psi: Use ONCPU state tracking machinery to detect reclaim

Move the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state. And we
also add task psi_flags changes checking in the psi_task_switch()
optimization to update the parents properly.

In terms of performance and cost, this ONCPU task state tracking
is not cheaper than previous timer tick in aggregate. But the code is
simpler and shorter this way, so it's a maintainability win. And
Johannes did some testing with perf bench, the performace and cost
changes would be acceptable for real workloads.

Thanks to Johannes Weiner for pointing out the psi_task_switch()
optimization things and the clearer changelog.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengm...@bytedance.com
---
 include/linux/psi.h  |  1 +-
 kernel/sched/core.c  |  1 +-
 kernel/sched/psi.c   | 65 +++
 kernel/sched/stats.h |  9 +--
 4 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/include/linux/psi.h b/include/linux/psi.h
index 7361023..65eb147 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int 
set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 bool sleep);
 
-void psi_memstall_tick(struct task_struct *task, int cpu);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 361974e..d2629fd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4551,7 +4551,6 @@ void scheduler_tick(void)
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
curr->sched_class->task_tick(rq, curr, 0);
calc_global_load_tick(rq);
-   psi_task_tick(rq);
 
rq_unlock(rq, );
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 2293c45..0fe6ff6 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
wake_up_interruptible(>poll_wait);
 }
 
-static void record_times(struct psi_group_cpu *groupc, int cpu,
-bool memstall_tick)
+static void record_times(struct psi_group_cpu *groupc, int cpu)
 {
u32 delta;
u64 now;
@@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu,
groupc->times[PSI_MEM_SOME] += delta;
if (groupc->state_mask & (1 << PSI_MEM_FULL))
groupc->times[PSI_MEM_FULL] += delta;
-   else if (memstall_tick) {
-   u32 sample;
-   /*
-* Since we care about lost potential, a
-* memstall is FULL when there are no other
-* working tasks, but also when the CPU is
-* actively reclaiming and nothing productive
-* could run even if it were runnable.
-*
-* When the timer tick sees a reclaiming CPU,
-* regardless of runnable tasks, sample a FULL
-* tick (or less if it hasn't been a full tick
-* since the last state change).
-*/
-   sample = min(delta, (u32)jiffies_to_nsecs(1));
-   groupc->times[PSI_MEM_FULL] += sample;
-   }
}
 
if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
@@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
 */
write_seqcount_begin(>seq);
 
-   record_times(groupc, cpu, false);
+   record_times(groupc, cpu);
 
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
@@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int 
cpu,
if (test_state(groupc->tasks, s))
state_mask |= (1 << s);
}
+
+   /*
+* Since we care about lost potential, a memstall is FULL
+* when there are no other working tasks, but also when
+* the CPU is actively reclaiming and nothing productive
+* could run even if it were runnable. So when the current
+* task in a cgroup is in_memstall, the 

[tip: sched/core] psi: Add PSI_CPU_FULL state

2021-03-04 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 311b293811a31929c72c790eff48cf767561589f
Gitweb:
https://git.kernel.org/tip/311b293811a31929c72c790eff48cf767561589f
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:56 +08:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 04 Mar 2021 09:56:01 +01:00

psi: Add PSI_CPU_FULL state

The FULL state doesn't exist for the CPU resource at the system level,
but exist at the cgroup level, means all non-idle tasks in a cgroup are
delayed on the CPU resource which used by others outside of the cgroup
or throttled by the cgroup cpu.max configuration.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengm...@bytedance.com
---
 include/linux/psi_types.h |  3 ++-
 kernel/sched/psi.c| 14 +++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index b95f321..0a23300 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -50,9 +50,10 @@ enum psi_states {
PSI_MEM_SOME,
PSI_MEM_FULL,
PSI_CPU_SOME,
+   PSI_CPU_FULL,
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
-   NR_PSI_STATES = 6,
+   NR_PSI_STATES = 7,
 };
 
 enum psi_aggregators {
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 967732c..2293c45 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -34,7 +34,10 @@
  * delayed on that resource such that nobody is advancing and the CPU
  * goes idle. This leaves both workload and CPU unproductive.
  *
- * (Naturally, the FULL state doesn't exist for the CPU resource.)
+ * Naturally, the FULL state doesn't exist for the CPU resource at the
+ * system level, but exist at the cgroup level, means all non-idle tasks
+ * in a cgroup are delayed on the CPU resource which used by others outside
+ * of the cgroup or throttled by the cgroup cpu.max configuration.
  *
  * SOME = nr_delayed_tasks != 0
  * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
@@ -225,6 +228,8 @@ static bool test_state(unsigned int *tasks, enum psi_states 
state)
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
case PSI_CPU_SOME:
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
+   case PSI_CPU_FULL:
+   return tasks[NR_RUNNING] && !tasks[NR_ONCPU];
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@@ -678,8 +683,11 @@ static void record_times(struct psi_group_cpu *groupc, int 
cpu,
}
}
 
-   if (groupc->state_mask & (1 << PSI_CPU_SOME))
+   if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
groupc->times[PSI_CPU_SOME] += delta;
+   if (groupc->state_mask & (1 << PSI_CPU_FULL))
+   groupc->times[PSI_CPU_FULL] += delta;
+   }
 
if (groupc->state_mask & (1 << PSI_NONIDLE))
groupc->times[PSI_NONIDLE] += delta;
@@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, 
enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(>avgs_lock);
 
-   for (full = 0; full < 2 - (res == PSI_CPU); full++) {
+   for (full = 0; full < 2; full++) {
unsigned long avg[3];
u64 total;
int w;