[tip: sched/core] psi: Optimize task switch inside shared cgroups

2021-03-06 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 4117cebf1a9fcbf35b9aabf0e37b6c5eea296798
Gitweb:
https://git.kernel.org/tip/4117cebf1a9fcbf35b9aabf0e37b6c5eea296798
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:23 +01:00

psi: Optimize task switch inside shared cgroups

The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.

We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))  # all ancestors
psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))  # until (prev & next)
psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com
---
 kernel/sched/psi.c   | 35 +--
 kernel/sched/stats.h | 28 
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 3907a6b..ee3c5b4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
}
}
 
-   /*
-* If this is a voluntary sleep, dequeue will have taken care
-* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-* only need to deal with it during preemption.
-*/
-   if (sleep)
-   return;
-
if (prev->pid) {
-   psi_flags_change(prev, TSK_ONCPU, 0);
+   int clear = TSK_ONCPU, set = 0;
+
+   /*
+* When we're going to sleep, psi_dequeue() lets us handle
+* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+* with TSK_ONCPU and save walking common ancestors twice.
+*/
+   if (sleep) {
+   clear |= TSK_RUNNING;
+   if (prev->in_iowait)
+   set |= TSK_IOWAIT;
+   }
+
+   psi_flags_change(prev, clear, set);
 
iter = NULL;
while ((group = iterate_groups(prev, )) && group != common)
-   psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+   psi_group_change(group, cpu, clear, set, true);
+
+   /*
+* TSK_ONCPU is handled up to the common ancestor. If we're 
tasked
+* with dequeuing too, finish that for the rest of the 
hierarchy.
+*/
+   if (sleep) {
+   clear &= ~TSK_ONCPU;
+   for (; group; group = iterate_groups(prev, ))
+   psi_group_change(group, cpu, clear, set, true);
+   }
}
 }
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a..dc218e9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool 
wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
-   int clear = TSK_RUNNING, set = 0;
+   int clear = TSK_RUNNING;
 
if (static_branch_likely(_disabled))
return;
 
-   if (!sleep) {
-   if (p->in_memstall)
-   clear |= TSK_MEMSTALL;
-   } else {
-   /*
-* When a task sleeps, schedule() dequeues it before
-* switching to the next one. Merge the clearing of
-* TSK_RUNNING and TSK_ONCPU to save an unnecessary
-* psi_task_change() call in psi_sched_switch().
-*/
-   clear |= TSK_ONCPU;
+   /*
+* A voluntary sleep is a dequeue followed by a task switch. To
+* avoid walking all ancestors twice, psi_task_switch() handles
+* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+* 

[tip: sched/core] psi: Optimize task switch inside shared cgroups

2021-03-04 Thread tip-bot2 for Chengming Zhou
The following commit has been merged into the sched/core branch of tip:

Commit-ID: e6560d58334ca463061ade733674abc8dd0df9bd
Gitweb:
https://git.kernel.org/tip/e6560d58334ca463061ade733674abc8dd0df9bd
Author:Chengming Zhou 
AuthorDate:Wed, 03 Mar 2021 11:46:59 +08:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 04 Mar 2021 09:56:02 +01:00

psi: Optimize task switch inside shared cgroups

The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
cgroups") only update cgroups whose state actually changes during a
task switch only in task preempt case, not in task sleep case.

We actually don't need to clear and set TSK_ONCPU state for common cgroups
of next and prev task in sleep case, that can save many psi_group_change
especially when most activity comes from one leaf cgroup.

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))  # all ancestors
psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))  # until (prev & next)
psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))  # all ancestors
psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

Co-developed-by: Muchun Song 
Signed-off-by: Muchun Song 
Signed-off-by: Chengming Zhou 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Johannes Weiner 
Link: 
https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengm...@bytedance.com
---
 kernel/sched/psi.c   | 35 +--
 kernel/sched/stats.h | 28 
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 3907a6b..ee3c5b4 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -840,20 +840,35 @@ void psi_task_switch(struct task_struct *prev, struct 
task_struct *next,
}
}
 
-   /*
-* If this is a voluntary sleep, dequeue will have taken care
-* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
-* only need to deal with it during preemption.
-*/
-   if (sleep)
-   return;
-
if (prev->pid) {
-   psi_flags_change(prev, TSK_ONCPU, 0);
+   int clear = TSK_ONCPU, set = 0;
+
+   /*
+* When we're going to sleep, psi_dequeue() lets us handle
+* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
+* with TSK_ONCPU and save walking common ancestors twice.
+*/
+   if (sleep) {
+   clear |= TSK_RUNNING;
+   if (prev->in_iowait)
+   set |= TSK_IOWAIT;
+   }
+
+   psi_flags_change(prev, clear, set);
 
iter = NULL;
while ((group = iterate_groups(prev, )) && group != common)
-   psi_group_change(group, cpu, TSK_ONCPU, 0, true);
+   psi_group_change(group, cpu, clear, set, true);
+
+   /*
+* TSK_ONCPU is handled up to the common ancestor. If we're 
tasked
+* with dequeuing too, finish that for the rest of the 
hierarchy.
+*/
+   if (sleep) {
+   clear &= ~TSK_ONCPU;
+   for (; group; group = iterate_groups(prev, ))
+   psi_group_change(group, cpu, clear, set, true);
+   }
}
 }
 
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 9e4e67a..dc218e9 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool 
wakeup)
 
 static inline void psi_dequeue(struct task_struct *p, bool sleep)
 {
-   int clear = TSK_RUNNING, set = 0;
+   int clear = TSK_RUNNING;
 
if (static_branch_likely(_disabled))
return;
 
-   if (!sleep) {
-   if (p->in_memstall)
-   clear |= TSK_MEMSTALL;
-   } else {
-   /*
-* When a task sleeps, schedule() dequeues it before
-* switching to the next one. Merge the clearing of
-* TSK_RUNNING and TSK_ONCPU to save an unnecessary
-* psi_task_change() call in psi_sched_switch().
-*/
-   clear |= TSK_ONCPU;
+   /*
+* A voluntary sleep is a dequeue followed by a task switch. To
+* avoid walking all ancestors twice, psi_task_switch() handles
+* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
+* Do nothing here.
+