Hello Chengming,

This patch looks useful to me. A couple of comments below:

On Tue, Feb 09, 2021 at 03:14:13PM +0800, Chengming Zhou wrote:
> The commit 36b238d57172 ("psi: Optimize switching tasks inside shared
> cgroups") only update cgroups whose state actually changes during a
> task switch only in task preempt case, not in task sleep case.
> 
> We actually don't need to clear and set TSK_ONCPU state for common cgroups
> of next and prev task in sleep case, that can save many psi_group_change
> especially when most activity comes from one leaf cgroup.

Can you please make this a bit more concrete? Maybe include this:

sleep before:
psi_dequeue()
  while ((group = iterate_groups(prev)))                        # all ancestors
    psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU)
psi_task_switch()
  while ((group = iterate_groups(next)))                        # all ancestors
    psi_group_change(next, .set=TSK_ONCPU)

sleep after:
psi_dequeue()
  nop
psi_task_switch()
  while ((group = iterate_groups(next)))                        # until (prev & 
next)
    psi_group_change(next, .set=TSK_ONCPU)
  while ((group = iterate_groups(prev)))                        # all ancestors
    psi_group_change(prev, .clear = common ? TSK_RUNNING : 
TSK_RUNNING|TSK_ONCPU)

When a voluntary sleep switches to another task, we remove one call of
psi_group_change() for every common cgroup ancestor of the two tasks.

> Signed-off-by: Muchun Song <songmuc...@bytedance.com>
> Signed-off-by: Chengming Zhou <zhouchengm...@bytedance.com>
> ---
>  kernel/sched/psi.c   | 27 +++++++++++++++++----------
>  kernel/sched/stats.h | 17 +++--------------
>  2 files changed, 20 insertions(+), 24 deletions(-)
> 
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 6e46d9eb279b..6061e87089ac 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -836,20 +836,27 @@ void psi_task_switch(struct task_struct *prev, struct 
> task_struct *next,
>               }
>       }
>  
> -     /*
> -      * If this is a voluntary sleep, dequeue will have taken care
> -      * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
> -      * only need to deal with it during preemption.
> -      */
> -     if (sleep)
> -             return;
> -
>       if (prev->pid) {
> -             psi_flags_change(prev, TSK_ONCPU, 0);
> +             int clear = 0, set = 0;
> +
> +             if (sleep) {
> +                     clear |= TSK_RUNNING;
> +                     if (prev->in_iowait)
> +                             set |= TSK_IOWAIT;
> +             }

This needs a comment why it's doing psi_dequeue()'s job. How about this?

                /*
                 * When we're going to sleep, psi_dequeue() lets us handle
                 * TSK_RUNNING and TSK_IOWAIT here, where we can combine it
                 * with TSK_ONCPU and save walking common ancestors twice.
                 */
                if (sleep) {
                        ...

> +             psi_flags_change(prev, clear | TSK_ONCPU, set);
>  
>               iter = NULL;
>               while ((group = iterate_groups(prev, &iter)) && group != common)
> -                     psi_group_change(group, cpu, TSK_ONCPU, 0, true);
> +                     psi_group_change(group, cpu, clear | TSK_ONCPU, set, 
> true);
> +
> +             if (sleep) {
> +                     while (group) {
> +                             psi_group_change(group, cpu, clear, set, true);
> +                             group = iterate_groups(prev, &iter);
> +                     }
> +             }

This function is *primarily* about handling TSK_ONCPU and secondarily
optimizes the dequeue. It would be a bit clearer to do:

        int clear = TSK_ONCPU, set = 0;

        ...

        /*
         * TSK_ONCPU is handled up to the common ancestor. If we're tasked
         * with dequeuing too, finish that for the rest of the hierarchy.
         */
        if (sleep) {
                clear &= TSK_ONCPU;
                for (; group; group = iterate_groups(prev, &iter))
                        psi_group_change(group, cpu, clear, set, true);         
        
        }


> diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
> index 9e4e67a94731..2d92c8467678 100644
> --- a/kernel/sched/stats.h
> +++ b/kernel/sched/stats.h
> @@ -84,28 +84,17 @@ static inline void psi_enqueue(struct task_struct *p, 
> bool wakeup)
>  
>  static inline void psi_dequeue(struct task_struct *p, bool sleep)
>  {
> -     int clear = TSK_RUNNING, set = 0;
> -
>       if (static_branch_likely(&psi_disabled))
>               return;
>  
>       if (!sleep) {
> +             int clear = TSK_RUNNING;
> +
>               if (p->in_memstall)
>                       clear |= TSK_MEMSTALL;
> -     } else {
> -             /*
> -              * When a task sleeps, schedule() dequeues it before
> -              * switching to the next one. Merge the clearing of
> -              * TSK_RUNNING and TSK_ONCPU to save an unnecessary
> -              * psi_task_change() call in psi_sched_switch().
> -              */
> -             clear |= TSK_ONCPU;
>  
> -             if (p->in_iowait)
> -                     set |= TSK_IOWAIT;
> +             psi_task_change(p, clear, 0);
>       }

Likewise, this really should have a comment for why it's not handling
TSK_RUNNING to match psi_enqueue()!

        int clear = TSK_RUNNING;

        /*
         * A voluntary sleep is a dequeue followed by a task switch. To
         * avoid walking all ancestors twice, psi_task_switch() handles
         * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
         * Do nothing here.
         */
         if (sleep)
                return;

        if (p->in_memstall)
                clear |= TSK_MEMSTALL;

        psi_task_change(p, clear, 0);
        
Thanks

Reply via email to