Hello Johannes,

在 2021/2/9 下午11:48, Johannes Weiner 写道:
> Hello Chengming,
>
> On Tue, Feb 09, 2021 at 03:10:33PM +0800, Chengming Zhou wrote:
>> When the current task in a cgroup is in_memstall, the corresponding groupc
>> on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
>> redundant psi_task_tick from scheduler_tick to save this periodic cost.
> Can you please update the patch name and the changelog to the new
> version of the patch? It's not removing the redundant tick, it's
> moving the reclaim detection from the timer tick to the task state
> tracking machinery using the recently added ONCPU state.

Yes, I will change the name and changelog, it will be clearer for this patch : )

>> Signed-off-by: Muchun Song <songmuc...@bytedance.com>
>> Signed-off-by: Chengming Zhou <zhouchengm...@bytedance.com>
>> ---
>>  include/linux/psi.h  |  1 -
>>  kernel/sched/core.c  |  1 -
>>  kernel/sched/psi.c   | 49 ++++++++++++++-----------------------------------
>>  kernel/sched/stats.h |  9 ---------
>>  4 files changed, 14 insertions(+), 46 deletions(-)
>>
>> diff --git a/include/linux/psi.h b/include/linux/psi.h
>> index 7361023f3fdd..65eb1476ac70 100644
>> --- a/include/linux/psi.h
>> +++ b/include/linux/psi.h
>> @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, 
>> int set);
>>  void psi_task_switch(struct task_struct *prev, struct task_struct *next,
>>                   bool sleep);
>>  
>> -void psi_memstall_tick(struct task_struct *task, int cpu);
>>  void psi_memstall_enter(unsigned long *flags);
>>  void psi_memstall_leave(unsigned long *flags);
>>  
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 15d2562118d1..31788a9b335b 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -4533,7 +4533,6 @@ void scheduler_tick(void)
>>      update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
>>      curr->sched_class->task_tick(rq, curr, 0);
>>      calc_global_load_tick(rq);
>> -    psi_task_tick(rq);
>>  
>>      rq_unlock(rq, &rf);
>>  
>> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
>> index 2293c45d289d..6e46d9eb279b 100644
>> --- a/kernel/sched/psi.c
>> +++ b/kernel/sched/psi.c
>> @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
>>      wake_up_interruptible(&group->poll_wait);
>>  }
>>  
>> -static void record_times(struct psi_group_cpu *groupc, int cpu,
>> -                     bool memstall_tick)
>> +static void record_times(struct psi_group_cpu *groupc, int cpu)
>>  {
>>      u32 delta;
>>      u64 now;
>> @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, 
>> int cpu,
>>              groupc->times[PSI_MEM_SOME] += delta;
>>              if (groupc->state_mask & (1 << PSI_MEM_FULL))
>>                      groupc->times[PSI_MEM_FULL] += delta;
>> -            else if (memstall_tick) {
>> -                    u32 sample;
>> -                    /*
>> -                     * Since we care about lost potential, a
>> -                     * memstall is FULL when there are no other
>> -                     * working tasks, but also when the CPU is
>> -                     * actively reclaiming and nothing productive
>> -                     * could run even if it were runnable.
>> -                     *
>> -                     * When the timer tick sees a reclaiming CPU,
>> -                     * regardless of runnable tasks, sample a FULL
>> -                     * tick (or less if it hasn't been a full tick
>> -                     * since the last state change).
>> -                     */
>> -                    sample = min(delta, (u32)jiffies_to_nsecs(1));
>> -                    groupc->times[PSI_MEM_FULL] += sample;
>> -            }
>>      }
>>  
>>      if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
>> @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, 
>> int cpu,
>>       */
>>      write_seqcount_begin(&groupc->seq);
>>  
>> -    record_times(groupc, cpu, false);
>> +    record_times(groupc, cpu);
>>  
>>      for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
>>              if (!(m & (1 << t)))
>> @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, 
>> int cpu,
>>              if (test_state(groupc->tasks, s))
>>                      state_mask |= (1 << s);
>>      }
>> +
>> +    /*
>> +     * Since we care about lost potential, a memstall is FULL
>> +     * when there are no other working tasks, but also when
>> +     * the CPU is actively reclaiming and nothing productive
>> +     * could run even if it were runnable. So when the current
>> +     * task in a cgroup is in_memstall, the corresponding groupc
>> +     * on that cpu is in PSI_MEM_FULL state.
>> +     */
>> +    if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
>> +            state_mask |= (1 << PSI_MEM_FULL);
> This doesn't really work with the psi_task_switch() optimization. If
> we switch between two tasks inside a leaf group, where one is memstall
> and the other is not, we don't update the parents properly. So you
> need another branch in there as well for checking memstall. At which
> point the timer tick implementation is likely cheaper and simpler...

Thank you for pointing out this. I will add another branch there to check 
memstall to update

the parents properly and send a patch-v2.

Thanks.

>> @@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct 
>> *prev,
>>      psi_task_switch(prev, next, sleep);
>>  }
>>  
>> -static inline void psi_task_tick(struct rq *rq)
>> -{
>> -    if (static_branch_likely(&psi_disabled))
>> -            return;
>> -
>> -    if (unlikely(rq->curr->in_memstall))
>> -            psi_memstall_tick(rq->curr, cpu_of(rq));
>> -}
>>  #else /* CONFIG_PSI */
>>  static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
>>  static inline void psi_dequeue(struct task_struct *p, bool sleep) {}

Reply via email to